From b360d8cace6b44a9cb9cc7c3da084b0f7422710d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 9 Aug 2018 14:21:05 -0700
Subject: [PATCH 001/529] [TEST] force openblas threads to be 1 (#1580)

---
 tests/scripts/task_python_nnvm.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/scripts/task_python_nnvm.sh b/tests/scripts/task_python_nnvm.sh
index 2fc41980fb3d..790073a2fe8b 100755
--- a/tests/scripts/task_python_nnvm.sh
+++ b/tests/scripts/task_python_nnvm.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 
 export PYTHONPATH=nnvm/python:python:topi/python
+# to avoid openblas threading error
+export OMP_NUM_THREADS=1
 
 echo "Running unittest..."
 python -m nose -v nnvm/tests/python/unittest || exit -1

From 0688ceb3deeb9b1fe656ac5cff36f50e1d02b3b1 Mon Sep 17 00:00:00 2001
From: Dayananda V <dayanandasiet@gmail.com>
Date: Fri, 10 Aug 2018 07:11:49 +0530
Subject: [PATCH 002/529] Vulkan TVM Android Support (#1571)

---
 apps/android_rpc/README.md                    | 25 +++--
 .../app/src/main/jni/Application.mk           | 16 ++--
 apps/android_rpc/tests/android_rpc_test.py    | 91 +++++++++++++------
 .../src/main/java/ml/dmlc/tvm/TVMContext.java | 15 +++
 .../main/java/ml/dmlc/tvm/rpc/RPCSession.java | 18 ++++
 python/tvm/rpc/client.py                      |  4 +
 web/tvm_runtime.js                            |  2 +
 7 files changed, 125 insertions(+), 46 deletions(-)

diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index 41d361c823ed..eef22f3c7010 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -123,18 +123,25 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
 python android_rpc_test.py
 ```
 
-This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector addition on your Android device. On my test device, it gives following results.
+This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set [`'test_opencl = True'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L25) and on Vulkan target set [`'test_vulkan = False'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L27) in  [tests/android_rpc_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
+On my test device, it gives following results.
 
 ```bash
-TVM: Initializing cython mode...
-[01:21:43] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64
-[01:21:43] src/runtime/opencl/opencl_device_api.cc:194: Initialize OpenCL platform 'Apple'
-[01:21:43] src/runtime/opencl/opencl_device_api.cc:214: opencl(0)='Iris' cl_device_id=0x1024500
-[01:21:44] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64
-Run GPU test ...
-0.000155807 secs/op
 Run CPU test ...
-0.00139824 secs/op
+0.000962932 secs/op
+
+Run GPU(OpenCL Flavor) test ...
+0.000155807 secs/op
+
+[23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:674: Cannot initialize vulkan: [23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:512: Check failed: __e == VK_SUCCESS Vulan Error, code=-9: VK_ERROR_INCOMPATIBLE_DRIVER
+
+Stack trace returned 10 entries:
+[bt] (0) /home/user/.local/lib/python3.6/site-packages/tvm-0.4.0-py3.6-linux-x86_64.egg/tvm/libtvm.so(dmlc::StackTrace[abi:cxx11]()+0x53) [0x7f477f5399f3]
+.........
+
+You can still compile vulkan module but cannot run locally
+Run GPU(Vulkan Flavor) test ...
+0.000225198 secs/op
 ```
 
 You can define your own TVM operators and test via this RPC app on your Android device to find the most optimized TVM schedule.
diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk
index 5bf52bdaffc0..f142e2995777 100644
--- a/apps/android_rpc/app/src/main/jni/Application.mk
+++ b/apps/android_rpc/app/src/main/jni/Application.mk
@@ -1,9 +1,9 @@
 ifndef config
-	ifneq ("$(wildcard ./config.mk)","")
-	  config ?= config.mk
-	else
-	  config ?= make/config.mk
-	endif
+    ifneq ("$(wildcard ./config.mk)","")
+        config ?= config.mk
+    else
+        config ?= make/config.mk
+    endif
 endif
 
 include $(config)
@@ -16,10 +16,10 @@ APP_STL := c++_static
 
 APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
-	APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
+    APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
 
 ifeq ($(USE_VULKAN), 1)
-	APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
-	APP_LDFLAGS += -lvulkan
+    APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
+    APP_LDFLAGS += -lvulkan
 endif
diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index cfb04c1ca9a9..44618efd45c1 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -21,59 +21,92 @@
 arch = "arm64"
 target = "llvm -target=%s-linux-android" % arch
 
+# whether enable to execute test on OpenCL target
+test_opencl = False
+# whether enable to execute test on Vulkan target
+test_vulkan = False
+
 def test_rpc_module():
     # graph
     n = tvm.convert(1024)
     A = tvm.placeholder((n,), name='A')
     B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    a_np = np.random.uniform(size=1024).astype(A.dtype)
     temp = util.tempdir()
-    s = tvm.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
-    # Build the dynamic lib.
-    # If we don't want to do metal and only use cpu, just set target to be target
-    f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
-    path_dso1 = temp.relpath("dev_lib2.so")
-    f.export_library(path_dso1, ndk.create_shared)
 
+    # Establish remote connection with target hardware
+    tracker = rpc.connect_tracker(tracker_host, tracker_port)
+    remote = tracker.request(key, priority=0,
+                             session_timeout=60)
+
+    # Compile the Graph for CPU target
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=64)
     s[B].parallel(xi)
     s[B].pragma(xo, "parallel_launch_point")
     s[B].pragma(xi, "parallel_barrier_when_finish")
     f = tvm.build(s, [A, B], target, name="myadd_cpu")
-    path_dso2 = temp.relpath("cpu_lib.so")
-    f.export_library(path_dso2, ndk.create_shared)
-
-    tracker = rpc.connect_tracker(tracker_host, tracker_port)
-    remote = tracker.request(key, priority=0,
-                             session_timeout=60)
+    path_dso_cpu = temp.relpath("cpu_lib.so")
+    f.export_library(path_dso_cpu, ndk.create_shared)
 
+    # Execute the portable graph on cpu target
     print('Run CPU test ...')
     ctx = remote.cpu(0)
-    remote.upload(path_dso2)
+    remote.upload(path_dso_cpu)
     f2 = remote.load_module("cpu_lib.so")
-    a_np = np.random.uniform(size=1024).astype(A.dtype)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
     time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
     cost = time_f(a, b).mean
-    print('%g secs/op' % cost)
+    print('%g secs/op\n' % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
+    # Compile the Graph for OpenCL target
+    if test_opencl:
+        s = tvm.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=64)
+        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        # Build the dynamic lib.
+        # If we don't want to do metal and only use cpu, just set target to be target
+        f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
+        path_dso_cl = temp.relpath("dev_lib_cl.so")
+        f.export_library(path_dso_cl, ndk.create_shared)
+
+        print('Run GPU(OpenCL Flavor) test ...')
+        ctx = remote.cl(0)
+        remote.upload(path_dso_cl)
+        f1 = remote.load_module("dev_lib_cl.so")
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+        cost = time_f(a, b).mean
+        print('%g secs/op\n' % cost)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
+
+    # Compile the Graph for Vulkan target
+    if test_vulkan:
+        s = tvm.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=64)
+        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        # Build the dynamic lib.
+        # If we don't want to do metal and only use cpu, just set target to be target
+        f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd")
+        path_dso_vulkan = temp.relpath("dev_lib_vulkan.so")
+        f.export_library(path_dso_vulkan, ndk.create_shared)
+
+        print('Run GPU(Vulkan Flavor) test ...')
+        ctx = remote.vulkan(0)
+        remote.upload(path_dso_vulkan)
+        f1 = remote.load_module("dev_lib_vulkan.so")
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+        cost = time_f(a, b).mean
+        print('%g secs/op\n' % cost)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
-    print('Run GPU test ...')
-    ctx = remote.cl(0)
-    remote.upload(path_dso1)
-    f1 = remote.load_module("dev_lib2.so")
-    a_np = np.random.uniform(size=1024).astype(A.dtype)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-    time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
-    cost = time_f(a, b).mean
-    print('%g secs/op' % cost)
-    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
 if __name__ == "__main__":
     test_rpc_module()
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java
index 0d108e0a2943..d9051f0d9d4d 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java
@@ -30,6 +30,7 @@ public class TVMContext {
     MASK2STR.put(1, "cpu");
     MASK2STR.put(2, "gpu");
     MASK2STR.put(4, "opencl");
+    MASK2STR.put(7, "vulkan");
     MASK2STR.put(8, "metal");
     MASK2STR.put(9, "vpi");
 
@@ -38,6 +39,7 @@ public class TVMContext {
     STR2MASK.put("cuda", 2);
     STR2MASK.put("cl", 4);
     STR2MASK.put("opencl", 4);
+    STR2MASK.put("vulkan", 7);
     STR2MASK.put("metal", 8);
     STR2MASK.put("vpi", 9);
   }
@@ -81,6 +83,19 @@ public static TVMContext opencl() {
     return opencl(0);
   }
 
+  /**
+   * Construct a Vulkan device.
+   * @param devId The device id
+   * @return The created context
+   */
+  public static TVMContext vulkan(int devId) {
+    return new TVMContext(7, devId);
+  }
+
+  public static TVMContext vulkan() {
+    return vulkan(0);
+  }
+
   /**
    * Construct a metal device.
    * @param devId The device id
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
index 0eec9224a40c..8ebf188b0667 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
@@ -143,6 +143,24 @@ public TVMContext cl() {
     return cl(0);
   }
 
+  /**
+   * Construct remote OpenCL device.
+   * @param devId device id.
+   * @return Remote OpenCL context.
+   */
+  public TVMContext vulkan(int devId) {
+    return context(7, devId);
+  }
+
+  /**
+   * Construct remote OpenCL device.
+   * @return Remote OpenCL context.
+   */
+  public TVMContext vulkan() {
+    return vulkan(0);
+  }
+
+
   /**
    * Construct remote Metal device.
    * @param devId device id.
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index 57f368b0e660..ffbe6eeab6ee 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -130,6 +130,10 @@ def cl(self, dev_id=0):
         """Construct OpenCL device."""
         return self.context(4, dev_id)
 
+    def vulkan(self, dev_id=0):
+        """Construct Vulkan device."""
+        return self.context(7, dev_id)
+
     def metal(self, dev_id=0):
         """Construct Metal device."""
         return self.context(8, dev_id)
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index ef594e9433fb..786745d3ce88 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -696,6 +696,7 @@ var tvm_runtime = tvm_runtime || {};
       1 : "cpu",
       2 : "gpu",
       4 : "opencl",
+      7 : "vulkan",
       8 : "metal",
       9 : "vpi",
       11 : "opengl",
@@ -706,6 +707,7 @@ var tvm_runtime = tvm_runtime || {};
       "cuda": 2,
       "cl": 4,
       "opencl": 4,
+      "vulkan": 7,
       "metal": 8,
       "vpi": 9,
       "opengl": 11,

From ddadde8987aea401950692c570b16a421f47d680 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 9 Aug 2018 18:55:48 -0700
Subject: [PATCH 003/529] [TEAM] merrymercy->code owner (#1581)

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6e3cf55b94b0..2d571ba668ea 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,6 +18,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Yuwei Hu](https://github.com/Huyuwei) TOPI
 - [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
 - [Nick Hynes](https://github.com/nhynes) SGX and secured computing
+- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
 
 ## Reviewers
 - [Masahiro Masuda](https://github.com/masahi)
@@ -27,7 +28,6 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Alex Weaver](https://github.com/alex-weaver)
 - [Eddie Yan](https://github.com/eqy)
 - [Joshua Z. Zhang](https://github.com/zhreshold)
-- [Lianmin Zheng](https://github.com/merrymercy)
 
 ## List of Contributors
 - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)

From a6ec4faf9628362d9c092933f515c3ca751efe50 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Fri, 10 Aug 2018 11:45:09 -0700
Subject: [PATCH 004/529] [AUTOTVM] API change (#1583)

---
 python/tvm/autotvm/__init__.py                |  2 +-
 python/tvm/autotvm/measure/__init__.py        |  2 +-
 python/tvm/autotvm/measure/measure.py         | 10 ++++----
 python/tvm/autotvm/measure/measure_methods.py | 24 +++++++++----------
 python/tvm/autotvm/tuner/tuner.py             |  4 ++--
 tutorials/autotvm/tune_conv2d_cuda.py         |  2 +-
 tutorials/autotvm/tune_nnvm_arm.py            |  6 ++---
 7 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index 20426be84aa1..5b312d93d288 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -22,7 +22,7 @@
 from . import tophub
 
 # some shortcuts
-from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, use_rpc
+from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo
 from .tuner import callback
 from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
     ApplyHistoryBest as apply_history_best
diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py
index b9bd3c37b01d..880dfd1ffe29 100644
--- a/python/tvm/autotvm/measure/__init__.py
+++ b/python/tvm/autotvm/measure/__init__.py
@@ -1,7 +1,7 @@
 """Distributed executor infrastructure to scale up the tuning"""
 
 from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option
-from .measure_methods import request_remote, check_remote, create_measure_batch, use_rpc
+from .measure_methods import request_remote, check_remote, create_measure_batch, rpc
 
 from .local_executor import LocalExecutor
 from .executor import Future, Executor
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 6a05e1a6a349..2325a970bc45 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -49,7 +49,7 @@ def measure_option(measure_func,
                    number=1,
                    repeat=1,
                    timeout=60,
-                   parallel_num=1,
+                   n_parallel=1,
                    do_fork=True,
                    build_func='default',
                    check_correctness=False,
@@ -63,7 +63,7 @@ def measure_option(measure_func,
         and a RPC server silently for the user.
 
         callable: It is a callable function for measurement.
-                  See the return value of measure/measure_methods.py::use_rpc for example.
+                  See the return value of measure/measure_methods.py::rpc for example.
     number : int, optional
         Number of times to do the measurement for average
     repeat : int, optional
@@ -74,7 +74,7 @@ def measure_option(measure_func,
     timeout: int, optional
         Timeout for a whole batch. TimeoutError will be returned as the result if a
         task timeouts.
-    parallel_num: int, optional
+    n_parallel: int, optional
         The number of measurement task that can run in parallel.
         Set this according to the number of cpu cores (for compilation) and
         the number of devices you have (for measuring generate code).
@@ -106,7 +106,7 @@ def measure_option(measure_func,
     and handle the logic of measurement.
 
     Signature:
-    * measure_func (see the return value of measure/measure_methods.py::use_rpc for example)
+    * measure_func (see the return value of measure/measure_methods.py::rpc for example)
     def measure_func(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
         return measure_results
 
@@ -119,7 +119,7 @@ def build_func(inp, tmp_dir, **kwargs):
         'number': number,
         'repeat': repeat,
         'timeout': timeout,
-        'parallel_num': parallel_num,
+        'n_parallel': n_parallel,
         'do_fork': do_fork,
         'build_func': build_func,
         'check_correctness': check_correctness,
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 6e95a6e435d0..e192ee26ee3e 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -13,8 +13,8 @@
 
 import numpy as np
 
-from ... import rpc, ir_pass, build, build_config, nd, context, TVMError, register_func, \
-    target as _target
+from ... import ir_pass, build, build_config, nd, context, TVMError, register_func, \
+    target as _target, rpc as _rpc
 from ...contrib import nvcc, util, ndk
 
 from ..util import get_const_tuple
@@ -60,7 +60,7 @@ def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
         host = os.environ['TVM_TRACKER_HOST']
         port = int(os.environ['TVM_TRACKER_PORT'])
 
-    tracker = rpc.connect_tracker(host, port)
+    tracker = _rpc.connect_tracker(host, port)
     remote = tracker.request(device_key, priority=priority,
                              session_timeout=timeout)
     return remote
@@ -113,7 +113,7 @@ def create_measure_batch(task, option):
 
     measure_func = option['measure_func']
     number, repeat = option['number'], option['repeat']
-    timeout, parallel_num, do_fork = option['timeout'], option['parallel_num'], option['do_fork']
+    timeout, n_parallel, do_fork = option['timeout'], option['n_parallel'], option['do_fork']
     build_func = option['build_func']
     check_correctness = option['check_correctness']
     replay_db = option['replay_db']
@@ -134,7 +134,7 @@ def create_measure_batch(task, option):
                         use_popen=True, silent=True,
                         tracker_addr=(tracker.host, tracker.port))
 
-        measure_func = use_rpc(device_key, tracker.host, tracker.port)
+        measure_func = rpc(device_key, tracker.host, tracker.port)
         attach_objects = (server, tracker)
 
     build_kwargs = {}
@@ -218,18 +218,18 @@ def measure_batch(measure_inputs):
             return partial_results
         return results
 
-    measure_batch.parallel_num = parallel_num
+    measure_batch.n_parallel = n_parallel
     # attach server and tracker object to avoid them of being garbage-collected
     measure_batch.attach_objects = attach_objects
     return measure_batch
 
 
-def use_rpc(key,
-            host=None,
-            port=None,
-            priority=1,
-            session_timeout=60,
-            pack_size=1):
+def rpc(key,
+        host=None,
+        port=None,
+        priority=1,
+        session_timeout=60,
+        pack_size=1):
     """
     Create a standard measure_func which uses RPC Tracker for measurement.
     This measure_func will request a device from the RPC Tracker and
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 5d1fc1507e58..91004cba4603 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -85,7 +85,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
             every measurement pair. See autotvm/tuner/callback.py for some examples.
         """
         measure_batch = create_measure_batch(self.task, measure_option)
-        parallel_num = getattr(measure_batch, 'parallel_num', 1)
+        n_parallel = getattr(measure_batch, 'n_parallel', 1)
         early_stopping = early_stopping or 1e9
         old_level = logger.level
 
@@ -95,7 +95,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
             if not self.has_next():
                 break
 
-            configs = self.next_batch(min(parallel_num, n_trial - i))
+            configs = self.next_batch(min(n_parallel, n_trial - i))
 
             inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
             results = measure_batch(inputs)
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 179ac811ab70..375d1a9b755e 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -168,7 +168,7 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
 # run 8 parallel threads for compilation
 measure_option = autotvm.measure_option('local',
                                         number=5,
-                                        parallel_num=8,
+                                        n_parallel=8,
                                         timeout=20)
 
 # begin tuning, log records to file `conv2d.log`
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index d11823f204e1..f3d1c62bdaf2 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -191,9 +191,9 @@ def get_network(name, batch_size):
    'early_stopping': 250,
 
    'measure_option': autotvm.measure_option(
-       autotvm.use_rpc(device_key, host='localhost', port=9190),
+       autotvm.measure.rpc(device_key, host='localhost', port=9190),
        number=4,
-       parallel_num=1,
+       n_parallel=1,
        timeout=10,
        build_func='ndk' if use_android else 'default',
    ),
@@ -205,7 +205,7 @@ def get_network(name, batch_size):
 #
 #   In general, the default value provided here works well. It is the same
 #   value that we used to generate pre-tuned parameters.
-#   If you have multiple devices, you can set :code:`parallel_num` to
+#   If you have multiple devices, you can set :code:`n_parallel` to
 #   the number of devices you have. (e.g. set it to 3 if you register 3 rk3399
 #   boards to the tracker).
 #   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,

From e571a80f584f007695a217638f124a568010314f Mon Sep 17 00:00:00 2001
From: Hao Jin <haojin2@users.noreply.github.com>
Date: Fri, 10 Aug 2018 18:01:55 -0400
Subject: [PATCH 005/529] update dmlc-core for security reason (#1584)

---
 dmlc-core | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dmlc-core b/dmlc-core
index e864aa6757cd..4f0564ec7694 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit e864aa6757cdbe78b1296fe5231fd3050b7802c3
+Subproject commit 4f0564ec769477c66d480dd966088f172050c874

From 545d10c617d8f0eb082ddb0854edb700a82495db Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Fri, 10 Aug 2018 15:02:10 -0700
Subject: [PATCH 006/529] DLPack Conversion API (#1573)

---
 include/tvm/runtime/c_runtime_api.h | 26 ++++++++++++
 include/tvm/runtime/ndarray.h       |  2 +-
 python/tvm/_ffi/ndarray.py          | 62 ++++++++++++++++++++++++++++-
 python/tvm/contrib/dlpack.py        | 43 ++++++++++++++++++++
 python/tvm/ndarray.py               |  2 +-
 src/runtime/ndarray.cc              | 36 +++++++++++++----
 tests/python/contrib/test_dlpack.py | 44 ++++++++++++++++++++
 7 files changed, 205 insertions(+), 10 deletions(-)
 create mode 100644 python/tvm/contrib/dlpack.py
 create mode 100644 tests/python/contrib/test_dlpack.py

diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 17d00bf479aa..dca0d5ed4a30 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -445,6 +445,32 @@ TVM_DLL int TVMArrayCopyFromTo(TVMArrayHandle from,
                                TVMArrayHandle to,
                                TVMStreamHandle stream);
 
+/*!
+ * \brief Produce an array from the DLManagedTensor that shares data memory
+ * with the DLManagedTensor.
+ * \param from The source DLManagedTensor.
+ * \param out The output array handle.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMArrayFromDLPack(DLManagedTensor* from,
+                               TVMArrayHandle* out);
+
+/*!
+ * \brief Produce a DLMangedTensor from the array that shares data memory with
+ * the array.
+ * \param from The source array.
+ * \param out The DLManagedTensor handle.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMArrayToDLPack(TVMArrayHandle from,
+                             DLManagedTensor** out);
+
+/*!
+ * \brief Delete (free) a DLManagedTensor's data.
+ * \param dltensor Pointer to the DLManagedTensor. 
+ */
+TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor);
+
 /*!
  * \brief Create a new runtime stream.
  *
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 2b51b2e0fcfe..d3ecce8ba9d0 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -155,7 +155,7 @@ class NDArray {
    * that is DLPack compatible.
    *
    * The memory is retained until the NDArray went out of scope.
-   *
+   * \param tensor The DLPack tensor to copy from.
    * \return The created NDArray view.
    */
   TVM_DLL static NDArray FromDLPack(DLManagedTensor* tensor);
diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
index 3788c07ac440..d994d7c2e4a5 100644
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -5,7 +5,7 @@
 import sys
 import ctypes
 import numpy as np
-from .base import _LIB, check_call, c_array, string_types, _FFI_MODE
+from .base import _LIB, check_call, c_array, string_types, _FFI_MODE, c_str
 from .runtime_ctypes import TVMType, TVMContext, TVMArray, TVMArrayHandle
 from .runtime_ctypes import TypeCode, tvm_shape_index_t
 
@@ -28,6 +28,17 @@
     from ._ctypes.ndarray import NDArrayBase as _NDArrayBase
 
 
+TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+_c_str_dltensor = c_str('dltensor')
+
+
+# used for PyCapsule manipulation
+if hasattr(ctypes, 'pythonapi'):
+    ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p
+    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+    ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
+
+
 def context(dev_type, dev_id=0):
     """Construct a TVM context with given device type and id.
 
@@ -62,6 +73,7 @@ def context(dev_type, dev_id=0):
         dev_type = TVMContext.STR2MASK[dev_type]
     return TVMContext(dev_type, dev_id)
 
+
 def numpyasarray(np_data):
     """Return a TVMArray representation of a numpy array.
     """
@@ -112,6 +124,42 @@ def empty(shape, dtype="float32", ctx=context(1, 0)):
         ctypes.byref(handle)))
     return _make_array(handle, False)
 
+
+def from_dlpack(dltensor):
+    """Produce an array from a DLPack tensor without memory copy.
+    Retreives the underlying DLPack tensor's pointer to create an array from the
+    data. Removes the original DLPack tensor's destructor as now the array is
+    responsible for destruction.
+
+    Parameters
+    ----------
+    dltensor : DLPack tensor
+
+    Returns
+    -------
+    arr: tvm.nd.NDArray
+        The array view of the tensor data.
+    """
+    dltensor = ctypes.py_object(dltensor)
+    name = ctypes.pythonapi.PyCapsule_GetName(dltensor)
+    ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, name)
+    handle = TVMArrayHandle()
+    check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle)))
+    ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, None)
+    return _make_array(handle, False)
+
+
+def _dlpack_deleter(pycapsule):
+    pycapsule = ctypes.py_object(pycapsule)
+    if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
+        ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
+        _LIB.TVMDLManagedTensorCallDeleter(ptr)
+        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
+
+
+_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter)
+
+
 class NDArrayBase(_NDArrayBase):
     """A simple Device/CPU Array object in runtime."""
     @property
@@ -260,6 +308,18 @@ def copyto(self, target):
             raise ValueError("Unsupported target type %s" % str(type(target)))
         return target
 
+    def to_dlpack(self):
+        """Produce an array from a DLPack Tensor without copying memory
+
+        Returns
+        -------
+        dlpack : DLPack tensor view of the array data
+        """
+        handle = ctypes.c_void_p()
+        check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle)))
+        return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter)
+
+
 def free_extension_handle(handle, type_code):
     """Free c++ extension type handle
 
diff --git a/python/tvm/contrib/dlpack.py b/python/tvm/contrib/dlpack.py
new file mode 100644
index 000000000000..11db29f98b3e
--- /dev/null
+++ b/python/tvm/contrib/dlpack.py
@@ -0,0 +1,43 @@
+"""Wrapping functions to bridge frameworks with DLPack support to TVM"""
+from .. import ndarray
+
+def convert_func(tvm_func, tensor_type, to_dlpack_func):
+    """Convert a tvm function into one that accepts a tensor from another
+       framework, provided the other framework supports DLPACK
+
+    Parameters
+    ----------
+    tvm_func: Function
+        Built tvm function operating on arrays
+
+    tensor_type: Type
+        Type of the tensors of the target framework
+
+    to_dlpack_func: Function
+        Function to convert the source tensors to DLPACK
+    """
+    assert callable(tvm_func)
+
+    def _wrapper(*args):
+        args = tuple(ndarray.from_dlpack(to_dlpack_func(arg))\
+            if isinstance(arg, tensor_type) else arg for arg in args)
+        return tvm_func(*args)
+
+    return _wrapper
+
+def to_pytorch_func(tvm_func):
+    """Convert a tvm function into one that accepts PyTorch tensors
+
+    Parameters
+    ----------
+    tvm_func: Function
+        Built tvm function operating on arrays
+
+    Returns
+    -------
+    wrapped_func: Function
+        Wrapped tvm function that operates on PyTorch tensors
+    """
+    import torch
+    import torch.utils.dlpack
+    return convert_func(tvm_func, torch.Tensor, torch.utils.dlpack.to_dlpack)
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index 18e958973d94..448e5f6d8bdb 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -8,7 +8,7 @@
 import numpy as _np
 
 from ._ffi.ndarray import TVMContext, TVMType, NDArrayBase
-from ._ffi.ndarray import context, empty
+from ._ffi.ndarray import context, empty, from_dlpack
 from ._ffi.ndarray import _set_class_ndarray
 from ._ffi.ndarray import register_extension, free_extension_handle
 
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index f862f32f6e99..424a2b09cb15 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -93,6 +93,16 @@ struct NDArray::Internal {
     arr.data_ = nullptr;
     return tensor;
   }
+  // Container to DLManagedTensor
+  static DLManagedTensor* ToDLPack(NDArray::Container* from) {
+    CHECK(from != nullptr);
+    DLManagedTensor* ret = new DLManagedTensor();
+    ret->dl_tensor = from->dl_tensor;
+    ret->manager_ctx = from;
+    from->IncRef();
+    ret->deleter = NDArrayDLPackDeleter;
+    return ret;
+  }
 };
 
 NDArray NDArray::CreateView(std::vector<int64_t> shape,
@@ -115,13 +125,7 @@ NDArray NDArray::CreateView(std::vector<int64_t> shape,
 }
 
 DLManagedTensor* NDArray::ToDLPack() const {
-  CHECK(data_ != nullptr);
-  DLManagedTensor* ret = new DLManagedTensor();
-  ret->dl_tensor = data_->dl_tensor;
-  ret->manager_ctx = const_cast<NDArray*>(this);
-  data_->IncRef();
-  ret->deleter = NDArrayDLPackDeleter;
-  return ret;
+  return Internal::ToDLPack(data_);
 }
 
 NDArray NDArray::Empty(std::vector<int64_t> shape,
@@ -213,6 +217,24 @@ int TVMArrayCopyFromTo(TVMArrayHandle from,
   API_END();
 }
 
+int TVMArrayFromDLPack(DLManagedTensor* from,
+                       TVMArrayHandle* out) {
+  API_BEGIN();
+  *out = NDArray::Internal::MoveAsDLTensor(NDArray::FromDLPack(from));
+  API_END();
+}
+
+int TVMArrayToDLPack(TVMArrayHandle from,
+                     DLManagedTensor** out) {
+  API_BEGIN();
+  *out = NDArray::Internal::ToDLPack(reinterpret_cast<NDArray::Container*>(from));
+  API_END();
+}
+
+void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor) {
+  (*(dltensor->deleter))(dltensor);
+}
+
 int TVMArrayCopyFromBytes(TVMArrayHandle handle,
                           void* data,
                           size_t nbytes) {
diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
new file mode 100644
index 000000000000..9a8ea34e69d5
--- /dev/null
+++ b/tests/python/contrib/test_dlpack.py
@@ -0,0 +1,44 @@
+import tvm
+import numpy as np
+from tvm.contrib.dlpack import to_pytorch_func
+
+def test():
+    a = np.random.randn(1337)
+    tvm_a = tvm.nd.array(a)
+    np.testing.assert_equal(tvm.nd.from_dlpack(tvm_a.to_dlpack()).asnumpy(), a)
+
+    try:
+        import torch
+        import torch.utils.dlpack
+
+        x = torch.rand(56, 56)
+        tvm_x = tvm.nd.from_dlpack(torch.utils.dlpack.to_dlpack(x))
+        np.testing.assert_equal(x.numpy(), tvm_x.asnumpy())
+        y = tvm.nd.from_dlpack(tvm_x.to_dlpack())
+        np.testing.assert_equal(y.asnumpy(), tvm_x.asnumpy())
+        np.testing.assert_equal(torch.utils.dlpack.from_dlpack(y.to_dlpack()).numpy(), tvm_x.asnumpy())
+
+        n = tvm.convert(137)
+        xx = torch.rand(137,137)
+        yy = torch.rand(137,137)
+        zz2 = torch.empty(137,137)
+        zz = xx.mm(yy)
+        XX = tvm.placeholder((n,n), name='X')
+        YY = tvm.placeholder((n,n), name='Y')
+
+        k = tvm.reduce_axis((0, n), name='k')
+        ZZ = tvm.compute((n,n), lambda i,j : tvm.sum(XX[i,k]*YY[k,j], axis=k))
+        s = tvm.create_schedule(ZZ.op)
+        f = tvm.build(s, [XX, YY, ZZ], target_host='llvm', name='f')
+
+        f_pytorch = to_pytorch_func(f)
+        zz2 = torch.empty(137,137)
+        f_pytorch(xx, yy, zz2)
+        np.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-6)
+
+    except ImportError:
+        pass
+
+
+if __name__ ==  '__main__':
+    test()

From 7009295e538a1775b3b4b34d871232132cd0479b Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Fri, 10 Aug 2018 19:04:46 -0700
Subject: [PATCH 007/529] use phone EditText for numerical fields (#1587)

---
 apps/android_rpc/app/src/main/res/layout/content_main.xml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml
index 0f2564833ecd..82be44d98451 100644
--- a/apps/android_rpc/app/src/main/res/layout/content_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml
@@ -20,6 +20,7 @@
             android:hint="@string/input_address"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
+            android:inputType="phone"
             android:background="@android:drawable/editbox_background"/>
     </LinearLayout>
 
@@ -37,6 +38,7 @@
             android:minWidth="100dip"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
+            android:inputType="phone"
             android:background="@android:drawable/editbox_background"/>
     </LinearLayout>
 

From 77dc1c446832a8c70b005690c744e00ff9bcf00a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 11 Aug 2018 09:15:05 -0700
Subject: [PATCH 008/529] [RUNTIME] Refactor to enable stackvm in runtime.
 (#1588)

---
 CMakeLists.txt                                |  12 +-
 Jenkinsfile                                   |   1 +
 cmake/config.cmake                            |   3 +
 include/tvm/ir.h                              |  20 +--
 include/tvm/runtime/util.h                    |  28 +++-
 python/tvm/module.py                          |   9 +-
 src/codegen/codegen.cc                        |   1 -
 src/codegen/stack_vm/stack_vm_module.cc       |  71 ----------
 .../codegen_stackvm.cc}                       |  23 +++-
 .../codegen_stackvm.h}                        |  10 +-
 .../stackvm/stackvm.cc}                       |  71 +++++++---
 .../stack_vm.h => runtime/stackvm/stackvm.h}  |  55 +++++---
 src/runtime/stackvm/stackvm_module.cc         | 128 ++++++++++++++++++
 src/runtime/stackvm/stackvm_module.h          |  27 ++++
 tests/python/unittest/test_module_load.py     |  22 ++-
 15 files changed, 337 insertions(+), 144 deletions(-)
 delete mode 100644 src/codegen/stack_vm/stack_vm_module.cc
 rename src/codegen/{stack_vm/codegen_stack_vm.cc => stackvm/codegen_stackvm.cc} (95%)
 rename src/codegen/{stack_vm/codegen_stack_vm.h => stackvm/codegen_stackvm.h} (95%)
 rename src/{codegen/stack_vm/stack_vm.cc => runtime/stackvm/stackvm.cc} (90%)
 rename src/{codegen/stack_vm/stack_vm.h => runtime/stackvm/stackvm.h} (89%)
 create mode 100644 src/runtime/stackvm/stackvm_module.cc
 create mode 100644 src/runtime/stackvm/stackvm_module.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39776d53d1f1..572f4aef1432 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,6 +29,7 @@ tvm_option(USE_ROCM "Build with ROCM" OFF)
 tvm_option(ROCM_PATH "The path to rocm" /opt/rocm)
 tvm_option(USE_RPC "Build with RPC" ON)
 tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF)
+tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
 tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
@@ -97,7 +98,6 @@ file(GLOB COMPILER_SRCS
     src/arithmetic/*.cc
     src/autotvm/*.cc
     src/codegen/*.cc
-    src/codegen/stack_vm/*.cc
     src/lang/*.cc
     src/pass/*.cc
     src/op/*.cc
@@ -135,6 +135,16 @@ if(USE_RPC)
   list(APPEND RUNTIME_SRCS ${RUNTIME_RPC_SRCS})
 endif(USE_RPC)
 
+file(GLOB STACKVM_RUNTIME_SRCS src/runtime/stackvm/*.cc)
+file(GLOB STACKVM_CODEGEN_SRCS src/codegen/stackvm/*.cc)
+list(APPEND COMPILER_SRCS ${STACKVM_CODEGEN_SRCS})
+if(USE_STACKVM_RUNTIME)
+  message(STATUS "Build with stackvm support in runtime...")
+  list(APPEND RUNTIME_SRCS ${STACKVM_RUNTIME_SRCS})
+else()
+  list(APPEND COMPILER_SRCS ${STACKVM_RUNTIME_SRCS})
+endif(USE_STACKVM_RUNTIME)
+
 if(USE_GRAPH_RUNTIME)
   message(STATUS "Build with Graph runtime support...")
   file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
diff --git a/Jenkinsfile b/Jenkinsfile
index bec0d2be5df8..2ecf3c59f8aa 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -96,6 +96,7 @@ stage('Build') {
            echo set\\(USE_RPC ON\\) >> config.cmake
            echo set\\(USE_SORT ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake
+           echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_BLAS openblas\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 85c5102169a9..c364a88cce11 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -65,6 +65,9 @@ set(USE_OPENGL OFF)
 # Whether enable RPC runtime
 set(USE_RPC ON)
 
+# Whether embed stackvm into the runtime
+set(USE_STACKVM_RUNTIME OFF)
+
 # Whether enable tiny embedded graph runtime.
 set(USE_GRAPH_RUNTIME ON)
 
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 9ea16131188d..646824332902 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -12,6 +12,7 @@
 #include <string>
 #include "./base.h"
 #include "./expr.h"
+#include "./runtime/util.h"
 
 namespace tvm {
 namespace ir {
@@ -449,25 +450,6 @@ constexpr const char* tvm_global_barrier_kinit = "tvm_global_barrier_kinit";
  */
 constexpr const char* tvm_thread_allreduce = "tvm_thread_allreduce";
 
-/*! \brief The kind of structure field info */
-enum TVMStructFieldKind : int {
-  // array head address
-  kArrAddr,
-  kArrData,
-  kArrShape,
-  kArrStrides,
-  kArrNDim,
-  kArrTypeCode,
-  kArrTypeBits,
-  kArrTypeLanes,
-  kArrByteOffset,
-  kArrDeviceId,
-  kArrDeviceType,
-  kArrKindBound_,
-  // TVMValue field
-  kTVMValueContent,
-  kTVMValueKindBound_
-};
 }   // namespace intrinsic
 
 // Reuse IR node defintiion from HalideIR
diff --git a/include/tvm/runtime/util.h b/include/tvm/runtime/util.h
index 160642ffcc85..7fa62be912be 100644
--- a/include/tvm/runtime/util.h
+++ b/include/tvm/runtime/util.h
@@ -21,7 +21,33 @@ namespace runtime {
 inline bool TypeMatch(TVMType t, int code, int bits, int lanes = 1) {
   return t.code == code && t.bits == bits && t.lanes == lanes;
 }
-
 }  // namespace runtime
 }  // namespace tvm
+// Forward declare the intrinsic id we need
+// in structure fetch to enable stackvm in runtime
+namespace tvm {
+namespace ir {
+namespace intrinsic {
+/*! \brief The kind of structure field info used in intrinsic */
+enum TVMStructFieldKind : int {
+  // array head address
+  kArrAddr,
+  kArrData,
+  kArrShape,
+  kArrStrides,
+  kArrNDim,
+  kArrTypeCode,
+  kArrTypeBits,
+  kArrTypeLanes,
+  kArrByteOffset,
+  kArrDeviceId,
+  kArrDeviceType,
+  kArrKindBound_,
+  // TVMValue field
+  kTVMValueContent,
+  kTVMValueKindBound_
+};
+}  // namespace intrinsic
+}  // namespace ir
+}  // namespace tvm
 #endif  // TVM_RUNTIME_UTIL_H_
diff --git a/python/tvm/module.py b/python/tvm/module.py
index 1b83c9b26243..6cca6fb0f722 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -90,9 +90,12 @@ def export_library(self,
         kwargs : dict, optiona;
             Additional arguments passed to fcompile
         """
-        if self.type_key == "stacktvm":
-            raise ValueError("Module[%s]: export_library requires llvm module,"
-                             " did you build with LLVM enabled?" % self.type_key)
+        if self.type_key == "stackvm":
+            if not file_name.endswith(".stackvm"):
+                raise ValueError("Module[%s]: can only be saved as stackvm format."
+                                 "did you build with LLVM enabled?" % self.type_key)
+            self.save(file_name)
+            return
 
         if self.type_key != "llvm":
             raise ValueError("Module[%s]: Only llvm support export shared" % self.type_key)
diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc
index 8bc7d238a866..12570e5881a9 100644
--- a/src/codegen/codegen.cc
+++ b/src/codegen/codegen.cc
@@ -40,7 +40,6 @@ std::string PackImportsToC(const runtime::Module& mod, bool system_lib) {
     CHECK_EQ(im->imports().size(), 0U)
         << "Only support simply one-level hierarchy";
     std::string tkey = im->type_key();
-    std::string bin;
     stream->Write(tkey);
     im->SaveToBinary(stream);
   }
diff --git a/src/codegen/stack_vm/stack_vm_module.cc b/src/codegen/stack_vm/stack_vm_module.cc
deleted file mode 100644
index 731663deb448..000000000000
--- a/src/codegen/stack_vm/stack_vm_module.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file stack_vm_module.cc
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/runtime/module.h>
-#include <tvm/codegen.h>
-#include "./codegen_stack_vm.h"
-
-namespace tvm {
-namespace codegen {
-
-class StackVMModuleNode : public runtime::ModuleNode {
- public:
-  const char* type_key() const {
-    return "stackvm";
-  }
-
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
-    if (name == runtime::symbol::tvm_module_main) {
-      return GetFunction(entry_func_, sptr_to_self);
-    }
-    auto it = fmap_.find(name);
-    if (it == fmap_.end()) return PackedFunc();
-    const StackVM& vm = it->second;
-    // capture sptr_to_self to keep module node alive.
-    return PackedFunc([vm, sptr_to_self](TVMArgs args, TVMRetValue* rv) {
-        vm(args);
-      });
-  }
-
-  std::string GetSource(const std::string& format) final {
-    std::ostringstream os;
-    for (const auto& kv : fmap_) {
-      os << "Function: " << kv.first << '\n';
-      os << kv.second;
-    }
-    return os.str();
-  }
-
-  static runtime::Module Build(const Array<LoweredFunc>& funcs) {
-    CHECK_NE(funcs.size(), 0U);
-    std::shared_ptr<StackVMModuleNode> n =
-        std::make_shared<StackVMModuleNode>();
-    for (LoweredFunc f : funcs) {
-      StackVM vm = codegen::CodeGenStackVM().Compile(f);
-      CHECK(!n->fmap_.count(f->name))
-          << "Function name " << f->name << "already exist in list";
-      vm.mod_ctx = n.get();
-      n->fmap_[f->name] = std::move(vm);
-    }
-    n->entry_func_ = funcs[0]->name;
-    return runtime::Module(n);
-  }
-
- private:
-  // entry function.
-  std::string entry_func_;
-  // internal function map
-  std::unordered_map<std::string, StackVM> fmap_;
-};
-
-TVM_REGISTER_API("codegen.build_stackvm")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = StackVMModuleNode::Build(args[0]);
-  });
-
-}  // namespace codegen
-}  // namespace tvm
diff --git a/src/codegen/stack_vm/codegen_stack_vm.cc b/src/codegen/stackvm/codegen_stackvm.cc
similarity index 95%
rename from src/codegen/stack_vm/codegen_stack_vm.cc
rename to src/codegen/stackvm/codegen_stackvm.cc
index 168e411fa6e2..517793ff14a3 100644
--- a/src/codegen/stack_vm/codegen_stack_vm.cc
+++ b/src/codegen/stackvm/codegen_stackvm.cc
@@ -1,11 +1,12 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file codegen_stack_vm.cc
+ * \file codegen_stackvm.cc
  */
 #include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <limits>
-#include "./codegen_stack_vm.h"
+#include "./codegen_stackvm.h"
+#include "../../runtime/stackvm/stackvm_module.h"
 
 namespace tvm {
 namespace codegen {
@@ -19,6 +20,7 @@ StackVM CodeGenStackVM::Compile(LoweredFunc f) {
     CHECK_EQ(static_cast<size_t>(vid), i);
   }
   this->Push(f->body);
+  vm_.InitCache();
   return std::move(vm_);
 }
 
@@ -486,5 +488,22 @@ void CodeGenStackVM::VisitExpr_(const Let *op) {
   this->PushOp(StackVM::STORE_HEAP, static_cast<int>(vid));
   this->Push(op->body);
 }
+
+runtime::Module BuildStackVM(const Array<LoweredFunc>& funcs) {
+  CHECK_NE(funcs.size(), 0U);
+  std::unordered_map<std::string, StackVM> fmap;
+  for (LoweredFunc f : funcs) {
+    StackVM vm = codegen::CodeGenStackVM().Compile(f);
+    CHECK(!fmap.count(f->name))
+        << "Function name " << f->name << "already exist in list";
+    fmap[f->name] = std::move(vm);
+  }
+  return runtime::StackVMModuleCreate(fmap, funcs[0]->name);
+}
+
+TVM_REGISTER_API("codegen.build_stackvm")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildStackVM(args[0]);
+  });
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/stack_vm/codegen_stack_vm.h b/src/codegen/stackvm/codegen_stackvm.h
similarity index 95%
rename from src/codegen/stack_vm/codegen_stack_vm.h
rename to src/codegen/stackvm/codegen_stackvm.h
index 089284529242..23bd61dcb4c2 100644
--- a/src/codegen/stack_vm/codegen_stack_vm.h
+++ b/src/codegen/stackvm/codegen_stackvm.h
@@ -3,8 +3,8 @@
  * \file codegen_stack_vm.h
  * \brief Codegen into Simple Stack VM.
  */
-#ifndef TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_
-#define TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_
+#ifndef TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_
+#define TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_
 
 #include <tvm/ir.h>
 #include <tvm/ir_functor_ext.h>
@@ -14,12 +14,14 @@
 #include <vector>
 #include <unordered_map>
 
-#include "./stack_vm.h"
+#include "../../runtime/stackvm/stackvm.h"
 
 namespace tvm {
 namespace codegen {
 
 using namespace ir;
+using runtime::StackVM;
+
 /*!
  * \brief A base class to generate a stack VM.
  *  This module is used to generate host wrapper
@@ -145,4 +147,4 @@ class CodeGenStackVM
 
 }  // namespace codegen
 }  // namespace tvm
-#endif  // TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_
+#endif  // TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_
diff --git a/src/codegen/stack_vm/stack_vm.cc b/src/runtime/stackvm/stackvm.cc
similarity index 90%
rename from src/codegen/stack_vm/stack_vm.cc
rename to src/runtime/stackvm/stackvm.cc
index 95feeae3679e..f86bfec087e4 100644
--- a/src/codegen/stack_vm/stack_vm.cc
+++ b/src/runtime/stackvm/stackvm.cc
@@ -1,15 +1,16 @@
 /*!
  *  Copyright (c) 2017 by Contributors
  * Implementation stack VM.
- * \file stack_vm.cc
+ * \file stackvm.cc
  */
 #include <dmlc/thread_local.h>
-#include <tvm/ir.h>
+#include <tvm/runtime/util.h>
 #include <tvm/runtime/c_backend_api.h>
-#include "./stack_vm.h"
+#include <algorithm>
+#include "./stackvm.h"
 
 namespace tvm {
-namespace codegen {
+namespace runtime {
 
 typedef dmlc::ThreadLocalStore<StackVM::State> StackVMStateStore;
 
@@ -172,28 +173,64 @@ std::ostream& operator<<(std::ostream& os, const StackVM& vm) {  // NOLINT(*)
   return os;
 }
 
-void StackVM::operator()(const runtime::TVMArgs& args) const {
+void StackVM::Run(const runtime::TVMArgs& args,
+                  runtime::ModuleNode* mod_ctx) const {
   StackVM::State* s = StackVM::ThreadLocalState();
+  if (s->heap.size() < heap_size) {
+    s->heap.resize(heap_size);
+  }
   s->sp = 0;
   s->pc = 0;
-  if (s->heap.size() < this->heap_size) {
-    s->heap.resize(this->heap_size);
-  }
-
+  s->mod_ctx = mod_ctx;
   s->heap[0].v_handle = (void*)args.values;  // NOLINT(*)
   s->heap[1].v_handle = (void*)args.type_codes;  // NOLINT(*)
   s->heap[2].v_int64 = args.num_args;
   this->Run(s);
 }
 
+void StackVM::InitCache() {
+  extern_func_cache_.clear();
+  extern_func_cache_.resize(
+      extern_func_name.size(), PackedFunc(nullptr));
+}
+
+void StackVM::Save(dmlc::Stream* strm) const {
+  // to be endian invariant.
+  std::vector<int32_t> code_copy(code.size());
+  std::transform(code.begin(), code.end(), code_copy.begin(), [](Code c) {
+      return c.v_int;
+    });
+  strm->Write(code_copy);
+  strm->Write(str_data);
+  strm->Write(extern_func_name);
+  strm->Write(heap_id_name);
+  strm->Write(heap_size);
+  strm->Write(stack_size);
+}
+
+bool StackVM::Load(dmlc::Stream* strm)  {
+  // to be endian invariant.
+  std::vector<int32_t> code_copy;
+  if (!strm->Read(&code_copy)) return false;
+  code.resize(code_copy.size());
+  std::transform(code_copy.begin(), code_copy.end(), code.begin(), [](int v) {
+      Code code; code.v_int = v; return code;
+    });
+  if (!strm->Read(&str_data)) return false;
+  if (!strm->Read(&extern_func_name)) return false;
+  if (!strm->Read(&heap_id_name)) return false;
+  if (!strm->Read(&heap_size)) return false;
+  if (!strm->Read(&stack_size)) return false;
+  this->InitCache();
+  return true;
+}
+
 void StackVM::Run(State* s) const {
   int64_t sp = s->sp;
   int64_t pc = s->pc;
   int64_t alloca_sp = s->sp;
   std::vector<TVMValue>& stack = s->stack;
   std::vector<TVMValue>& heap = s->heap;
-  s->extern_func.clear();
-  s->extern_func.resize(extern_func_name.size());
   if (stack.size() < stack_size) {
     stack.resize(stack_size);
   }
@@ -488,17 +525,19 @@ void StackVM::Run(State* s) const {
 }
 
 const PackedFunc& StackVM::GetExtern(State* s, int fid) const {
-  PackedFunc& f = s->extern_func[fid];
+  CHECK_LT(static_cast<size_t>(fid), extern_func_cache_.size());
+  // allow race write in this, since write is idempotent
+  PackedFunc& f = extern_func_cache_[fid];
   if (f == nullptr) {
-    CHECK(mod_ctx != nullptr)
+    CHECK(s->mod_ctx != nullptr)
         << "No local context is set in stackvm";
-    const PackedFunc* pf = mod_ctx->GetFuncFromEnv(extern_func_name[fid]);
+    CHECK(s->mod_ctx != nullptr);
+    const PackedFunc* pf = s->mod_ctx->GetFuncFromEnv(extern_func_name[fid]);
     CHECK(pf != nullptr);
     f = *pf;
-    CHECK(f != nullptr);
   }
   return f;
 }
 
-}  // namespace codegen
+}  // namespace runtime
 }  // namespace tvm
diff --git a/src/codegen/stack_vm/stack_vm.h b/src/runtime/stackvm/stackvm.h
similarity index 89%
rename from src/codegen/stack_vm/stack_vm.h
rename to src/runtime/stackvm/stackvm.h
index 54972d39a5df..b2ce975b2c73 100644
--- a/src/codegen/stack_vm/stack_vm.h
+++ b/src/runtime/stackvm/stackvm.h
@@ -1,36 +1,36 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file stack_vm.h
+ * \file stackvm.h
  * \brief A simple stack-based virtual machine.
  *
  *  This can be used to interepret host side code
  *  to setup calls into device functions
  *  when only Runtime compilation for device is available(via NVRTC or OpenCL).
  */
-#ifndef TVM_CODEGEN_STACK_VM_STACK_VM_H_
-#define TVM_CODEGEN_STACK_VM_STACK_VM_H_
+#ifndef TVM_RUNTIME_STACKVM_STACKVM_H_
+#define TVM_RUNTIME_STACKVM_STACKVM_H_
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/module.h>
-#include <tvm/packed_func_ext.h>
 #include <string>
 #include <vector>
 
 namespace tvm {
-namespace codegen {
+namespace runtime {
 
 using runtime::operator<<;
 /*!
- * \brief A simple stack-based virtual machine.
+ * \brief A simple stack-based virtual machine program.
  */
 class StackVM {
  public:
   /*!
-   * \brief Invoke the StackVM as PackedFunc
+   * \brief Invoke the StackVM program.
    * \param args The arguments to the StackVM.
+   * \param mod_ctx The module context used in running.
    */
-  void operator()(const TVMArgs& args) const;
+  void Run(const TVMArgs& args, runtime::ModuleNode* mod_ctx) const;
   /*!
    * \brief The opcode of stack vm
    * \note Notation
@@ -276,21 +276,25 @@ class StackVM {
     std::vector<TVMValue> stack;
     /*! \brief The global heap space */
     std::vector<TVMValue> heap;
-    /*! \brief extern functions */
-    std::vector<PackedFunc> extern_func;
     /*! \brief stack pointer  */
     int64_t sp{0};
     /*! \brief program counter */
     int64_t pc{0};
+    /*! \brief The current module context of stackvm */
+    runtime::ModuleNode* mod_ctx{nullptr};
   };
-  /*! \brief The external function entries. */
-  struct ExternFuncEntry {
-    std::string name;
-    runtime::PackedFunc func;
-  };
-
-  /*! \brief execute the stack vm with given state */
-  void Run(State* state) const;
+  /*! \brief Initialize local cache*/
+  void InitCache();
+  /*!
+   * \brief Save stackvm program to an output stream
+   * \param strm The output stream
+   */
+  void Save(dmlc::Stream* strm) const;
+  /*!
+   * \brief Load stackvm program from output stream
+   * \param strm The output stream
+   */
+  bool Load(dmlc::Stream* strm);
   /*!
    * \brief Print instruction at location pc
    * \param os The ostream
@@ -300,12 +304,11 @@ class StackVM {
   int64_t PrintCode(std::ostream&os, int64_t pc) const;  // NOLINT(*)
   /*! \brief Get thread local state of the stack VM */
   static State* ThreadLocalState();
+  // The code below are programs
   /*! \brief The instructions */
   std::vector<Code> code;
   /*! \brief constant error messages */
   std::vector<std::string> str_data;
-  /*! \brief The current module context of stackvm */
-  runtime::ModuleNode* mod_ctx{nullptr};
   /*! \brief Extern functions */
   std::vector<std::string> extern_func_name;
   /*! \brief name of each heap id */
@@ -385,10 +388,18 @@ class StackVM {
   friend std::ostream& operator<<(std::ostream& os, const StackVM& vm);  // NOLINT(*)
 
  private:
+  //  execute the stack vm with given state
+  void Run(State* state) const;
   // get extern function.
   const PackedFunc& GetExtern(State* s, int fid) const;
+  // cached extern function
+  mutable std::vector<PackedFunc> extern_func_cache_;
 };
 
-}  // namespace codegen
+}  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_CODEGEN_STACK_VM_STACK_VM_H_
+
+namespace dmlc {
+DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::StackVM, true);
+}
+#endif  // TVM_RUNTIME_STACKVM_STACKVM_H_
diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc
new file mode 100644
index 000000000000..71ca9ba6c09a
--- /dev/null
+++ b/src/runtime/stackvm/stackvm_module.cc
@@ -0,0 +1,128 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file stackvm_module.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/module.h>
+#include <dmlc/memory_io.h>
+#include "./stackvm_module.h"
+#include "../file_util.h"
+#include "../module_util.h"
+
+namespace tvm {
+namespace runtime {
+
+class StackVMModuleNode : public runtime::ModuleNode {
+ public:
+  const char* type_key() const {
+    return "stackvm";
+  }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    if (name == runtime::symbol::tvm_module_main) {
+      return GetFunction(entry_func_, sptr_to_self);
+    }
+    auto it = fmap_.find(name);
+    if (it == fmap_.end()) return PackedFunc();
+    const StackVM& vm = it->second;
+    // capture sptr_to_self to keep module node alive.
+    return PackedFunc([vm, sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        vm.Run(args, this);
+      });
+  }
+
+  std::string GetSource(const std::string& format) final {
+    std::ostringstream os;
+    for (const auto& kv : fmap_) {
+      os << "Function: " << kv.first << '\n';
+      os << kv.second;
+    }
+    return os.str();
+  }
+
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final {
+    std::string data, mblob;
+    dmlc::MemoryStringStream writer(&data);
+    dmlc::Stream* strm = &writer;
+    strm->Write(fmap_);
+    strm->Write(entry_func_);
+    // also save imports
+    uint64_t num_imports = static_cast<uint64_t>(imports_.size());
+    strm->Write(num_imports);
+
+    for (runtime::Module im : imports_) {
+      CHECK_EQ(im->imports().size(), 0U)
+          << "Only support simply one-level hierarchy";
+      std::string tkey = im->type_key();
+      strm->Write(tkey);
+      LOG(INFO) << "save " << tkey;
+      im->SaveToBinary(strm);
+      LOG(INFO) << "FInish save " << tkey;
+    }
+    SaveBinaryToFile(file_name, data);
+  }
+
+  static Module Create(std::unordered_map<std::string, StackVM> fmap,
+                       std::string entry_func) {
+    std::shared_ptr<StackVMModuleNode> n =
+        std::make_shared<StackVMModuleNode>();
+    n->fmap_ = std::move(fmap);
+    n->entry_func_ = std::move(entry_func);
+    return Module(n);
+  }
+
+  static Module Load(dmlc::Stream* strm) {
+    std::unordered_map<std::string, StackVM> fmap;
+    std::string entry_func, data;
+    strm->Read(&fmap);
+    strm->Read(&entry_func);
+    std::shared_ptr<StackVMModuleNode> n =
+        std::make_shared<StackVMModuleNode>();
+    n->fmap_ = std::move(fmap);
+    n->entry_func_ = std::move(entry_func);
+    uint64_t num_imports;
+    strm->Read(&num_imports);
+    for (uint64_t i = 0; i < num_imports; ++i) {
+      std::string tkey;
+      CHECK(strm->Read(&tkey));
+      std::string fkey = "module.loadbinary_" + tkey;
+      const PackedFunc* f = Registry::Get(fkey);
+      CHECK(f != nullptr)
+          << "Loader of " << tkey << "("
+          << fkey << ") is not presented.";
+      Module m = (*f)(static_cast<void*>(strm));
+      n->imports_.emplace_back(std::move(m));
+    }
+    return Module(n);
+  }
+
+  static Module LoadFromFile(std::string file_name,
+                             std::string format) {
+    std::string data;
+    LoadBinaryFromFile(file_name, &data);
+    dmlc::MemoryStringStream reader(&data);
+    return Load(&reader);
+  }
+
+ private:
+  // internal function map
+  std::unordered_map<std::string, StackVM> fmap_;
+  // entry function.
+  std::string entry_func_;
+};
+
+Module StackVMModuleCreate(std::unordered_map<std::string, StackVM> fmap,
+                           std::string entry_func) {
+  return StackVMModuleNode::Create(fmap, entry_func);
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_stackvm")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = StackVMModuleNode::LoadFromFile(args[0], args[1]);
+  });
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/stackvm/stackvm_module.h b/src/runtime/stackvm/stackvm_module.h
new file mode 100644
index 000000000000..fcd51a64f870
--- /dev/null
+++ b/src/runtime/stackvm/stackvm_module.h
@@ -0,0 +1,27 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file stackvm_module.h
+ * \brief StackVM module
+ */
+#ifndef TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_
+#define TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <string>
+#include "./stackvm.h"
+
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief create a stackvm module
+ *
+ * \param fmap The map from name to function
+ * \param entry_func The entry function name.
+ * \return The created module
+ */
+Module StackVMModuleCreate(std::unordered_map<std::string, StackVM> fmap,
+                           std::string entry_func);
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_
diff --git a/tests/python/unittest/test_module_load.py b/tests/python/unittest/test_module_load.py
index 1b239a357f66..8ee3ea5e06c0 100644
--- a/tests/python/unittest/test_module_load.py
+++ b/tests/python/unittest/test_module_load.py
@@ -109,11 +109,25 @@ def check_device(device):
             f2[name](a, b)
             np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
-    check_device("cuda")
-    check_device("vulkan")
-    check_device("opencl")
-    check_device("metal")
+    def check_stackvm(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        temp = util.tempdir()
+        name = "myadd_%s" % device
+        f = tvm.build(s, [A, B], device, "stackvm", name=name)
+        path_dso = temp.relpath("dev_lib.stackvm")
+        #f.export_library(path_dso)
+        #f1 = tvm.module.load(path_dso)
+        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        f(a, b)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
+    for device in ["cuda", "vulkan", "opencl", "metal"]:
+        check_device(device)
+        check_stackvm(device)
 
 def test_combine_module_llvm():
     """Test combine multiple module into one shared lib."""

From 5e5aec0a310de0646cd7c76c930456aeaad81f0e Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 11 Aug 2018 15:31:15 -0700
Subject: [PATCH 009/529] [DLPACK] Enable cython support (#1589)

---
 HalideIR                            |  2 +-
 include/tvm/runtime/c_runtime_api.h |  2 +-
 python/tvm/_ffi/_ctypes/ndarray.py  | 49 ++++++++++++++++++++++++++++-
 python/tvm/_ffi/_cython/base.pxi    | 13 +++++++-
 python/tvm/_ffi/_cython/ndarray.pxi | 38 ++++++++++++++++++++++
 python/tvm/_ffi/ndarray.py          | 48 +++-------------------------
 tests/scripts/task_python_nnvm.sh   |  4 +++
 tests/scripts/task_python_topi.sh   |  4 +++
 8 files changed, 113 insertions(+), 47 deletions(-)

diff --git a/HalideIR b/HalideIR
index a5a80bdc8232..a0b9563f4571 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit a5a80bdc8232c9dbfe508bb5c46e8f58cdf7ec20
+Subproject commit a0b9563f45719553adf4d39fe3c14db1af0e1f40
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index dca0d5ed4a30..32d574340052 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -467,7 +467,7 @@ TVM_DLL int TVMArrayToDLPack(TVMArrayHandle from,
 
 /*!
  * \brief Delete (free) a DLManagedTensor's data.
- * \param dltensor Pointer to the DLManagedTensor. 
+ * \param dltensor Pointer to the DLManagedTensor.
  */
 TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor);
 
diff --git a/python/tvm/_ffi/_ctypes/ndarray.py b/python/tvm/_ffi/_ctypes/ndarray.py
index df877679fc7d..8b88e7dc98ea 100644
--- a/python/tvm/_ffi/_ctypes/ndarray.py
+++ b/python/tvm/_ffi/_ctypes/ndarray.py
@@ -1,11 +1,47 @@
+# pylint: disable=invalid-name
 """Runtime NDArray api"""
 from __future__ import absolute_import
 
 import ctypes
-from ..base import _LIB, check_call
+from ..base import _LIB, check_call, c_str
 from ..runtime_ctypes import TVMArrayHandle
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _return_handle
 
+
+TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+_c_str_dltensor = c_str('dltensor')
+_c_str_used_dltensor = c_str('used_dltensor')
+
+
+# used for PyCapsule manipulation
+if hasattr(ctypes, 'pythonapi'):
+    ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p
+    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+    ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
+
+
+def _from_dlpack(dltensor):
+    dltensor = ctypes.py_object(dltensor)
+    if ctypes.pythonapi.PyCapsule_IsValid(dltensor, _c_str_dltensor):
+        ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, _c_str_dltensor)
+        handle = TVMArrayHandle()
+        check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle)))
+        ctypes.pythonapi.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
+        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
+        return _make_array(handle, False)
+    raise ValueError("Expect a dltensor field, PyCapsule can only be consumed once")
+
+
+def _dlpack_deleter(pycapsule):
+    pycapsule = ctypes.cast(pycapsule, ctypes.py_object)
+    if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
+        ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
+        _LIB.TVMDLManagedTensorCallDeleter(ptr)
+        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
+
+_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter)
+
+
 class NDArrayBase(object):
     """A simple Device/CPU Array object in runtime."""
     __slots__ = ["handle", "is_view"]
@@ -29,6 +65,17 @@ def __del__(self):
     def _tvm_handle(self):
         return ctypes.cast(self.handle, ctypes.c_void_p).value
 
+    def to_dlpack(self):
+        """Produce an array from a DLPack Tensor without copying memory
+
+        Returns
+        -------
+        dlpack : DLPack tensor view of the array data
+        """
+        handle = ctypes.c_void_p()
+        check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle)))
+        return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter)
+
 
 def _make_array(handle, is_view):
     handle = ctypes.cast(handle, TVMArrayHandle)
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 50a99245f793..00173c431bb7 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -1,6 +1,7 @@
 from ..base import TVMError
 from libcpp.vector cimport vector
 from cpython.version cimport PY_MAJOR_VERSION
+from cpython cimport pycapsule
 from libc.stdint cimport int64_t, uint64_t, uint8_t, uint16_t
 import ctypes
 
@@ -40,6 +41,11 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
         int64_t* strides
         uint64_t byte_offset
 
+    ctypedef struct DLManagedTensor:
+        DLTensor dl_tensor
+        void* manager_ctx
+        void (*deleter)(DLManagedTensor* self)
+
     ctypedef struct TVMValue:
         int64_t v_int64
         double v_float64
@@ -49,7 +55,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
         DLContext v_ctx
 
 ctypedef int64_t tvm_index_t
-ctypedef void* DLTensorHandle
+ctypedef DLTensor* DLTensorHandle
 ctypedef void* TVMStreamHandle
 ctypedef void* TVMRetValueHandle
 ctypedef void* TVMFunctionHandle
@@ -92,6 +98,11 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
     int TVMArrayCopyFromTo(DLTensorHandle src,
                            DLTensorHandle to,
                            TVMStreamHandle stream)
+    int TVMArrayFromDLPack(DLManagedTensor* arr_from,
+                           DLTensorHandle* out)
+    int TVMArrayToDLPack(DLTensorHandle arr_from,
+                         DLManagedTensor** out)
+    void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor)
 
 cdef extern from "tvm/c_dsl_api.h":
     int TVMNodeFree(NodeHandle handle)
diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi
index 44b0a544609d..0a507affec1c 100644
--- a/python/tvm/_ffi/_cython/ndarray.pxi
+++ b/python/tvm/_ffi/_cython/ndarray.pxi
@@ -1,5 +1,29 @@
 from ..runtime_ctypes import TVMArrayHandle
 
+cdef const char* _c_str_dltensor = "dltensor"
+cdef const char* _c_str_used_dltensor = "used_dltensor"
+
+
+cdef void _c_dlpack_deleter(object pycaps):
+    cdef DLManagedTensor* dltensor
+    if pycapsule.PyCapsule_IsValid(pycaps, _c_str_dltensor):
+        dltensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(pycaps, _c_str_dltensor)
+        TVMDLManagedTensorCallDeleter(dltensor)
+
+
+def _from_dlpack(object dltensor):
+    cdef DLManagedTensor* ptr
+    cdef DLTensorHandle chandle
+    if pycapsule.PyCapsule_IsValid(dltensor, _c_str_dltensor):
+        ptr = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(dltensor, _c_str_dltensor)
+        CALL(TVMArrayFromDLPack(ptr, &chandle))
+        # set name and destructor to be empty
+        pycapsule.PyCapsule_SetDestructor(dltensor, NULL)
+        pycapsule.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
+        return c_make_array(chandle, 0)
+    raise ValueError("Expect a dltensor field, pycapsule.PyCapsule can only be consumed once")
+
+
 cdef class NDArrayBase:
     cdef DLTensor* chandle
     cdef int c_is_view
@@ -35,12 +59,26 @@ cdef class NDArrayBase:
         if self.c_is_view == 0:
             CALL(TVMArrayFree(self.chandle))
 
+    def to_dlpack(self):
+        """Produce an array from a DLPack Tensor without copying memory
+
+        Returns
+        -------
+        dlpack : DLPack tensor view of the array data
+        """
+        cdef DLManagedTensor* dltensor
+        if self.c_is_view != 0:
+            raise ValueError("to_dlpack do not work with memory views")
+        CALL(TVMArrayToDLPack(self.chandle, &dltensor))
+        return pycapsule.PyCapsule_New(dltensor, _c_str_dltensor, _c_dlpack_deleter)
+
 
 cdef c_make_array(void* chandle, is_view):
     ret = _CLASS_NDARRAY(None, is_view)
     (<NDArrayBase>ret).chandle = <DLTensor*>chandle
     return ret
 
+
 cdef _TVM_COMPATS = ()
 
 cdef _TVM_EXT_RET = {}
diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
index d994d7c2e4a5..e49c3b62f473 100644
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -17,28 +17,17 @@
     if _FFI_MODE == "ctypes":
         raise ImportError()
     if sys.version_info >= (3, 0):
-        from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array
+        from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
         from ._cy3.core import NDArrayBase as _NDArrayBase
     else:
-        from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array
+        from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
         from ._cy2.core import NDArrayBase as _NDArrayBase
 except IMPORT_EXCEPT:
     # pylint: disable=wrong-import-position
-    from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array
+    from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
     from ._ctypes.ndarray import NDArrayBase as _NDArrayBase
 
 
-TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
-_c_str_dltensor = c_str('dltensor')
-
-
-# used for PyCapsule manipulation
-if hasattr(ctypes, 'pythonapi'):
-    ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p
-    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
-    ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
-
-
 def context(dev_type, dev_id=0):
     """Construct a TVM context with given device type and id.
 
@@ -134,30 +123,14 @@ def from_dlpack(dltensor):
     Parameters
     ----------
     dltensor : DLPack tensor
+        Input DLManagedTensor, can only be consumed once.
 
     Returns
     -------
     arr: tvm.nd.NDArray
         The array view of the tensor data.
     """
-    dltensor = ctypes.py_object(dltensor)
-    name = ctypes.pythonapi.PyCapsule_GetName(dltensor)
-    ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, name)
-    handle = TVMArrayHandle()
-    check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle)))
-    ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, None)
-    return _make_array(handle, False)
-
-
-def _dlpack_deleter(pycapsule):
-    pycapsule = ctypes.py_object(pycapsule)
-    if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
-        ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
-        _LIB.TVMDLManagedTensorCallDeleter(ptr)
-        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
-
-
-_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter)
+    return _from_dlpack(dltensor)
 
 
 class NDArrayBase(_NDArrayBase):
@@ -308,17 +281,6 @@ def copyto(self, target):
             raise ValueError("Unsupported target type %s" % str(type(target)))
         return target
 
-    def to_dlpack(self):
-        """Produce an array from a DLPack Tensor without copying memory
-
-        Returns
-        -------
-        dlpack : DLPack tensor view of the array data
-        """
-        handle = ctypes.c_void_p()
-        check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle)))
-        return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter)
-
 
 def free_extension_handle(handle, type_code):
     """Free c++ extension type handle
diff --git a/tests/scripts/task_python_nnvm.sh b/tests/scripts/task_python_nnvm.sh
index 790073a2fe8b..cf6039d58416 100755
--- a/tests/scripts/task_python_nnvm.sh
+++ b/tests/scripts/task_python_nnvm.sh
@@ -4,6 +4,10 @@ export PYTHONPATH=nnvm/python:python:topi/python
 # to avoid openblas threading error
 export OMP_NUM_THREADS=1
 
+# Rebuild cython
+make cython || exit -1
+make cython3 || exit -1
+
 echo "Running unittest..."
 python -m nose -v nnvm/tests/python/unittest || exit -1
 python3 -m nose -v nnvm/tests/python/unittest || exit -1
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index 13a324d79b1f..6842ddaae13a 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -1,4 +1,8 @@
 export PYTHONPATH=python:topi/python
 
+# Rebuild cython
+make cython || exit -1
+make cython3 || exit -1
+
 python -m nose -v topi/tests/python || exit -1
 python3 -m nose -v topi/tests/python || exit -1

From 0cdc7b3f2421378746f6a68b8db4ddf0f18ebf01 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Mon, 13 Aug 2018 09:35:46 -0700
Subject: [PATCH 010/529] Fixed bugs for SSD sorting and multbox detection
 (#1578)

---
 topi/python/topi/cuda/nms.py          | 480 ++++++++++++++++++++------
 topi/python/topi/cuda/ssd/multibox.py | 225 ++++++++----
 2 files changed, 534 insertions(+), 171 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 4d4e402de5c2..361208bf1cfb 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -7,19 +7,155 @@
 from topi.vision import nms
 
 
-def sort_ir(data, index, output, axis, is_descend):
-    """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.
+def sort_pre_ir(index, sizes_out, axis_mul_before, axis_mul_after):
+    """Low level IR routing subfunction 1/4 for computing segments' staring locatons.
+
+    Parameters
+    ----------
+    index : Buffer
+        Buffer of number of valid output boxes.
+
+    sizes_out : Buffer
+        Output buffer of start locations of each sorting segment.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib = tvm.ir_builder.create()
+    p_index = ib.buffer_ptr(index)
+    dshape = sizes_out.shape
+    sizes = ib.buffer_ptr(sizes_out)
+    nthread_tx = max_threads
+    nthread_bx = dshape[0] // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        sizes[tid] = p_index[tid]
+
+    # scan
+    with ib.if_scope(tid < 1):
+        with ib.for_range(0, axis_mul_before * axis_mul_after - 1, name="k") as k:
+            sizes[k + 1] += sizes[k]
+    body = ib.get()
+    return body
+
+
+def sort_pre_ir_data(data, index, sizes_in, data_out, index_out, \
+                     axis, axis_mul_before, axis_mul_after):
+    """Low level IR routing subfunction 2/4 for flattening data and indices into segmented format.
 
     Parameters
     ----------
     data: Buffer
-        2D Buffer of input boxes' score with shape [batch_size, num_anchors].
+        Buffer of output boxes with class and score.
 
     index : Buffer
-        Buffer of number of valid number of boxes.
+        Buffer of number of valid output boxes.
 
-    output : Buffer
-        Output buffer of indicies of sorted tensor.
+    sizes_in : Buffer
+        Buffer of start locations of each sorting segment.
+
+    data_out : Buffer
+        Buffer of flattened segmented data.
+
+    index_out : Buffer
+        Buffer of flattened segmented indices.
+
+    axis : int
+        The axis used for sorting.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    ib = tvm.ir_builder.create()
+    sizes = ib.buffer_ptr(sizes_in)
+    p_index = ib.buffer_ptr(index)
+    p_data = ib.buffer_ptr(data)
+    data_new = ib.buffer_ptr(data_out)
+    index_new = ib.buffer_ptr(index_out)
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    dshape = tvm.max(sizes_in.shape[0], p_index[0])
+    nthread_tx = max_threads
+    nthread_bx = dshape // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    with ib.if_scope(axis_mul_before * axis_mul_after > 1):
+        with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+            i = tid / axis_mul_after
+            j = tid % axis_mul_after
+            current_sort_num = p_index[tid]
+            base_idx = i * data.shape[axis] * axis_mul_after + j
+            with ib.for_range(0, current_sort_num, name="k") as k:
+                full_idx = base_idx + k * axis_mul_after
+                with ib.if_scope(tid == 0):
+                    start = 0
+                with ib.else_scope():
+                    start = sizes[tid-1]
+                index_new[start + k] = k
+                data_new[start + k] = p_data[full_idx]
+    with ib.else_scope():
+        with ib.if_scope(tid == 0):
+            with ib.for_range(0, p_index[0], name="k") as k:
+                index_new[k] = k
+
+    body = ib.get()
+    return body
+
+def sort_oet_ir(data, index, new_data, new_index, loc, out_index, axis_mul_before, \
+                axis_mul_after, axis, is_descend):
+    """Low level IR routing subfunction 3/4 for Odd-Even-Transposition sorting.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of output boxes with class and score.
+
+    index : Buffer
+        Buffer of number of valid output boxes.
+
+    new_data : Buffer
+        Buffer of flattened segmented data.
+
+    new_index : Buffer
+        Buffer of flattened segmented indices.
+
+    loc : Buffer
+        Buffer of start locations of each sorting segment.
+
+    out_index : Buffer
+        Output buffer of output box indexes sorted by score in a flattened segmented format.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
 
     axis : int
         The axis used for sorting.
@@ -32,15 +168,197 @@ def sort_ir(data, index, output, axis, is_descend):
     stmt : Stmt
         The result IR statement.
     """
-
     max_threads = int(
         tvm.target.current_target(allow_none=False).max_num_threads)
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     ib = tvm.ir_builder.create()
+    dshape = loc.shape
+    fshape = data.shape[axis] * dshape[0]
+    temp_data = ib.allocate(
+        "float32", dshape, name="temp_data", scope="local")
     p_data = ib.buffer_ptr(data)
     p_index = ib.buffer_ptr(index)
+    data_new = ib.buffer_ptr(new_data)
+    index_new = ib.buffer_ptr(new_index)
+    index_out = ib.buffer_ptr(out_index)
+    sizes = ib.buffer_ptr(loc)
+    nthread_tx = max_threads
+    nthread_bx = fshape // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(axis_mul_before * axis_mul_after > 1):
+        with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+            with ib.if_scope(tid == 0):
+                start = 0
+            with ib.else_scope():
+                start = sizes[tid-1]
+            # OddEvenTransposeSort
+            with ib.for_range(0, p_index[tid], name="k") as k:
+                with ib.for_range(0, p_index[tid] - 1, name="i") as i:
+                    with ib.if_scope(i % 2 == k % 2):
+                        with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) == is_descend)):
+                            temp_data[tid] = data_new[i+start]
+                            data_new[i+start] = data_new[i+start+1]
+                            data_new[i+start+1] = temp_data[tid]
+                            index_out[tid] = index_new[i+start]
+                            index_new[i+start] = index_new[i+start+1]
+                            index_new[i+start+1] = index_out[tid]
+        with ib.if_scope(tid < 1):
+            with ib.for_range(0, sizes[dshape[0] - 1], name="i") as i:
+                index_out[i] = index_new[i]
+    with ib.else_scope():
+        with ib.for_range(0, fshape, name="k", for_type="unroll") as k:
+            with ib.if_scope(tvm.all(k % 2 == tid % 2, tid < fshape)):
+                with ib.if_scope(k % 2 == 0):
+                    with ib.if_scope(tvm.all(tid + 1 < fshape, (p_data[tid] < p_data[tid+1]) \
+                                             == is_descend)):
+                        data_new[tid] = p_data[tid+1]
+                        index_out[tid] = index_new[tid+1]
+                    with ib.else_scope():
+                        data_new[tid] = p_data[tid]
+                        index_out[tid] = index_new[tid]
+                with ib.else_scope():
+                    with ib.if_scope(tvm.all(tid + 1 < fshape, (data_new[tid] < data_new[tid+1]) \
+                                             == is_descend)):
+                        p_data[tid] = data_new[tid+1]
+                        index_new[tid] = index_out[tid+1]
+                    with ib.else_scope():
+                        p_data[tid] = data_new[tid]
+                        index_new[tid] = index_out[tid]
+            with ib.if_scope(tvm.all(k % 2 != tid % 2, tid < fshape)):
+                with ib.if_scope(k % 2 == 0):
+                    with ib.if_scope(tvm.all(tid > 0, (p_data[tid-1] < p_data[tid]) == is_descend)):
+                        data_new[tid] = p_data[tid-1]
+                        index_out[tid] = index_new[tid-1]
+                    with ib.else_scope():
+                        data_new[tid] = p_data[tid]
+                        index_out[tid] = index_new[tid]
+                with ib.else_scope():
+                    with ib.if_scope(tvm.all(tid > 0, (data_new[tid-1] < data_new[tid]) \
+                                             == is_descend)):
+                        p_data[tid] = data_new[tid-1]
+                        index_new[tid] = index_out[tid-1]
+                    with ib.else_scope():
+                        p_data[tid] = data_new[tid]
+                        index_new[tid] = index_out[tid]
+        with ib.if_scope(fshape % 2 == 1):
+            with ib.if_scope(tid < 1):
+                with ib.for_range(0, fshape, name="k") as k:
+                    index_out[tid] = index_new[tid]
+    body = ib.get()
+    return body
+
+
+def sort_ir_out(data, index, new_index, loc, output, axis_mul_before, axis_mul_after, axis):
+    """Low level IR routing subfunction 4/4 for writing sorted indices to output format.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of output boxes with class and score.
+
+    index : Buffer
+        Buffer of number of valid output boxes.
+
+    new_index : Buffer
+        Buffer of sorted indices in a flatten format.
+
+    loc : Buffer
+        Buffer of start locations of each sorting segment.
+
+    output : Buffer
+        Output buffer of output box indexes sorted by score.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
+
+    axis : int
+        The axis used for sorting.
+
+    is_descend : bool
+        If the sorted data is in descending order.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib = tvm.ir_builder.create()
+    dshape = tvm.max(loc.shape[0], data.shape[axis])
+    p_index = ib.buffer_ptr(index)
+    index_new = ib.buffer_ptr(new_index)
+    sizes = ib.buffer_ptr(loc)
     p_out = ib.buffer_ptr(output)
+    nthread_tx = max_threads
+    nthread_bx = dshape // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(axis_mul_before * axis_mul_after > 1):
+        with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+            i = tid / axis_mul_after
+            j = tid % axis_mul_after
+            base_idx = i * data.shape[axis] * axis_mul_after + j
+            with ib.for_range(0, data.shape[axis], name="k") as k:
+                with ib.if_scope(tid == 0):
+                    start = 0
+                with ib.else_scope():
+                    start = sizes[tid-1]
+                p_out[base_idx + k * axis_mul_after] = tvm.select(
+                    k < p_index[tid], index_new[k+start], k)
+    with ib.else_scope():
+        with ib.if_scope(tid < data.shape[axis]):
+            p_out[tid] = tvm.select(tid < p_index[0], index_new[tid], tid)
+
+    body = ib.get()
+    return body
+
+
+def sort_gpu(data, data_buf, index, index_buf, output_buf, axis, is_descend):
+    """Function to generate low level IR to do sorting on the GPU, use it by calling sort_gpu.
+
+    Parameters
+    ----------
+    data: tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+        The last dimension should be in format of
+        [class_id, score, box_left, box_top, box_right, box_bottom].
+
+    data_buf: Buffer
+        2D Buffer of input boxes' score with shape [batch_size, num_anchors].
+
+    index : tvm.Tensor
+        1-D tensor for valid number of boxes.
+
+    index_buf : Buffer
+        Buffer of number of valid number of boxes.
+
+    output_buf : Buffer
+        Output buffer of indicies of sorted tensor.
+
+    axis : int
+        The axis used for sorting.
+
+    is_descend : bool
+        If the sorted data is in descending order.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors].
+    """
+
     ndim = len(data.shape)
     assert data.dtype == "float32", "Currently only supports input dtype to be float32"
     assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim
@@ -55,89 +373,60 @@ def sort_ir(data, index, output, axis, is_descend):
         elif i > axis:
             axis_mul_after *= data.shape[i]
 
-    dshape = 0
-    for i in range(0, len(index.shape)):
-        dshape += index.shape[i]
-    dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape,
-                        axis_mul_before*axis_mul_after)
-
-    sizes_temp = ib.allocate(
-        "int32", dshape, name="sizes_temp", scope="global")
-    sizes = ib.allocate("int32", dshape, name="sizes", scope="global")
-    temp_index = ib.allocate("int32", dshape, name="temp_index", scope="local")
-    temp_data = ib.allocate("float32", dshape, name="temp_data", scope="local")
-    data_new = ib.allocate("float32", dshape, name="data_new", scope="global")
-    index_new = ib.allocate("int32", dshape, name="index_new", scope="global")
-    nthread_tx = max_threads
-    nthread_bx = dshape // max_threads + 1
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        sizes[tid] = p_index[tid]
-        sizes_temp[tid] = p_index[tid]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after) \
-             .astype("float32"))) + 1, name="k") as k:
-            with ib.if_scope(tid - (tvm.const(1, "int32") << k) >= 0):
-                with ib.if_scope(k % 2 == 0):
-                    sizes[tid] += sizes_temp[tid - (
-                        tvm.const(1, "int32") << k)]
-                    sizes_temp[tid] = sizes[tid]
-                with ib.else_scope():
-                    sizes_temp[tid] += sizes[tid - (
-                        tvm.const(1, "int32") << k)]
-                    sizes[tid] = sizes_temp[tid]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        i = tid / axis_mul_after
-        j = tid % axis_mul_after
-        current_sort_num = p_index[tid]
-        base_idx = i * data.shape[axis] * axis_mul_after + j
-        with ib.for_range(0, current_sort_num, name="k") as k:
-            full_idx = base_idx + k * axis_mul_after
-            with ib.if_scope(tid == 0):
-                start = 0
-            with ib.else_scope():
-                start = sizes[tid-1]
-            index_new[start + k] = k
-            data_new[start + k] = p_data[full_idx]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        with ib.if_scope(tid == 0):
-            start = 0
-        with ib.else_scope():
-            start = sizes[tid-1]
-        # OddEvenTransposeSort
-        with ib.for_range(0, p_index[tid], name="k") as k:
-            with ib.for_range(0, p_index[tid] - 1, name="i") as i:
-                with ib.if_scope(i % 2 == (k & 1)):
-                    with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^
-                                      is_descend) == False):
-                        temp_data[tid] = data_new[i+start]
-                        data_new[i+start] = data_new[i+start+1]
-                        data_new[i+start+1] = temp_data[tid]
-                        temp_index[tid] = index_new[i+start]
-                        index_new[i+start] = index_new[i+start+1]
-                        index_new[i+start+1] = temp_index[tid]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        i = tid / axis_mul_after
-        j = tid % axis_mul_after
-        current_sort_num = p_index[tid]
-        base_idx = i * data.shape[axis] * axis_mul_after + j
-        with ib.for_range(0, data.shape[axis], name="k") as k:
-            with ib.if_scope(tid == 0):
-                start = 0
-            with ib.else_scope():
-                start = sizes[tid-1]
-            p_out[base_idx + k * axis_mul_after] = tvm.select(
-                k < current_sort_num,
-                index_new[k+start], k)
-    body = ib.get()
-    return body
+    dshape = axis_mul_before*axis_mul_after
+    fshape = data.shape[axis] * dshape
+
+    loc_buf = api.decl_buffer(dshape, index.dtype, "sizes", data_alignment=8)
+    new_index_buf = api.decl_buffer(
+        fshape, index.dtype, "index_new", data_alignment=8)
+    out_index_buf = api.decl_buffer(
+        fshape, index.dtype, "index_out", data_alignment=8)
+    new_data_buf = api.decl_buffer(
+        dshape, data.dtype, "data_new", data_alignment=8)
+
+    loc = \
+        tvm.extern([(dshape,)],
+                   [index],
+                   lambda ins, outs: sort_pre_ir(
+                       ins[0], outs[0], axis_mul_before, axis_mul_after),
+                   dtype=[index.dtype],
+                   in_buffers=index_buf,
+                   out_buffers=[loc_buf],
+                   tag="sorting_prepare")
+
+    data_new, index_new = \
+        tvm.extern([(dshape,), (fshape,)],
+                   [data, index, loc],
+                   lambda ins, outs: sort_pre_ir_data(
+                       ins[0], ins[1], ins[2], outs[0], outs[1], axis,
+                       axis_mul_before, axis_mul_after),
+                   dtype=[data.dtype, index.dtype],
+                   in_buffers=[data_buf, index_buf, loc_buf],
+                   out_buffers=[new_data_buf, new_index_buf],
+                   tag="sorting_data")
+
+    index_out = \
+        tvm.extern([(fshape,)],
+                   [data, index, data_new, index_new, loc],
+                   lambda ins, outs: sort_oet_ir(
+                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0],
+                       axis_mul_before, axis_mul_after, axis, is_descend),
+                   dtype=[index.dtype],
+                   in_buffers=[data_buf, index_buf,
+                               new_data_buf, new_index_buf, loc_buf],
+                   out_buffers=[out_index_buf],
+                   tag="sorting_oet")
+    out = \
+        tvm.extern([data.shape],
+                   [data, index, index_out, loc],
+                   lambda ins, outs: sort_ir_out(
+                       ins[0], ins[1], ins[2], ins[3], outs[0],
+                       axis_mul_before, axis_mul_after, axis),
+                   dtype=[index.dtype],
+                   in_buffers=[data_buf, index_buf, out_index_buf, loc_buf],
+                   out_buffers=output_buf,
+                   tag="sorting_output")
+    return out
 
 
 def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
@@ -333,15 +622,8 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
     sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype,
                                       "sort_tensor_buf", data_alignment=8)
 
-    sort_tensor = \
-        tvm.extern(score_shape,
-                   [score_tensor, valid_count],
-                   lambda ins, outs: sort_ir(
-                       ins[0], ins[1], outs[0], score_axis, True),
-                   dtype=sort_tensor_dtype,
-                   in_buffers=[score_tensor_buf, valid_count_buf],
-                   out_buffers=sort_tensor_buf,
-                   name="nms_sort")
+    sort_tensor = sort_gpu(score_tensor, score_tensor_buf, valid_count,
+                           valid_count_buf, sort_tensor_buf, score_axis, True)
     out = \
         tvm.extern(data.shape,
                    [data, sort_tensor, valid_count],
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index c22e7a513d7d..3c013c4d1605 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, too-many-function-args
 """SSD multibox operators"""
 from __future__ import absolute_import as _abs
 import math
@@ -13,6 +13,7 @@
 from topi.vision.ssd import multibox_transform_loc
 from ..nms import nms
 
+
 def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     """Low level IR routing for multibox_prior operator.
 
@@ -41,7 +42,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     stmt : Stmt
         The result IR statement.
     """
-    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
+    max_threads = int(math.sqrt(
+        tvm.target.current_target(allow_none=False).max_num_threads))
     tx = tvm.thread_axis("threadIdx.x")
     ty = tvm.thread_axis("threadIdx.y")
     bx = tvm.thread_axis("blockIdx.x")
@@ -76,7 +78,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
 
             for k in range(num_sizes + num_ratios - 1):
                 w = tvm.select(k < num_sizes,
-                               size_ratio_concat[k] * in_height / in_width / 2.0,
+                               size_ratio_concat[
+                                   k] * in_height / in_width / 2.0,
                                size_ratio_concat[0] * in_height / in_width *
                                math.sqrt(size_ratio_concat[k + 1]) / 2.0)
                 h = tvm.select(k < num_sizes, size_ratio_concat[k] / 2.0,
@@ -93,7 +96,7 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
 
 
 @multibox_prior.register(["cuda", "gpu"])
-def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \
+def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1),
                        offsets=(0.5, 0.5), clip=False):
     """Generate prior(anchor) boxes from data, sizes and ratios.
 
@@ -124,31 +127,114 @@ def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \
     """
     num_sizes = len(sizes)
     num_ratios = len(ratios)
-    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
+    oshape = (
+        1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
     out = tvm.extern(oshape, [data], lambda ins, outs:
-                     multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets),
+                     multibox_prior_ir(
+                         ins[0], outs[0], sizes, ratios, steps, offsets),
                      tag="multibox_prior")
     if clip:
         out = topi.clip(out, 0, 1)
     return out
 
 
-def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances):
-    """Low level IR routing for transform location in multibox_detection operator.
+def transform_loc_pre(cls_prob, valid_count, temp_flag, temp_id, temp_score_out, threshold):
+    """Low level IR routing for transform location data preparation.
 
     Parameters
     ----------
     cls_prob : Buffer
         Buffer of class probabilities.
 
+    valid_count : Buffer
+        Buffer of number of valid output boxes.
+
+    temp_flag : Buffer
+        Output intermediate result buffer
+
+    temp_id : Buffer
+        Output intermediate result buffer
+
+    temp_score_out : Buffer
+        Output buffer
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = cls_prob.shape[0]
+    num_classes = cls_prob.shape[1]
+    num_anchors = cls_prob.shape[2]
+
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    ib = tvm.ir_builder.create()
+    score = ib.buffer_ptr(temp_score_out)
+    cls_id = ib.buffer_ptr(temp_id)
+    flag = ib.buffer_ptr(temp_flag)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    nthread_tx = max_threads
+    nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    p_cls_prob = ib.buffer_ptr(cls_prob)
+    p_valid_count = ib.buffer_ptr(valid_count)
+
+    with ib.if_scope(tid < batch_size * num_anchors):
+        n = tid / num_anchors  # number of batches
+        i = tid % num_anchors  # number of anchors
+        score[i] = -1.0
+        cls_id[i] = 0
+        p_valid_count[n] = 0
+        with ib.for_range(0, num_classes-1, name="k") as k:
+            temp = p_cls_prob[n * num_anchors * num_classes + (k + 1) * num_anchors + i]
+            with ib.if_scope(temp > score[i]):
+                cls_id[i] = k + 1
+                score[i] = temp
+        with ib.if_scope(tvm.all(cls_id[i] > 0, score[i] < threshold)):
+            cls_id[i] = 0
+        with ib.if_scope(cls_id[i] > 0):
+            flag[i] = 1
+        with ib.else_scope():
+            flag[i] = 0
+
+        with ib.if_scope(tid < batch_size):
+            with ib.for_range(0, num_anchors, name="k") as k:
+                with ib.if_scope(k > 0):
+                    flag[tid * num_anchors +
+                         k] += flag[tid * num_anchors + k - 1]
+            p_valid_count[n] = flag[tid * num_anchors + num_anchors - 1]
+
+    body = ib.get()
+    return body
+
+
+def transform_loc_ir(loc_pred, anchor, temp_flag, temp_id, temp_score_in, \
+                     out, clip, variances, batch_size, num_classes, num_anchors):
+    """Low level IR routing for transform location in multibox_detection operator.
+
+    Parameters
+    ----------
     loc_pred : Buffer
         Buffer of location regression predictions.
 
     anchor : Buffer
         Buffer of prior anchor boxes.
 
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
+    temp_flag : Buffer
+        Intermediate result buffer.
+
+    temp_id : Buffer
+        Intermediate result buffer.
+
+    temp_score_in : Buffer
+        Input buffer which stores intermediate results.
 
     out : Buffer
         Output buffer.
@@ -156,12 +242,18 @@ def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, thresho
     clip : boolean
         Whether to clip out-of-boundary boxes.
 
-    threshold : float
-        Threshold to be a positive prediction.
-
     variances : tuple of float
         Variances to be decoded from box regression output.
 
+    batch_size : int
+        Batch size
+
+    num_classes : int
+        Number of classes
+
+    num_anchors : int
+        Number of anchors
+
     Returns
     -------
     stmt : Stmt
@@ -187,21 +279,16 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
         ow = tvm.exp(pw * vw) * aw / 2.0
         oh = tvm.exp(ph * vh) * ah / 2.0
         return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
-
-    batch_size = cls_prob.shape[0]
-    num_classes = cls_prob.shape[1]
-    num_anchors = cls_prob.shape[2]
+            tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
+            tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
+            tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
 
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
     ib = tvm.ir_builder.create()
-    temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, \
-                 ), name="temp_score", scope="global")
-    score = ib.allocate('float32', (batch_size * num_anchors, ), name="score", scope="local")
-    cls_id = ib.allocate('int32', (batch_size * num_anchors, ), name="id", scope="local")
-    flag = ib.allocate('int32', (batch_size * num_anchors, ), name="flag", scope="global")
-    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    score = ib.buffer_ptr(temp_score_in)
+    cls_id = ib.buffer_ptr(temp_id)
+    flag = ib.buffer_ptr(temp_flag)
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     nthread_tx = max_threads
@@ -209,42 +296,13 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    p_cls_prob = ib.buffer_ptr(cls_prob)
     p_loc_pred = ib.buffer_ptr(loc_pred)
     p_anchor = ib.buffer_ptr(anchor)
-    p_valid_count = ib.buffer_ptr(valid_count)
     p_out = ib.buffer_ptr(out)
-    with ib.if_scope(tid < batch_size * num_anchors * num_classes):
-        n = tid / (num_anchors * num_classes)
-        j = (tid % (num_anchors * num_classes)) / num_anchors
-        i = tid % num_anchors
-        with ib.if_scope(j > 0):
-            temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = \
-            p_cls_prob[tid]
-        p_valid_count[n] = 0
-    with ib.if_scope(tid < batch_size * num_anchors):
-        n = tid / num_anchors
-        i = tid % num_anchors
-        score[tid] = -1.0
-        cls_id[tid] = 0
-        with ib.for_range(0, num_classes-1, name="k") as k:
-            temp = temp_score[tid * (num_classes-1) + k]
-            cls_id[tid] = tvm.select(temp > score[tid], k + 1, cls_id[tid])
-            score[tid] = tvm.make.Max(temp, score[tid])
-        with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)):
-            cls_id[tid] = 0
-        with ib.if_scope(cls_id[tid] > 0):
-            flag[tid] = 1
-        with ib.else_scope():
-            flag[tid] = 0
-    with ib.if_scope(tid < batch_size):
-        with ib.for_range(0, num_anchors, name="k") as k:
-            with ib.if_scope(k > 0):
-                flag[tid * num_anchors + k] += flag[tid * num_anchors + k - 1]
-        p_valid_count[tid] = flag[tid * num_anchors + num_anchors - 1]
+
     with ib.if_scope(tid < batch_size * num_anchors):
-        n = tid / num_anchors
-        i = tid % num_anchors
+        n = tid / num_anchors  # number of batches
+        i = tid % num_anchors  # number of anchors
         with ib.if_scope(cls_id[tid] > 0):
             with ib.if_scope(tid == 0):
                 out_base_idx = n * num_anchors * 6
@@ -253,17 +311,17 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
             p_out[out_base_idx] = cls_id[tid] - 1.0
             p_out[out_base_idx + 1] = score[tid]
             p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
-            p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4, p_anchor, i*4,
-                                                    clip, variances[0], variances[1],
-                                                    variances[2], variances[3])
+                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4,
+                                                        p_anchor, i*4, clip, variances[0],
+                                                        variances[1], variances[2], variances[3])
 
     body = ib.get()
     return body
 
 
 @multibox_transform_loc.register(["cuda", "gpu"])
-def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
-                               variances=(0.1, 0.1, 0.2, 0.2)):
+def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \
+                               threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)):
     """Location transformation for multibox detection
 
     Parameters
@@ -297,20 +355,42 @@ def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=
         1-D tensor with shape (batch_size,), number of valid anchor boxes.
     """
     batch_size = cls_prob.shape[0]
-    num_anchors = anchor.shape[1]
+    num_classes = cls_prob.shape[1]
+    num_anchors = cls_prob.shape[2]
     oshape = (batch_size, num_anchors, 6)
     # Define data alignment for intermediate buffer
     valid_count_dtype = "int32"
     valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
                                       "valid_count_buf", data_alignment=4)
-    out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8)
-    valid_count, out = \
-        tvm.extern([(batch_size,), oshape],
-                   [cls_prob, loc_pred, anchor],
+    out_buf = api.decl_buffer(
+        oshape, cls_prob.dtype, "out_buf", data_alignment=8)
+    size = num_anchors
+    temp_flag_buf = api.decl_buffer(
+        (size,), valid_count_dtype, "flag", data_alignment=8)
+    temp_id_buf = api.decl_buffer(
+        (size,), valid_count_dtype, "cls_id", data_alignment=8)
+    temp_score_buf = api.decl_buffer(
+        (size,), cls_prob.dtype, "score", data_alignment=8)
+
+    valid_count, temp_flag, temp_id, temp_score = \
+        tvm.extern([(batch_size,), (size,), (size,), (size,)],
+                   [cls_prob],
+                   lambda ins, outs: transform_loc_pre(
+                       ins[0], outs[0], outs[1], outs[2], outs[3], threshold),
+                   dtype=[valid_count_dtype,
+                          valid_count_dtype, valid_count_dtype, cls_prob.dtype],
+                   out_buffers=[valid_count_buf,
+                                temp_flag_buf, temp_id_buf, temp_score_buf],
+                   tag="multibox_transform_loc_first_step")
+
+    out = \
+        tvm.extern([oshape],
+                   [loc_pred, anchor, temp_flag, temp_id, temp_score],
                    lambda ins, outs: transform_loc_ir(
-                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
-                   dtype=[valid_count_dtype, cls_prob.dtype],
-                   out_buffers=[valid_count_buf, out_buf],
+                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, \
+                       variances, batch_size, num_classes, num_anchors),
+                   dtype=[cls_prob.dtype],
+                   out_buffers=[out_buf],
                    tag="multibox_transform_loc")
     return [out, valid_count]
 
@@ -356,5 +436,6 @@ def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
+    out = nms(
+        inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
     return out

From 3900392ccaa881c88398acd64844d36c794cd0bd Mon Sep 17 00:00:00 2001
From: Pariksheet Pinjari <pariksheet.pinjari@huawei.com>
Date: Tue, 14 Aug 2018 01:43:09 +0530
Subject: [PATCH 011/529] Split_indices negative axis added (#1595)

---
 topi/include/topi/transform.h                | 5 +++++
 topi/tests/python_cpp/test_topi_transform.py | 1 +
 2 files changed, 6 insertions(+)

diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 09af612b957b..245b38cfb63d 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -475,6 +475,11 @@ inline Array<Tensor> split_sections(const Tensor& x,
                            int axis,
                            std::string name = "tensor",
                            std::string tag = kInjective) {
+  if (axis < 0) {
+    axis += static_cast<int>(x->shape.size());
+  }
+  CHECK_LT(axis, x->shape.size()) << "axis out of bounds";
+
   auto src_axis_size = static_cast<int>(GetConstInt(x->shape[axis]));
 
   CHECK_GT(num_sections, 0) << "Slice count must be > 0";
diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py
index c8b7c3906caa..3f7bdbfdd499 100644
--- a/topi/tests/python_cpp/test_topi_transform.py
+++ b/topi/tests/python_cpp/test_topi_transform.py
@@ -340,6 +340,7 @@ def test_concatenate():
 
 def test_split():
     verify_split((2, 12, 3), 3, 1)
+    verify_split((2, 12, 3), 3, -1)
     verify_split((2, 12, 3), [2, 4], 1)
     verify_split((10, 12, 24), [5, 7, 9], -1)
 

From 4daa9ee8ada2b99688ccb01f7a9257f58ae483ee Mon Sep 17 00:00:00 2001
From: Albin Joy <albin.joy@huawei.com>
Date: Tue, 14 Aug 2018 02:44:26 +0530
Subject: [PATCH 012/529] [FRONTEND][TENSORFLOW] Optimized tensorflow testcases
 (#1546)

* [NNVM][TENSORFLOW] Optimized tensorflow testcases

* Replace Constants with Placeholder

* Review comment fix
---
 .../frontend/tensorflow/test_forward.py       | 457 ++++++------------
 1 file changed, 136 insertions(+), 321 deletions(-)

diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 495852f9e5d6..64c57c126f8d 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -83,6 +83,34 @@ def run_tf_graph(sess, input_data, input_node, output_node):
     output_data = sess.run(tensor, input_dict)
     return output_data
 
+
+def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False):
+    """Generic function to generate and compare tensorflow and TVM output"""
+
+    out_node = out_name.split(':')[0] if ":" in out_name else out_name
+
+    if isinstance(in_name, list):
+        in_node = [0]*len(in_name)
+        for i in range(len(in_name)):
+            in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i]
+    else:
+        in_node = in_name.split(':')[0] if ":" in in_name else in_name
+
+    with tf.Session() as sess:
+        if init_global_variables:
+            sess.run(variables.global_variables_initializer())
+        final_graph_def = tf.graph_util.convert_variables_to_constants(
+            sess,
+            sess.graph.as_graph_def(add_shapes=True),
+            [out_node],
+            )
+
+        tf_output = run_tf_graph(sess, in_data, in_name, out_name)
+        tvm_output = run_tvm_graph(final_graph_def, in_data,
+                                   in_node, tf_output.shape, tf_output.dtype)
+        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+        sess.close()
+
 #######################################################################
 # Pooling
 # -------
@@ -93,31 +121,15 @@ def _test_pooling(input_shape, **kwargs):
         np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(x, shape=input_shape, dtype='float32')
-        # pylint: disable=unused-variable
-        pool = nn_ops.pool(in_data, **kwargs)
-        # pylint: enable=unused-variable
+        in_data = array_ops.placeholder(shape=input_shape, dtype='float32')
+        nn_ops.pool(in_data, **kwargs)
 
         if kwargs['pooling_type'] == 'MAX':
-            out_node = 'max_pool'
             out_name = 'max_pool:0'
         else:
-            out_node = 'avg_pool'
             out_name = 'avg_pool:0'
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                [out_node],
-                )
-
-            tf_output = run_tf_graph(sess, x, 'Const:0', out_name)
-            tvm_output = run_tvm_graph(graph_def, x.astype('float32'),
-                                       "Const", tf_output.shape, 'float32')
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-
-            sess.close()
+        compare_tf_with_tvm(x, 'Placeholder:0', out_name)
 
 def test_forward_pooling():
     """ Pooling """
@@ -195,35 +207,19 @@ def _test_convolution(tensor_in_sizes, filter_in_sizes,
     filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)]
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data_array, shape=tensor_in_sizes, dtype='float32')
+        in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype='float32')
         in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype='float32')
         strides = [1] + strides + [1]
         dilations = [1] + dilations + [1]
 
-        # pylint: disable=unused-variable
-        conv = nn_ops.conv2d(in_data,
-                             in_filter,
-                             strides=strides,
-                             padding=padding,
-                             data_format=data_format)
-        # pylint: enable=unused-variable
+        nn_ops.conv2d(in_data,
+                      in_filter,
+                      strides=strides,
+                      padding=padding,
+                      data_format=data_format)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Conv2D'],
-                )
-
-            tf_output = run_tf_graph(sess, np.reshape(data_array, tensor_in_sizes),
-                                     'Const:0', 'Conv2D:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       np.reshape(data_array, tensor_in_sizes).astype('float32'),
-                                       "Const", tf_output.shape, 'float32')
-
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-
-            sess.close()
+        compare_tf_with_tvm(np.reshape(data_array, tensor_in_sizes).astype('float32'),
+                            'Placeholder:0', 'Conv2D:0')
 
 def test_forward_convolution():
     _test_convolution([4, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME', 'NHWC')
@@ -239,28 +235,10 @@ def _test_reshape(data, out_shape):
     """ One iteration of reshape operation with given data and out shape """
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
-
-        # pylint: disable=unused-variable
-        reshape_out = array_ops.reshape(in_data, out_shape)
-        # pylint: enable=unused-variable
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+        array_ops.reshape(in_data, out_shape)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Reshape'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     'Const:0', 'Reshape:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm(data, 'Placeholder:0', 'Reshape:0')
 
 def test_forward_reshape():
     _test_reshape(np.arange(6.0), [2, 3])
@@ -279,31 +257,14 @@ def _test_squeeze(data, squeeze_dims=None):
         squeeze_dims = []
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
 
-        # pylint: disable=unused-variable
         if squeeze_dims:
-            squeeze_out = array_ops.squeeze(in_data, squeeze_dims)
+            array_ops.squeeze(in_data, squeeze_dims)
         else:
-            squeeze_out = array_ops.squeeze(in_data)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Squeeze'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     'Const:0', 'Squeeze:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
+            array_ops.squeeze(in_data)
 
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm(data, 'Placeholder:0', 'Squeeze:0')
 
 def test_forward_squeeze():
     """ Squeeze """
@@ -336,28 +297,10 @@ def _test_concat_v2(data, dim):
     """ One iteration of ConcatV2 """
 
     with tf.Graph().as_default():
+        gen_array_ops._concat_v2(data, dim)
 
-        # pylint: disable=unused-variable
-        concat_out = gen_array_ops._concat_v2(data, dim)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['ConcatV2'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'], 'ConcatV2:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       ["ConcatV2/values_0", 'ConcatV2/values_1'],
-                                       tf_output.shape, tf_output.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm(data, ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'],
+                            'ConcatV2:0')
 
 def _test_forward_concat_v2():
     t1 = np.array([])
@@ -377,28 +320,10 @@ def _test_sigmoid(data):
     """ One iteration of sigmoid """
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
-
-        # pylint: disable=unused-variable
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
         sigmoid_out = math_ops.sigmoid(in_data)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Sigmoid'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     'Const:0', 'Sigmoid:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
 
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-
-            sess.close()
+        compare_tf_with_tvm(data, 'Placeholder:0', 'Sigmoid:0')
 
 def test_forward_sigmoid():
     """ Sigmoid """
@@ -412,24 +337,10 @@ def test_forward_sigmoid():
 def _test_argx(func, data, **kwargs):
 
     with tf.Graph().as_default():
-        inp = constant_op.constant(data, shape=data.shape, dtype=data.dtype, name="c0")
-
-        # pylint: disable=unused-variable
-        out = func(inp, name="argx0", **kwargs)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess=sess,
-                input_graph_def=sess.graph.as_graph_def(add_shapes=True),
-                output_node_names=["argx0"])
-
-            tf_output = run_tf_graph(sess, data, input_node="c0:0", output_node="argx0:0")
-            tvm_output = run_tvm_graph(graph_def, data, "c0", tf_output.shape, output_dtype='int32')
-
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+        inp = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="c0")
+        func(inp, name="argx0", **kwargs, output_type=tf.int32)
 
-            sess.close()
+        compare_tf_with_tvm(data, 'c0:0', 'argx0:0')
 
 def test_argmin_argmax():
     for axis in [None,0,1,2]:
@@ -442,6 +353,8 @@ def test_argmin_argmax():
 # --------
 
 def _test_variable(data):
+    """ One iteration of a variable """
+
     tf.reset_default_graph()
     input_op = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
     input_tensor = array_ops.reshape(input_op, data.shape)
@@ -450,84 +363,15 @@ def _test_variable(data):
     with variable_scope.variable_scope("linear", reuse=None):
         w = variable_scope.get_variable(
             "w", shape=[size, size], dtype=input_tensor.dtype)
-    # pylint: disable=unused-variable
-    output_op = math_ops.matmul(input_tensor, w)
-    # pylint: enable=unused-variable
-
-    with tf.Session() as sess:
-        sess.run(variables.global_variables_initializer())
-        final_graph_def = tf.graph_util.convert_variables_to_constants(
-            sess,
-            sess.graph.as_graph_def(add_shapes=True),
-            ['MatMul'],
-            )
-
-        tf_output = run_tf_graph(sess, data, 'Placeholder:0', 'MatMul:0')
-        tvm_output = run_tvm_graph(final_graph_def, data,
-                                   "Placeholder", tf_output.shape, data.dtype)
+    math_ops.matmul(input_tensor, w)
 
-        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-        sess.close()
+    compare_tf_with_tvm(data, 'Placeholder:0', 'MatMul:0', init_global_variables=True)
 
 def test_forward_variable():
     """Variable type op test"""
     _test_variable(np.random.uniform(size=(32, 100)).astype('float32'))
 
 
-#######################################################################
-# LSTM
-# ----
-def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype):
-    tf.reset_default_graph()
-    input_size = num_hidden
-    input_data = np.full((batch_size, input_size), 1., dtype=dtype)
-    in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
-    in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
-
-    def _get_tensorflow_output():
-        with tf.Session() as sess:
-            with variable_scope.variable_scope(
-                "root", initializer=init_ops.constant_initializer(0.5)):
-                m0 = array_ops.zeros([batch_size, num_hidden])
-                m1 = array_ops.zeros([batch_size, num_hidden])
-                x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype)
-                g, ((out_m0, out_m1)) = \
-                     tf.contrib.rnn.LSTMBlockCell(num_hidden,
-                                                  forget_bias=forget_bias)(x, ((m0, m1)))
-                sess.run([variables.global_variables_initializer()])
-                res = sess.run([g, out_m0, out_m1], {
-                    x.name: np.array([[1., 1.]]),
-                    m0.name: 0.1 * np.ones([batch_size, num_hidden]),
-                    m1.name: 0.1 * np.ones([batch_size, num_hidden]),
-                })
-            graph_def = sess.graph.as_graph_def(add_shapes=True)
-            final_graph_def = graph_util.convert_variables_to_constants(
-                sess,
-                graph_def,
-                ['root/lstm_cell/LSTMBlockCell'])
-            return final_graph_def, res
-
-    graph_def, tf_out = _get_tensorflow_output()
-    tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h],
-                               ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c',
-                                'root/lstm_cell/LSTMBlockCell_h'],
-                               [tf_out[0].shape, (2, batch_size, num_hidden)],
-                               [tf_out[0].dtype, tf_out[1].dtype])
-
-    if isinstance(tvm_output, list):
-        out = tvm_output[0]
-        out_state = tvm_output[1]
-        out_state_tup = np.split(out_state, indices_or_sections=2, axis=0)
-        out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
-        out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
-        tvm_out = [out, out_state_c, out_state_h]
-        np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3)
-
-def test_forward_lstm():
-    '''test LSTM block cell'''
-    _test_lstm_cell(1, 2, 1, 0.0, 'float32')
-
-
 #######################################################################
 # StridedSlice
 # ------------
@@ -535,6 +379,8 @@ def test_forward_lstm():
 def _test_stridedslice(ip_shape, begin, end, stride, dtype,
                              begin_mask=0, end_mask=0, new_axis_mask=0,
                              shrink_axis_mask=0, ellipsis_mask=0):
+    """ One iteration of a Stridedslice """
+
     tf.reset_default_graph()
     in_data = tf.placeholder(dtype, ip_shape, name="in_data")
     tf.strided_slice(in_data, begin, end, stride, begin_mask=begin_mask,
@@ -543,17 +389,7 @@ def _test_stridedslice(ip_shape, begin, end, stride, dtype,
                          ellipsis_mask=ellipsis_mask, name="strided_slice")
     np_data = np.random.uniform(size=ip_shape).astype(dtype)
 
-    with tf.Session() as sess:
-        final_graph_def = tf.graph_util.convert_variables_to_constants(
-            sess,
-            sess.graph.as_graph_def(add_shapes=True),
-            ['strided_slice'])
-        tf_output = run_tf_graph(sess, np_data,
-                                 'in_data:0', 'strided_slice:0')
-        tvm_output = run_tvm_graph(final_graph_def, np_data,
-                                   "in_data", tf_output.shape, np_data.dtype)
-        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-        sess.close()
+    compare_tf_with_tvm(np_data, 'in_data:0', 'strided_slice:0')
 
 def test_forward_stridedslice():
     '''test StridedSlice'''
@@ -586,6 +422,8 @@ def test_forward_stridedslice():
 # ------
 
 def _test_gather(ip_shape, indice_shape, indice_value, axis, dtype):
+    """ One iteration of a Gather """
+
     tf.reset_default_graph()
     in_data = tf.placeholder(dtype, ip_shape, name="in_data")
     indices = tf.placeholder("int32", indice_shape, name="indices")
@@ -601,17 +439,7 @@ def _fill_indices(indice_value):
         return indices
     np_indices = _fill_indices(indice_value)
 
-    with tf.Session() as sess:
-        final_graph_def = tf.graph_util.convert_variables_to_constants(
-            sess,
-            sess.graph.as_graph_def(add_shapes=True),
-            ['GatherV2'])
-        tf_output = run_tf_graph(sess, [np_data, np_indices], ['in_data:0',
-                                 'indices:0'], 'GatherV2:0')
-        tvm_output = run_tvm_graph(final_graph_def, [np_data, np_indices],
-                                   ['in_data', 'indices'], tf_output.shape, dtype)
-        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-        sess.close()
+    compare_tf_with_tvm([np_data, np_indices], ['in_data:0', 'indices:0'], 'GatherV2:0')
 
 def test_forward_gather():
     '''test gather layer'''
@@ -640,28 +468,11 @@ def test_forward_multi_input():
 
         out1 = tf.add(in1, in2, name='out1')
         out2 = tf.subtract(in3, in4, name='out2')
-
         out = tf.multiply(out1, out2, name='out')
+        in_data = np.arange(9, dtype='int32').reshape([3, 3])
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['out'],
-                )
-
-            in_data = np.arange(9, dtype='int32').reshape([3, 3])
-
-            tf_output = run_tf_graph(sess, [in_data, in_data, in_data, in_data ],
-                                     ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       [in_data, in_data, in_data, in_data ],
-                                       ['in1', 'in2', 'in3', 'in4'],
-                                       tf_output.shape, tf_output.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm([in_data, in_data, in_data, in_data],
+                            ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0')
 
 #######################################################################
 # Resize Bilinear
@@ -674,36 +485,75 @@ def _test_resize_bilinear(in_shape, to_shape, align_corners):
     shape_data = np.array(to_shape).astype('int32')
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
         shape_data = constant_op.constant(shape_data, shape=shape_data.shape, dtype=shape_data.dtype)
+        tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners)
 
-        # pylint: disable=unused-variable
-        resize_out = tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners)
-        # pylint: enable=unused-variable
+        compare_tf_with_tvm(data, 'Placeholder:0', 'ResizeBilinear:0')
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['ResizeBilinear'],
-                )
+def test_forward_resize_bilinear():
+    """ Resize Bilinear """
 
-            tf_output = run_tf_graph(sess, data,
-                    'Const:0', 'ResizeBilinear:0')
+    _test_resize_bilinear((4, 16, 32, 32), [50, 50], False)
+    _test_resize_bilinear((6, 32, 64, 64), [20, 20], True)
 
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
 
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
+#######################################################################
+# LSTM
+# ----
 
-            sess.close()
+def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype):
+    """ One iteration of a LSTM cell """
 
-def test_forward_resize_bilinear():
-    """ Resize Bilinear """
+    tf.reset_default_graph()
+    input_size = num_hidden
+    input_data = np.full((batch_size, input_size), 1., dtype=dtype)
+    in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
+    in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
 
-    _test_resize_bilinear((4, 16, 32, 32), [50, 50], False)
-    _test_resize_bilinear((6, 32, 64, 64), [20, 20], True)
+    def _get_tensorflow_output():
+        with tf.Session() as sess:
+            with variable_scope.variable_scope(
+                "root", initializer=init_ops.constant_initializer(0.5)):
+                m0 = array_ops.zeros([batch_size, num_hidden])
+                m1 = array_ops.zeros([batch_size, num_hidden])
+                x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype)
+                g, ((out_m0, out_m1)) = \
+                     tf.contrib.rnn.LSTMBlockCell(num_hidden,
+                                                  forget_bias=forget_bias)(x, ((m0, m1)))
+                sess.run([variables.global_variables_initializer()])
+                res = sess.run([g, out_m0, out_m1], {
+                    x.name: np.array([[1., 1.]]),
+                    m0.name: 0.1 * np.ones([batch_size, num_hidden]),
+                    m1.name: 0.1 * np.ones([batch_size, num_hidden]),
+                })
+            graph_def = sess.graph.as_graph_def(add_shapes=True)
+            final_graph_def = graph_util.convert_variables_to_constants(
+                sess,
+                graph_def,
+                ['root/lstm_cell/LSTMBlockCell'])
+            return final_graph_def, res
+
+    graph_def, tf_out = _get_tensorflow_output()
+    tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h],
+                               ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c',
+                                'root/lstm_cell/LSTMBlockCell_h'],
+                               [tf_out[0].shape, (2, batch_size, num_hidden)],
+                               [tf_out[0].dtype, tf_out[1].dtype])
+    assert isinstance(tvm_output, list)
+
+    out = tvm_output[0]
+    out_state = tvm_output[1]
+    out_state_tup = np.split(out_state, indices_or_sections=2, axis=0)
+    out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
+    out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
+    tvm_out = [out, out_state_c, out_state_h]
+    np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3)
+
+def test_forward_lstm():
+    '''test LSTM block cell'''
+
+    _test_lstm_cell(1, 2, 1, 0.0, 'float32')
 
 #######################################################################
 # Pad
@@ -714,30 +564,17 @@ def _test_pad(input_shape, paddings, mode, **kwargs):
     x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape)
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(x, shape=input_shape, dtype='float32')
+        in_data = array_ops.placeholder(shape=input_shape, dtype='float32')
         pad_values = constant_op.constant(paddings)
         pad = tf.pad(in_data, paddings=pad_values, mode=mode, **kwargs)
 
         if mode == 'CONSTANT':
             if 'constant_values' in kwargs:
-                out_node = 'PadV2'
                 out_name = 'PadV2:0'
             else:
-                out_node = 'Pad'
                 out_name = 'Pad:0'
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                [out_node],
-                )
-
-            tf_output = run_tf_graph(sess, x, 'Const:0', out_name)
-            tvm_output = run_tvm_graph(graph_def, x.astype('float32'),
-                                       "Const", tf_output.shape, 'float32')
-            np.testing.assert_allclose(tf_output, tvm_output)
-            sess.close()
+        compare_tf_with_tvm(x, 'Placeholder:0', out_name)
 
 def test_forward_pad():
     """ Pad """
@@ -944,17 +781,7 @@ def _test_lrn(ishape, size, axis, bias, alpha, beta):
                                             alpha=alpha,
                                             beta=beta)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['lrn'],)
-            tf_output = run_tf_graph(sess, inp_array, 'lrn0_data:0', 'lrn:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       inp_array,
-                                       "lrn0_data", tf_output.shape, tf_output.dtype)
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-            sess.close()
+        compare_tf_with_tvm(inp_array, 'lrn0_data:0', 'lrn:0')
 
 def test_forward_lrn():
     _test_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5)
@@ -962,38 +789,26 @@ def test_forward_lrn():
 #######################################################################
 # l2_normalize
 # ------------
+
 def _test_l2_normalize(ishape, eps, axis):
     """ testing l2 normalize (uses max, sum, square, sqrt frontend operators)"""
 
     inp_array = np.random.uniform(size=ishape).astype(np.float32)
-    inp_array.fill(1)
 
     with tf.Graph().as_default():
-        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype, name="Placeholder")
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
         nn.l2_normalize(in1,
                         axis=axis,
                         epsilon=eps,
                         name=None,
                         dim=None)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['l2_normalize'],
-                )
-            tf_output = run_tf_graph(sess, inp_array, 'Placeholder:0', 'Placeholder:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       inp_array,
-                                       "Placeholder",
-                                       tf_output.shape,
-                                       tf_output.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-            sess.close()
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'l2_normalize:0')
+
 def test_forward_l2_normalize():
     _test_l2_normalize((1, 3, 20, 20), 0.001, (0,))
 
+
 #######################################################################
 # Main
 # ----
@@ -1011,7 +826,7 @@ def test_forward_l2_normalize():
     test_forward_mobilenet()
     test_forward_variable()
     test_forward_resize_bilinear()
-    test_forward_pad()    
+    test_forward_pad()
     test_forward_lstm()
     test_forward_stridedslice()
     test_forward_gather()

From f2814fc14eeaa12886810c90b29c9c834cb5f102 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Tue, 14 Aug 2018 21:28:34 +0530
Subject: [PATCH 013/529] [NNVM][DOC] Update NNVM symbol documentation to
 latest. Ref. 1591 (#1599)

---
 docs/nnvm_top.rst | 79 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
index 96a37b779e1e..927990647a69 100644
--- a/docs/nnvm_top.rst
+++ b/docs/nnvm_top.rst
@@ -29,6 +29,7 @@ This level enables fully connected multi-layer perceptron.
 
    nnvm.symbol.dense
    nnvm.symbol.relu
+   nnvm.symbol.prelu
    nnvm.symbol.tanh
    nnvm.symbol.sigmoid
    nnvm.symbol.exp
@@ -39,6 +40,8 @@ This level enables fully connected multi-layer perceptron.
    nnvm.symbol.elemwise_mul
    nnvm.symbol.elemwise_div
    nnvm.symbol.elemwise_sum
+   nnvm.symbol.elemwise_mod
+   nnvm.symbol.elemwise_pow
    nnvm.symbol.flatten
    nnvm.symbol.concatenate
    nnvm.symbol.expand_dims
@@ -50,6 +53,14 @@ This level enables fully connected multi-layer perceptron.
    nnvm.symbol.log_softmax
    nnvm.symbol.pad
    nnvm.symbol.block_grad
+   nnvm.symbol.matmul
+   nnvm.symbol.resize
+   nnvm.symbol.upsampling
+   nnvm.symbol.take
+   nnvm.symbol.l2_normalize
+   nnvm.symbol.flip
+   nnvm.symbol.lrn
+   nnvm.symbol.where
 
 
 **Level 2: Convolutions**
@@ -92,6 +103,7 @@ This level enables typical convnet models.
    nnvm.symbol.__lshift_scalar__
    nnvm.symbol.__rshift_scalar__
 
+
 **Level 4: Broadcast and Reductions**
 
 .. autosummary::
@@ -117,11 +129,43 @@ This level enables typical convnet models.
    nnvm.symbol.ones_like
    nnvm.symbol.zeros
    nnvm.symbol.zeros_like
+   nnvm.symbol.slice_like
+   nnvm.symbol.strided_slice
+   nnvm.symbol.argmax
+   nnvm.symbol.argmin
+   nnvm.symbol.collapse_sum
+   nnvm.symbol.broadcast_equal
+   nnvm.symbol.broadcast_greater_equal
+   nnvm.symbol.broadcast_greater_equal
+   nnvm.symbol.broadcast_greater
+   nnvm.symbol.broadcast_left_shift
+   nnvm.symbol.broadcast_less_equal
+   nnvm.symbol.broadcast_less_equal
+   nnvm.symbol.broadcast_less
+   nnvm.symbol.broadcast_max
+   nnvm.symbol.broadcast_min
+   nnvm.symbol.broadcast_mod
+   nnvm.symbol.broadcast_not_equal
+   nnvm.symbol.broadcast_pow
+   nnvm.symbol.broadcast_right_shift
+
+
+**Level 5: Vision Operators**
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.multibox_prior
+   nnvm.symbol.multibox_transform_loc
+   nnvm.symbol.nms
+   nnvm.symbol.yolo_region
+   nnvm.symbol.yolo_reorg
 
 Detailed Definitions
 --------------------
 .. autofunction:: nnvm.symbol.dense
 .. autofunction:: nnvm.symbol.relu
+.. autofunction:: nnvm.symbol.prelu
 .. autofunction:: nnvm.symbol.tanh
 .. autofunction:: nnvm.symbol.sigmoid
 .. autofunction:: nnvm.symbol.exp
@@ -132,6 +176,8 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.elemwise_mul
 .. autofunction:: nnvm.symbol.elemwise_div
 .. autofunction:: nnvm.symbol.elemwise_sum
+.. autofunction:: nnvm.symbol.elemwise_mod
+.. autofunction:: nnvm.symbol.elemwise_pow
 .. autofunction:: nnvm.symbol.flatten
 .. autofunction:: nnvm.symbol.concatenate
 .. autofunction:: nnvm.symbol.expand_dims
@@ -143,6 +189,14 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.log_softmax
 .. autofunction:: nnvm.symbol.pad
 .. autofunction:: nnvm.symbol.block_grad
+.. autofunction:: nnvm.symbol.matmul
+.. autofunction:: nnvm.symbol.resize
+.. autofunction:: nnvm.symbol.upsampling
+.. autofunction:: nnvm.symbol.take
+.. autofunction:: nnvm.symbol.l2_normalize
+.. autofunction:: nnvm.symbol.flip
+.. autofunction:: nnvm.symbol.lrn
+.. autofunction:: nnvm.symbol.where
 
 .. autofunction:: nnvm.symbol.conv2d
 .. autofunction:: nnvm.symbol.conv2d_transpose
@@ -191,3 +245,28 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.ones_like
 .. autofunction:: nnvm.symbol.zeros
 .. autofunction:: nnvm.symbol.zeros_like
+.. autofunction:: nnvm.symbol.slice_like
+.. autofunction:: nnvm.symbol.strided_slice
+.. autofunction:: nnvm.symbol.argmax
+.. autofunction:: nnvm.symbol.argmin
+.. autofunction:: nnvm.symbol.collapse_sum
+.. autofunction:: nnvm.symbol.broadcast_equal
+.. autofunction:: nnvm.symbol.broadcast_greater_equal
+.. autofunction:: nnvm.symbol.broadcast_greater_equal
+.. autofunction:: nnvm.symbol.broadcast_greater
+.. autofunction:: nnvm.symbol.broadcast_left_shift
+.. autofunction:: nnvm.symbol.broadcast_less_equal
+.. autofunction:: nnvm.symbol.broadcast_less_equal
+.. autofunction:: nnvm.symbol.broadcast_less
+.. autofunction:: nnvm.symbol.broadcast_max
+.. autofunction:: nnvm.symbol.broadcast_min
+.. autofunction:: nnvm.symbol.broadcast_mod
+.. autofunction:: nnvm.symbol.broadcast_not_equal
+.. autofunction:: nnvm.symbol.broadcast_pow
+.. autofunction:: nnvm.symbol.broadcast_right_shift
+
+.. autofunction:: nnvm.symbol.multibox_prior
+.. autofunction:: nnvm.symbol.multibox_transform_loc
+.. autofunction:: nnvm.symbol.nms
+.. autofunction:: nnvm.symbol.yolo_region
+.. autofunction:: nnvm.symbol.yolo_reorg

From e9f942ab3aa2b386304ba05f5e72affd8e5b2260 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Tue, 14 Aug 2018 22:14:33 +0530
Subject: [PATCH 014/529] [NNVM][POOL] bug fix. Remove the hardcode. (#1600)

---
 nnvm/src/top/nn/pooling.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nnvm/src/top/nn/pooling.cc b/nnvm/src/top/nn/pooling.cc
index cccd5b1c710b..8b9b7a64aa0d 100644
--- a/nnvm/src/top/nn/pooling.cc
+++ b/nnvm/src/top/nn/pooling.cc
@@ -77,7 +77,7 @@ inline bool Pool2DInferShape(const nnvm::NodeAttrs& attrs,
   } else {
     oshape[hidx] = ((dshape[hidx] + pad_h - param.pool_size[0] +
                     param.strides[0] - 1) / param.strides[0]) + 1;
-    oshape[widx] = ((dshape[3] + pad_w - param.pool_size[1] +
+    oshape[widx] = ((dshape[widx] + pad_w - param.pool_size[1] +
                     param.strides[1] - 1) / param.strides[1]) + 1;
   }
   NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);

From 9c8d9d2c8f367c8a2249b5e783213ab41db2cfd8 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 14 Aug 2018 23:18:02 +0530
Subject: [PATCH 015/529] [FRONTEND][DARKNET]LSTM and GRU support (#1576)

---
 nnvm/python/nnvm/frontend/darknet.py          | 140 ++++++++++++++++--
 nnvm/python/nnvm/testing/darknet.py           |   3 +
 .../python/frontend/darknet/test_forward.py   |  44 +++++-
 3 files changed, 175 insertions(+), 12 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 3a197a416219..3aa36b7e7ef9 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -412,7 +412,12 @@ def __init__(self, net, dtype='float32'):
         self._sym_array = {}
         self._tvmparams = {}
         self._outs = []
-        self._rnn_state_ctr = 0
+        self._state_ctr = {}
+        self._state_ctr['rnn'] = 0
+        self._state_ctr['crnn'] = 0
+        self._state_ctr['lstm'] = 0
+        self._state_ctr['cell_state'] = 0
+        self._state_ctr['gru'] = 0
 
     def _read_memory_buffer(self, shape, data):
         length = 1
@@ -623,16 +628,16 @@ def _get_opname(self, layer):
         """Returs the layer name."""
         return layer.type
 
-    def _new_rnn_state_sym(self, state=None):
+    def _new_rnn_state_sym(self, state=None, name='rnn'):
         """Returs a symbol for state"""
-        name = "rnn%d_state" % (self._rnn_state_ctr)
-        self._rnn_state_ctr += 1
-        return _sym.Variable(name=name, init=state)
+        sym_name = name + "%d_state" % self._state_ctr[name]
+        self._state_ctr[name] += 1
+        return _sym.Variable(name=sym_name, init=state)
 
-    def _get_rnn_state_buffer(self, layer):
+    def _get_rnn_state_buffer(self, layer, name):
         """Get the state buffer for rnn."""
         buffer = np.zeros((1, layer.outputs), self.dtype)
-        return self._new_rnn_state_sym(buffer)
+        return self._new_rnn_state_sym(buffer, name)
 
     def _get_darknet_rnn_attrs(self, layer, sym):
         """Get the rnn converted symbol from attributes."""
@@ -653,7 +658,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
             attr.update({'batch' : layer.batch})
             attr.update({'num_hidden' : str(layer.outputs)})
 
-            state = self._get_rnn_state_buffer(layer)
+            state = self._get_rnn_state_buffer(layer, 'rnn')
 
             for _ in range(layer.steps):
                 input_layer = layer.input_layer
@@ -678,7 +683,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
             attr.update({'batch' : layer.batch})
             attr.update({'num_hidden' : str(layer.outputs)})
 
-            state = self._get_rnn_state_buffer(layer)
+            state = self._get_rnn_state_buffer(layer, 'crnn')
 
             for _ in range(layer.steps):
                 input_layer = layer.input_layer
@@ -698,6 +703,123 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
             self._sym_array[layer_num] = sym
             processed = True
 
+        elif LAYERTYPE.LSTM == layer.type:
+            if layer.steps > 1:
+                raise NotImplementedError("Currently support only single step GRU")
+
+            op_name_add = 'elemwise_add'
+            op_name_mul = 'elemwise_mul'
+            attrs = {}
+            act_attr = {}
+
+            h_state = self._get_rnn_state_buffer(layer, 'lstm')
+            c_state = self._get_rnn_state_buffer(layer, 'cell_state')
+            for _ in range(layer.steps):
+                sym_wf = self._get_darknet_rnn_attrs(layer.wf, h_state)
+                sym_wi = self._get_darknet_rnn_attrs(layer.wi, h_state)
+                sym_wg = self._get_darknet_rnn_attrs(layer.wg, h_state)
+                sym_wo = self._get_darknet_rnn_attrs(layer.wo, h_state)
+
+                input_sym = sym
+                sym_uf = self._get_darknet_rnn_attrs(layer.uf, input_sym)
+                sym_ui = self._get_darknet_rnn_attrs(layer.ui, input_sym)
+                sym_ug = self._get_darknet_rnn_attrs(layer.ug, input_sym)
+                sym_uo = self._get_darknet_rnn_attrs(layer.uo, input_sym)
+
+                new_inputs = _as_list([sym_wf, sym_uf])
+                add_f = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wi, sym_ui])
+                add_i = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wg, sym_ug])
+                add_g = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wo, sym_uo])
+                add_o = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_f, _ = _darknet_activations(_as_list(add_f), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_i, _ = _darknet_activations(_as_list(add_i), act_attr)
+
+                act_attr['activation'] = ACTIVATION.TANH
+                act_g, _ = _darknet_activations(_as_list(add_g), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_o, _ = _darknet_activations(_as_list(add_o), act_attr)
+
+                new_inputs = _as_list([act_i, act_g])
+                mul_t = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([act_f, c_state])
+                c_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([mul_t, c_state])
+                c_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.TANH
+                h_state, _ = _darknet_activations(_as_list(c_state), act_attr)
+
+                new_inputs = _as_list([act_o, h_state])
+                h_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+                self._outs = self._outs + [c_state, h_state]
+                sym = h_state
+            self._sym_array[layer_num] = sym
+            processed = True
+
+        elif LAYERTYPE.GRU == layer.type:
+            if layer.steps > 1:
+                raise NotImplementedError("Currently support only single step GRU")
+
+            op_name_add = 'elemwise_add'
+            op_name_mul = 'elemwise_mul'
+            attrs = {}
+            act_attr = {}
+
+            state = self._get_rnn_state_buffer(layer, "gru")
+            for _ in range(layer.steps):
+                sym_wz = self._get_darknet_rnn_attrs(layer.wz, state)
+                sym_wr = self._get_darknet_rnn_attrs(layer.wr, state)
+
+                input_sym = sym
+                sym_uz = self._get_darknet_rnn_attrs(layer.uz, input_sym)
+                sym_ur = self._get_darknet_rnn_attrs(layer.ur, input_sym)
+                sym_uh = self._get_darknet_rnn_attrs(layer.uh, input_sym)
+
+                new_inputs = _as_list([sym_uz, sym_wz])
+                add_z = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_ur, sym_wr])
+                add_r = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_z, _ = _darknet_activations(_as_list(add_z), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_r, _ = _darknet_activations(_as_list(add_r), act_attr)
+
+                new_inputs = _as_list([act_r, state])
+                forgot = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                sym_wh = self._get_darknet_rnn_attrs(layer.wh, forgot)
+
+                new_inputs = _as_list([sym_uh, sym_wh])
+                h_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                if layer.tanh == 1:
+                    act_attr['activation'] = ACTIVATION.TANH
+                else:
+                    act_attr['activation'] = ACTIVATION.LOGISTIC
+                h_state, _ = _darknet_activations(_as_list(h_state), act_attr)
+
+                sym = act_z * state + (1 - act_z) * h_state
+
+                self._outs = self._outs + [sym]
+            self._sym_array[layer_num] = sym
+            processed = True
+
         return processed, sym
 
     def from_darknet(self):
diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py
index 362fd3058954..e3d110e9605e 100644
--- a/nnvm/python/nnvm/testing/darknet.py
+++ b/nnvm/python/nnvm/testing/darknet.py
@@ -491,6 +491,9 @@ class ACTIVATION(object):
 layer make_region_layer(int batch, int w, int h, int n, int classes, int coords);
 layer make_softmax_layer(int batch, int inputs, int groups);
 layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam);
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize);
+layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
+layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
 void free_network(network *net);
 """
                    )
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
index e68aed085664..5fc71a86211e 100644
--- a/nnvm/tests/python/frontend/darknet/test_forward.py
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -306,7 +306,7 @@ def test_forward_softmax_temperature():
     LIB.free_network(net)
 
 def test_forward_rnn():
-    '''test softmax layer'''
+    '''test RNN layer'''
     net = LIB.make_network(1)
     batch = 1
     inputs = 256
@@ -325,7 +325,7 @@ def test_forward_rnn():
     LIB.free_network(net)
 
 def test_forward_crnn():
-    '''test softmax layer'''
+    '''test CRNN layer'''
     net = LIB.make_network(1)
     batch = 1
     c = 3
@@ -349,6 +349,42 @@ def test_forward_crnn():
     test_forward(net)
     LIB.free_network(net)
 
+def test_forward_lstm():
+    '''test LSTM layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 256
+    outputs = 256
+    steps = 1
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_lstm_layer(batch, inputs, outputs, steps, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    test_rnn_forward(net)
+    LIB.free_network(net)
+
+def test_forward_gru():
+    '''test GRU layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 256
+    outputs = 256
+    steps = 1
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_gru_layer(batch, inputs, outputs, steps, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    test_rnn_forward(net)
+    LIB.free_network(net)
+
 def test_forward_activation_logistic():
     '''test logistic activation layer'''
     net = LIB.make_network(1)
@@ -395,4 +431,6 @@ def test_forward_activation_logistic():
     test_forward_elu()
     test_forward_rnn()
     test_forward_crnn()
-    test_forward_activation_logistic()
\ No newline at end of file
+    test_forward_lstm()
+    test_forward_gru()
+    test_forward_activation_logistic()

From d05026a24a2682b11fbe4eaf58790ad9dcbed41a Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 15 Aug 2018 15:34:05 -0700
Subject: [PATCH 016/529] [AUTOTVM] Fix GATuner and improve error message
 (#1605)

---
 include/tvm/operation.h                       |  2 ++
 python/tvm/autotvm/measure/measure_methods.py |  2 ++
 python/tvm/autotvm/task/nnvm_integration.py   | 15 ++++++++++++++-
 python/tvm/autotvm/task/task.py               |  2 +-
 python/tvm/autotvm/task/topi_integration.py   |  2 +-
 python/tvm/autotvm/tuner/callback.py          |  8 +++++++-
 python/tvm/autotvm/tuner/ga_tuner.py          | 14 ++++++++++----
 7 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index d13680531af9..ed8be6e4a7c0 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -366,6 +366,8 @@ class ExternOpNode : public OperationNode {
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
     v->Visit("inputs", &inputs);
+    v->Visit("input_placeholders", &input_placeholders);
+    v->Visit("output_placeholders", &output_placeholders);
     v->Visit("body", &body);
   }
   EXPORT static Operation make(std::string name,
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index e192ee26ee3e..d845cc1f88fd 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -394,6 +394,8 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
             msg = str(exc)
             if "Stack trace returned" in msg:
                 msg = msg[:msg.index("Stack trace returned")]
+            if "CUDA Source" in msg:
+                msg = msg[:msg.index("CUDA Source")]
             costs = (RuntimeError(msg),)
             errno = MeasureErrorNo.RUNTIME_DEVICE
         tstamp = time.time()
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index 338b46784a75..1b50869fc378 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -4,12 +4,16 @@
 
 """
 import warnings
+import logging
+
 
 from ... import tensor, placeholder, target as _target
 
 from ..util import get_const_tuple
 from .task import create, register
+from .dispatcher import ApplyHistoryBest
 
+logger = logging.getLogger('autotvm')
 
 def serialize_args(args):
     """serialize arguments of a topi function to a hashable tuple.
@@ -176,8 +180,17 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
 
     # run compiler to collect all TOPI calls during compilation
     env.reset()
+
+    # disable logger temporarily
+    old_state = logger.disabled
+    logger.disabled = True
+
+    # use a dummy target to do a fake compile for collecting topi calls
     dummy_target = _target.create("opencl -device=dummy")
-    nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype)
+    with ApplyHistoryBest([], allow_fallback=True):
+        nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype)
+
+    logger.disabled = old_state
 
     tasks = []
     for task_name, args in env.get_tasks():
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 7a386f1f9e67..f8923fca56e3 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -368,7 +368,7 @@ def traverse(ops):
                 pass
             else:
                 raise FlopCalculationError("Only support tvm.compute currently. "
-                                           "Other ops like tvm.scan is not supported")
+                                           "Other ops like tvm.scan/tvm.extern is not supported")
         return ret
 
     try:
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 012ca4a214e9..18f45f8d6708 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -62,7 +62,7 @@ def _decorator(f):
         for target_key in targets:
             if target_key not in _REGISTED_DISPATHCER:
                 _REGISTED_DISPATHCER[target_key] = {}
-            if topi_compute not in _REGISTED_DISPATHCER:
+            if topi_compute not in _REGISTED_DISPATHCER[target_key]:
                 @topi_compute.register(target_key)
                 @dispatcher
                 def config_dispatcher(*args, **kwargs):
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index 15d5ac1c9689..6f66871f671c 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -101,11 +101,17 @@ def __init__(self):
             self.total = total
 
         def __del__(self):
-            sys.stdout.write(' Done.\n')
+            if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
+                sys.stdout.write(' Done.\n')
 
     ctx = _Context()
     tic = time.time()
 
+    if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
+        sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
+                         '| %.2f s' % (prefix, 0, 0, 0, total, time.time() - tic))
+        sys.stdout.flush()
+
     def _callback(tuner, inputs, results):
         ctx.ct += len(inputs)
 
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index 916bd4ee68c6..b92737ed5317 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -47,6 +47,7 @@ def __init__(self, task, pop_size, elite_num=3, mutation_prob=0.1):
 
         # random initialization
         self.pop_size = min(self.pop_size, len(self.space))
+        self.elite_num = min(self.pop_size, self.elite_num)
         for _ in range(self.pop_size):
             tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims)
             while knob2point(tmp_gene, self.dims) in self.visited:
@@ -70,9 +71,9 @@ def update(self, inputs, results):
                 y = inp.task.flop / np.mean(res.costs)
                 self.scores.append(y)
             else:
-                self.scores.append(0)
+                self.scores.append(0.0)
 
-        if len(self.scores) >= len(self.genes):
+        if len(self.scores) >= len(self.genes) and len(self.visited) < len(self.space):
             genes = self.genes + self.elites
             scores = np.array(self.scores[:len(self.genes)] + self.elite_scores)
 
@@ -85,8 +86,13 @@ def update(self, inputs, results):
 
             # cross over
             indices = np.arange(len(genes))
-            scores /= np.max(scores)
-            probs = scores / np.sum(scores)
+            max_score = np.max(scores)
+            if max_score < 1e-8:
+                probs = np.empty_like(scores)
+                probs[:] = 1.0 / len(scores)
+            else:
+                scores /= max_score
+                probs = scores / np.sum(scores)
             tmp_genes = []
             for _ in range(self.pop_size):
                 p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs)

From a1829b39655dafc814e67e83119677d464a3d279 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 15 Aug 2018 15:42:24 -0700
Subject: [PATCH 017/529] [NNVM] Add symbol for inception v3 (#1604)

---
 nnvm/python/nnvm/testing/__init__.py          |   1 +
 nnvm/python/nnvm/testing/inception_v3.py      | 255 ++++++++++++++++++
 nnvm/python/nnvm/testing/squeezenet.py        |   2 +-
 nnvm/src/compiler/graph_hash.cc               |   2 +-
 .../frontend/mxnet/model_zoo/__init__.py      |   9 +-
 .../frontend/mxnet/model_zoo/inception_v3.py  | 170 ++++++++++++
 .../tests/python/frontend/mxnet/test_graph.py |  11 +-
 7 files changed, 442 insertions(+), 8 deletions(-)
 create mode 100644 nnvm/python/nnvm/testing/inception_v3.py
 create mode 100644 nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py

diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
index bff828d68280..4a879047ec7e 100644
--- a/nnvm/python/nnvm/testing/__init__.py
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -8,6 +8,7 @@
 from . import resnet
 from . import vgg
 from . import squeezenet
+from . import inception_v3
 from . import dcgan
 from . import dqn
 from . import yolo2_detection
diff --git a/nnvm/python/nnvm/testing/inception_v3.py b/nnvm/python/nnvm/testing/inception_v3.py
new file mode 100644
index 000000000000..f14daa1ae656
--- /dev/null
+++ b/nnvm/python/nnvm/testing/inception_v3.py
@@ -0,0 +1,255 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision."
+arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+# pylint: disable=invalid-name,missing-docstring,unused-argument
+from .. import symbol as sym
+from .utils import create_workload
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = sym.conv2d(data=data, channels=num_filter, kernel_size=kernel,
+                      strides=stride, padding=pad, use_bias=False,
+                      name='%s%s_conv2d' % (name, suffix))
+    bn = sym.batch_norm(data=conv, name='%s%s_batchnorm' % (name, suffix), epsilon=2e-5)
+    act = sym.relu(data=bn, name='%s%s_relu' % (name, suffix))
+    return act
+
+def Pooling(data, kernel, stride, pad, pool_type, name):
+    if pool_type == 'max':
+        return sym.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name)
+    elif pool_type == 'avg':
+        return sym.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name,
+                              count_include_pad=True)
+    else:
+        raise ValueError("Invalid pooling type: " + pool_type)
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+
+    cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv')
+    concat = sym.concatenate(*[tower_1x1, tower_5x5, tower_3x3, cproj],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1),
+                      name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                      name=('%s_tower' % name), suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max",
+                      name=('max_pool_%s_pool' % name))
+    concat = sym.concatenate(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1),
+                 name=('%s_tower_2' % name), suffix='_conv')
+    # concat
+    concat = sym.concatenate(*[tower_1x1, tower_d7, tower_q7, cproj],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name),
+                     suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                        name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2),
+                        name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0),
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = sym.concatenate(*[tower_3x3, tower_d7_3x3, pooling],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1),
+                      name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0),
+                      name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name),
+                 suffix='_conv')
+    # concat
+    concat = sym.concatenate(
+        *[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj],
+        name='ch_concat_%s_chconcat' % name)
+    return concat
+
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                   name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                    name="pool1")
+
+    # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0),
+                   name="global_pool")
+    flatten = sym.flatten(data=pool, name="flatten")
+    fc1 = sym.dense(data=flatten, units=num_classes, name='fc1')
+    softmax = sym.softmax(data=fc1, name='softmax')
+    return softmax
+
+def get_workload(batch_size=1, num_classes=1000,
+                 image_shape=(3, 299, 299), dtype="float32", **kwargs):
+    """Get benchmark workload for InceptionV3
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/squeezenet.py b/nnvm/python/nnvm/testing/squeezenet.py
index a445e8cfb7da..eab2cf06fee6 100644
--- a/nnvm/python/nnvm/testing/squeezenet.py
+++ b/nnvm/python/nnvm/testing/squeezenet.py
@@ -98,7 +98,7 @@ def get_symbol(num_classes, version, **kwargs):
 
 def get_workload(batch_size=1, num_classes=1000, version='1.0',
                  image_shape=(3, 224, 224), dtype="float32", **kwargs):
-    """Get benchmark workload for resnet
+    """Get benchmark workload for SqueezeNet
 
     Parameters
     ----------
diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc
index d881130f72cc..ccd2e3ce433f 100644
--- a/nnvm/src/compiler/graph_hash.cc
+++ b/nnvm/src/compiler/graph_hash.cc
@@ -125,7 +125,7 @@ std::string GraphDeepCompare(const Graph& a,
   const IndexedGraph& idxb = b.indexed_graph();
   std::ostringstream err;
   if (idxa.num_nodes() != idxb.num_nodes()) {
-    err << "Number of nodes mismatch";
+    err << "Number of nodes mismatch (" <<  idxa.num_nodes() << " v.s " << idxb.num_nodes() << ")";
     return err.str();
   }
   if (idxa.num_node_entries() != idxb.num_node_entries()) {
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
index e3c9acdf23ef..66e743ad9c33 100644
--- a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
@@ -1,11 +1,8 @@
 """MXNet and NNVM model zoo."""
 from __future__ import absolute_import
-from . import mlp, resnet, vgg, dqn, dcgan, squeezenet
+from . import mlp, resnet, vgg, dqn, dcgan, squeezenet, inception_v3
 import nnvm.testing
 
-__all__ = ['mx_mlp', 'nnvm_mlp', 'mx_resnet', 'nnvm_resnet', 'mx_vgg', 'nnvm_vgg',
-           'mx_squeezenet', 'nnvm_squeezenet']
-
 _num_class = 1000
 
 # mlp fc
@@ -35,6 +32,10 @@
     mx_squeezenet[version] = squeezenet.get_symbol(version=version)
     nnvm_squeezenet[version] = nnvm.testing.squeezenet.get_workload(1, version=version)[0]
 
+# inception
+mx_inception_v3 = inception_v3.get_symbol()
+nnvm_inception_v3 = nnvm.testing.inception_v3.get_workload(1)[0]
+
 # dqn
 mx_dqn = dqn.get_symbol()
 nnvm_dqn = nnvm.testing.dqn.get_workload(1)[0]
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py
new file mode 100644
index 000000000000..b8585bf05037
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py
@@ -0,0 +1,170 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+import mxnet as mx
+import numpy as np
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
+    bn = mx.sym.BatchNorm(data=conv, eps=2e-5, name='%s%s_batchnorm' % (name, suffix))
+    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
+    return act
+
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(pooling, proj, name=('%s_tower_2' %  name), suffix='_conv')
+    concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name))
+    concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = mx.sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1")
+
+    # # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool")
+    flatten = mx.sym.Flatten(data=pool, name="flatten")
+    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1', flatten=False)
+    softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+    return softmax
diff --git a/nnvm/tests/python/frontend/mxnet/test_graph.py b/nnvm/tests/python/frontend/mxnet/test_graph.py
index 18e124ad6ffc..e89224cd969e 100644
--- a/nnvm/tests/python/frontend/mxnet/test_graph.py
+++ b/nnvm/tests/python/frontend/mxnet/test_graph.py
@@ -39,17 +39,23 @@ def test_squeezenet():
         nnvm_sym = model_zoo.nnvm_squeezenet[version]
         compare_graph(from_mx_sym, nnvm_sym)
 
+def test_inception_v3():
+    mx_sym = model_zoo.mx_inception_v3
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = model_zoo.nnvm_inception_v3
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 3, 299, 299))
+
 def test_dqn():
     mx_sym = model_zoo.mx_dqn
     from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
     nnvm_sym = model_zoo.nnvm_dqn
-    compare_graph(from_mx_sym, nnvm_sym)
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 4, 84, 84))
 
 def test_dcgan():
     mx_sym = model_zoo.mx_dcgan
     from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
     nnvm_sym = model_zoo.nnvm_dcgan
-    compare_graph(from_mx_sym, nnvm_sym)
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 100))
 
 def test_multi_outputs():
     def compose(F, **kwargs):
@@ -70,3 +76,4 @@ def compose(F, **kwargs):
     test_dqn()
     test_dcgan()
     test_squeezenet()
+    test_inception_v3()

From 834d6fe42d98332d7cc5ed1069c8cd674b286da9 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 15 Aug 2018 23:21:05 -0700
Subject: [PATCH 018/529] [TEAM] New reviewer: kevinthesun (#1606)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 2d571ba668ea..9db50b02b11a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -26,6 +26,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
 - [Siva](https://github.com/srkreddy1238)
 - [Alex Weaver](https://github.com/alex-weaver)
+- [Yao Wang](https://github.com/kevinthesun)
 - [Eddie Yan](https://github.com/eqy)
 - [Joshua Z. Zhang](https://github.com/zhreshold)
 

From 2d7d220d06fd576802e5484264a1215d6f2f6af0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 16 Aug 2018 14:05:08 -0700
Subject: [PATCH 019/529] [RUNTIME] Enable return NDArray in RPC (#1610)

---
 include/tvm/runtime/ndarray.h             |  1 +
 src/api/api_base.cc                       |  8 +++
 src/runtime/rpc/rpc_module.cc             | 61 +++++++++++++++++++---
 src/runtime/rpc/rpc_session.cc            | 63 +++++++++++++++++++----
 src/runtime/rpc/rpc_session.h             |  1 +
 tests/python/unittest/test_runtime_rpc.py | 44 +++++++++++++++-
 6 files changed, 160 insertions(+), 18 deletions(-)

diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index d3ecce8ba9d0..c288ce5f3adb 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -246,6 +246,7 @@ struct NDArray::Container {
 
  private:
   friend class NDArray;
+  friend class RPCWrappedFunc;
   /*!
    * \brief The shape container,
    *  can be used used for shape data.
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index 37970e69e24f..70301993ad3a 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -37,6 +37,14 @@ TVM_REGISTER_API("_nop")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
   });
 
+// internal fucntion used for debug and testing purposes
+TVM_REGISTER_API("_ndarray_use_count")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    runtime::NDArray nd = args[0];
+    // substract the current one
+    *ret = (nd.use_count() - 1);
+  });
+
 TVM_REGISTER_API("_TVMSetStream")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     TVMSetStream(args[0], args[1], args[2]);
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 251871bf0cc1..d6c56e1b7cf4 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -12,13 +12,13 @@ namespace tvm {
 namespace runtime {
 
 // Wrapped remote function to packed func.
-struct RPCWrappedFunc {
+class RPCWrappedFunc {
  public:
   RPCWrappedFunc(void* handle,
                  std::shared_ptr<RPCSession> sess)
       : handle_(handle), sess_(sess) {
     fwrap_ = PackedFunc([sess](TVMArgs args, TVMRetValue* rv) {
-        WrapRemote(sess, args.values[0].v_handle, args.type_codes[0], rv);
+        WrapRemote(sess, args, rv);
       });
   }
 
@@ -34,10 +34,47 @@ struct RPCWrappedFunc {
   }
 
   static void WrapRemote(std::shared_ptr<RPCSession> sess,
-                         void* handle,
-                         int tcode,
+                         TVMArgs args,
                          TVMRetValue* rv);
 
+  // deleter of RPC remote array
+  static void RemoteNDArrayDeleter(NDArray::Container* ptr) {
+    RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
+    space->sess->CallRemote(RPCCode::kNDArrayFree, ptr->manager_ctx);
+    delete space;
+    delete ptr;
+  }
+  // wrap return value as remote NDArray.
+  static NDArray WrapRemoteNDArray(std::shared_ptr<RPCSession> sess,
+                                   DLTensor* tensor,
+                                   void* nd_handle) {
+    NDArray::Container* data = new NDArray::Container();
+    data->manager_ctx = nd_handle;
+    data->deleter = RemoteNDArrayDeleter;
+    RemoteSpace* space = new RemoteSpace();
+    space->sess = sess;
+    space->data = tensor->data;
+    data->dl_tensor.data = space;
+    NDArray ret(data);
+    // RAII now in effect
+    data->shape_ = std::vector<int64_t>(
+        tensor->shape, tensor->shape + tensor->ndim);
+    data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
+    data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
+    // setup dtype
+    data->dl_tensor.dtype = tensor->dtype;
+    // setup ctx, encode as remote session
+    data->dl_tensor.ctx.device_id = tensor->ctx.device_id;
+    data->dl_tensor.ctx.device_type = static_cast<DLDeviceType>(
+        static_cast<int>(tensor->ctx.device_type) +
+        kRPCSessMask * (sess->table_index() + 1));
+    // check strides.
+    CHECK(tensor->strides == nullptr);
+    // setup byteoffset
+    data->dl_tensor.byte_offset = tensor->byte_offset;
+    return ret;
+  }
+
  private:
   PackedFunc fwrap_;
   void* handle_{nullptr};
@@ -126,20 +163,28 @@ class RPCModuleNode final : public ModuleNode {
 };
 
 void RPCWrappedFunc::WrapRemote(std::shared_ptr<RPCSession> sess,
-                                void* handle,
-                                int tcode,
+                                TVMArgs args,
                                 TVMRetValue *rv) {
+  void* handle = args.values[0].v_handle;
+  int tcode = args.type_codes[0];
+
   if (handle == nullptr) return;
   if (tcode == kFuncHandle) {
     auto wf = std::make_shared<RPCWrappedFunc>(handle, sess);
     *rv = PackedFunc([wf](TVMArgs args, TVMRetValue* rv) {
         return wf->operator()(args, rv);
       });
-  } else {
-    CHECK_EQ(tcode, kModuleHandle);
+  } else if (tcode == kModuleHandle) {
     std::shared_ptr<RPCModuleNode> n =
         std::make_shared<RPCModuleNode>(handle, sess);
     *rv = Module(n);
+  } else if (tcode == kArrayHandle || tcode == kNDArrayContainer) {
+    CHECK_EQ(args.size(), 2);
+    DLTensor* tensor = args[0];
+    void* nd_handle = args[1];
+    *rv = WrapRemoteNDArray(sess, tensor, nd_handle);
+  } else {
+    LOG(FATAL) << "Cannot wrap tcode=" << tcode;
   }
 }
 
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 21fff7b29882..6bb01b9bd459 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -130,19 +130,22 @@ class RPCSession::EventHandler : public dmlc::Stream {
           break;
         }
         case kReturnReceived: {
-          CHECK_EQ(arg_buf_->value.size(), 1U);
+          CHECK_GE(arg_buf_->value.size(), 1U);
+
           TVMArgValue argv = arg_buf_->AsTVMArgs()[0];
           if (argv.type_code() == kFuncHandle ||
-              argv.type_code() == kModuleHandle) {
+              argv.type_code() == kModuleHandle ||
+              argv.type_code() == kArrayHandle) {
             CHECK(fwrap != nullptr) << "function/module wrapper not available";
             fwrap->CallPacked(arg_buf_->AsTVMArgs(), rv);
           } else {
+            CHECK_EQ(arg_buf_->value.size(), 1U);
             *rv = argv;
           }
           arg_buf_.reset();
           this->SwitchToState(kRecvCode);
           std::swap(client_mode_, client_mode);
-          return  RPCCode::kReturn;
+          return RPCCode::kReturn;
         }
         case kCopyAckReceived: {
           std::swap(client_mode_, client_mode);
@@ -172,15 +175,22 @@ class RPCSession::EventHandler : public dmlc::Stream {
     ctx.device_type = static_cast<DLDeviceType>(dev_type % kRPCSessMask);
     return ctx;
   }
-  // send Packed sequence to writer.
-  void SendPackedSeq(const TVMValue* arg_values, const int* type_codes, int n) {
+  // Send Packed sequence to writer.
+  // return_ndarray is a special flag to handle returning of ndarray
+  //    In this case, we return the shape, context and data of the array,
+  //    as well as a customized PackedFunc that handles deletion of
+  //    the array in the remote.
+  void SendPackedSeq(const TVMValue* arg_values,
+                     const int* type_codes,
+                     int n,
+                     bool return_ndarray = false) {
     this->Write(n);
-    // only handles .
     for (int i = 0; i < n; ++i) {
       int tcode = type_codes[i];
       if (tcode == kNDArrayContainer) tcode = kArrayHandle;
       this->Write(tcode);
     }
+
     // Argument packing.
     for (int i = 0; i < n; ++i) {
       int tcode = type_codes[i];
@@ -215,9 +225,23 @@ class RPCSession::EventHandler : public dmlc::Stream {
         case kNDArrayContainer:
         case kArrayHandle: {
           DLTensor* arr = static_cast<DLTensor*>(value.v_handle);
-          TVMContext ctx = StripSessMask(arr->ctx);
-          uint64_t data = reinterpret_cast<uint64_t>(
-              static_cast<RemoteSpace*>(arr->data)->data);
+          TVMContext ctx;
+          uint64_t data;
+          if (!return_ndarray) {
+            // in the client mode
+            // ctx contains the remote table index
+            // the space is wrapped by an RemoteSpace
+            // that holds reference to the session.
+            ctx = StripSessMask(arr->ctx);
+            data = reinterpret_cast<uint64_t>(
+                static_cast<RemoteSpace*>(arr->data)->data);
+          } else {
+            // When we return NDArray, we directly return
+            // the space and the context
+            // The client will be further wrapping
+            ctx = arr->ctx;
+            data = reinterpret_cast<uint64_t>(arr->data);
+          }
           this->Write(data);
           this->Write(ctx);
           this->Write(arr->ndim);
@@ -701,6 +725,21 @@ class RPCSession::EventHandler : public dmlc::Stream {
               << "Only server can send function and module handle back.";
         rv.MoveToCHost(&ret_value, &ret_tcode);
         SendPackedSeq(&ret_value, &ret_tcode, 1);
+      } else if (rv.type_code() == kNDArrayContainer) {
+        // always send handle in 64 bit.
+        CHECK(!client_mode_)
+            << "Only server can send NDArray back";
+        // We follow a special protocol to return NDArray to client side
+        // The first pack value is the NDArray handle as DLTensor
+        // The second pack value is a customized deleter that deletes the NDArray.
+        TVMValue ret_value_pack[2];
+        int ret_tcode_pack[2];
+        rv.MoveToCHost(&ret_value_pack[0], &ret_tcode_pack[0]);
+
+        NDArray::Container* nd = static_cast<NDArray::Container*>(ret_value_pack[0].v_handle);
+        ret_value_pack[1].v_handle = nd;
+        ret_tcode_pack[1] = kHandle;
+        SendPackedSeq(ret_value_pack, ret_tcode_pack, 2, true);
       } else {
         ret_value = rv.value();
         ret_tcode = rv.type_code();
@@ -1090,6 +1129,11 @@ void RPCModuleGetSource(TVMArgs args, TVMRetValue *rv) {
   *rv = (*static_cast<Module*>(mhandle))->GetSource(fmt);
 }
 
+void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) {
+  void* handle = args[0];
+  static_cast<NDArray::Container*>(handle)->DecRef();
+}
+
 void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
   PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*());
   void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3]));
@@ -1138,6 +1182,7 @@ void RPCSession::EventHandler::HandlePackedCall() {
     case RPCCode::kModuleFree: CallHandler(RPCModuleFree); break;
     case RPCCode::kModuleGetFunc: CallHandler(RPCModuleGetFunc); break;
     case RPCCode::kModuleGetSource: CallHandler(RPCModuleGetSource); break;
+    case RPCCode::kNDArrayFree: CallHandler(RPCNDArrayFree); break;
     default: LOG(FATAL) << "Unknown event " << static_cast<int>(code_);
   }
   CHECK_EQ(state_, kRecvCode);
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 68f6763ae6db..4b736de0e041 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -48,6 +48,7 @@ enum class RPCCode : int {
   kModuleFree,
   kModuleGetFunc,
   kModuleGetSource,
+  kNDArrayFree
 };
 
 /*!
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index e7c0cc1bbabd..0de788068b6b 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -175,6 +175,7 @@ def test_rpc_return_func():
     @tvm.register_func("rpc.test.remote_func")
     def addone(x):
         return lambda y: x+y
+
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     f1 = client.get_function("rpc.test.remote_func")
@@ -182,6 +183,46 @@ def addone(x):
     assert fadd(12) == 22
 
 
+def test_rpc_return_ndarray():
+    # Use closure to check the ref counter correctness
+    nd = tvm.nd.array(np.zeros(10).astype("float32"))
+    @tvm.register_func("rpc.test.remote_return_nd")
+    def my_module(name):
+        if name == "get_arr":
+            return lambda : nd
+        elif name == "ref_count":
+            return lambda : tvm._api_internal._ndarray_use_count(nd)
+        elif name == "get_elem":
+            return lambda idx: nd.asnumpy()[idx]
+        elif name == "get_arr_elem":
+            return lambda arr, idx: arr.asnumpy()[idx]
+
+    # start server
+    server = rpc.Server("localhost", key="x1")
+    client = rpc.connect(server.host, server.port, key="x1")
+    m = client.get_function("rpc.test.remote_return_nd")
+    get_arr = m("get_arr")
+    ref_count = m("ref_count")
+    get_elem = m("get_elem")
+    get_arr_elem = m("get_arr_elem")
+    # array test
+    def run_arr_test():
+        arr = get_arr()
+        assert ref_count() == 2
+        arr2 = get_arr()
+        assert ref_count() == 3
+        assert arr.context == client.cpu(0)
+        arr.copyfrom(np.ones(10).astype(arr.dtype))
+        assert arr2.asnumpy()[0] == 1.0
+        assert get_elem(0) == 1.0
+        assert get_arr_elem(arr2, 0) == 1.0
+
+    assert ref_count() == 1
+    run_arr_test()
+    # check recycle correctness
+    assert ref_count() == 1
+
+
 def test_local_func():
     @tvm.register_func("rpc.test.remote_func2")
     def addone(x):
@@ -199,9 +240,10 @@ def addone(x):
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
+    test_rpc_return_ndarray()
+    test_rpc_return_func()
     test_bigendian_rpc()
     test_rpc_remote_module()
-    test_rpc_return_func()
     test_rpc_file_exchange()
     test_rpc_array()
     test_rpc_simple()

From 5435def2bfa2594d704a2f5ad005880fc5c72779 Mon Sep 17 00:00:00 2001
From: Keren Zhou <robinho364@gmail.com>
Date: Thu, 16 Aug 2018 16:35:23 -0700
Subject: [PATCH 020/529] [NNVM] Add ONNX upsample converter (#1591)

---
 nnvm/python/nnvm/frontend/onnx.py             | 19 ++++++++
 .../python/frontend/onnx/test_forward.py      | 47 +++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index fa26648b293a..f62202a37dff 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -406,6 +406,24 @@ def _impl(inputs, attr, params):
     return _impl
 
 
+class Upsample(OnnxOpConverter):
+    """ Operator converter for Upsample (nearest mode).
+    """
+
+    @classmethod
+    def _impl_v7(cls, inputs, attr, params):
+        scales = attr.get('scales')
+        assert len(scales) == 4 and scales[0] == 1.0 and scales[1] == 1.0 and scales[2] == scales[3]
+        mode = attr.get('mode')
+        if mode == b'nearest':
+            method = "NEAREST_NEIGHBOR"
+        elif mode == b'linear':
+            method = "BILINEAR"
+        else:
+            raise ValueError("Invalid ONNX upsample mode: {}".format(mode))
+        return _sym.upsampling(inputs[0], scale=int(scales[-1]), method=method, layout='NCHW')
+
+
 class Shape(OnnxOpConverter):
     """ Operator converter for Shape.
     """
@@ -540,6 +558,7 @@ def _get_convert_map(opset):
         # 'Crop'
         # 'Embedding'
         # 'Upsample'
+        'Upsample' : Upsample.get_converter(opset),
         'SpatialBN': BatchNorm.get_converter(opset),
 
         # defs/generator
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 9fb3aed2da10..3f2fbb144289 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -1,6 +1,8 @@
 import numpy as np
 import math
 import nnvm
+import topi
+import topi.testing
 import tvm
 from tvm.contrib import graph_runtime
 from nnvm.testing.config import ctx_list
@@ -380,6 +382,50 @@ def test_lrn():
     verify_lrn((5, 5, 5, 5), 3, 'float32')
     verify_lrn((5, 5, 5, 5), 3, 'float32', alpha=0.0002, beta=0.5, bias=2.0)
 
+def _test_upsample_nearest():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in'], ['out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0])
+    
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.upsampling_python(in_array, scale, "NCHW")
+
+    graph = helper.make_graph([y],
+                              'upsample_nearest_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_nearest_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        np.testing.assert_allclose(out_array, tvm_out)
+
+def _test_upsample_bilinear():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in'], ['out'], mode='linear', scales=[1.0, 1.0, 2.0, 2.0])
+    
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW")
+
+    graph = helper.make_graph([y],
+                              'upsample_bilinear_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_bilinear_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        np.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_upsample():
+    _test_upsample_nearest()
+    _test_upsample_bilinear()
+
 
 if __name__ == '__main__':
     # verify_super_resolution_example()
@@ -398,3 +444,4 @@ def test_lrn():
     test_matmul()
     test_gather()
     test_lrn()
+    test_upsample()

From 1d7ef11f577fa876d2823e52283f92e973ccbd4a Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Thu, 16 Aug 2018 16:37:03 -0700
Subject: [PATCH 021/529] add -mattr=+neon for all arm cpu target (#1612)

---
 apps/benchmark/README.md               | 11 +++++++----
 python/tvm/target.py                   | 16 ++++++++--------
 tutorials/nnvm/deploy_model_on_rasp.py |  2 +-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index e83e47c46eb7..ee22f90dc435 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -63,8 +63,11 @@ python3 -m tvm.exec.rpc_tracker
   python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro  
   ```
 
-  If your device has a same SoC of the above device, you can reuse these parameters
-  (e.g. use `llvm -device=arm_cpu -mode=rk3399 -target=aarch64-linux-gnu` as target).
-  Otherwise, you need to tune for your own device, please follow this 
-  [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).
+  If your device has a same or similar SoC of the above devices, you can reuse these parameters.
+  For example, if your SoC is similar to rasp3b, use
+  ```bash
+  python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key your_custom_key
+  ```
+  For other devices, to get the best performance, it is recommended that you tune your network by yourself. 
+  Please follow this [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).
 
diff --git a/python/tvm/target.py b/python/tvm/target.py
index fed20c3914c6..e2d780f75264 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -427,14 +427,14 @@ def arm_cpu(model='unknown', options=None):
     from . import autotvm
 
     trans_table = {
-        "pixel2":    ["-model=snapdragon835", "-target=arm64-linux-android"],
-        "mate10":    ["-model=kirin970", "-target=arm64-linux-android"],
-        "mate10pro": ["-model=kirin970", "-target=arm64-linux-android"],
-        "p20":       ["-model=kirin970", "-target=arm64-linux-android"],
-        "p20pro":    ["-model=kirin970", "-target=arm64-linux-android"],
-        "rasp3b":    ["-model=bcm2837", "-target=armv7l-linux-gnueabihf"],
-        "rk3399":    ["-model=rk3399", "-target=aarch64-linux-gnu"],
-        "pynq":      ["-model=pynq", "-target=armv7a-linux-eabi"],
+        "pixel2":    ["-model=snapdragon835", "-target=arm64-linux-android -mattr=+neon"],
+        "mate10":    ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "mate10pro": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "p20":       ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "p20pro":    ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "rasp3b":    ["-model=bcm2837", "-target=armv7l-linux-gnueabihf -mattr=+neon"],
+        "rk3399":    ["-model=rk3399", "-target=aarch64-linux-gnu -mattr=+neon"],
+        "pynq":      ["-model=pynq", "-target=armv7a-linux-eabi -mattr=+neon"],
     }
     pre_defined_opt = trans_table.get(model, ["-model=%s" % model])
 
diff --git a/tutorials/nnvm/deploy_model_on_rasp.py b/tutorials/nnvm/deploy_model_on_rasp.py
index c11f202c1251..40dbdaeb00ee 100644
--- a/tutorials/nnvm/deploy_model_on_rasp.py
+++ b/tutorials/nnvm/deploy_model_on_rasp.py
@@ -154,7 +154,7 @@ def transform_image(image):
 else:
     target = tvm.target.arm_cpu('rasp3b')
     # The above line is a simple form of
-    # target = tvm.target.create('llvm -devcie=arm_cpu -target=armv7l-linux-gnueabihf')
+    # target = tvm.target.create('llvm -devcie=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon')
 
 with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
     graph, lib, params = nnvm.compiler.build(

From 5c5ad860fdafb38a55555d8d0775b0d049adb0da Mon Sep 17 00:00:00 2001
From: Ashok Emani <ashok.emani@intel.com>
Date: Thu, 16 Aug 2018 19:19:43 -0700
Subject: [PATCH 022/529] fix output_shape in conv2d_nchw (#1613)

---
 topi/include/topi/nn.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 53b899796e37..4a537a646425 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -265,7 +265,7 @@ inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I,
   auto pW = I->shape[3];
   tvm::Array<tvm::Expr> output_shape{
       I->shape[0],                                            // B
-      W->shape[1],                                            // O
+      W->shape[0],                                            // O
       (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
       (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1   // W
   };

From 6bc2b63e757b39d0910aabf960cb242b6252308f Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 17 Aug 2018 11:37:56 +0900
Subject: [PATCH 023/529] [NNVM] Bug fix Prevent fusing convolution with
 injective op  (#1608)

---
 nnvm/src/compiler/graph_fuse.cc              | 31 +++++++++++++++++-
 nnvm/tests/python/compiler/test_op_fusion.py | 34 ++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
index 52a8ae44f8ee..f65312be1a29 100644
--- a/nnvm/src/compiler/graph_fuse.cc
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -63,12 +63,16 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
       // Check if we can fuse to the master.
       int chosen_master = -1;
       bool ewise = inode.source->num_outputs() == 1;
+      bool mark_as_injective = false;
       for (const auto& e : inode.inputs) {
         if (fuse_vec[e.node_id] == FuseRule::kUknown) {
           TOpPattern ipt = pattern_vec[e.node_id];
           if (ipt != kElemWise) ewise = false;
-          if (ipt <= kInjective) {
+          if (ipt <= kBroadcast) {
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else if (ipt == kInjective) {
             fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+            mark_as_injective = true;
           } else if (ipt == kOutEWiseFusable &&
                      chosen_master == -1 &&
                      shape_vec[idx.entry_id(nid, 0)] == shape_vec[idx.entry_id(e)]) {
@@ -87,6 +91,8 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
       master_vec[nid] = chosen_master;
       if (chosen_master != -1) {
         pt = kOutEWiseFusable;
+      } else if (mark_as_injective) {
+        pt = kInjective;
       } else {
         pt = ewise ? kElemWise : kBroadcast;
       }
@@ -135,8 +141,31 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
     if (group_vec[nid] == -1) {
       group_vec[nid] = nid;
     }
+
+    // Check if injective op and out_ewise_fusable op (e.g. conv2d) are in the same group.
+    bool parent_out_ewise = false;
+    bool parent_injective = false;
+    for (const auto& e : inode.inputs) {
+      TOpPattern pt = pattern_vec[e.node_id];
+      if (pt == kOutEWiseFusable) {
+        parent_out_ewise = true;
+      } else if (pt == kInjective) {
+        parent_injective = true;
+      }
+    }
+    // Change the master node from out_ewise_fusable op to itself
+    if (parent_injective && parent_out_ewise) master_vec[nid] = nid;
+
     // Propagate the group id.
     for (const auto& e : inode.inputs) {
+      TOpPattern pt = pattern_vec[e.node_id];
+      if (parent_out_ewise && parent_injective) {
+        if (pt == kOutEWiseFusable) {
+          continue;  // Do not fuse out_ewise_fusable op
+        } else if (pt == kInjective) {
+          master_vec[e.node_id] = nid;
+        }
+      }
       if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) {
         CHECK(group_vec[e.node_id] == -1||
               group_vec[e.node_id] == group_vec[nid]);
diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py
index 8d05ae02c579..5f4da3865a45 100644
--- a/nnvm/tests/python/compiler/test_op_fusion.py
+++ b/nnvm/tests/python/compiler/test_op_fusion.py
@@ -77,6 +77,39 @@ def test_injective_reduce_injective():
         np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
+def test_injective_conv2d():
+    channels = 16
+    data = sym.Variable(name="data")
+    pool = sym.global_avg_pool2d(data=data)
+    weight = sym.reshape(pool, shape=[1, channels, 1, 1])
+    residual = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1),
+                          layout="NCHW", kernel_layout="OIHW", use_bias=False, name="conv")
+    net = weight * data + residual
+    size = 56
+    dtype="float32"
+    dshape = (1, channels, size, size)
+    kshape = (channels, channels, 3, 3)
+    oshape = dshape
+    shape_dict = {"data": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, shape_dict)
+        # data, global_avg_pool, conv weight, conv op, fused elemwise add
+        assert graph.index.num_nodes == 5
+
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=data, conv_weight=kernel)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        residual = topi.testing.conv2d_nchw_python(
+            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
+        weight = np.mean(data.asnumpy(), axis=(2, 3))
+        c_np = weight[:, :, np.newaxis, np.newaxis] * data.asnumpy() + residual
+        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
 def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2):
     with nnvm.compiler.build_config(opt_level=opt_level):
         graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params)
@@ -123,3 +156,4 @@ def get_sym(out_channel):
     test_ewise_injective()
     test_conv_ewise_injective()
     test_fuse_conv2d_elu()
+    test_injective_conv2d()

From e3d094d92f1402359fe9e639c09c89781a5c7475 Mon Sep 17 00:00:00 2001
From: Sergey Mironov <grrwlf@gmail.com>
Date: Sat, 18 Aug 2018 07:40:52 +0300
Subject: [PATCH 024/529] [NNVM] TF: Add Pack operation (#1570)

---
 nnvm/include/nnvm/top/tensor.h                |  2 +-
 nnvm/python/nnvm/frontend/tensorflow.py       |  9 ++++++
 nnvm/src/top/tensor/transform.cc              | 19 ++++++------
 .../frontend/tensorflow/test_forward.py       | 29 ++++++++++++++++++-
 4 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h
index 22ee9d7118e6..53ed5b3b0a22 100644
--- a/nnvm/include/nnvm/top/tensor.h
+++ b/nnvm/include/nnvm/top/tensor.h
@@ -16,7 +16,7 @@ namespace top {
 struct ConcatenateParam : public dmlc::Parameter<ConcatenateParam> {
   int axis;
   DMLC_DECLARE_PARAMETER(ConcatenateParam) {
-    DMLC_DECLARE_FIELD(axis).set_lower_bound(0).set_default(1)
+    DMLC_DECLARE_FIELD(axis).set_default(1)
     .describe("the axis to be concated.");
   }
 };
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index d761e34c7c59..092b8fa20219 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -339,6 +339,14 @@ def _impl(inputs, attr, params):
             extras={'axis': axis.asnumpy()[0]})(inputs, attr)
     return _impl
 
+def _pack():
+    def _impl(inputs, attr, params):
+        axis = int(attr["axis"])
+        inputs_reshaped = [_sym.expand_dims(i, axis=axis, num_newaxis=1) for i in inputs]
+        return _sym.concatenate(*inputs_reshaped, axis=axis)
+
+    return _impl
+
 def _reshape():
     def _impl(inputs, attr, params):
         try:
@@ -673,6 +681,7 @@ def _impl(inputs, attr, params):
     'Minimum'                           : _elemwise('min'),
     'Sum'                               : _sum(),
     'Square'                            : _square(),
+    'Pack'                              : _pack(),
     'Relu'                              : AttrCvt('relu'),
     'Reshape'                           : _reshape(),
     'ResizeBilinear'                    : _resize_bilinear(),
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 78255d20f040..52dca5654838 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -93,23 +93,24 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs,
   TShape dshape;
   dim_t size = 0;
   bool has_zero = false;
+  int axis = param.axis >= 0 ? param.axis : in_shape->at(0).ndim() + param.axis;
   for (size_t i = 0; i < in_shape->size(); ++i) {
     TShape tmp = (*in_shape)[i];
     if (tmp.ndim()) {
-      CHECK_LT(static_cast<dim_t>(param.axis), tmp.ndim())
-          << "concat dim " << param.axis << " out of range of input shape " << tmp;
-      has_zero = tmp[param.axis] == 0 || has_zero;
-      size += tmp[param.axis];
-      tmp[param.axis] = 0;
+      CHECK_LT(static_cast<dim_t>(axis), tmp.ndim())
+          << "concat dim " << axis << " out of range of input shape " << tmp;
+      has_zero = tmp[axis] == 0 || has_zero;
+      size += tmp[axis];
+      tmp[axis] = 0;
       shape_assign(&dshape, tmp);
     }
   }
 
   TShape tmp = (*out_shape)[0];
   if (tmp.ndim()) {
-    CHECK_LT(static_cast<dim_t>(param.axis), tmp.ndim())
-        << "concat dim " << param.axis << " out of range of input shape " << tmp;
-    tmp[param.axis] = 0;
+    CHECK_LT(static_cast<dim_t>(axis), tmp.ndim())
+        << "concat dim " << axis << " out of range of input shape " << tmp;
+    tmp[axis] = 0;
     shape_assign(&dshape, tmp);
   }
 
@@ -119,7 +120,7 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs,
     NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, i, dshape);
   }
 
-  if (!has_zero) dshape[param.axis] = size;
+  if (!has_zero) dshape[axis] = size;
   NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, dshape);
   return dshape.Size() != 0;
 }
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 64c57c126f8d..6fa020a03444 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -342,7 +342,7 @@ def _test_argx(func, data, **kwargs):
 
         compare_tf_with_tvm(data, 'c0:0', 'argx0:0')
 
-def test_argmin_argmax():
+def test_forward_argminmax():
     for axis in [None,0,1,2]:
         data = np.random.uniform(size=(8,4,9)).astype('float32')
         _test_argx(tf.argmax, data=data, axis=axis)
@@ -555,6 +555,31 @@ def test_forward_lstm():
 
     _test_lstm_cell(1, 2, 1, 0.0, 'float32')
 
+
+
+#######################################################################
+# Pack
+# ---
+def _test_pack(axis, shape, **kwargs):
+
+    a = np.arange(np.prod(shape), dtype=np.float32).reshape(shape)
+    b = np.arange(np.prod(shape), dtype=np.float32).reshape(shape)
+
+    with tf.Graph().as_default():
+        tf_a = array_ops.placeholder(shape=shape, dtype='float32', name='pl_a')
+        tf_b = array_ops.placeholder(shape=shape, dtype='float32', name='pl_b')
+        tf_c = tf.stack([tf_a,tf_b], axis=axis, **kwargs)
+        assert tf_c.op.op_def.name == 'Pack', "tf.stack() is expected to produce 'Pack' operation"
+
+        compare_tf_with_tvm([a,b], ['pl_a:0','pl_b:0'], 'stack:0')
+
+def test_forward_pack():
+    for axis in range(-3,3):
+        _test_pack(axis, [3,2,1])
+    for axis in range(-1,1):
+        _test_pack(axis, [3])
+    _test_pack(0, [])
+
 #######################################################################
 # Pad
 # ---
@@ -818,9 +843,11 @@ def test_forward_l2_normalize():
     test_forward_reshape()
     test_forward_squeeze()
     test_forward_sigmoid()
+    test_forward_argminmax()
     if tf.__version__ == '1.4.1':
         _test_forward_concat_v2()
     test_forward_multi_input()
+    test_forward_pack()
     test_forward_inception_v3()
     test_forward_inception_v1()
     test_forward_mobilenet()

From d7df07fffc2fbe5d8fc60d23d030749f3bce4094 Mon Sep 17 00:00:00 2001
From: xqdan <danxiaoqiang@126.com>
Date: Sun, 19 Aug 2018 02:18:29 +0800
Subject: [PATCH 025/529] #1592 [PASS] Fix missing mem CHECK in storage_rewrite
 (#1616)

---
 src/pass/storage_rewrite.cc                   |  6 ++
 .../unittest/test_pass_storage_rewrite.py     | 63 ++++++++++++-------
 2 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 0170499e1491..877216ed7656 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -584,6 +584,12 @@ class StoragePlanRewriter : public IRMutator {
           e->new_alloc = Allocate::make(
               e->alloc_var, alloc_type, {combo_size}, const_true(),
               Evaluate::make(0));
+          if (e->scope.tag.length() != 0) {
+            MemoryInfo info = GetMemoryInfo(e->scope.to_string());
+            uint64_t total_elem = e->const_nbits / e->elem_type.bits();
+            CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+                << "Allocation exceed bound of memory tag " << e->scope.to_string();
+          }
         }
       }
     }
diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py
index 2bb02998982f..3c07a1f26aff 100644
--- a/tests/python/unittest/test_pass_storage_rewrite.py
+++ b/tests/python/unittest/test_pass_storage_rewrite.py
@@ -28,15 +28,30 @@ def verify(n):
     tvm.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 1
 
+def register_mem(scope_tb, max_bits):
+    #Register mem
+    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
+    def mem_info_inp_buffer():
+        return tvm.make.node("MemoryInfo",
+                        unit_bits= 16,
+                        max_simd_bits=32,
+                        max_num_bits=max_bits,
+                        head_address=None)
+
 def test_alloc_seq():
+    scope_tb = "local.L0A"
+    max_bits = 1024 * 1024 * 1024
+
+    register_mem(scope_tb, max_bits)
+
     ib = tvm.ir_builder.create()
     n = tvm.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
-            A = ib.allocate("float32", 200, name="A", scope="local.L0A")
+            A = ib.allocate("float32", 200, name="A", scope=scope_tb)
             A[j] = 1.2
         with ib.for_range(0, 10, name="j") as j:
-            A = ib.allocate("float32", 200, name="B", scope="local.L0A")
+            A = ib.allocate("float32", 200, name="B", scope=scope_tb)
             A[j] = 1.3
 
     body = ib.get()
@@ -233,16 +248,9 @@ def test_parallel_alloc():
 
     assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate))
 
-def test_inplace_rule2():
+def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024):
     #Test Buffer
-    scope_tb = "local_TB2"
-    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
-    def mem_info_inp_buffer():
-        return tvm.make.node("MemoryInfo",
-                        unit_bits= 16,
-                        max_simd_bits=32,
-                        max_num_bits=1024*1024*1024,
-                        head_address=None)
+    register_mem(scope_tb, max_bits)
     m = 10
     A = tvm.placeholder((m,), name='A')
     C = tvm.placeholder((m,), name='C')
@@ -275,16 +283,23 @@ def verify(n):
     tvm.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 2
 
+def test_exceed_mem():
+    max_bits = 639
+    # The critical max_num_bits is between 639 and 640
+    loc = -1
+    try:
+        test_inplace_rule2("local_TEM", max_bits)
+    except Exception as e:
+        estr = str(e)
+        loc = estr.find('Allocation exceed bound of memory')
+        assert loc != -1
+
 def test_inplace_rule3():
     #Test Buffer
     scope_tb = "local_TB3"
-    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
-    def mem_info_inp_buffer():
-        return tvm.make.node("MemoryInfo",
-                        unit_bits= 16,
-                        max_simd_bits=32,
-                        max_num_bits=1024*1024*1024,
-                        head_address=None)
+    max_bits=1024 * 1024 * 1024
+
+    register_mem(scope_tb, max_bits)
     m = 10
     B0 = tvm.placeholder((m,), name='B0')
     B1 = tvm.placeholder((m,), name='B1')
@@ -388,17 +403,22 @@ def verify(n):
     assert num_alloc[0] == 1
 
 def test_alloc_seq_type2():
+    scope_tb = "local.L0A2"
+    max_bits=1024 * 1024 * 1024
+
+    register_mem(scope_tb, max_bits)
+
     ib = tvm.ir_builder.create()
     n = tvm.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
-            A = ib.allocate("float32", 200, name="A", scope="local.L0A")
+            A = ib.allocate("float32", 200, name="A", scope=scope_tb)
             A[j] = 1.2
         with ib.for_range(0, 20, name="j") as j:
-            B = ib.allocate("int16", 400, name="B", scope="local.L0A")
+            B = ib.allocate("int16", 400, name="B", scope=scope_tb)
             B[j] = tvm.const(1, "int16")
         with ib.for_range(0, 10, name="j") as j:
-            C = ib.allocate("float32", 200, name="C", scope="local.L0A")
+            C = ib.allocate("float32", 200, name="C", scope=scope_tb)
             C[j] = 1.2
 
     body = ib.get()
@@ -465,6 +485,7 @@ def test_replace_dataflow():
     test_storage_combine()
     test_storage_share_gpu()
     test_inplace_rule2()
+    test_exceed_mem()
     test_inplace_rule3()
     test_alloc_seq_type()
     test_alloc_seq_type2()

From 566f18544875ace6e2597693af7f44d4857aa30e Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Sat, 18 Aug 2018 23:50:58 +0530
Subject: [PATCH 026/529] =?UTF-8?q?[FRONTEND][COREML]MultiplyLayerParams?=
 =?UTF-8?q?=20L2NormalizeLayerParams=20and=20UpsampleLayerParams=20support?=
 =?UTF-8?q?=20=E2=80=A6=20(#1511)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 nnvm/python/nnvm/frontend/coreml.py           |  22 ++
 .../python/frontend/coreml/test_forward.py    | 190 +++++++++++++++++-
 2 files changed, 211 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py
index e80cfe23f220..3ca76bb0b20e 100644
--- a/nnvm/python/nnvm/frontend/coreml.py
+++ b/nnvm/python/nnvm/frontend/coreml.py
@@ -217,6 +217,16 @@ def AddLayerParams(op, insyms, symtab):
         ret = _sym.__add_scalar__(ret, scalar=op.alpha)
     return ret
 
+def MultiplyLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list):
+        insyms = [insyms]
+    ret = insyms[0]
+    for i in range(1, len(insyms)):
+        ret = _sym.elemwise_mul(ret, insyms[i])
+    if op.alpha != 1:
+        ret = _sym.__mul_scalar__(ret, scalar=op.alpha)
+    return ret
+
 def ConcatLayerParams(op, insyms, symtab):
     if not isinstance(insyms, list):
         insyms = [insyms]
@@ -249,6 +259,15 @@ def PermuteLayerParams(op, insym, symtab):
     axes = tuple(op.axis)
     return _sym.transpose(insym, axes=axes)
 
+def UpsampleLayerParams(op, insym, symtab):
+    if op.scalingFactor[0] != op.scalingFactor[1]:
+        raise NotImplementedError("Upsampling only supported with same \
+            height and width scaling factor.")
+    interpolationMode = 'NEAREST_NEIGHBOR' if op.mode == 0 else 'BILINEAR'
+    return _sym.upsampling(insym, scale=op.scalingFactor[0], method=interpolationMode)
+
+def L2NormalizeLayerParams(op, insym, symtab):
+    return _sym.l2_normalize(insym, eps=op.epsilon, axis=1)
 
 _convert_map = {
     'NeuralNetworkMeanImage': NeuralNetworkMeanImage,
@@ -261,10 +280,13 @@ def PermuteLayerParams(op, insym, symtab):
     'SoftmaxLayerParams':SoftmaxLayerParams,
     'InnerProductLayerParams':InnerProductLayerParams,
     'AddLayerParams':AddLayerParams,
+    'MultiplyLayerParams':MultiplyLayerParams,
     'FlattenLayerParams':FlattenLayerParams,
     'ConcatLayerParams':ConcatLayerParams,
     'PaddingLayerParams':PaddingLayerParams,
     'PermuteLayerParams':PermuteLayerParams,
+    'UpsampleLayerParams':UpsampleLayerParams,
+    'L2NormalizeLayerParams':L2NormalizeLayerParams
 }
 
 def coreml_op_to_nnvm(op, inname, outname, symtab):
diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py
index d5c460e56987..27ae28c20ab9 100644
--- a/nnvm/tests/python/frontend/coreml/test_forward.py
+++ b/nnvm/tests/python/frontend/coreml/test_forward.py
@@ -1,8 +1,12 @@
 import numpy as np
 
-import topi
+from coremltools.models.neural_network import NeuralNetworkBuilder
+from coremltools.models import datatypes
+
 import tvm
 from tvm.contrib import graph_runtime
+import topi
+import topi.testing
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
@@ -40,6 +44,190 @@ def test_resnet50_checkonly():
     model_file = model_zoo.get_resnet50()
     test_model_checkonly(model_file, 'resnet50')
 
+def run_tvm_graph(graph_def, input_data, input_name, output_shape, output_dtype='float32'):
+    """ Generic function to compile on nnvm and execute on tvm """
+
+    sym, params = nnvm.frontend.from_coreml(graph_def)
+    target = 'llvm'
+    if isinstance(input_data, list):
+        shape_dict = {}
+        dtype_dict = {}
+        for i, e in enumerate(input_name):
+            shape_dict[e] = input_data[i].shape
+            dtype_dict[e] = input_data[i].dtype
+    else:
+        shape_dict = {input_name: input_data.shape}
+        dtype_dict = {input_name: input_data.dtype}
+
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict,
+                                             dtype=dtype_dict, params=params)
+
+    ctx = tvm.cpu(0)
+    from tvm.contrib import graph_runtime
+    m = graph_runtime.create(graph, lib, ctx)
+    # set inputs
+    if isinstance(input_data, list):
+        for i, e in enumerate(input_name):
+            m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
+    else:
+        m.set_input(input_name, tvm.nd.array(input_data.astype(input_data.dtype)))
+
+    m.set_input(**params)
+    # execute
+    m.run()
+    # get outputs
+    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+        tvm_output_list = []
+        for i, s in enumerate(output_shape):
+            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+            tvm_output_list.append(tvm_output.asnumpy())
+        return tvm_output_list
+    else:
+        tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype))
+        return tvm_output.asnumpy()
+
+def verify_AddLayerParams(input_dim, alpha=2):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.add(a_np1, a_np2) + alpha
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Add',
+                            alpha=alpha,
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='ADD')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_AddLayerParams():
+    verify_AddLayerParams((1, 2, 2), 0)
+    verify_AddLayerParams((1, 2, 2), 1)
+    verify_AddLayerParams((1, 3, 3), 2)
+
+def verify_MultiplyLayerParams(input_dim, alpha):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.multiply(a_np1, a_np2) * alpha
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Mul',
+                            alpha=alpha,
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='MULTIPLY')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_MultiplyLayerParams():
+    verify_MultiplyLayerParams((1, 2, 2), 0)
+    verify_MultiplyLayerParams((1, 2, 2), 1)
+    verify_MultiplyLayerParams((1, 3, 3), 2)
+
+def verify_ConcatLayerParams(input1_dim, input2_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input1_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input2_dim).astype(dtype)
+
+    b_np = np.concatenate((a_np1, a_np2), axis=1)
+    inputs = [('input1', datatypes.Array(*input1_dim)),
+              ('input2', datatypes.Array(*input2_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Concate',
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='CONCAT')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_ConcatLayerParams():
+    verify_ConcatLayerParams((1, 1, 2, 2), (1, 2, 2, 2))
+    verify_ConcatLayerParams((1, 2, 4, 4), (1, 3, 4, 4))
+
+def verify_UpsampleLayerParams(input_dim, scale, mode):
+    dtype = "float32"
+
+    a_np = np.full(input_dim, 1, dtype=dtype)
+    if mode == 'NN':
+        b_np = topi.testing.upsampling_python(a_np, scale)
+    else:
+        new_h = input_dim[2] * scale
+        new_w = input_dim[3] * scale
+        b_np = topi.testing.bilinear_resize_python(a_np, (new_h, new_w), 'NCHW')
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_upsample(name='Upsample',
+                         scaling_factor_h=scale,
+                         scaling_factor_w=scale,
+                         mode=mode,
+                         input_name='input',
+                         output_name='output')
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_UpsampleLayerParams():
+    verify_UpsampleLayerParams((1, 16, 32, 32), 2, 'NN')
+    verify_UpsampleLayerParams((1, 4, 6, 6), 3, 'BILINEAR')
+
+def verify_l2_normalize(input_dim, eps):
+    dtype = "float32"
+
+    a_np = np.random.uniform(size=input_dim).astype(dtype)
+    b_np = topi.testing.l2_normalize_python(a_np, eps, 1)
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_l2_normalize(name='L2', epsilon=eps, input_name='input', output_name='output')
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_l2_normalize():
+    verify_l2_normalize((1, 3, 20, 20), 0.001)
+
 if __name__ == '__main__':
     test_mobilenet_checkonly()
     test_resnet50_checkonly()
+    test_forward_AddLayerParams()
+    test_forward_ConcatLayerParams()
+    test_forward_MultiplyLayerParams()
+    test_forward_UpsampleLayerParams()
+    test_forward_l2_normalize()

From 026c2626ce7200a5097205810f2db681d4cc9939 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Sun, 19 Aug 2018 19:29:35 -0700
Subject: [PATCH 027/529] fix import (#1621)

---
 nnvm/python/nnvm/testing/yolo2_detection.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/nnvm/python/nnvm/testing/yolo2_detection.py b/nnvm/python/nnvm/testing/yolo2_detection.py
index b7744c45cff4..0b229149b8ea 100644
--- a/nnvm/python/nnvm/testing/yolo2_detection.py
+++ b/nnvm/python/nnvm/testing/yolo2_detection.py
@@ -10,9 +10,6 @@
 import math
 from collections import namedtuple
 import numpy as np
-from PIL import Image
-from PIL import ImageDraw
-from PIL import ImageFont
 
 def _entry_index(batch, w, h, outputs, classes, coords, location, entry):
     n = int(location/(w*h))
@@ -186,6 +183,10 @@ def _draw_label(im, r, c, label, rgb):
                         _set_pixel(im, i+c, j+r, k, val)#rgb[k] * val)
 
 def _get_label(labelstr, rgb):
+    from PIL import Image
+    from PIL import ImageDraw
+    from PIL import ImageFont
+
     text = labelstr
     colorText = "black"
     testDraw = ImageDraw.Draw(Image.new('RGB', (1, 1)))

From d6d97e8772e4e5b7982688eeffd8b930f5c4fcce Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 20 Aug 2018 11:30:01 +0900
Subject: [PATCH 028/529] Add missing check when deciding conv op and injective
 op are in the same group (#1622)

---
 nnvm/src/compiler/graph_fuse.cc              |  1 +
 nnvm/tests/python/compiler/test_op_fusion.py | 34 ++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
index f65312be1a29..4999d93d1861 100644
--- a/nnvm/src/compiler/graph_fuse.cc
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -146,6 +146,7 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
     bool parent_out_ewise = false;
     bool parent_injective = false;
     for (const auto& e : inode.inputs) {
+      if (fuse_vec[e.node_id] != FuseRule::kFuseToMaster) continue;
       TOpPattern pt = pattern_vec[e.node_id];
       if (pt == kOutEWiseFusable) {
         parent_out_ewise = true;
diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py
index 5f4da3865a45..0c81ac890d55 100644
--- a/nnvm/tests/python/compiler/test_op_fusion.py
+++ b/nnvm/tests/python/compiler/test_op_fusion.py
@@ -110,6 +110,39 @@ def test_injective_conv2d():
         np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
+def test_concatenate_conv2d():
+    ch = 3
+    size = 8
+    data = sym.Variable(name="data")
+    concat = sym.concatenate(data, data, axis=1)
+    conv = sym.conv2d(data=concat, kernel_size=(1,1), channels=ch*2, use_bias=False, name="conv")
+    net = sym.elemwise_add(concat, conv)
+
+    dtype="float32"
+    dshape = (1, ch, size, size)
+    kshape = (ch*2, ch*2, 1, 1)
+    oshape = (1, ch*2, size, size)
+    shape_dict = {"data": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, shape_dict)
+        # data, conv weight, conv op, concat
+        assert graph.index.num_nodes == 4
+
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=data, conv_weight=kernel)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+
+        concat = np.concatenate((data.asnumpy(), data.asnumpy()), axis=1)
+        conv = topi.testing.conv2d_nchw_python(
+            concat, kernel.asnumpy(), (1,1), 'SAME')
+        ref = concat + conv
+        np.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
+
+
 def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2):
     with nnvm.compiler.build_config(opt_level=opt_level):
         graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params)
@@ -157,3 +190,4 @@ def get_sym(out_channel):
     test_conv_ewise_injective()
     test_fuse_conv2d_elu()
     test_injective_conv2d()
+    test_concatenate_conv2d()

From 826de7b833af5e455649dae1fd58015e89e0281d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 20 Aug 2018 09:20:19 -0700
Subject: [PATCH 029/529] [NODEREF] Introduce named attribute system. (#1618)

---
 include/tvm/attrs.h                           | 593 ++++++++++++++++++
 include/tvm/runtime/packed_func.h             |   8 +
 python/tvm/make.py                            |  11 +
 src/api/api_base.cc                           |  12 -
 src/api/api_test.cc                           |  46 ++
 src/api/dsl_api.cc                            |  37 +-
 src/lang/attrs.cc                             |  45 ++
 src/lang/reflection.cc                        |  42 +-
 tests/cpp/attrs_test.cc                       |  76 +++
 tests/python/unittest/test_lang_reflection.py |  26 +
 10 files changed, 861 insertions(+), 35 deletions(-)
 create mode 100644 include/tvm/attrs.h
 create mode 100644 src/api/api_test.cc
 create mode 100644 src/lang/attrs.cc
 create mode 100644 tests/cpp/attrs_test.cc

diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
new file mode 100644
index 000000000000..aed6b1ff722f
--- /dev/null
+++ b/include/tvm/attrs.h
@@ -0,0 +1,593 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/attrs.h
+ * \brief TVM attribute module
+ *
+ *  This module enables declaration of named attributes
+ *  which support default value setup and bound checking.
+ *
+ * \code
+ *   struct MyAttrs : public tvm::AttrsNode<MyAttrs> {
+ *     float learning_rate;
+ *     int num_hidden;
+ *     std::string name;
+ *     // declare attribute fields in header file
+ *     TVM_DECLARE_ATTRS(MyAttrs, "attrs.MyAttrs") {
+ *       TVM_ATTR_FIELD(num_hidden).set_lower_bound(1);
+ *       TVM_ATTR_FIELD(learning_rate).set_default(0.01f);
+ *       TVM_ATTR_FIELD(name).set_default("hello");
+ *     }
+ *   };
+ *   // register it in cc file
+ *   TVM_REGISTER_NODE_TYPE(MyAttrs);
+ * \endcode
+ *
+ * \sa AttrsNode, TVM_DECLARE_ATTRS, TVM_ATTR_FIELD
+ */
+#ifndef TVM_ATTRS_H_
+#define TVM_ATTRS_H_
+
+#include <unordered_map>
+#include <vector>
+#include <type_traits>
+#include <string>
+#include "./ir.h"
+#include "./base.h"
+#include "./packed_func_ext.h"
+
+namespace tvm {
+/*!
+ * \brief Declare an attribute function.
+ * \param ClassName The name of the class.
+ * \param TypeKey The type key to be used by the TVM node system.
+ */
+#define TVM_DECLARE_ATTRS(ClassName, TypeKey)                   \
+  static constexpr const char* _type_key = TypeKey;             \
+  TVM_DECLARE_NODE_TYPE_INFO(ClassName, ::tvm::BaseAttrsNode);  \
+  template<typename FVisit>                                     \
+  void __VisitAttrs__(FVisit& __fvisit__)  // NOLINT(*)
+
+
+/*!
+ * \brief Declare an attribute field.
+ * \param FieldName The field name.
+ */
+#define TVM_ATTR_FIELD(FieldName) \
+  __fvisit__(#FieldName, &FieldName)
+
+
+/*! \brief Error thrown during attribute checking. */
+struct AttrError : public dmlc::Error {
+  /*!
+   * \brief constructor
+   * \param msg error message
+   */
+  explicit AttrError(const std::string &msg)
+      : dmlc::Error(msg) {}
+};
+
+/*!
+ * \brief Information about attribute fields in string representations.
+ */
+struct AttrFieldInfo {
+  /*! \brief name of the field */
+  std::string name;
+  /*! \brief type docstring information in str. */
+  std::string type_info;
+  /*! \brief detailed description of the type */
+  std::string description;
+};
+
+/*!
+ * \brief Base class of all attribute class
+ * \note Do not subclass AttrBaseNode directly,
+ *       subclass AttrsNode instead.
+ * \sa AttrsNode
+ */
+class BaseAttrsNode : public Node {
+ public:
+  using TVMArgs = runtime::TVMArgs;
+  using TVMRetValue = runtime::TVMRetValue;
+  /*!
+   * \brief Initialize the attributes by sequence of arguments
+   * \param args The postional arguments in the form
+   *        [key0, value0, key1, value1, ..., key_n, value_n]
+   */
+  template<typename... Args>
+  inline void InitBySeq(Args&& ...args);
+  /*!
+   * \brief Print readible docstring to ostream, add newline.
+   * \param os the stream to print the docstring to.
+   */
+  inline void PrintDocString(std::ostream &os) const;  // NOLINT(*)
+  /*!
+   * \brief Get the field information about the
+   * \note This function throws when the required a field is not present.
+   */
+  TVM_DLL virtual std::vector<AttrFieldInfo> ListFieldInfo() const = 0;
+  /*!
+   * \brief Initialize the attributes by arguments.
+   * \param kwargs The key value pairs for initialization.
+   *        [key0, value0, key1, value1, ..., key_n, value_n]
+   * \param allow_unknown Whether allow additional unknown fields.
+   * \note This function throws when the required a field is not present.
+   */
+  TVM_DLL virtual void InitByPackedArgs(const TVMArgs& kwargs, bool allow_unknown = false) = 0;
+
+  static constexpr const char* _type_key = "Attrs";
+  TVM_DECLARE_BASE_NODE_INFO(BaseAttrsNode, Node);
+};
+
+/*! \brief Base attribute container for all attributes */
+class Attrs : public NodeRef {
+ public:
+  // normal constructor
+  Attrs() {}
+  // construct from shared ptr.
+  explicit Attrs(std::shared_ptr<Node> n) : NodeRef(n) {}
+
+  /*! \return The attribute node */
+  const BaseAttrsNode* operator->() const {
+    return ptr();
+  }
+  /*! \brief specify container node */
+  using ContainerType = BaseAttrsNode;
+
+ private:
+  /*! \return the internal attribute node */
+  const BaseAttrsNode* ptr() const {
+    return static_cast<const BaseAttrsNode*>(node_.get());
+  }
+};
+
+/*!
+ * \brief Specialized attribute type that is backed by a map.
+ *  The DictAttrsNode implements the Attrs behavior,
+ *  its fields are directly accessible via object.field_name
+ *  like other normal nodes.
+ */
+class DictAttrsNode : public BaseAttrsNode {
+ public:
+  /*! \brief internal attrs map */
+  Map<std::string, NodeRef> dict;
+  /*!
+   * \brief Consruct a Attrs backed by DictAttrsNode.
+   * \param dict The attributes.
+   * \return The dict attributes.
+   */
+  TVM_DLL static Attrs make(Map<std::string, NodeRef> dict);
+  // implementations
+  void VisitAttrs(AttrVisitor* v) final;
+  void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final;
+  std::vector<AttrFieldInfo> ListFieldInfo() const final;
+  // type info
+  static constexpr const char* _type_key = "DictAttrs";
+  TVM_DECLARE_NODE_TYPE_INFO(DictAttrsNode, BaseAttrsNode);
+};
+
+// Namespace containing detail implementations
+namespace detail {
+using runtime::TVMArgValue;
+
+// helper entry that does nothing in set_default/bound/describe calls.
+struct AttrNopEntry {
+  using TSelf = AttrNopEntry;
+
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED T value) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED T begin) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED T end) {
+    return *this;
+  }
+};
+
+// Wrapper for normal visitor.
+class AttrNormalVisitor {
+ public:
+  explicit AttrNormalVisitor(AttrVisitor* visitor)
+      : visitor_(visitor) {
+  }
+  template<typename T>
+  AttrNopEntry operator()(const char* key, T* value) {
+    visitor_->Visit(key, value);
+    return AttrNopEntry();
+  }
+
+ private:
+  AttrVisitor* visitor_;
+};
+
+// helper entry that does initialization, set default.
+template<typename T>
+struct AttrInitEntry {
+  // The attributes
+  using TSelf = AttrInitEntry<T>;
+  // The type key
+  const char* type_key_;
+  // field name
+  const char* key_;
+  // internal value.
+  T* value_;
+  // whether the value is missing.
+  bool value_missing_{true};
+  // If the value is still missing in destruction time throw an error.
+  ~AttrInitEntry() DMLC_THROW_EXCEPTION {
+    if (value_missing_) {
+      std::ostringstream os;
+      os << type_key_ << ": Cannot find required field \'" << key_
+         << "\' during initialization";
+      throw AttrError(os.str());
+    }
+  }
+  // override fields.
+  // This function sets the lower bound of the attribute
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) {
+    if (this->value_missing_)  return *this;
+    const T& val = *value_;
+    if (begin > val) {
+      std::ostringstream os;
+      os << type_key_ << "." << key_ << ": "
+         << "value " << val
+         << " is smaller than the lower bound " << begin;
+      throw AttrError(os.str());
+    }
+    return *this;
+  }
+  // This function sets the upper bound of the attribute
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) {
+    if (this->value_missing_)  return *this;
+    const T& val = *value_;
+    if (val > end) {
+      std::ostringstream os;
+      os << type_key_ << "." << key_ << ": "
+         << "value " << val
+         << " is bigger than the upper bound " << end;
+      throw AttrError(os.str());
+    }
+    return *this;
+  }
+  // set default when
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) {
+    if (!value_missing_) return *this;
+    *value_ = value;
+    value_missing_ = false;
+    return *this;
+  }
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    return *this;
+  }
+};
+
+// Template function to allow smart conversion
+// from Expr types into the constants.
+template<typename T>
+inline void SetValue(T* ptr, const TVMArgValue& val) {
+  *ptr = val.operator T();
+}
+template<typename T>
+inline void SetIntValue(T* ptr, const TVMArgValue& val) {
+  if (val.type_code() == kDLInt) {
+    *ptr = static_cast<T>(val.value().v_int64);
+  } else {
+    Expr expr = val;
+    CHECK(expr.defined());
+    if (const ir::IntImm* op = expr.as<ir::IntImm>()) {
+      *ptr = static_cast<T>(op->value);
+    } else if (const ir::UIntImm* op = expr.as<ir::UIntImm>()) {
+      *ptr = static_cast<T>(op->value);
+    } else {
+      LOG(FATAL) << "Expect int value, but get " << expr->type_key();
+    }
+  }
+}
+template<>
+inline void SetValue<std::string>(std::string* ptr, const TVMArgValue& val) {
+  if (val.type_code() == kStr) {
+    *ptr = val.operator std::string();
+  } else {
+    Expr expr = val;
+    const ir::StringImm* op = expr.as<ir::StringImm>();
+    CHECK(op != nullptr);
+    *ptr = op->value;
+  }
+}
+template<>
+inline void SetValue<double>(double* ptr, const TVMArgValue& val) {
+  if (val.type_code() == kDLFloat || val.type_code() == kDLInt) {
+    *ptr = val.operator double();
+  } else {
+    Expr expr = val;
+    CHECK(expr.defined());
+    if (const ir::IntImm* op = expr.as<ir::IntImm>()) {
+      *ptr = static_cast<double>(op->value);
+    } else if (const ir::IntImm* op = expr.as<ir::IntImm>()) {
+      *ptr = static_cast<double>(op->value);
+    } else if (const ir::UIntImm* op = expr.as<ir::UIntImm>()) {
+      *ptr = static_cast<double>(op->value);
+    } else {
+      LOG(FATAL) << "Expect float value, but get " << expr->type_key();
+    }
+  }
+}
+template<>
+inline void SetValue<int>(int* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+template<>
+inline void SetValue<int64_t>(int64_t* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+template<>
+inline void SetValue<uint64_t>(uint64_t* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+template<>
+inline void SetValue<bool>(bool* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+
+// Visitor for value initialization
+template<typename FFind>
+class AttrInitVisitor {
+ public:
+  // Counter of number of matched attributes during visit.
+  // This is used to decide if there is additional unmatched attributes.
+  size_t hit_count_{0};
+  // constructor
+  AttrInitVisitor(const char* type_key, FFind ffind)
+      : type_key_(type_key), ffind_(ffind) {
+  }
+
+  template<typename T>
+  AttrInitEntry<T> operator()(const char* key, T* value) {
+    TVMArgValue val;
+    AttrInitEntry<T> opt;
+    opt.type_key_ = type_key_;
+    opt.key_ = key;
+    opt.value_ = value;
+    if (ffind_(key, &val)) {
+      SetValue(value, val);
+      opt.value_missing_ = false;
+      ++hit_count_;
+    } else {
+      opt.value_missing_ = true;
+    }
+    return opt;
+  }
+
+ private:
+  // the type key
+  const char* type_key_;
+  FFind ffind_;
+};
+
+template<typename FFind>
+inline AttrInitVisitor<FFind> CreateInitVisitor(
+    const char* type_key,
+    FFind ffind) {
+  return AttrInitVisitor<FFind>(type_key, ffind);
+}
+
+/*!
+ * \brief Helper struct to get the type name known to tvm.
+ * \tparam T the type we are interested in.
+ */
+template<typename T>
+struct TypeName {
+  static constexpr const char* value = T::ContainerType::_type_key;
+};
+
+template<>
+struct TypeName<int> {
+  static constexpr const char* value = "int";
+};
+
+template<>
+struct TypeName<int64_t> {
+  static constexpr const char* value = "int64";
+};
+
+template<>
+struct TypeName<uint64_t> {
+  static constexpr const char* value = "uint64_t";
+};
+
+template<>
+struct TypeName<Type> {
+  static constexpr const char* value = "Type";
+};
+
+template<>
+struct TypeName<std::string> {
+  static constexpr const char* value = "str";
+};
+
+template<>
+struct TypeName<bool> {
+  static constexpr const char* value = "bool";
+};
+
+template<>
+struct TypeName<void*> {
+  static constexpr const char* value = "handle";
+};
+
+template<>
+struct TypeName<double> {
+  static constexpr const char* value = "double";
+};
+
+class AttrDocEntry {
+ public:
+  using TSelf = AttrDocEntry;
+
+  explicit AttrDocEntry(AttrFieldInfo* info)
+      : info_(info) {
+  }
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    info_->description = str;
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED T value) {
+    std::ostringstream os;
+    os << info_->type_info << ", default=" << value;
+    info_->type_info = os.str();
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED T begin) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED T end) {
+    return *this;
+  }
+
+ private:
+  AttrFieldInfo* info_;
+};
+
+class AttrDocVisitor {
+ public:
+  template<typename T>
+  AttrDocEntry operator()(const char* key, T* v) {
+    AttrFieldInfo info;
+    info.name = key;
+    info.type_info = TypeName<T>::value;
+    fields_.emplace_back(std::move(info));
+    return AttrDocEntry(&(fields_.back()));
+  }
+
+  std::vector<AttrFieldInfo> fields_;
+};
+
+class AttrExistVisitor {
+ public:
+  std::string key_;
+  bool exist_{false};
+
+  template<typename T>
+  AttrNopEntry operator()(const char* key, T* v) {
+    if (exist_) return AttrNopEntry();
+    if (key == key_) exist_ = true;
+    return AttrNopEntry();
+  }
+};
+}  // namespace detail
+
+/*!
+ * \brief The base class of the all the
+ *  Use "curiously recurring template pattern".
+ *
+ * \tparam DerivedType The final attribute type.
+ */
+template<typename DerivedType>
+class AttrsNode : public BaseAttrsNode {
+ public:
+  void VisitAttrs(AttrVisitor* v) final {
+    detail::AttrNormalVisitor vis(v);
+    self()->__VisitAttrs__(vis);
+  }
+
+  void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final {
+    CHECK_EQ(args.size() % 2, 0);
+    const int kLinearSearchBound = 16;
+    int hit_count = 0;
+    // applies two stratgies to lookup
+    if (args.size() < kLinearSearchBound) {
+      // linear search.
+      auto ffind = [&args](const char* key, runtime::TVMArgValue* val) {
+        for (int i = 0; i < args.size(); i += 2) {
+          CHECK_EQ(args.type_codes[i], kStr);
+          if (!std::strcmp(key, args.values[i].v_str)) {
+            *val = args[i + 1];
+            return true;
+          }
+        }
+        return false;
+      };
+      auto vis = detail::CreateInitVisitor(DerivedType::_type_key, ffind);
+      self()->__VisitAttrs__(vis);
+      hit_count = vis.hit_count_;
+    } else {
+      // construct a map then do lookup.
+      std::unordered_map<std::string, runtime::TVMArgValue> kwargs;
+      for (int i = 0; i < args.size(); i += 2) {
+        CHECK_EQ(args.type_codes[i], kStr);
+        kwargs[args[i].operator std::string()] = args[i + 1];
+      }
+      auto ffind = [&kwargs](const char *key, runtime::TVMArgValue* val) {
+        auto it = kwargs.find(key);
+        if (it != kwargs.end()) {
+          *val = it->second;
+          return true;
+        }
+        return false;
+      };
+      auto vis = detail::CreateInitVisitor(DerivedType::_type_key, ffind);
+      self()->__VisitAttrs__(vis);
+      hit_count = vis.hit_count_;
+    }
+    // error handling, slow path
+    if (hit_count * 2 != args.size() && !allow_unknown) {
+      for (int i = 0; i < args.size(); i += 2) {
+        detail::AttrExistVisitor visitor;
+        visitor.key_ = args[i].operator std::string();
+        self()->__VisitAttrs__(visitor);
+        if (!visitor.exist_) {
+          std::ostringstream os;
+          os << DerivedType::_type_key
+             << ": does not have field \'" << visitor.key_
+             << "\', Possible fields:\n";
+          os << "----------------\n";
+          this->PrintDocString(os);
+          throw AttrError(os.str());
+        }
+      }
+    }
+  }
+
+  std::vector<AttrFieldInfo> ListFieldInfo() const final {
+    detail::AttrDocVisitor visitor;
+    self()->__VisitAttrs__(visitor);
+    return visitor.fields_;
+  }
+
+ private:
+  DerivedType* self() const {
+    return const_cast<DerivedType*>(
+        static_cast<const DerivedType*>(this));
+  }
+};
+
+
+template<typename... Args>
+inline void BaseAttrsNode::InitBySeq(Args&& ...args) {
+  runtime::PackedFunc pf([this](const TVMArgs& args, TVMRetValue *rv) {
+      this->InitByPackedArgs(args);
+    });
+  pf(std::forward<Args>(args)...);
+}
+
+inline void BaseAttrsNode::PrintDocString(std::ostream &os) const { // NOLINT(*)
+  std::vector<AttrFieldInfo> entry = this->ListFieldInfo();
+  for (AttrFieldInfo info : entry) {
+    os << info.name << " : " << info.type_info << '\n';
+    if (info.description.length() != 0) {
+      os << "    " << info.description << '\n';
+    }
+  }
+}
+
+}  // namespace tvm
+#endif  // TVM_ATTRS_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 6d8df4a5e3d6..63e8ca7cd16b 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -223,6 +223,12 @@ class ExtTypeVTable {
 class TVMPODValue_ {
  public:
   operator double() const {
+    // Allow automatic conversion from int to float
+    // This avoids errors when user pass in int from
+    // the frontend while the API expects a float.
+    if (type_code_ == kDLInt) {
+      return static_cast<double>(value_.v_int64);
+    }
     TVM_CHECK_TYPE_CODE(type_code_, kDLFloat);
     return value_.v_float64;
   }
@@ -310,6 +316,8 @@ class TVMPODValue_ {
  */
 class TVMArgValue : public TVMPODValue_ {
  public:
+  /*! \brief default constructor */
+  TVMArgValue() {}
   /*!
    * \brief constructor
    * \param value of the function
diff --git a/python/tvm/make.py b/python/tvm/make.py
index 49f698f4f663..19949509778b 100644
--- a/python/tvm/make.py
+++ b/python/tvm/make.py
@@ -71,6 +71,17 @@ def node(type_key, **kwargs):
     **kwargs : dict
         The fields of the node.
 
+    Returns
+    -------
+    node : Node
+        The corresponding DSL Node
+
+    Note
+    ----
+    If the created node is instance of AttrsNode, then
+    the creator function will also run bound checks and
+    default value setup as supported by Attrs.
+
     Example
     -------
     The following code constructs a IntImm object
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index 70301993ad3a..3583f42a00c9 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -33,18 +33,6 @@ TVM_REGISTER_API("_load_json")
     *ret = LoadJSON<NodeRef>(args[0]);
   });
 
-TVM_REGISTER_API("_nop")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-  });
-
-// internal fucntion used for debug and testing purposes
-TVM_REGISTER_API("_ndarray_use_count")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    runtime::NDArray nd = args[0];
-    // substract the current one
-    *ret = (nd.use_count() - 1);
-  });
-
 TVM_REGISTER_API("_TVMSetStream")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     TVMSetStream(args[0], args[1], args[2]);
diff --git a/src/api/api_test.cc b/src/api/api_test.cc
new file mode 100644
index 000000000000..1744267fdcd7
--- /dev/null
+++ b/src/api/api_test.cc
@@ -0,0 +1,46 @@
+ /*!
+ *  Copyright (c) 2018 by Contributors
+ *  Code mainly used for test purposes.
+ * \file api_test.cc
+ */
+#include <tvm/expr.h>
+#include <tvm/tensor.h>
+#include <tvm/attrs.h>
+#include <tvm/api_registry.h>
+
+namespace tvm {
+// Attrs used to python API
+struct TestAttrs : public AttrsNode<TestAttrs> {
+  int axis;
+  std::string name;
+  Array<Expr> padding;
+
+  TVM_DECLARE_ATTRS(TestAttrs, "attrs.TestAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .set_default(10)
+        .set_lower_bound(1)
+        .set_upper_bound(10)
+        .describe("axis field");
+    TVM_ATTR_FIELD(name)
+        .describe("name");
+    TVM_ATTR_FIELD(padding)
+        .describe("padding of input")
+        .set_default(Array<Expr>({0, 0}));
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(TestAttrs);
+
+TVM_REGISTER_API("_nop")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+  });
+
+// internal fucntion used for debug and testing purposes
+TVM_REGISTER_API("_ndarray_use_count")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    runtime::NDArray nd = args[0];
+    // substract the current one
+    *ret = (nd.use_count() - 1);
+  });
+
+}  // namespace tvm
diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc
index 80d7c3163e10..9157e62fda8a 100644
--- a/src/api/dsl_api.cc
+++ b/src/api/dsl_api.cc
@@ -7,6 +7,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/api_registry.h>
+#include <tvm/attrs.h>
 #include <vector>
 #include <string>
 #include <exception>
@@ -124,22 +125,35 @@ class DSLAPIImpl : public DSLAPI {
         (*static_cast<TVMAPINode*>(handle))->type_index());
   }
   void NodeGetAttr(NodeHandle handle,
-                  const char* key,
-                  TVMValue* ret_val,
-                  int* ret_type_code,
-                  int* ret_success) const final {
+                   const char* key,
+                   TVMValue* ret_val,
+                   int* ret_type_code,
+                   int* ret_success) const final {
     TVMRetValue rv;
     APIAttrGetter getter;
+    TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
     getter.skey = key;
     getter.ret = &rv;
-    TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
     if (getter.skey == "type_key") {
       ret_val->v_str = (*tnode)->type_key();
       *ret_type_code = kStr;
       *ret_success = 1;
-    } else {
+      return;
+    } else if (!(*tnode)->is_type<DictAttrsNode>()) {
       (*tnode)->VisitAttrs(&getter);
       *ret_success = getter.found_ref_object || rv.type_code() != kNull;
+    } else {
+      // specially handle dict attr
+      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode->get());
+      auto it = dnode->dict.find(key);
+      if (it != dnode->dict.end()) {
+        *ret_success = 1;
+        rv = (*it).second;
+      } else {
+        *ret_success = 0;
+      }
+    }
+    if (*ret_success) {
       if (rv.type_code() == kStr ||
           rv.type_code() == kTVMType) {
         TVMAPIThreadLocalEntry *e = TVMAPIThreadLocalStore::Get();
@@ -159,7 +173,16 @@ class DSLAPIImpl : public DSLAPI {
     TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
     APIAttrDir dir;
     dir.names = &(ret->ret_vec_str);
-    (*tnode)->VisitAttrs(&dir);
+
+    if (!(*tnode)->is_type<DictAttrsNode>()) {
+      (*tnode)->VisitAttrs(&dir);
+    } else {
+      // specially handle dict attr
+      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode->get());
+      for (const auto& kv : dnode->dict) {
+        ret->ret_vec_str.push_back(kv.first);
+      }
+    }
     ret->ret_vec_charp.clear();
     for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
       ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
new file mode 100644
index 000000000000..49a91983e79d
--- /dev/null
+++ b/src/lang/attrs.cc
@@ -0,0 +1,45 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file attrs.cc
+ */
+#include <tvm/attrs.h>
+
+namespace tvm {
+
+void DictAttrsNode::VisitAttrs(AttrVisitor* v)  {
+  v->Visit("__dict__", &dict);
+}
+
+void DictAttrsNode::InitByPackedArgs(
+    const runtime::TVMArgs& args, bool allow_unknown) {
+  for (int i = 0; i < args.size(); i += 2) {
+    std::string key = args[i];
+    runtime::TVMArgValue val = args[i + 1];
+    if (val.type_code() == kNodeHandle) {
+      dict.Set(key, val.operator NodeRef());
+    } else if (val.type_code() == kStr) {
+      dict.Set(key, Expr(val.operator std::string()));
+    } else {
+      dict.Set(key, val.operator Expr());
+    }
+  }
+}
+
+std::vector<AttrFieldInfo> DictAttrsNode::ListFieldInfo() const {
+  return {};
+}
+
+Attrs DictAttrsNode::make(Map<std::string, NodeRef> dict) {
+  std::shared_ptr<DictAttrsNode> n = std::make_shared<DictAttrsNode>();
+  n->dict = std::move(dict);
+  return Attrs(n);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<DictAttrsNode>([](const DictAttrsNode *op, IRPrinter *p) {
+    p->stream << op->dict;
+});
+
+TVM_REGISTER_NODE_TYPE(DictAttrsNode);
+
+}  // namespace tvm
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 7c4e862f0abb..9fb9143aa7f4 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -5,6 +5,7 @@
  */
 #include <tvm/base.h>
 #include <tvm/expr.h>
+#include <tvm/attrs.h>
 #include <tvm/container.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/runtime/ndarray.h>
@@ -467,22 +468,15 @@ class NodeAttrSetter : public AttrVisitor {
   }
 };
 
-// API function to make node.
-// args format:
-//    type_key, key1, value1, ..., key_n, value_n
-void MakeNode(runtime::TVMArgs args, runtime::TVMRetValue* rv) {
+
+void InitNodeByPackedArgs(Node* n, const TVMArgs& args) {
   NodeAttrSetter setter;
-  setter.type_key = args[0].operator std::string();
-  CHECK_EQ(args.size() % 2, 1);
-  for (int i = 1; i < args.size(); i += 2) {
-    setter.attrs.emplace(
-        args[i].operator std::string(),
-        runtime::TVMArgValue(args.values[i + 1], args.type_codes[i + 1]));
-  }
-  auto* f = dmlc::Registry<NodeFactoryReg>::Find(setter.type_key);
-  CHECK(f != nullptr)
-      << "Node type \'" << setter.type_key << "\' is not registered in TVM";
-  std::shared_ptr<Node> n = f->body();
+  setter.type_key = n->type_key();
+  CHECK_EQ(args.size() % 2, 0);
+  for (int i = 0; i < args.size(); i += 2) {
+    setter.attrs.emplace(args[i].operator std::string(),
+                         args[i + 1]);
+  }
   n->VisitAttrs(&setter);
   if (setter.attrs.size() != 0) {
     std::ostringstream os;
@@ -492,10 +486,26 @@ void MakeNode(runtime::TVMArgs args, runtime::TVMRetValue* rv) {
     }
     LOG(FATAL) << os.str();
   }
+}
+
+// API function to make node.
+// args format:
+//   key1, value1, ..., key_n, value_n
+void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
+  std::string type_key = args[0];
+  auto* f = dmlc::Registry<NodeFactoryReg>::Find(type_key);
+  CHECK(f != nullptr)
+      << "Node type \'" << type_key << "\' is not registered in TVM";
+  TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1);
+  std::shared_ptr<Node> n = f->body();
+  if (n->derived_from<BaseAttrsNode>()) {
+    static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
+  } else {
+    InitNodeByPackedArgs(n.get(), kwargs);
+  }
   *rv = NodeRef(n);
 }
 
 TVM_REGISTER_GLOBAL("make._Node")
 .set_body(MakeNode);
-
 }  // namespace tvm
diff --git a/tests/cpp/attrs_test.cc b/tests/cpp/attrs_test.cc
new file mode 100644
index 000000000000..138e0b242e02
--- /dev/null
+++ b/tests/cpp/attrs_test.cc
@@ -0,0 +1,76 @@
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/attrs.h>
+#include <tvm/ir.h>
+
+namespace tvm {
+namespace test {
+// test example usage docs
+struct TestAttrs : public AttrsNode<TestAttrs> {
+  int axis;
+  std::string name;
+  Expr expr;
+  double learning_rate;
+
+  TVM_DECLARE_ATTRS(TestAttrs, "attrs.cpptest.TestAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .set_default(10)
+        .set_lower_bound(1)
+        .set_upper_bound(10)
+        .describe("axis field");
+    TVM_ATTR_FIELD(name)
+        .describe("name of the field");
+    TVM_ATTR_FIELD(expr)
+        .describe("expression field")
+        .set_default(make_const(Int(32), 1));
+    TVM_ATTR_FIELD(learning_rate)
+        .describe("learning_rate")
+        .set_default(0.1);
+  }
+};
+}
+}
+
+TEST(Attrs, Basic) {
+  using namespace tvm;
+  using namespace tvm::test;
+  std::shared_ptr<TestAttrs> n = std::make_shared<TestAttrs>();
+  try {
+    n->InitBySeq("axis", 10);
+    LOG(FATAL) << "bad";
+  } catch (const tvm::AttrError& e) {
+  }
+  try {
+    n->InitBySeq("axis", 12, "name", "111");
+    LOG(FATAL) << "bad";
+  } catch (const tvm::AttrError& e) {
+  }
+
+  try {
+    n->InitBySeq("axisx", 12, "name", "111");
+    LOG(FATAL) << "bad";
+  } catch (const tvm::AttrError& e) {
+    std::string what = e.what();
+    CHECK(what.find("expr : Expr, default=1") != std::string::npos);
+    CHECK(what.find("axisx") != std::string::npos);
+  }
+  n->InitBySeq("learning_rate", Expr(1), "expr", 128, "name", "xx");
+  CHECK_EQ(n->learning_rate, 1.0);
+
+  n->InitBySeq("name", "xxx", "expr", 128);
+  CHECK_EQ(n->name, "xxx");
+  CHECK_EQ(n->axis, 10);
+  CHECK_EQ(n->expr.as<tvm::ir::IntImm>()->value, 128);
+  // Check docstring
+  std::ostringstream os;
+  n->PrintDocString(os);
+  LOG(INFO) << "docstring\n"<< os.str();
+  CHECK(os.str().find("expr : Expr, default=1") != std::string::npos);
+}
+
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py
index fefb8771a812..2ba67b8d9c86 100644
--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -36,6 +36,31 @@ def test_make_node():
     assert AA.op == A.op
     assert AA.value_index == A.value_index
 
+
+def test_make_attrs():
+    try:
+        x = tvm.make.node("attrs.TestAttrs", unknown_key=1, name="xx")
+        assert False
+    except tvm.TVMError as e:
+        assert str(e).find("unknown_key") != -1
+
+    try:
+        x = tvm.make.node("attrs.TestAttrs", axis=100, name="xx")
+        assert False
+    except tvm.TVMError as e:
+        assert str(e).find("upper bound") != -1
+
+    x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3,4))
+    assert x.name == "xx"
+    assert x.padding[0].value == 3
+    assert x.padding[1].value == 4
+    assert x.axis == 10
+
+    dattr = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
+    assert dattr.x.value == 1
+
+
+
 def test_make_sum():
     A = tvm.placeholder((2, 10), name='A')
     k = tvm.reduce_axis((0,10), "k")
@@ -46,6 +71,7 @@ def test_make_sum():
     assert BB.op.body[0].combiner is not None
 
 if __name__ == "__main__":
+    test_make_attrs()
     test_make_node()
     test_make_smap()
     test_const_saveload_json()

From b58698d1fd392af8c2cd412ba9ec850f4a575c52 Mon Sep 17 00:00:00 2001
From: Pariksheet Pinjari <pariksheet.pinjari@huawei.com>
Date: Mon, 20 Aug 2018 22:43:35 +0530
Subject: [PATCH 030/529] [NNVM][DARKNET]Yolo and Upsample frontend support
 (#1501)

* Yolo and Upsample frontend support

* Lint fix

* Mac support added

* Code clean and trigger CI
---
 nnvm/python/nnvm/frontend/darknet.py          | 32 +++++++-
 nnvm/python/nnvm/testing/darknet.py           | 25 +++++-
 nnvm/python/nnvm/top/vision.py                | 15 ++++
 nnvm/src/top/vision/yolo/yolo.cc              | 33 ++++++++
 .../python/frontend/darknet/test_forward.py   | 30 +++++++-
 tutorials/nnvm/from_darknet.py                | 77 +++++++++----------
 6 files changed, 165 insertions(+), 47 deletions(-)
 create mode 100644 nnvm/src/top/vision/yolo/yolo.cc

diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 3aa36b7e7ef9..7fb3e34750c8 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -32,8 +32,12 @@ class LAYERTYPE(object):
     NETWORK = 20
     XNOR = 21
     REGION = 22
-    REORG = 23
-    BLANK = 24
+    YOLO = 23
+    REORG = 24
+    UPSAMPLE = 25
+    LOGXENT = 26
+    L2NORM = 27
+    BLANK = 28
 
 class ACTIVATION(object):
     """Darknet ACTIVATION Class constant."""
@@ -257,6 +261,12 @@ def _darknet_reshape(inputs, attrs):
     new_attrs['shape'] = _darknet_required_attr(attrs, 'shape')
     return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
+def _darknet_upsampling(inputs, attrs):
+    """Process the upsampling operation."""
+    op_name, new_attrs = 'upsampling', {}
+    new_attrs['scale'] = attrs.get('scale', 1)
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
 def _darknet_softmax_output(inputs, attrs):
     """Process the softmax operation."""
     temperature = attrs.get('temperature', 1)
@@ -298,6 +308,15 @@ def _darknet_region(inputs, attrs):
         new_attrs['softmax'] = attrs.get('softmax', 0)
     return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
+def _darknet_yolo(inputs, attrs):
+    """Process the yolo operation."""
+    op_name, new_attrs = 'yolov3_yolo', {}
+    if 'n' in attrs:
+        new_attrs['n'] = attrs.get('n', 1)
+    if 'classes' in attrs:
+        new_attrs['classes'] = attrs.get('classes', 1)
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
 def _darknet_activations(inputs, attrs):
     """Process the activation function."""
     act = _darknet_required_attr(attrs, 'activation')
@@ -350,6 +369,8 @@ def _darknet_op_not_support(inputs, attrs):
     LAYERTYPE.REORG           : _darknet_reorg,
     LAYERTYPE.REGION          : _darknet_region,
     LAYERTYPE.SHORTCUT        : _darknet_shortcut,
+    LAYERTYPE.UPSAMPLE        : _darknet_upsampling,
+    LAYERTYPE.YOLO            : _darknet_yolo,
     LAYERTYPE.DETECTION       : _darknet_op_not_support,
     LAYERTYPE.CROP            : _darknet_op_not_support,
     LAYERTYPE.COST            : _darknet_op_not_support,
@@ -575,6 +596,13 @@ def _get_darknet_attrs(self, layer, layer_num):
             attr.update({'coords' : layer.coords})
             attr.update({'background' : layer.background})
             attr.update({'softmax' : layer.softmax})
+
+        elif LAYERTYPE.YOLO == layer.type:
+            attr.update({'n' : layer.n})
+            attr.update({'classes' : layer.classes})
+
+        elif LAYERTYPE.UPSAMPLE == layer.type:
+            attr.update({'scale' : layer.stride})
         else:
             err = "Darknet layer type {} is not supported in nnvm.".format(layer.type)
             raise NotImplementedError(err)
diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py
index e3d110e9605e..9a346e01b50b 100644
--- a/nnvm/python/nnvm/testing/darknet.py
+++ b/nnvm/python/nnvm/testing/darknet.py
@@ -115,8 +115,12 @@ class LAYERTYPE(object):
     NETWORK = 20
     XNOR = 21
     REGION = 22
-    REORG = 23
-    BLANK = 24
+    YOLO = 23
+    REORG = 24
+    UPSAMPLE = 25
+    LOGXENT = 26
+    L2NORM = 27
+    BLANK = 28
 
 class ACTIVATION(object):
     """Darknet ACTIVATION Class constant."""
@@ -182,12 +186,16 @@ class ACTIVATION(object):
     NETWORK,
     XNOR,
     REGION,
+    YOLO,
     REORG,
+    UPSAMPLE,
+    LOGXENT,
+    L2NORM,
     BLANK
 } LAYERTYPE;
 
 typedef enum{
-    SSE, MASKED, LONE, SEG, SMOOTH
+    SSE, MASKED, L1, SEG, SMOOTH, WGAN
 } COSTTYPE;
 
 
@@ -241,18 +249,20 @@ class ACTIVATION(object):
     float shift;
     float ratio;
     float learning_rate_scale;
+    float clip;
     int softmax;
     int classes;
     int coords;
     int background;
     int rescore;
     int objectness;
-    int does_cost;
     int joint;
     int noadjust;
     int reorg;
     int log;
     int tanh;
+    int *mask;
+    int total;
 
     float alpha;
     float beta;
@@ -265,13 +275,17 @@ class ACTIVATION(object):
     float class_scale;
     int bias_match;
     int random;
+    float ignore_thresh;
+    float truth_thresh;
     float thresh;
+    float focus;
     int classfix;
     int absolute;
 
     int onlyforward;
     int stopbackward;
     int dontload;
+    int dontsave;
     int dontloadscales;
 
     float temperature;
@@ -309,6 +323,7 @@ class ACTIVATION(object):
 
     float * delta;
     float * output;
+    float * loss;
     float * squared;
     float * norms;
 
@@ -462,6 +477,7 @@ class ACTIVATION(object):
     int train;
     int index;
     float *cost;
+    float clip;
 } network;
 
 
@@ -491,6 +507,7 @@ class ACTIVATION(object):
 layer make_region_layer(int batch, int w, int h, int n, int classes, int coords);
 layer make_softmax_layer(int batch, int inputs, int groups);
 layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam);
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes);
 layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize);
 layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
 layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index f2e12c0f367a..e59b2bdfe6d9 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -38,6 +38,21 @@ def schedule_region(attrs, outs, target):
 
 reg.register_pattern("yolo_region", OpPattern.OPAQUE)
 
+@reg.register_compute("yolov3_yolo")
+def compute_yolo(attrs, inputs, _):
+    """Compute definition of yolo"""
+    n = attrs.get_int("n")
+    classes = attrs.get_int("classes")
+    return topi.vision.yolo.yolo(inputs[0], n, classes)
+
+@reg.register_schedule("yolov3_yolo")
+def schedule_yolo(attrs, outs, target):
+    """Schedule definition of yolo"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_injective(outs)
+
+reg.register_pattern("yolov3_yolo", OpPattern.OPAQUE)
+
 # multibox_prior
 @reg.register_schedule("multibox_prior")
 def schedule_multibox_prior(_, outs, target):
diff --git a/nnvm/src/top/vision/yolo/yolo.cc b/nnvm/src/top/vision/yolo/yolo.cc
new file mode 100644
index 000000000000..4800f4371f9d
--- /dev/null
+++ b/nnvm/src/top/vision/yolo/yolo.cc
@@ -0,0 +1,33 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file yolo.cc
+ * \brief Property def of yolo operators.
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "../../elemwise_op_common.h"
+
+namespace nnvm {
+namespace top {
+
+NNVM_REGISTER_OP(yolov3_yolo)
+.describe(R"code(Yolo layer
+)code" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_support_level(5)
+.add_argument("data", "Tensor", "Input data")
+.set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<FInplaceOption>(
+    "FInplaceOption",
+    [](const NodeAttrs &attrs) {
+      return std::vector<std::pair<int, int>>{{0, 0}, {1, 0}};
+    })
+.set_attr<FGradient>("FGradient", [](const NodePtr &n,
+                                     const std::vector<NodeEntry> &ograds) {
+  return std::vector<NodeEntry>{ograds[0], ograds[0]};
+});
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
index 5fc71a86211e..3d7d06b48483 100644
--- a/nnvm/tests/python/frontend/darknet/test_forward.py
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -44,7 +44,7 @@ def _download(url, path, overwrite=False, sizecompare=False):
     except:
         urllib.urlretrieve(url, path)
 
-DARKNET_LIB = 'libdarknet.so'
+DARKNET_LIB = 'libdarknet2.0.so'
 DARKNETLIB_URL = 'https://github.com/siju-samuel/darknet/blob/master/lib/' \
                                     + DARKNET_LIB + '?raw=true'
 _download(DARKNETLIB_URL, DARKNET_LIB)
@@ -239,6 +239,8 @@ def test_forward_shortcut():
     layer_2 = LIB.make_convolutional_layer(1, 111, 111, 32, 32, 1, 1, 1, 0, 1, 0, 0, 0, 0)
     layer_3 = LIB.make_shortcut_layer(1, 0, 111, 111, 32, 111, 111, 32)
     layer_3.activation = 1
+    layer_3.alpha = 1
+    layer_3.beta = 1
     net.layers[0] = layer_1
     net.layers[1] = layer_2
     net.layers[2] = layer_3
@@ -272,6 +274,30 @@ def test_forward_region():
     test_forward(net)
     LIB.free_network(net)
 
+def test_forward_yolo_op():
+    '''test yolo layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 14, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    a = []
+    layer_2 = LIB.make_yolo_layer(1, 111, 111, 2, 0, a, 2)
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_upsample():
+    '''test upsample layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_upsample_layer(1, 19, 19, 3, 3)
+    layer.scale = 1
+    net.layers[0] = layer
+    net.w = net.h = 19
+    LIB.resize_network(net, 19, 19)
+    test_forward(net)
+    LIB.free_network(net)
+
 def test_forward_elu():
     '''test elu activation layer'''
     net = LIB.make_network(1)
@@ -428,6 +454,8 @@ def test_forward_activation_logistic():
     test_forward_rnn()
     test_forward_reorg()
     test_forward_region()
+    test_forward_yolo_op()
+    test_forward_upsample()
     test_forward_elu()
     test_forward_rnn()
     test_forward_crnn()
diff --git a/tutorials/nnvm/from_darknet.py b/tutorials/nnvm/from_darknet.py
index 883026f2af98..c6b70cf59413 100644
--- a/tutorials/nnvm/from_darknet.py
+++ b/tutorials/nnvm/from_darknet.py
@@ -22,54 +22,48 @@
 import numpy as np
 import tvm
 import os
+import sys
 
 from ctypes import *
 from tvm.contrib.download import download
 from nnvm.testing.darknet import __darknetffi__
 
-######################################################################
-# Set the parameters here.
-# Supported models alexnet, resnet50, resnet152, extraction, yolo
-#
-model_name = 'yolo'
-test_image = 'dog.jpg'
-target = 'llvm'
-ctx = tvm.cpu(0)
+#Model name
+MODEL_NAME = 'yolo'
 
 ######################################################################
-# Prepare cfg and weights file
-# ----------------------------
-# Pretrained model available https://pjreddie.com/darknet/imagenet/
-# Download cfg and weights file first time.
+# Download required files
+# -----------------------
+# Download cfg and weights file if first time.
+CFG_NAME = MODEL_NAME + '.cfg'
+WEIGHTS_NAME = MODEL_NAME + '.weights'
+REPO_URL = 'https://github.com/siju-samuel/darknet/blob/master/'
+CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
+WEIGHTS_URL = REPO_URL + 'weights/' + WEIGHTS_NAME + '?raw=true'
+
+download(CFG_URL, CFG_NAME)
+download(WEIGHTS_URL, WEIGHTS_NAME)
 
-cfg_name = model_name + '.cfg'
-weights_name = model_name + '.weights'
-cfg_url = 'https://github.com/siju-samuel/darknet/blob/master/cfg/' + \
-            cfg_name + '?raw=true'
-weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
-
-download(cfg_url, cfg_name)
-download(weights_url, weights_name)
-
-######################################################################
 # Download and Load darknet library
-# ---------------------------------
-
-darknet_lib = 'libdarknet.so'
-darknetlib_url = 'https://github.com/siju-samuel/darknet/blob/master/lib/' + \
-                        darknet_lib + '?raw=true'
-download(darknetlib_url, darknet_lib)
-
-#if the file doesnt exist, then exit normally.
-if os.path.isfile('./' + darknet_lib) is False:
-    exit(0)
-
-darknet_lib = __darknetffi__.dlopen('./' + darknet_lib)
-cfg = "./" + str(cfg_name)
-weights = "./" + str(weights_name)
-net = darknet_lib.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
+if sys.platform in ['linux', 'linux2']:
+    DARKNET_LIB = 'libdarknet2.0.so'
+    DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
+elif sys.platform == 'darwin':
+    DARKNET_LIB = 'libdarknet_mac2.0.so'
+    DARKNET_URL = REPO_URL + 'lib_osx/' + DARKNET_LIB + '?raw=true'
+else:
+    err = "Darknet lib is not supported on {} platform".format(sys.platform)
+    raise NotImplementedError(err)
+
+download(DARKNET_URL, DARKNET_LIB)
+
+DARKNET_LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
+cfg = "./" + str(CFG_NAME)
+weights = "./" + str(WEIGHTS_NAME)
+net = DARKNET_LIB.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
 dtype = 'float32'
 batch_size = 1
+
 print("Converting darknet to nnvm symbols...")
 sym, params = nnvm.frontend.darknet.from_darknet(net, dtype)
 
@@ -77,7 +71,9 @@
 # Compile the model on NNVM
 # -------------------------
 # compile the model
-data = np.empty([batch_size, net.c ,net.h, net.w], dtype);
+target = 'llvm'
+ctx = tvm.cpu(0)
+data = np.empty([batch_size, net.c, net.h, net.w], dtype)
 shape = {'data': data.shape}
 print("Compiling the model...")
 with nnvm.compiler.build_config(opt_level=2):
@@ -103,6 +99,7 @@ def save_lib():
 ######################################################################
 # Load a test image
 # --------------------------------------------------------------------
+test_image = 'dog.jpg'
 print("Loading the test image...")
 img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \
             test_image   +'?raw=true'
@@ -134,7 +131,7 @@ def save_lib():
 hier_thresh = 0.5
 img = nnvm.testing.darknet.load_image_color(test_image)
 _, im_h, im_w = img.shape
-probs= []
+probs = []
 boxes = []
 region_layer = net.layers[net.n - 1]
 boxes, probs = nnvm.testing.yolo2_detection.get_region_boxes(region_layer, im_w, im_h, net.w, net.h,
@@ -157,5 +154,5 @@ def save_lib():
 
 nnvm.testing.yolo2_detection.draw_detections(img, region_layer.w*region_layer.h*region_layer.n,
                  thresh, boxes, probs, names, region_layer.classes)
-plt.imshow(img.transpose(1,2,0))
+plt.imshow(img.transpose(1, 2, 0))
 plt.show()

From 447d7c1ece6120f161dabf8ff0265ca539497ece Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Mon, 20 Aug 2018 13:27:31 -0700
Subject: [PATCH 031/529] Improve x86 Inception (#1506)

* Improve x86 pooling and concat

* Fix

* Fix test concatenate correct layout

* Add conditional vectorize

* Fix lint

* Modify schedule for global pooling

* Fix

* Fix warning

* Fix alter layout test

* Remove vectorization for pooling when using 4D layout

* Remove vectorization for 4D concat

* Fix concatenate layout

* Fix concatenate schedule

* Fix concat

* Fix lint

* Fix concat

* Simplify pooling logic

* Update docstring

* Fix test topi pooling

* Small changes
---
 nnvm/python/nnvm/top/nn.py                    | 10 ++--
 nnvm/python/nnvm/top/transform.py             |  8 +++-
 nnvm/src/top/tensor/transform.cc              | 24 ++++++++--
 .../python/unittest/test_correct_layout.py    | 21 +++++++--
 topi/include/topi/nn/pooling.h                | 12 ++---
 topi/python/topi/cuda/pooling.py              |  7 ++-
 topi/python/topi/generic/injective.py         | 17 +++++++
 topi/python/topi/generic/nn.py                |  5 +-
 topi/python/topi/opengl/pooling.py            |  7 ++-
 topi/python/topi/x86/injective.py             | 46 +++++++++++++++++++
 topi/python/topi/x86/pooling.py               | 44 +++++++++++++++---
 topi/tests/python/test_topi_pooling.py        |  6 ++-
 12 files changed, 174 insertions(+), 33 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index f9a2c2813a04..b452738123c3 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -280,20 +280,22 @@ def schedule_conv2d_transpose(attrs, outs, target):
 
 # max_pool2d
 @reg.register_schedule("max_pool2d")
-def schedule_max_pool2d(_, outs, target):
+def schedule_max_pool2d(attrs, outs, target):
     """Schedule definition of max_pool2d"""
+    layout = attrs["layout"]
     with tvm.target.create(target):
-        return topi.generic.schedule_pool(outs)
+        return topi.generic.schedule_pool(outs, layout)
 
 reg.register_pattern("max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # avg_pool2d
 @reg.register_schedule("avg_pool2d")
-def schedule_avg_pool2d(_, outs, target):
+def schedule_avg_pool2d(attrs, outs, target):
     """Schedule definition of avg_pool2d"""
+    layout = attrs["layout"]
     with tvm.target.create(target):
-        return topi.generic.schedule_pool(outs)
+        return topi.generic.schedule_pool(outs, layout)
 
 reg.register_pattern("avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
diff --git a/nnvm/python/nnvm/top/transform.py b/nnvm/python/nnvm/top/transform.py
index facb345c1abe..594007239d4a 100644
--- a/nnvm/python/nnvm/top/transform.py
+++ b/nnvm/python/nnvm/top/transform.py
@@ -2,6 +2,7 @@
 """Tensor transformation ops"""
 from __future__ import absolute_import
 
+import tvm
 import topi
 from .tensor import _fschedule_broadcast, _fschedule_injective
 from . import registry as reg
@@ -58,8 +59,13 @@ def compute_reshape_like(attrs, inputs, out_info):
 reg.register_schedule("squeeze", _fschedule_injective)
 
 # concatenate
+@reg.register_schedule("concatenate")
+def schedule_concatenate(_, outs, target):
+    """Schedule definition of concatenate"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_concatenate(outs)
+
 reg.register_pattern("concatenate", OpPattern.INJECTIVE)
-reg.register_schedule("concatenate", _fschedule_injective)
 
 # split
 reg.register_pattern("split", OpPattern.INJECTIVE)
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 52dca5654838..b1485438ca50 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -129,15 +129,31 @@ inline bool ConcatenateCorrectLayout(const NodeAttrs& attrs,
                                      std::vector<Layout> *ilayouts,
                                      const std::vector<Layout> *last_ilayouts,
                                      std::vector<Layout> *olayouts) {
+  const ConcatenateParam& param = nnvm::get<ConcatenateParam>(attrs.parsed);
   CHECK_EQ(ilayouts->size(), last_ilayouts->size());
   CHECK_EQ(olayouts->size(), 1U);
 
-  for (size_t i = 0; i < ilayouts->size(); ++i) {
-    const Layout& input = last_ilayouts->at(i).defined() ?
-                          last_ilayouts->at(i) : ilayouts->at(i);
-    NNVM_ASSIGN_LAYOUT(*ilayouts, i, input);
+  Layout layout;
+  if (!ilayouts->at(0).defined()) {
+    layout = last_ilayouts->at(0);
+  } else if (param.axis >= static_cast<int>(ilayouts->at(0).ndim())) {
+    CHECK(last_ilayouts->at(0).defined())
+      << "Current input layout " << ilayouts->at(0)
+      << " is invalid but last input layout is not "
+         "defined for the first input.";
+    layout = last_ilayouts->at(0);
+  } else if (last_ilayouts->at(0).defined()
+             && ilayouts->at(0)[param.axis]
+                != last_ilayouts->at(0)[param.axis]) {
+    layout = last_ilayouts->at(0);
+  } else {
+    layout = ilayouts->at(0);
   }
 
+  for (size_t i = 0; i < ilayouts->size(); ++i) {
+    NNVM_ASSIGN_LAYOUT(*ilayouts, i, layout);
+  }
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout);
   return true;
 }
 
diff --git a/nnvm/tests/python/unittest/test_correct_layout.py b/nnvm/tests/python/unittest/test_correct_layout.py
index 6176586284a7..8961498a579e 100644
--- a/nnvm/tests/python/unittest/test_correct_layout.py
+++ b/nnvm/tests/python/unittest/test_correct_layout.py
@@ -77,14 +77,25 @@ def test_concatenate():
     g, ldict = correct_layout(z, {"x": "HW", "y": "HW"})
     assert(ldict["x"][0] == "HW")
     assert(ldict["y"][0] == "HW")
-    assert(ldict["concat"][0] == "__undef__")
+    assert(ldict["concat"][0] == "HW")
     # second pass will insert layout transform
     _, ldict = correct_layout(g, {"x": "HW16w", "y": "HW16w"})
     assert(ldict["x"][0] == "HW16w")
     assert(ldict["y"][0] == "HW16w")
-    assert(ldict["x_HW"][0] == "HW")
-    assert(ldict["y_HW"][0] == "HW")
-    assert(ldict["concat"][0] == "__undef__")
+    assert(ldict["concat"][0] == "HW16w")
+
+    x1 = sym.Variable("x", shape=(10, 20, 60))
+    x2 = sym.Variable("y", shape=(10, 20, 40))
+    z = sym.concatenate(x1, x2, axis=2, name="concat")
+    g, ldict = correct_layout(z, {"x": "H20wW", "y": "H20wW"})
+    assert(ldict["x"][0] == "H20wW")
+    assert(ldict["y"][0] == "H20wW")
+    assert(ldict["concat"][0] == "H20wW")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, {"x": "HW", "y": "HW"})
+    assert(ldict["x_H20wW"][0] == "H20wW")
+    assert(ldict["x_H20wW"][0] == "H20wW")
+    assert(ldict["concat"][0] == "H20wW")
 
 
 def test_expand_dims():
@@ -349,4 +360,4 @@ def test_reduce():
     test_transpose()
     test_broadcast_to()
     test_broadcast_binary()
-    test_reduce()
\ No newline at end of file
+    test_reduce()
diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h
index 26d61d42991d..ca318adfe6cb 100644
--- a/topi/include/topi/nn/pooling.h
+++ b/topi/include/topi/nn/pooling.h
@@ -112,18 +112,18 @@ inline Tensor pool_impl(const Tensor& x,
     }, "tensor", "pool_max");
   } else if (pool_type == kAvgPool) {
     auto temp = do_pad ? pad(x, pad_before, pad_after, 0, "pad_temp") : x;
-    auto tsum = tvm::compute(out_shape, [&](const Array<Var>& output) {
+    auto tavg = [&](const Array<Var>& output, Expr divide_factor) {
       Array<Expr> indices;
       for (const Var& var : output) indices.push_back(var);
       indices.Set(height_axis, output[height_axis] * stride_height + dheight);
       indices.Set(width_axis, output[width_axis] * stride_width + dwidth);
-      return tvm::sum(temp(indices), { dheight, dwidth });
-    }, "tensor", "pool_avg");
+      return tvm::sum(temp(indices) / divide_factor, { dheight, dwidth });
+    };
 
     return tvm::compute(out_shape,
     [&](const Array<Var>& output) {
       if (count_include_pad) {
-        return tsum(output) / (kernel_height * kernel_width);
+        return tavg(output, kernel_height * kernel_width);
       } else {
         Expr h_start = output[height_axis] * stride_height - pad_top;
         Expr w_start = output[width_axis] * stride_width - pad_left;
@@ -133,9 +133,9 @@ inline Tensor pool_impl(const Tensor& x,
         w_start = ir::Max::make(w_start, make_const(Int(32), 0));
         Expr divide_factor = ir::Max::make((h_end - h_start) * (w_end - w_start),
                                            make_const(Int(32), 1));
-        return tsum(output) / divide_factor;
+        return tavg(output, divide_factor);
       }
-    }, "tensor", kElementWise);
+    }, "tensor", "pool_avg");
   } else {
     LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
     return x;
diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py
index 637f664fbd36..6b36e9a8743f 100644
--- a/topi/python/topi/cuda/pooling.py
+++ b/topi/python/topi/cuda/pooling.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, unused-variable
+# pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for pooling operators"""
 import tvm
 from .. import tag
@@ -70,7 +70,7 @@ def traverse(OP):
 
 
 @generic.schedule_pool.register(["cuda", "gpu"])
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool.
 
     Parameters
@@ -79,6 +79,9 @@ def schedule_pool(outs):
         The computation graph description of pool
         in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     s: Schedule
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
index 0a9e394661af..975e4c11ea41 100644
--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -29,5 +29,22 @@ def schedule_injective(outs):
     s[x].fuse(s[x].op.axis)
     return s
 
+@tvm.target.generic_func
+def schedule_concatenate(outs):
+    """Schedule for concatenate op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return schedule_injective(outs)
+
 schedule_elemwise = schedule_injective
 schedule_broadcast = schedule_injective
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 1e01adb899b7..874decc792ec 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -282,7 +282,7 @@ def schedule_dense(outs):
 
 
 @tvm.target.override_native_generic_func("schedule_pool")
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool
 
     Parameters
@@ -291,6 +291,9 @@ def schedule_pool(outs):
           The computation graph description of pool
           in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     sch: Schedule
diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py
index 8195ea91d8a6..d6dbf0eac5c2 100644
--- a/topi/python/topi/opengl/pooling.py
+++ b/topi/python/topi/opengl/pooling.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, unused-variable
+# pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for pooling operators"""
 import tvm
 from .. import tag
@@ -54,7 +54,7 @@ def traverse(OP):
 
 
 @generic.schedule_pool.register(["opengl"])
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool.
 
     Parameters
@@ -63,6 +63,9 @@ def schedule_pool(outs):
         The computation graph description of pool
         in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     s: Schedule
diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py
index b43ebb98b82f..ac552903ad7f 100644
--- a/topi/python/topi/x86/injective.py
+++ b/topi/python/topi/x86/injective.py
@@ -33,5 +33,51 @@ def schedule_injective(outs):
         s[x].parallel(s[x].op.axis[0])
     return s
 
+@generic.schedule_concatenate.register(["cpu"])
+def schedule_concatenate(outs):
+    """X86 schedule for concatenate op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of injective in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    def vectorize(sch, tensor, vectorize_limit):
+        """Internal vectorization function for concatenate."""
+        inner_axis = s[tensor].op.axis[len(s[tensor].op.axis) - 1]
+        inner_length = tensor.shape[len(tensor.shape) - 1].value
+        if inner_length <= vectorize_limit:
+            sch[tensor].vectorize(inner_axis)
+        else:
+            split_factor = 1
+            for i in range(vectorize_limit, 1, -1):
+                if inner_length % i == 0:
+                    split_factor = i
+                    break
+            if split_factor > 1:
+                _, inner_i = sch[tensor].split(inner_axis, split_factor)
+                sch[tensor].vectorize(inner_i)
+
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    x = outs[0]
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+    if len(s[x].op.axis) >= 5:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
+        vectorize(s, x, 64)
+        s[x].parallel(fused)
+    elif len(s[x].op.axis) >= 3:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
+        s[x].parallel(fused)
+    else:
+        s[x].parallel(s[x].op.axis[0])
+    return s
+
 schedule_elemwise = schedule_injective
 schedule_broadcast = schedule_injective
diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py
index 998edf7a0e16..5fce5f32afb6 100644
--- a/topi/python/topi/x86/pooling.py
+++ b/topi/python/topi/x86/pooling.py
@@ -4,19 +4,47 @@
 from .. import generic
 from .. import tag
 
-def _parallel_sch(sch):
+def _parallel_sch(sch, oshape, do_vectorize=False):
+    def vectorize(fused_axis, num_parallel_axis, vectorize_limit=64):
+        """Internal vectorization utility function."""
+        reorder_axis = [fused_axis]
+        for i in range(num_parallel_axis, len(sch.op.axis) - 1):
+            reorder_axis.append(sch.op.axis[i])
+        kw, kh = sch.op.reduce_axis
+        fuse_k = sch.fuse(kw, kh)
+        c = sch.op.axis[len(sch.op.axis) - 1]
+        reorder_axis += [fuse_k, c]
+        sch.reorder(*reorder_axis)
+        inner_length = oshape[len(oshape) - 1].value
+        if inner_length <= vectorize_limit:
+            sch.vectorize(c)
+        else:
+            split_factor = 1
+            for i in range(vectorize_limit, 1, -1):
+                if inner_length % i == 0:
+                    split_factor = i
+                    break
+            if split_factor > 1:
+                _, c_i = sch.split(c, split_factor)
+                sch.vectorize(c_i)
+
     if len(sch.op.axis) >= 5:
         fused = sch.fuse(sch.op.axis[0], sch.op.axis[1], sch.op.axis[2])
-        sch.parallel(fused)
+        if do_vectorize:
+            vectorize(fused, 3)
+
     elif len(sch.op.axis) >= 3:
         fused = sch.fuse(sch.op.axis[0], sch.op.axis[1])
-        sch.parallel(fused)
+        if do_vectorize:
+            vectorize(fused, 2)
     else:
         sch.parallel(sch.op.axis[0])
+        return
+    sch.parallel(fused)
 
 
 @generic.schedule_pool.register(["cpu"])
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool
 
     Parameters
@@ -25,6 +53,9 @@ def schedule_pool(outs):
           The computation graph description of pool
           in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     sch: Schedule
@@ -37,7 +68,8 @@ def schedule_pool(outs):
     def _schedule(PaddedInput, Pool):
         if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
             s[PaddedInput].compute_inline()
-        _parallel_sch(s[Pool])
+        do_vectorize = layout[-1] not in "HWhw"
+        _parallel_sch(s[Pool], outs[0].shape, do_vectorize)
 
     def traverse(OP):
         """Internal travserse function"""
@@ -93,7 +125,7 @@ def traverse(OP):
         # schedule pool
         elif OP.tag.startswith('global_pool'):
             Pool = OP.output(0)
-            _parallel_sch(s[Pool])
+            _parallel_sch(s[Pool], outs[0].shape)
         else:
             raise RuntimeError("Unsupported operator: %s" % OP.tag)
 
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index c9f790146b4a..b87795743c4c 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -10,9 +10,11 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_
     kw = kh
     sw = sh
     pt, pl, pb, pr = padding
+    layout = "NCHW"
     A = tvm.placeholder((n, ic, ih, iw), name='A')
     B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding,
-                     pool_type=pool_type, ceil_mode=ceil_mode, count_include_pad=count_include_pad)
+                     pool_type=pool_type, ceil_mode=ceil_mode,
+                     layout="NCHW", count_include_pad=count_include_pad)
     B = topi.nn.relu(B)
     dtype = A.dtype
 
@@ -54,7 +56,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_pool(B)
+            s = topi.generic.schedule_pool(B, layout)
 
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)

From 75bc72422b923c9f3da981c67b07b8d54eca5a93 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 20 Aug 2018 16:28:28 -0700
Subject: [PATCH 032/529] [VERSION] Update to 0.5.dev (#1623)

* [VERSION] Update to 0.5.dev

* Update the docs to include all intrins
---
 NEWS.md                             | 63 +++++++++++++++++++++++++++
 conda/nnvm/meta.yaml                |  2 +-
 conda/topi/meta.yaml                |  2 +-
 conda/tvm-libs/meta.yaml            |  2 +-
 conda/tvm/meta.yaml                 |  2 +-
 docs/api/python/intrin.rst          |  6 +++
 include/tvm/runtime/c_runtime_api.h |  2 +-
 python/tvm/_ffi/libinfo.py          |  6 ++-
 python/update_version.py            | 66 +++++++++++++++++++++++++++++
 web/tvm_runtime.js                  |  2 +-
 10 files changed, 145 insertions(+), 8 deletions(-)
 create mode 100644 python/update_version.py

diff --git a/NEWS.md b/NEWS.md
index 567aabf3fcbd..2c2f616cb2f0 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,6 +9,69 @@ Refer to the Roadmap issue for complete list on on-going version features.
 If you check in something that is not reflected in Roadmap issue, please reply
 to that issue so it can get added.
 
+## 0.4
+
+This release features several major improvements. The high-level graph optimizer is now part of TVM repo. Some of the highlights are: Initial support of AutoTVM for automated optimization; customized accelerator backend VTA.
+
+- Tensor operator primitives
+  - Introduce attrs field to operator primitives(e.g. compute) to store additional metadata, the attrs can be used as hint for scheduling
+- Enable embedding of asm micro-kernels
+- Hybrid python programming model
+   - python AST based IR builder interface
+   - support GPU programs
+- AutoTVM, Automated tuning, and scheduling
+   - basic autotvm infra
+    - GPU IR verifier
+   - basic autotuning tutorial
+   - topi integration
+- ARM support
+    - winograd support
+   - initial support of ARM autotuning records
+- TOPI Vision
+   - Generic GPU sort support(useful for vision)
+   - SSD operator support
+- TOPI numpy consistency
+   - Rename all binary operators for numpy consistecy: broadcast_add-> add, broadcast_sub -> substract, broadcast_mul -> multiply, broadcast_div->divide
+   - New operators: slice, LRN, equal, not_equal, less, greater
+   - tutorials on topi
+- Initial low-bit operator support support
+    - Optimized popcount generation on ARM
+    - general bit-serial convolution and GEMM
+    - optimized low bit kernels
+    - parallel optimization
+- New topi backend optimization for intel graphics
+- Adapt AVX schedules for SSE target
+- VTA: customized accelerator backend
+  - custom hardware backend example
+  - tutorials on how to use customized accelerator
+- Initial experimental support for  HLS backend
+- Bugfix in SPIRV code generator for vulkan
+- libdevice support, enable NVPTX backend
+- Introduce NDArrayContainer for managed NDarray
+- RPC and Device API
+   - Support communication between big/small endian machines.
+   - RPC and device API protocol upgrade (this is a non-backward compatible change) to support big-small endian communication. This is a non-backward compatible change, need to use the latest version of TVM runtime with the RPC
+   - graduate rpc from contrib, tvm.contrib.rpc->tvm.rpc
+   -Support tracker in Android RPC, add fault tolerance for AutoTVM
+- BIG.LITTLE aware threadpool
+- tvm4j graph runtime that runs end to end workload in java
+- DLPack support
+   - Support from_dlpack and to_dlpack
+   - Enables bridges to pytorch
+- Enable link of stackvm in runtime
+- Tensorflow graphdef frontend
+- Keras frontend
+   - improved to support reuse layers, add activations
+- ONNX
+   - gather,  LRN
+- CoreML frontend
+   - Support C-RNN and activation functions
+- Fix grads for sum and expand_like
+- Enhanced operator fusion for multiple elemwise branches
+- Separate nnvm fusion and compilation pass
+- Unified build system to cmake, customizable cmake path for vulkan, rocm, cuda
+
+
 ## 0.3
 
 This release features numerous improvements in TOPI and backends. We make the first step toward object detection support in TOPI, featuring operators necessary for YOLO and SSDs. The topi now supports numpy-style API and operator overloading. RPC is significantly improved to support resource allocation and using a pool of devices. We are adding two new backends: WebGL for running GPUs on the browser, and Vulkan for running on next-generation graphics API.
diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml
index a8b47d0de118..9c045c177ff6 100644
--- a/conda/nnvm/meta.yaml
+++ b/conda/nnvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: nnvm
diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml
index af2fb4fd4228..4002f577863b 100644
--- a/conda/topi/meta.yaml
+++ b/conda/topi/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: topi
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
index dbdfd4a7701f..d6902c45a693 100644
--- a/conda/tvm-libs/meta.yaml
+++ b/conda/tvm-libs/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: tvm-libs
diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml
index 478e095322eb..fe53b7dd49d9 100644
--- a/conda/tvm/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: tvm
diff --git a/docs/api/python/intrin.rst b/docs/api/python/intrin.rst
index 3942c57f1a04..59f695196ce8 100644
--- a/docs/api/python/intrin.rst
+++ b/docs/api/python/intrin.rst
@@ -6,7 +6,10 @@ tvm.intrin
 
    tvm.call_packed
    tvm.call_pure_intrin
+   tvm.call_intrin
    tvm.call_pure_extern
+   tvm.call_extern
+   tvm.call_llvm_intrin
    tvm.register_intrin_rule
    tvm.exp
    tvm.log
@@ -18,7 +21,10 @@ tvm.intrin
 
 .. autofunction:: tvm.call_packed
 .. autofunction:: tvm.call_pure_intrin
+.. autofunction:: tvm.call_intrin
 .. autofunction:: tvm.call_pure_extern
+.. autofunction:: tvm.call_extern
+.. autofunction:: tvm.call_llvm_intrin
 .. autofunction:: tvm.register_intrin_rule
 .. autofunction:: tvm.exp
 .. autofunction:: tvm.log
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 32d574340052..52499fb9186f 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -43,7 +43,7 @@
 #endif
 
 // TVM version
-#define TVM_VERSION "0.4.0"
+#define TVM_VERSION "0.5.dev"
 
 
 // TVM Runtime is DLPack compatible.
diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index 390849f8536d..f911829d38b1 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -100,5 +100,7 @@ def find_lib_path(name=None, search_path=None, optional=False):
 
 
 # current version
-# We use the version of the incoming release for code that is under development
-__version__ = "0.4.0"
+# We use the version of the incoming release for code
+# that is under development.
+# The following line is set by tvm/python/update_version.py
+__version__ = "0.5.dev"
diff --git a/python/update_version.py b/python/update_version.py
new file mode 100644
index 000000000000..9e958f109479
--- /dev/null
+++ b/python/update_version.py
@@ -0,0 +1,66 @@
+"""
+This is the global script that set the version information of TVM.
+This script runs and update all the locations that related to versions
+
+List of affected files:
+- tvm-root/python/tvm/_ffi/libinfo.py
+- tvm-root/include/tvm/runtime/c_runtime_api.h
+- tvm-root/web/tvm_runtime.js
+- tvm-root/conda/tvm/meta.yaml
+- tvm-root/conda/topi/meta.yaml
+- tvm-root/conda/nnvm/meta.yaml
+- tvm-root/conda/tvm-libs/meta.yaml
+"""
+import os
+import re
+# current version
+# We use the version of the incoming release for code
+# that is under development
+__version__ = "0.5.dev"
+
+# Implementations
+def update(file_name, pattern, repl):
+    update = []
+    hit_counter = 0
+    need_update = False
+    for l in open(file_name):
+        result = re.findall(pattern, l)
+        if result:
+            assert len(result) == 1
+            hit_counter += 1
+            if result[0] != repl:
+                l = re.sub(pattern, repl, l)
+                need_update = True
+                print("%s: %s->%s" % (file_name, result[0], repl))
+            else:
+                print("%s: version is already %s" % (file_name, repl))
+
+        update.append(l)
+    if hit_counter != 1:
+        raise RuntimeError("Cannot find version in %s" % file_name)
+
+    if need_update:
+        with open(file_name, "w") as output_file:
+            for l in update:
+                output_file.write(l)
+
+
+def main():
+    curr_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    proj_root = os.path.abspath(os.path.join(curr_dir, ".."))
+    # python path
+    update(os.path.join(proj_root, "python", "tvm", "_ffi", "libinfo.py"),
+           r"(?<=__version__ = \")[.0-9a-z]+", __version__)
+    # C++ header
+    update(os.path.join(proj_root, "include", "tvm", "runtime", "c_runtime_api.h"),
+           "(?<=TVM_VERSION \")[.0-9a-z]+", __version__)
+    # conda
+    for path in ["tvm", "topi", "nnvm", "tvm-libs"]:
+        update(os.path.join(proj_root, "conda", path, "meta.yaml"),
+               "(?<=version = \")[.0-9a-z]+", __version__)
+    # web
+    update(os.path.join(proj_root, "web", "tvm_runtime.js"),
+           "(?<=@version )[.0-9a-z]+", __version__)
+
+if __name__ == "__main__":
+    main()
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index 786745d3ce88..2eab15093b72 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -2,7 +2,7 @@
  * TVM Javascript web runtime library.
  *
  * @projectname tvm
- * @version 0.1
+ * @version 0.5.dev
  */
 /* eslint no-unused-vars: "off" */
 /* eslint no-unexpected-multiline: "off" */

From c9c7d186d9f51966857ac307a97df6a3cd1b7976 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 21 Aug 2018 12:40:23 -0500
Subject: [PATCH 033/529] Add int8 gemm recipe (#1614)

---
 topi/recipe/gemm/gemm_int8.py | 185 ++++++++++++++++++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 topi/recipe/gemm/gemm_int8.py

diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py
new file mode 100644
index 000000000000..61ef97d0a2bf
--- /dev/null
+++ b/topi/recipe/gemm/gemm_int8.py
@@ -0,0 +1,185 @@
+"Example code to perform int8 GEMM"
+import logging
+import sys
+import numpy as np
+import tvm
+from tvm import autotvm
+
+DO_TUNING = True
+PRETUNED_INDEX = 75333
+
+def intrin_dot():
+    n = 4  # dp4a requires operands packed by 4
+    x = tvm.placeholder((n,), name='x', dtype='int8')
+    y = tvm.placeholder((n,), name='y', dtype='int8')
+    k = tvm.reduce_axis((0, n), name='k')
+
+    z = tvm.compute(
+        (1,), lambda _: tvm.sum(
+            x[k].astype('int32') * y[k].astype('int32'), axis=k))
+
+    def intrin_func(ins, outs):
+        xx, yy = ins
+        zz = outs[0]
+        ib = tvm.ir_builder.create()
+
+        dp4a = zz.vstore(0, tvm.call_pure_extern('int32', '__dp4a',
+                                                 xx.vload(0, dtype='int8x4'),
+                                                 yy.vload(0, dtype='int8x4'),
+                                                 zz.vload(0)))
+        ib.emit(dp4a)
+
+        body = ib.get()
+        return body, zz.vstore(0, 0), body
+
+    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
+        binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name,
+                                    data_alignment=cfg.data_alignment,
+                                    offset_factor=cfg.offset_factor,
+                                    scope='local') for t in [x, y, z]}
+        return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
+
+
+dot = intrin_dot()
+
+
+@autotvm.template
+def gemm_int8(n, m, l):
+    A = tvm.placeholder((n, l), name='A', dtype='int8')
+    B = tvm.placeholder((m, l), name='B', dtype='int8')
+
+    k = tvm.reduce_axis((0, l), name='k')
+    C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('int32') * B[j, k].astype(
+        'int32'), axis=k), name='C')
+
+    cfg = autotvm.get_config()
+    s = tvm.create_schedule(C.op)
+    y, x = C.op.axis
+
+    AA = s.cache_read(A, 'shared', [C])
+    BB = s.cache_read(B, 'shared', [C])
+    AL = s.cache_read(AA, 'local', [C])
+    BL = s.cache_read(BB, 'local', [C])
+    CC = s.cache_write(C, 'local')
+
+    k = CC.op.reduce_axis[0]
+
+    cfg.define_split('tile_k', cfg.axis(k), num_outputs=3,
+                     filter=lambda entity: entity.size[2] == 4 and \
+                     entity.size[0] * 2 >= entity.size[1])
+
+    ko, kt, ki = cfg['tile_k'].apply(s, CC, k)
+
+    s[CC].tensorize(ki, dot)
+
+    block_x = tvm.thread_axis('blockIdx.x')
+    block_y = tvm.thread_axis('blockIdx.y')
+    thread_x = tvm.thread_axis('threadIdx.x')
+    thread_y = tvm.thread_axis('threadIdx.y')
+
+    def block_size_filter(entity):
+        return entity.size[0] * 2 >= entity.size[1] * 2 and \
+                entity.size[1] <= 16 and entity.size[3] <= 4
+    cfg.define_split('tile_y', cfg.axis(y), num_outputs=4, filter=block_size_filter)
+    cfg.define_split('tile_x', cfg.axis(x), num_outputs=4, filter=block_size_filter)
+    by, tyz, ty, yi = cfg['tile_y'].apply(s, C, y)
+    bx, txz, tx, xi = cfg['tile_x'].apply(s, C, x)
+
+    s[C].bind(by, block_y)
+    s[C].bind(bx, block_x)
+    s[C].bind(tyz, tvm.thread_axis('vthread'))
+    s[C].bind(txz, tvm.thread_axis('vthread'))
+    s[C].bind(ty, thread_y)
+    s[C].bind(tx, thread_x)
+    s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)
+
+    s[CC].compute_at(s[C], tx)
+
+    yo, xo = CC.op.axis
+    s[CC].reorder(ko, kt, yo, xo, ki)
+    s[CC].unroll(kt)
+
+    for stage in [AL, BL]:
+        s[stage].compute_at(s[CC], kt)
+        _, xi = s[stage].split(stage.op.axis[1], factor=4)
+        s[stage].vectorize(xi)
+        s[stage].double_buffer()
+
+    cfg.define_knob('storage_align', [16, 48])
+    for stage in [AA, BB]:
+        s[stage].storage_align(s[stage].op.axis[0],
+                               cfg['storage_align'].val, 0)
+        s[stage].compute_at(s[CC], ko)
+
+        fused = s[stage].fuse(*s[stage].op.axis)
+        ty, tx = s[stage].split(fused, nparts=cfg['tile_y'].size[2])
+        tx, xi = s[stage].split(tx, nparts=cfg['tile_x'].size[2])
+        _, xi = s[stage].split(xi, factor=16)
+
+        s[stage].bind(ty, thread_y)
+        s[stage].bind(tx, thread_x)
+        s[stage].vectorize(xi)
+
+    cfg.define_knob('auto_unroll_max_step', [512, 1500])
+    s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[C].pragma(by, 'unroll_explicit', False)
+
+    cfg.add_flop(n*m*l*2)
+    return s, [A, B, C]
+
+
+if __name__ == '__main__':
+    N = 2048
+    n = m = l = N
+
+    logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
+    task = autotvm.task.create(gemm_int8, args=(n, m, l), target='cuda')
+    print(task.config_space)
+
+    measure_option = autotvm.measure_option(
+        measure_func='local', number=10, n_parallel=8, timeout=20)
+    log_name = 'gemm_int8.log'
+    if DO_TUNING:
+        tuner = autotvm.tuner.XGBTuner(task)
+        tuner.tune(n_trial=1000, measure_option=measure_option,
+               callbacks=[autotvm.callback.log_to_file(log_name)])
+
+        dispatch_context = autotvm.apply_history_best(log_name)
+        best_config = dispatch_context.query(task.target, task.workload)
+        print('\nBest config:')
+        print(best_config)
+    else:
+        config = task.config_space.get(PRETUNED_INDEX)
+        dispatch_context = autotvm.task.ApplyConfig(config)
+        print("Using pretuned config:")
+        print(config)
+
+    with dispatch_context:
+        with tvm.target.create('cuda'):
+            s, arg_bufs = gemm_int8(n, m, l)
+            f = tvm.build(s, arg_bufs, 'cuda', name='gemm_int8')
+
+    ctx = tvm.context('cuda', 0)
+
+    a_np = np.random.randint(size=(n, l), low=-128, high=127, dtype='int8')
+    b_np = np.random.randint(size=(m, l), low=-128, high=127, dtype='int8')
+
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((n, m), dtype='int32'), ctx)
+    f(a, b, c)
+
+    np.testing.assert_allclose(
+        c.asnumpy(),
+        np.dot(
+            a_np.astype('int32'),
+            b_np.T.astype('int32')),
+        rtol=1e-5)
+
+    num_ops = 2 * l * m * n
+    num_runs = 1000
+    timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs)
+    t = timer_f(a, b, c).mean
+    GOPS = num_ops / (t * 1e3) / 1e6
+    print("average time cost of %d runs = %g ms, %g GOPS." %
+          (num_runs, t * 1e3, GOPS))

From d879066af12d0ac09a875913675060413483b792 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 21 Aug 2018 16:35:59 -0700
Subject: [PATCH 034/529] [RUNTIME] Add TypedPackedFunc (#1626)

---
 include/tvm/runtime/packed_func.h | 251 ++++++++++++++++++++++++++++++
 tests/cpp/packed_func_test.cc     |  23 +++
 2 files changed, 274 insertions(+)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 63e8ca7cd16b..758d03b5b18b 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -118,6 +118,163 @@ class PackedFunc {
   FType body_;
 };
 
+/*!
+ * \brief Please refer to \ref TypedPackedFuncAnchor "TypedPackedFunc<R(Args..)>"
+ */
+template<typename FType>
+class TypedPackedFunc;
+
+/*!
+ * \anchor TypedPackedFuncAnchor
+ * \brief A PackedFunc wrapper to provide typed function signature.
+ * It is backed by a PackedFunc internally.
+ *
+ * TypedPackedFunc enables compile time type checking.
+ * TypedPackedFunc works with the runtime system:
+ * - It can be passed as an argument of PackedFunc.
+ * - It can be assigned to TVMRetValue.
+ * - It can be directly converted to a type-erased PackedFunc.
+ *
+ * Developers should prefer TypedPackedFunc over PackedFunc in C++ code
+ * as it enables compile time checking.
+ * We can construct a TypedPackedFunc from a lambda function
+ * with the same signature.
+ *
+ * \code
+ *  // user defined lambda function.
+ *  auto addone = [](int x)->int {
+ *    return x + 1;
+ *  };
+ *  // We can directly convert
+ *  // lambda function to TypedPackedFunc
+ *  TypedPackedFunc<int(int)> ftyped(addone);
+ *  // invoke the function.
+ *  int y = ftyped(1);
+ *  // Can be directly converted to PackedFunc
+ *  PackedFunc packed = ftype;
+ * \endcode
+ * \tparam R The return value of the function.
+ * \tparam Args The argument signature of the function.
+ */
+template<typename R, typename ...Args>
+class TypedPackedFunc<R(Args...)> {
+ public:
+  /*! \brief short hand for this function type */
+  using TSelf = TypedPackedFunc<R(Args...)>;
+  /*! \brief default constructor */
+  TypedPackedFunc() {}
+  /*!
+   * \brief construct by wrap a PackedFunc
+   *
+   * Example usage:
+   * \code
+   * PackedFunc packed([](TVMArgs args, TVMRetValue *rv) {
+   *   int x = args[0];
+   *   *rv = x + 1;
+   *  });
+   * // construct from packed function
+   * TypedPackedFunc<int(int)> ftyped(packed);
+   * // call the typed version.
+   * CHECK_EQ(ftyped(1), 2);
+   * \endcode
+   *
+   * \param packed The packed function
+   */
+  explicit TypedPackedFunc(PackedFunc packed)
+      : packed_(packed) {
+  }
+  /*!
+   * \brief construct from a lambda function with the same signature.
+   *
+   * Example usage:
+   * \code
+   * auto typed_lambda = [](int x)->int { return x + 1; }
+   * // construct from packed function
+   * TypedPackedFunc<int(int)> ftyped(typed_lambda);
+   * // call the typed version.
+   * CHECK_EQ(ftyped(1), 2);
+   * \endcode
+   *
+   * \param typed_lambda typed lambda function.
+   * \tparam FLambda the type of the lambda function.
+   */
+  template<typename FLambda,
+           typename = typename std::enable_if<
+             std::is_convertible<FLambda,
+                                 std::function<R(Args...)>
+                                 >::value>::type>
+  explicit TypedPackedFunc(const FLambda& typed_lambda) {
+    this->AssignTypedLambda(typed_lambda);
+  }
+  /*!
+   * \brief copy assignment operator from typed lambda
+   *
+   * Example usage:
+   * \code
+   * // construct from packed function
+   * TypedPackedFunc<int(int)> ftyped;
+   * ftyped = [](int x) { return x + 1; }
+   * // call the typed version.
+   * CHECK_EQ(ftyped(1), 2);
+   * \endcode
+   *
+   * \param typed_lambda typed lambda function.
+   * \tparam FLambda the type of the lambda function.
+   * \returns reference to self.
+   */
+  template<typename FLambda,
+           typename = typename std::enable_if<
+             std::is_convertible<FLambda,
+                                 std::function<R(Args...)>
+                                 >::value>::type>
+  TSelf& operator=(FLambda typed_lambda) {  // NOLINT(*)
+    this->AssignTypedLambda(typed_lambda);
+    return *this;
+  }
+  /*!
+   * \brief copy assignment operator from PackedFunc.
+   * \param packed The packed function.
+   * \returns reference to self.
+   */
+  TSelf& operator=(PackedFunc packed) {
+    packed_ = packed;
+    return *this;
+  }
+  /*!
+   * \brief Invoke the operator.
+   * \param args The arguments
+   * \returns The return value.
+   */
+  inline R operator()(Args ...args) const;
+  /*!
+   * \brief convert to PackedFunc
+   * \return the internal PackedFunc
+   */
+  operator PackedFunc() const {
+    return packed();
+  }
+  /*!
+   * \return reference the internal PackedFunc
+   */
+  const PackedFunc& packed() const {
+    return packed_;
+  }
+
+ private:
+  friend class TVMRetValue;
+  /*! \brief The internal packed function */
+  PackedFunc packed_;
+  /*!
+   * \brief Assign the packed field using a typed lambda function.
+   *
+   * \param flambda The lambda function.
+   * \tparam FLambda The lambda function type.
+   * \note We capture the lambda when possible for maximum efficiency.
+   */
+  template<typename FLambda>
+  inline void AssignTypedLambda(FLambda flambda);
+};
+
 /*! \brief Arguments into TVM functions. */
 class TVMArgs {
  public:
@@ -361,6 +518,10 @@ class TVMArgValue : public TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle);
     return *ptr<PackedFunc>();
   }
+  template<typename FType>
+  operator TypedPackedFunc<FType>() const {
+    return TypedPackedFunc<FType>(operator PackedFunc());
+  }
   operator Module() const {
     TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle);
     return *ptr<Module>();
@@ -446,6 +607,10 @@ class TVMRetValue : public TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle);
     return *ptr<PackedFunc>();
   }
+  template<typename FType>
+  operator TypedPackedFunc<FType>() const {
+    return TypedPackedFunc<FType>(operator PackedFunc());
+  }
   operator Module() const {
     TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle);
     return *ptr<Module>();
@@ -512,6 +677,10 @@ class TVMRetValue : public TVMPODValue_ {
     this->SwitchToClass(kFuncHandle, f);
     return *this;
   }
+  template<typename FType>
+  TVMRetValue& operator=(const TypedPackedFunc<FType>& f) {
+    return operator=(f.packed());
+  }
   TVMRetValue& operator=(Module m) {
     this->SwitchToClass(kModuleHandle, m);
     return *this;
@@ -847,6 +1016,10 @@ class TVMArgsSetter {
     values_[i].v_handle = const_cast<PackedFunc*>(&value);
     type_codes_[i] = kFuncHandle;
   }
+  template<typename FType>
+  void operator()(size_t i, const TypedPackedFunc<FType>& value) const {  // NOLINT(*)
+    operator()(i, value.packed());
+  }
   void operator()(size_t i, const Module& value) const {  // NOLINT(*)
     values_[i].v_handle = const_cast<Module*>(&value);
     type_codes_[i] = kModuleHandle;
@@ -894,6 +1067,84 @@ inline TVMRetValue PackedFunc::operator()(Args&& ...args) const {
   return rv;
 }
 
+namespace detail {
+template<typename R, int nleft, int index, typename F>
+struct unpack_call_dispatcher {
+  template<typename ...Args>
+  static void run(const F& f,
+                  const TVMArgs& args_pack,
+                  TVMRetValue* rv,
+                  Args&&... unpacked_args) {
+    unpack_call_dispatcher<R, nleft - 1, index + 1, F>
+        ::run(f, args_pack, rv,
+              std::forward<Args>(unpacked_args)...,
+              args_pack[index]);
+  }
+};
+
+template<typename R, int index, typename F>
+struct unpack_call_dispatcher<R, 0, index, F> {
+  template<typename ...Args>
+  static void run(const F& f,
+                  const TVMArgs& args_pack,
+                  TVMRetValue* rv,
+                  Args&&... unpacked_args) {
+    *rv = R(f(std::forward<Args>(unpacked_args)...));
+  }
+};
+
+template<int index, typename F>
+struct unpack_call_dispatcher<void, 0, index, F> {
+  template<typename ...Args>
+  static void run(const F& f,
+                  const TVMArgs& args_pack,
+                  TVMRetValue* rv,
+                  Args&&... unpacked_args) {
+    f(std::forward<Args>(unpacked_args)...);
+  }
+};
+
+template<typename R, int nargs, typename F>
+inline void unpack_call(const F& f, const TVMArgs& args, TVMRetValue* rv) {
+  unpack_call_dispatcher<R, nargs, 0, F>::run(f, args, rv);
+}
+
+template<typename R, typename ...Args>
+inline R call_packed(const PackedFunc& pf, Args&& ...args) {
+  return R(pf(std::forward<Args>(args)...));
+}
+
+template<typename R>
+struct typed_packed_call_dispatcher {
+  template<typename ...Args>
+  static inline R run(const PackedFunc& pf, Args&& ...args) {
+    return pf(std::forward<Args>(args)...);
+  }
+};
+
+template<>
+struct typed_packed_call_dispatcher<void> {
+  template<typename ...Args>
+  static inline void run(const PackedFunc& pf, Args&& ...args) {
+    pf(std::forward<Args>(args)...);
+  }
+};
+}  // namespace detail
+
+template<typename R, typename ...Args>
+template<typename FType>
+inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda) {
+  packed_ = PackedFunc([flambda](const TVMArgs& args, TVMRetValue* rv) {
+      detail::unpack_call<R, sizeof...(Args)>(flambda, args, rv);
+    });
+}
+
+template<typename R, typename ...Args>
+inline R TypedPackedFunc<R(Args...)>::operator()(Args... args) const {
+  return detail::typed_packed_call_dispatcher<R>
+      ::run(packed_, std::forward<Args>(args)...);
+}
+
 // extension and node type handling
 namespace detail {
 template<typename T, typename TSrc, bool is_ext>
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 9b2f1df73731..abe26fabe9ea 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -135,6 +135,29 @@ TEST(PackedFunc, Type) {
   CHECK(get_type2("float32x2").operator Type() == Float(32, 2));
 }
 
+TEST(TypedPackedFunc, HighOrder) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  using Int1Func = TypedPackedFunc<int(int)>;
+  using Int2Func = TypedPackedFunc<int(int, int)>;
+  using BindFunc = TypedPackedFunc<Int1Func(Int2Func, int value)>;
+  BindFunc ftyped;
+  ftyped = [](Int2Func f1, int value) -> Int1Func {
+    auto binded = [f1, value](int x) {
+      return f1(value, x);
+    };
+    Int1Func x(binded);
+    return x;
+  };
+  auto add = [](int x, int y) { return x + y; };
+  CHECK_EQ(ftyped(Int2Func(add), 1)(2), 3);
+  PackedFunc f = ftyped(Int2Func(add), 1);
+  CHECK_EQ(f(3).operator int(), 4);
+  // call the type erased version.
+  Int1Func f1 = ftyped.packed()(Int2Func(add), 1);
+  CHECK_EQ(f1(3), 4);
+}
+
 // new namespoace
 namespace test {
 // register int vector as extension type

From 7e520f2983ad5e9b1480c1a6dc905d713e79170d Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Tue, 21 Aug 2018 18:35:03 -0700
Subject: [PATCH 035/529] check in (#1629)

---
 .../java/ml/dmlc/tvm/tvmrpc/MainActivity.java | 24 ++++---------------
 .../app/src/main/res/layout/content_main.xml  | 11 ---------
 .../app/src/main/res/values/strings.xml       |  3 +--
 3 files changed, 5 insertions(+), 33 deletions(-)

diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
index d80008bbe258..2ea4e4cb7528 100644
--- a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
+++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
@@ -39,11 +39,9 @@
 
 
 public class MainActivity extends AppCompatActivity {
-  private boolean skipRelaunch = true;
   // wait time before automatic restart of RPC Activity
   public static final int HANDLER_RESTART_DELAY = 5000;
 
-
   private void showDialog(String title, String msg) {
     AlertDialog.Builder builder = new AlertDialog.Builder(this);
     builder.setTitle(title);
@@ -91,7 +89,7 @@ private void setupRelaunch() {
     final Runnable rPCStarter = new Runnable() {
         public void run() {
             if (switchPersistent.isChecked()) {
-              System.err.println("relaunching RPC activity in 5s...");
+              System.err.println("relaunching RPC activity...");
               Intent intent = ((MainActivity) context).updateRPCPrefs();
               startActivity(intent);
             }
@@ -116,6 +114,7 @@ public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
         if (isChecked) {
           System.err.println("automatic RPC restart enabled...");
           updateRPCPrefs();
+          setupRelaunch();
         } else {
           System.err.println("automatic RPC restart disabled...");
           updateRPCPrefs();
@@ -123,29 +122,14 @@ public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
       }
     });
 
-    Button startRPC = findViewById(R.id.button_start_rpc);
-    startRPC.setOnClickListener(new View.OnClickListener() {
-        public void onClick(View v) {
-            Intent intent = ((MainActivity) context).updateRPCPrefs();
-            startActivity(intent);
-        }
-    });
-
     enableInputView(true);
   }
 
   @Override
   protected void onResume() {
     System.err.println("MainActivity onResume...");
-    System.err.println("skipRelaunch: " + skipRelaunch);
-    // if this is the first time onResume is called, do nothing, otherwise we
-    // may double launch
-    if (!skipRelaunch) {
-        enableInputView(true);
-        setupRelaunch();
-    } else {
-        skipRelaunch = false;
-    }
+    enableInputView(true);
+    setupRelaunch();
     super.onResume();
   }
 
diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml
index 82be44d98451..69c1f76030df 100644
--- a/apps/android_rpc/app/src/main/res/layout/content_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml
@@ -78,15 +78,4 @@
             android:textOn="@string/switch_on" />
     </LinearLayout>
 
-    <LinearLayout
-        android:orientation="horizontal"
-        android:layout_width="fill_parent"
-        android:layout_height="wrap_content">
-        <Button
-            android:id="@+id/button_start_rpc"
-            android:layout_height="wrap_content"
-            android:layout_width="wrap_content"
-            android:text="@string/start_rpc" />
-    </LinearLayout>
-
 </LinearLayout>
diff --git a/apps/android_rpc/app/src/main/res/values/strings.xml b/apps/android_rpc/app/src/main/res/values/strings.xml
index 33caa374b496..f1ca2b90a001 100644
--- a/apps/android_rpc/app/src/main/res/values/strings.xml
+++ b/apps/android_rpc/app/src/main/res/values/strings.xml
@@ -9,11 +9,10 @@
     <string name="label_address">Address</string>
     <string name="label_port">Port</string>
     <string name="label_key">Key</string>
-    <string name="label_persistent">Keep RPC Alive</string>
+    <string name="label_persistent">Enable RPC</string>
 
     <string name="switch_on">Enabled</string>
     <string name="switch_off">Disabled</string>
 
-    <string name="start_rpc">Start RPC</string>
     <string name="stop_rpc">Stop RPC</string>
 </resources>

From 252bc9861c0bcc88b7865829b7ae655bd99f3f1b Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Tue, 21 Aug 2018 18:35:32 -0700
Subject: [PATCH 036/529] [AUTOTVM] Allow fallback for template & Fix bugs in
 tuners (#1615)

* support fallback & fix bugs in tuners & clean topi test

* update task extraction

* update task extraction

* fix arm tutorial

* Update tune_nnvm_arm.py
---
 nnvm/python/nnvm/compiler/build_module.py     |   5 +-
 .../compiler/test_autotvm_task_extraction.py  |  63 +++++++
 python/tvm/autotvm/__init__.py                |   3 +-
 python/tvm/autotvm/measure/measure.py         |   5 +-
 python/tvm/autotvm/measure/measure_methods.py |  44 ++---
 python/tvm/autotvm/task/__init__.py           |   2 +-
 python/tvm/autotvm/task/dispatcher.py         | 117 +++++++++----
 python/tvm/autotvm/task/nnvm_integration.py   | 117 +++++++++----
 python/tvm/autotvm/task/space.py              |  56 ++++++-
 python/tvm/autotvm/task/task.py               |   2 +-
 python/tvm/autotvm/tophub.py                  |   7 +-
 python/tvm/autotvm/tuner/ga_tuner.py          |  10 +-
 python/tvm/autotvm/tuner/model_based_tuner.py |  33 ++--
 .../tvm/autotvm/tuner/sa_model_optimizer.py   |   2 +-
 python/tvm/autotvm/tuner/tuner.py             |  13 +-
 .../tvm/autotvm/tuner/xgboost_cost_model.py   | 119 ++++++++-----
 python/tvm/autotvm/tuner/xgboost_tuner.py     |  17 +-
 python/tvm/exec/tophub.py                     |   9 +-
 python/tvm/target.py                          |   1 +
 .../unittest/test_autotvm_dispatch_context.py |  44 +++--
 tests/python/unittest/test_autotvm_space.py   |  15 +-
 .../unittest/test_autotvm_xgboost_model.py    |   6 +-
 topi/python/topi/arm_cpu/conv2d.py            |  86 +++++++---
 topi/python/topi/arm_cpu/depthwise_conv2d.py  |  14 +-
 topi/python/topi/x86/injective.py             |   2 +-
 topi/tests/python/common.py                   |  12 ++
 .../python/test_topi_bitserial_conv2d.py      |  25 ++-
 .../python/test_topi_bitserial_conv2d_rasp.py |  16 +-
 topi/tests/python/test_topi_bnn.py            |   2 +-
 topi/tests/python/test_topi_broadcast.py      |  25 +--
 topi/tests/python/test_topi_clip.py           |   3 +-
 topi/tests/python/test_topi_conv2d.py         |  47 ------
 topi/tests/python/test_topi_conv2d_hwcn.py    |  14 +-
 topi/tests/python/test_topi_conv2d_nchw.py    | 157 ++++++++++++------
 .../python/test_topi_conv2d_transpose_nchw.py |  22 +--
 topi/tests/python/test_topi_dense.py          |   9 +-
 .../python/test_topi_depthwise_conv2d.py      |  38 ++---
 tutorials/autotvm/tune_nnvm_arm.py            |   5 +-
 38 files changed, 756 insertions(+), 411 deletions(-)
 create mode 100644 nnvm/tests/python/compiler/test_autotvm_task_extraction.py
 create mode 100644 topi/tests/python/common.py
 delete mode 100644 topi/tests/python/test_topi_conv2d.py

diff --git a/nnvm/python/nnvm/compiler/build_module.py b/nnvm/python/nnvm/compiler/build_module.py
index 217598c9d79a..6fab4460b427 100644
--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
@@ -239,8 +239,9 @@ def build(graph, target=None, shape=None, dtype="float32",
         raise ValueError("Target is not set in env or passed as argument.")
     target = tvm.target.create(target)
 
-    # if not inside an autotvm config dispatch context, load pre-tuned parameters from TopHub
-    if autotvm.task.DispatchContext.current is None:
+    # If current dispatch context is fallback context (the default root context),
+    # then load pre-tuned parameters from TopHub
+    if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
         tophub_context = autotvm.tophub.context(target)
     else:
         tophub_context = autotvm.util.EmptyContext()
diff --git a/nnvm/tests/python/compiler/test_autotvm_task_extraction.py b/nnvm/tests/python/compiler/test_autotvm_task_extraction.py
new file mode 100644
index 000000000000..fd14934f8ade
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_autotvm_task_extraction.py
@@ -0,0 +1,63 @@
+"""Test task extraction for autotvm"""
+
+import nnvm.testing
+import nnvm.compiler
+from tvm import autotvm
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if name == 'resnet-18':
+        net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'vgg-16':
+        net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=batch_size)
+    elif name == 'dcgan':
+        net, params = nnvm.testing.dcgan.get_workload(batch_size=batch_size)
+        input_shape = (batch_size, 100)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+def test_task_extraction():
+    target = 'llvm'
+    dtype = 'float32'
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+    assert len(tasks) == 12
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.dense,))
+    assert len(tasks) == 1
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+    assert len(tasks) == 13
+
+    net, params, input_shape, out_shape = get_network('mobilenet', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+    assert len(tasks) == 20
+
+    net, params, input_shape, out_shape = get_network('dcgan', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d_transpose,))
+    assert len(tasks) == 4
+
+if __name__ == '__main__':
+    test_task_extraction()
diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index 5b312d93d288..625b50c10853 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -25,5 +25,6 @@
 from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo
 from .tuner import callback
 from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
-    ApplyHistoryBest as apply_history_best
+    register_topi_compute, register_topi_schedule, \
+    DispatchContext, FallbackContext, ApplyHistoryBest as apply_history_best
 from .env import GLOBAL_SCOPE
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 2325a970bc45..2d780eeaf004 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -89,8 +89,9 @@ def measure_option(measure_func,
 
         callable: customized build function for other backends (e.g. VTA).
                   See measure/measure_methods.py::default_build_func for example.
-    check_correctness: bool
-        Whether check correctness after measurement. This will use llvm cpu as reference.
+    check_correctness: bool, optional
+        Whether check correctness after measurement. This will use llvm cpu target to generate
+        reference output.
     replay_db : Database, optional
         The database that we retrieve saved MeasureResult from.
 
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index d845cc1f88fd..2d740b9493b2 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -83,7 +83,7 @@ def check_remote(target, device_key, tracker_addr=None, priority=2, timeout=10):
         The priority of this request, larger is more prior
     timeout: float, optional
         The timeout of this check (units: seconds).
-        If time is out, a RuntimerError will be raised.
+        If time is out, a RuntimeError will be raised.
     """
     def _check():
         remote = request_remote(device_key, tracker_addr, priority)
@@ -281,11 +281,11 @@ def fmeasure(input_pack, build_func, build_kwargs, number, repeat, ref_input, re
         results: List of MeasureResult
             The results for input_pack
         """
-        remote = request_remote(key, (host, port), priority, session_timeout)
+        remote_args = (key, (host, port), priority, session_timeout)
 
         res = _measure_common(input_pack, build_func, build_kwargs, number, repeat,
                               ref_input, ref_output,
-                              remote)
+                              remote_args)
         return res
 
     fmeasure.pack_size = pack_size
@@ -294,7 +294,7 @@ def fmeasure(input_pack, build_func, build_kwargs, number, repeat, ref_input, re
 
 
 def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
-                    ref_input=None, ref_output=None, remote=None):
+                    ref_input=None, ref_output=None, remote_args=None):
     """Measure the time cost for a pack of inputs.
 
     (Note: A pack is a list of inputs which will be measured inside a same RPC session)
@@ -318,8 +318,8 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
         Reference input for checking correctness
     ref_output: Array of np.ndarray, optional
         Reference output for checking correctness
-    remote: RPCSession, optional
-        The remote RPC session
+    remote_args: Tuple, optional
+        The arguments to request_remote. If is not None, will use remote rpc devices.
 
     Returns
     -------
@@ -327,7 +327,8 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
         The list of results of measurement.
     """
     res_pack = []
-    tmp_dir = util.tempdir() if remote else None
+    tmp_dir = util.tempdir() if remote_args else None
+    assert len(input_pack) == 1, "Only supports input_pack == 1 for now"
 
     for inp in input_pack:
         tic = time.time()
@@ -360,31 +361,36 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
                                           tstamp - tic, tstamp))
             continue
 
-        # upload built module
-        if remote:
-            remote.upload(tmp_dir.relpath(filename))
-            func = remote.load_module(filename)
-            ctx = remote.context(str(inp.target), 0)
-            time_f = func.time_evaluator(
-                func.entry_name, ctx, number=number, repeat=repeat)
-        else:
-            ctx = context(str(inp.target), 0)
-            time_f = func.time_evaluator(
-                func.entry_name, ctx, number=number, repeat=repeat)
-
         # measure time
         errno = MeasureErrorNo.NO_ERROR
         try:
+            # upload built module
+            if remote_args:
+                remote = request_remote(*remote_args)
+                remote.upload(tmp_dir.relpath(filename))
+                func = remote.load_module(filename)
+                ctx = remote.context(str(inp.target), 0)
+                time_f = func.time_evaluator(
+                    func.entry_name, ctx, number=number, repeat=repeat)
+            else:
+                ctx = context(str(inp.target), 0)
+                time_f = func.time_evaluator(
+                    func.entry_name, ctx, number=number, repeat=repeat)
+
+            # set input
             if ref_input:
                 args = [nd.array(x, ctx=ctx) for x in ref_input]
             else:
                 args = [nd.empty(get_const_tuple(x.shape), dtype=x.dtype, ctx=ctx)
                         for x in arg_bufs]
+
             costs = time_f(*args).results
             if len(costs) > 2:  # remove largest and smallest value to reduce variance
                 costs = list(costs)
                 costs.sort()
                 costs = tuple(costs[1:-1])
+
+            # check correctness of output
             if ref_output:
                 for expected, real in zip(ref_output, args):
                     if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index 0d43f92656cd..7592fc5af7df 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -9,7 +9,7 @@
 from .task import Task, create, register, template, get_config, args_to_workload
 from .space import ConfigSpace, ConfigEntity
 from .code_hash import attach_code_hash, attach_code_hash_to_arg
-from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, dispatcher
+from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, FallbackContext, dispatcher
 
 from .topi_integration import register_topi_compute, register_topi_schedule
 from .nnvm_integration import extract_from_graph
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 93f6d584abfa..ec1dcc44f141 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -21,7 +21,7 @@
 
 from tvm import target as _target
 
-from .space import ConfigSpace
+from .space import FallbackConfigEntity
 
 logger = logging.getLogger('autotvm')
 
@@ -34,9 +34,36 @@ class DispatchContext(object):
     """
     current = None
 
+    def __init__(self):
+        self._old_ctx = DispatchContext.current
+
     def query(self, target, workload):
         """
-        Query the context to get the specific implementation.
+        Query the context to get the specific config for a template.
+        If cannot find the result inside this context, this function will query it
+        from the upper contexts.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload : Workload
+            The current workload.
+
+        Returns
+        -------
+        cfg : ConfigSpace
+            The specific configuration.
+        """
+        ret = self._query_inside(target, workload)
+        if ret is None:
+            ret = self._old_ctx.query(target, workload)
+        return ret
+
+    def _query_inside(self, target, workload):
+        """
+        Query the context to get the specific config for a template.
+        This function only query config inside this context.
 
         Parameters
         ----------
@@ -117,17 +144,17 @@ def _do_reg(myf):
     def dispatch_func(func, *args, **kwargs):
         """The wrapped dispatch function"""
         tgt = _target.current_target()
-        context = DispatchContext.current
-        if context is None:
-            raise RuntimeError("DispatchContext is not initialized")
         workload = func(*args, **kwargs)
-        cfg = context.query(tgt, workload)
-        if cfg.template_key:
-            return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
-        else:
-            assert dispatch_dict, "No func registered for this dispatcher"
+        cfg = DispatchContext.current.query(tgt, workload)
+        if cfg.is_fallback and not cfg.template_key:
+            # first try 'direct' template
+            if 'direct' in dispatch_dict:
+                return dispatch_dict['direct'](cfg, *args, **kwargs)
+            # otherwise pick a random template
             for v in dispatch_dict.values():
                 return v(cfg, *args, **kwargs)
+        else:
+            return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
 
     fdecorate = decorate(fworkload, dispatch_func)
     fdecorate.register = register
@@ -135,7 +162,7 @@ def dispatch_func(func, *args, **kwargs):
 
 
 class ApplyConfig(DispatchContext):
-    """Apply a specific config entity during query.
+    """Apply a deterministic config entity for all queries.
 
     Parameters
     ----------
@@ -147,7 +174,7 @@ def __init__(self, config):
         self._config = config
         self.workload = None
 
-    def query(self, target, workload):
+    def _query_inside(self, target, workload):
         """Override query"""
         self.workload = workload
         return self._config
@@ -164,20 +191,12 @@ class ApplyHistoryBest(DispatchContext):
         If is str, then it should be the filename of a records log file.
                    Each row of this file is an encoded record pair.
         Otherwise, it is an iterator.
-    default: ConfigEntity, optional
-        The default config to return when no history records
-    allow_fallback: bool
-        Whether allow to use a fallback configuration if cannot find
-        tuned result.
     """
-    def __init__(self, records, default=None, allow_fallback=False):
+    def __init__(self, records):
         super(ApplyHistoryBest, self).__init__()
 
         self.best_by_targetkey = {}
         self.best_by_model = {}
-        self._default = default
-        self._allow_fallback = allow_fallback
-        self.fallback = {}
 
         if records:
             self.load(records)
@@ -234,7 +253,7 @@ def load(self, records):
 
         logger.debug("Finish loading %d records", counter)
 
-    def query(self, target, workload):
+    def _query_inside(self, target, workload):
         if target is None:
             raise RuntimeError("Need a target context to find the history best. "
                                "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
@@ -254,20 +273,50 @@ def query(self, target, workload):
             if key in self.best_by_targetkey:
                 return self.best_by_targetkey[key][0].config
 
-        if self._default:
-            return self._default
+        return None
+
+
+class FallbackContext(DispatchContext):
+    """
+    A fallback dispatch context.
+
+    Any tunable template can be called under this context.
+    This is the root context.
+    """
+
+    def __init__(self):
+        super(FallbackContext, self).__init__()
+        self.memory = {}
+        self.silent = False
+
+    def _query_inside(self, target, workload):
+        key = (str(target), workload)
+        if key in self.memory:
+            return self.memory[key]
 
-        if self._allow_fallback:
-            key = (target, workload)
-            if key in self.fallback:
-                return self.fallback[key]
+        if not self.silent:
             logger.warning(
                 "Cannot find config for target=%s, workload=%s. A fallback configuration "
                 "is used, which may bring great performance regression.", target, workload)
-            cfg = ConfigSpace()
-            self.fallback[key] = cfg
-            return cfg
+        cfg = FallbackConfigEntity()
+
+        # cache this config
+        self.memory[key] = cfg
+        return cfg
+
+    def clear_cache(self, target, workload):
+        """Clear fallback cache. Pass the same argument as _query_inside to this function
+        to clean the cache.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload : Workload
+            The current workload.
+        """
+        key = (str(target), workload)
+        if key in self.memory:
+            del self.memory[key]
 
-        raise RuntimeError(
-            "Cannot find config for target=%s, workload=%s. You need to do tuning "
-            "for this workload to get the config." % (target, workload))
+DispatchContext.current = FallbackContext()
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index 1b50869fc378..9138cc288372 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -7,11 +7,10 @@
 import logging
 
 
-from ... import tensor, placeholder, target as _target
+from ... import tensor, placeholder, create_schedule, target as _target
 
 from ..util import get_const_tuple
 from .task import create, register
-from .dispatcher import ApplyHistoryBest
 
 logger = logging.getLogger('autotvm')
 
@@ -56,40 +55,68 @@ def __init__(self):
         import topi
         import nnvm
 
+        # NOTE: To add more symbols, you only need to change the following lists
+        # nnvm symbol -> topi compute
         self.symbol2topi = {
             nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
-            nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose],
+            nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+            nnvm.sym.dense: [topi.nn.dense],
         }
 
+        # topi compute -> autotvm task name
         self.topi_to_task = {
             topi.nn.conv2d: "topi_nn_conv2d",
             topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
             topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
+            topi.nn.dense: "topi_nn_dense",
         }
 
-        self._register_dummy()
+        self.topi_to_schedule = {
+            topi.nn.conv2d: [topi.generic.schedule_conv2d_nchw,
+                             topi.generic.schedule_conv2d_nhwc],
+            topi.nn.depthwise_conv2d_nchw: [topi.generic.schedule_depthwise_conv2d_nchw,
+                                            topi.generic.schedule_depthwise_conv2d_nhwc],
+            topi.nn.conv2d_transpose_nchw: [topi.generic.schedule_conv2d_transpose_nchw],
+            topi.nn.dense: [topi.generic.schedule_dense],
+        }
+
+        self._register_tracing()
         self._register_topi_task()
         self.task_collection = []
+        self.wanted_topi_funcs = list(self.topi_to_task.keys())
+
+    def _register_tracing(self):
+        """Register tracing function to track the topi function call"""
+        # register topi compute for "tracing" target
+        for topi_compute in self.topi_to_task:
+            def _local_scope(compute_func):
+                """start a scope to hold the local function in for loop"""
 
-    def _register_dummy(self):
-        """Register dummy function to track the topi function call"""
-        for func in self.topi_to_task:
-            def _local_scope(local_func):
-                """build a scope to holds the function"""
-                @local_func.register("dummy", )
-                def _dummy_func(*args, **kwargs):
+                @compute_func.register("tracing", )
+                def _tracing_topi_compute(*args, **kwargs):
                     assert not kwargs, "Do not support extracting tuning tasks when" \
                                        "kwargs is used in TOPI function call." \
                                        "Please modify it to use only positional args."
 
-                    if (self.topi_to_task[local_func], serialize_args(args)) \
-                            not in self.task_collection:
-                        self.task_collection.append((self.topi_to_task[local_func],
-                                                     serialize_args(args)))
-                    with _target.create("opencl"):
-                        return local_func(*args)
+                    if compute_func in self.wanted_topi_funcs:  # record this call
+                        key = (self.topi_to_task[compute_func], serialize_args(args))
+                        if key not in self.task_collection:
+                            self.task_collection.append(key)
+
+                    return compute_func.fdefault(*args)
+            _local_scope(topi_compute)
+
+        # register topi schedule for "tracing" target
+        for topi_compute in self.topi_to_task:
+            for topi_schedule in self.topi_to_schedule[topi_compute]:
+                def _local_scope_(schedule_func):
+                    """start a scope to hold the local function in for loop"""
 
-            _local_scope(func)
+                    @schedule_func.register("tracing", )
+                    def _tracing_topi_compute(outs):
+                        outs = [outs] if isinstance(outs, tensor.Tensor) else outs
+                        return create_schedule([x.op for x in outs])
+                _local_scope_(topi_schedule)
 
     def _register_topi_task(self):
         """register tuning wrapper for topi function"""
@@ -125,17 +152,47 @@ def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
             s = topi.generic.schedule_conv2d_transpose_nchw([C])
             return s, [A, W, C]
 
-    def reset(self):
-        """Reset task collections"""
+        @register("topi_nn_dense")
+        def _topi_nn_dense(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            data, weight, bias = args
+            C = topi.nn.dense(*args, **kwargs)
+            s = topi.generic.schedule_dense([C])
+            if bias is not None:
+                return s, [data, weight, bias, C]
+            return s, [data, weight, C]
+
+    def reset(self, wanted_topi_funcs):
+        """Reset task collections
+
+        Parameters
+        ----------
+        wanted_topi_funcs: List of function
+            The topi function to be extracted
+        """
         self.task_collection = []
+        self.wanted_topi_funcs = wanted_topi_funcs
 
     def get_tasks(self):
-        """Get collected tasks"""
+        """Get collected tasks
+
+        Returns
+        -------
+        tasks: List of tuple(name, args)
+            A list of tasks extracted from the nnvm graph
+        """
         return self.task_collection
 
     @staticmethod
     def get():
-        """Get the single instance of TaskExtractEnv"""
+        """Get the single instance of TaskExtractEnv
+
+        Returns
+        -------
+        env: TaskExtractEnv
+            The single instance of TaskExtractEnv
+        """
         if not TaskExtractEnv.current:
             TaskExtractEnv.current = TaskExtractEnv()
         return TaskExtractEnv.current
@@ -144,8 +201,8 @@ def get():
 def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
     """ Extract tuning tasks from a nnvm graph.
 
-    This function collects tunning tasks by building the graph
-    with a "dummy" target and tracing all the calls to topi.
+    This function collects tuning tasks by building the graph
+    with a "tracing" target and tracing all the calls to topi.
 
     Parameters
     ----------
@@ -158,7 +215,7 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
     target: tvm.target.Target
         The compilation target
     symbols : Array of nnvm.symbol
-        Array of nnvm symbols
+        Array of nnvm symbols want to be tuned
     target_host: tvm.target.Target
         The host compilation target
 
@@ -179,16 +236,16 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
 
     # run compiler to collect all TOPI calls during compilation
-    env.reset()
+    env.reset(topi_funcs)
 
     # disable logger temporarily
     old_state = logger.disabled
     logger.disabled = True
 
-    # use a dummy target to do a fake compile for collecting topi calls
-    dummy_target = _target.create("opencl -device=dummy")
-    with ApplyHistoryBest([], allow_fallback=True):
-        nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype)
+    # use a "tracing" target to do a fake compile for collecting topi calls
+    tracing_target = _target.create("llvm -device=tracing")
+    nnvm.compiler.engine.clear_cache()
+    nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype)
 
     logger.disabled = old_state
 
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index ea823c6f2760..5a34353acfe9 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -567,15 +567,16 @@ class ConfigSpace(object):
     """
     def __init__(self):
         # private dict to provide sugar
-        self.space_map = OrderedDict()  # name -> space
+        self.space_map = OrderedDict()    # name -> space
         self._collect = True
         self._length = None
-        self._entity_map = OrderedDict()
+        self._entity_map = OrderedDict()  # name -> entity
         self._constraints = []
         self.errors = []
         self.template_key = None
         self.code_hash = None
         self.flop = 0
+        self.is_fallback = False
 
     @staticmethod
     def axis(var):
@@ -607,6 +608,15 @@ def define_split(self, name, axis, policy='all', **kwargs):
             If is 'candidate', try listed candidate.
         kwargs: dict
             extra arguments for policy
+            see examples below for how to use filter
+
+        Examples
+        --------
+        >>> # use custom candidates
+        >>> cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4], [4, 1, 4]])
+
+        >>> # use a filter that only accepts the split scheme whose inner most tile is less then 4
+        >>> cfg.define_split('tile_y', y, policy='all', filter=lambda x: x.size[-1] <= 4)
         """
         axes = [axis]
         return self._add_new_transform(SplitSpace, name, axes, policy, **kwargs)
@@ -889,3 +899,45 @@ def from_json_dict(json_dict):
     def __repr__(self):
         return "%s,%s,%s,%d" % (str(self._entity_map)[12:-1], self.template_key,
                                 self.code_hash, self.index)
+
+class FallbackConfigEntity(ConfigSpace):
+    """The config entity created to support fallback"""
+
+    def __init__(self):
+        super(FallbackConfigEntity, self).__init__()
+        self.is_fallback = True
+
+    def fallback_split(self, name, constraints):
+        """Fallback a split knob
+
+        Parameters
+        ----------
+        name: str
+            name of the knob
+        constraints: List of int
+            The maximum tile size for every dimension. Value `-1` means no constraint.
+
+        Examples
+        --------
+        If you use cfg.define_split('tile_0', 128, num_outputs=3),
+        Then cfg.fallback_split('tile_0', [-1, 8, 4]) will give you cfg['tile_0'].size = [4, 8, 4]
+
+        If you use cfg.define_split('tile_0', 49, num_outputs=3),
+        Then cfg.fallback_split('tile_0', [-1, 8, 4]) will give you cfg['tile_0'].size = [7, 7, 1]
+        """
+        space = self.space_map[name]
+        assert len(constraints) == space.num_outputs
+        indices = np.arange(space.num_outputs)
+
+        # '-1' means no constraint
+        constraints = [x if x != -1 else 1e10 for x in constraints]
+
+        for entity in reversed(space.entities):
+            if all([entity.size[i] <= constraints[i] for i in indices]):
+                self._entity_map[name] = entity
+                return
+
+        raise RuntimeError("Cannot find feasible fallback split entity for node: " + name)
+
+    def __repr__(self):
+        return "%s,%s,%s" % (str(self._entity_map)[12:-1], self.template_key, self.code_hash)
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index f8923fca56e3..ab52788c8d91 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -206,7 +206,7 @@ def args_to_workload(x):
     elif isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)):
         return x.value
     elif x is None:
-        return None
+        return 0
     else:
         raise RuntimeError('Do not support type "%s" in argument. Consider to use'
                            'primitive types only' % type(x))
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index e11bb7a4fc92..3d7b249df905 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -28,7 +28,7 @@ def _alias(name):
     return table.get(name, name)
 
 
-def context(target, extra_files=None, allow_fallback=False):
+def context(target, extra_files=None):
     """Return the dispatch context with pre-tuned parameters.
     The corresponding downloaded *.log files under tophub root path will be loaded.
     Users can also add their own files in argument `extra_files`.
@@ -39,12 +39,9 @@ def context(target, extra_files=None, allow_fallback=False):
         The compilation target
     extra_files: list of str, optional
         Extra log files to load
-    allow_fallback: bool
-        Whether allow to use a fallback configuration if cannot find
-        tuned result.
     """
     rootpath = AUTOTVM_TOPHUB_ROOT_PATH
-    best_context = ApplyHistoryBest([], allow_fallback=allow_fallback)
+    best_context = ApplyHistoryBest([])
 
     if isinstance(target, str):
         target = _target.create(target)
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index b92737ed5317..b9d900e49577 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -86,13 +86,9 @@ def update(self, inputs, results):
 
             # cross over
             indices = np.arange(len(genes))
-            max_score = np.max(scores)
-            if max_score < 1e-8:
-                probs = np.empty_like(scores)
-                probs[:] = 1.0 / len(scores)
-            else:
-                scores /= max_score
-                probs = scores / np.sum(scores)
+            scores += 1e-8
+            scores /= np.max(scores)
+            probs = scores / np.sum(scores)
             tmp_genes = []
             for _ in range(self.pop_size):
                 p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs)
diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py
index d1c1b16d3181..62fc57f2e869 100644
--- a/python/tvm/autotvm/tuner/model_based_tuner.py
+++ b/python/tvm/autotvm/tuner/model_based_tuner.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from .tuner import Tuner
-
+from ..env import GLOBAL_SCOPE
 
 class FeatureCache(object):
     """Feature cache manager for cache sharing between different cost models"""
@@ -119,11 +119,9 @@ def load_basemodel(self, base_model):
         """
         raise NotImplementedError()
 
-    def clone_new(self):
-        """Clone a new model with the same parameters.
-        This function will only copy hyperparameters of the tuner, not all the trained model
-
-        This is used for deriving a base model conveniently
+    def spawn_base_model(self):
+        """Clone a base model with the same parameters.
+        The base model is used to fit history data in transfer learning.
 
         Returns
         -------
@@ -221,7 +219,9 @@ def next_batch(self, batch_size):
                     break
                 self.trial_pt += 1
 
-            if self.trial_pt >= len(self.trials):  # trial list is empty, choose randomly
+            if self.trial_pt >= len(self.trials) - int(0.05 * self.plan_size):
+                # if the trial list is empty or
+                # the tuner is doing the last 5% trials (e-greedy), choose randomly
                 index = np.random.randint(len(self.space))
                 while index in self.visited:
                     index = np.random.randint(len(self.space))
@@ -264,18 +264,16 @@ def update(self, inputs, results):
             self.train_ct += 1
 
     def load_history(self, data_set):
-        # filter data, only pick the data with a same task
-        data = []
-        for inp, res in data_set:
-            if inp.task.name == self.task.name and \
-                            inp.config.template_key == self.task.config_space.template_key:
-                data.append((inp, res))
-        if not data:
-            return
+        # set in_tuning as True to make the feature extraction consistent
+        GLOBAL_SCOPE.in_tuning = True
 
         # fit base model
-        base_model = self.cost_model.clone_new()
-        base_model.fit_log(data, self.plan_size)
+        base_model = self.cost_model.spawn_base_model()
+        success = base_model.fit_log(data_set, self.plan_size)
+
+        if not success:
+            GLOBAL_SCOPE.in_tuning = False
+            return
 
         # use base model to select initial points
         if not self.trials:
@@ -285,6 +283,7 @@ def load_history(self, data_set):
             self.trial_pt = 0
 
         self.cost_model.load_basemodel(base_model)
+        GLOBAL_SCOPE.in_tuning = False
 
     def has_next(self):
         return len(self.visited) < len(self.space)
diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py
index 6e1c373c113f..1947c6dde4e0 100644
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
@@ -87,7 +87,7 @@ def find_maximums(self, model, num, exclusive):
 
             new_scores = model.predict(new_points)
 
-            ac_prob = np.exp((new_scores - scores) / t)
+            ac_prob = np.exp((new_scores - scores) / (t + 1e-2))
             ac_index = np.random.random(len(ac_prob)) < ac_prob
 
             points[ac_index] = new_points[ac_index]
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 91004cba4603..cffbb9798392 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -31,6 +31,10 @@ def __init__(self, task, **kwargs):
         self.best_measure_pair = None
         self.best_iter = 0
 
+        # time to leave
+        self.ttl = None
+        self.n_trial = None
+
     def has_next(self):
         """Whether has next untried config in the space
 
@@ -76,7 +80,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
         measure_option: dict
             The options for how to measure generated code.
             You should use the return value ot autotvm.measure_option for this argument.
-        early_stopping: int
+        early_stopping: int, optional
             Early stop the tuning when not finding better configs in this number of trials
         callbacks: List of callable
             A list of callback functions. The signature of callback function is
@@ -87,6 +91,8 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
         measure_batch = create_measure_batch(self.task, measure_option)
         n_parallel = getattr(measure_batch, 'n_parallel', 1)
         early_stopping = early_stopping or 1e9
+        self.n_trial = n_trial
+
         old_level = logger.level
 
         GLOBAL_SCOPE.in_tuning = True
@@ -127,11 +133,12 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
             for callback in callbacks:
                 callback(self, inputs, results)
 
-            if i > self.best_iter + early_stopping:
+            self.ttl = min(early_stopping + self.best_iter, n_trial) - i
+            if i >= self.best_iter + early_stopping:
                 logger.debug("Early stopped. Best iter: %d.", self.best_iter)
                 break
 
-            if error_ct > 50:
+            if error_ct > 150:
                 logger.warning("Too many errors happen in the tuning. Now is in debug mode")
                 logger.setLevel(logging.DEBUG)
             else:
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 178e92476752..bda3ee26e062 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -31,8 +31,12 @@ class XGBoostCostModel(CostModel):
         If is 'curve', use sampled curve feature (relation feature).
 
         Note on choosing feature type:
-        For single task tuning, 'itervar' and 'knob' is good.
+        For single task tuning, 'itervar' and 'knob' are good.
                                 'itervar' is more accurate but 'knob' is much faster.
+                                There are some constraints on 'itervar', if you meet
+                                problems with feature extraction when using 'itervar',
+                                you can swith to 'knob'.
+
         For cross-shape tuning (e.g. many convolutions with different shapes),
                                'itervar' and 'curve' has better transferability,
                                'knob' is faster.
@@ -46,8 +50,11 @@ class XGBoostCostModel(CostModel):
         The number of threads.
     log_interval: int, optional
         If is not none, the cost model will print training log every `log_interval` iterations.
+    upper_model: XGBoostCostModel, optional
+        The upper model used in transfer learning
     """
-    def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval=25):
+    def __init__(self, task, feature_type, loss_type, num_threads=4, log_interval=25,
+                 upper_model=None):
         super(XGBoostCostModel, self).__init__()
 
         if xgb is None:
@@ -109,35 +116,51 @@ def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval
         else:
             raise RuntimeError("Invalid feature type " + feature_type)
 
-        self.feature_cache = FeatureCache()
+        if upper_model:  # share a same feature cache with upper model
+            self.feature_cache = upper_model.feature_cache
+        else:
+            self.feature_cache = FeatureCache()
+        self.upper_model = upper_model
         self.feature_extra_ct = 0
         self.pool = None
         self.base_model = None
-        self.upper_model = None
 
         self._sample_size = 0
+        self._reset_pool(self.space, self.target, self.task)
 
-        self._reset_pool()
+    def _reset_pool(self, space, target, task):
+        """reset processing pool for feature extraction"""
+
+        if self.upper_model:  # base model will reuse upper model's pool,
+            self.upper_model._reset_pool(space, target, task)
+            return
+
+        self._close_pool()
 
-    def _reset_pool(self):
-        # reset processing pool for feature extraction
-        if self.pool:
-            self.pool.terminate()
-            self.pool.join()
-            del self.pool
         # use global variable to pass common arguments
         global _extract_space, _extract_target, _extract_task
-        _extract_space = self.space
-        _extract_target = self.target
-        _extract_task = self.task
+        _extract_space = space
+        _extract_target = target
+        _extract_task = task
         self.pool = multiprocessing.Pool(self.num_threads)
 
+    def _close_pool(self):
+        if self.pool:
+            self.pool.terminate()
+            self.pool.join()
+            self.pool = None
+
+    def _get_pool(self):
+        if self.upper_model:
+            return self.upper_model._get_pool()
+        return self.pool
+
     def _base_model_discount(self):
-        return 1.0 / (2 ** (self._sample_size / 50.0))
+        return 1.0 / (2 ** (self._sample_size / 64.0))
 
     def fit(self, xs, ys, plan_size):
         tic = time.time()
-        self._reset_pool()
+        self._reset_pool(self.space, self.target, self.task)
 
         x_train = self._get_feature(xs)
         y_train = np.array(ys)
@@ -150,8 +173,12 @@ def fit(self, xs, ys, plan_size):
         self._sample_size = len(x_train)
 
         if self.base_model:
-            dtrain.set_base_margin(self._base_model_discount() *
-                                   self.base_model.predict(xs, output_margin=True))
+            discount = self._base_model_discount()
+            if discount < 0.05:  # discard base model
+                self.base_model.upper_model = None
+                self.base_model = None
+            else:
+                dtrain.set_base_margin(discount * self.base_model.predict(xs, output_margin=True))
 
         self.bst = xgb.train(self.xgb_params, dtrain,
                              num_boost_round=8000,
@@ -172,11 +199,19 @@ def fit(self, xs, ys, plan_size):
 
     def fit_log(self, records, plan_size):
         tic = time.time()
-        self._reset_pool()
 
-        args = list(records)
-        logger.debug("XGB load %d entries from history log file", len(args))
+        # filter data, only pick the data with a same task
+        data = []
+        for inp, res in records:
+            if inp.task.name == self.task.name and \
+                            inp.config.template_key == self.task.config_space.template_key:
+                data.append((inp, res))
+
+        logger.debug("XGB load %d entries from history log file", len(data))
 
+        # extract feature
+        self._reset_pool(self.space, self.target, self.task)
+        pool = self._get_pool()
         if self.fea_type == 'itervar':
             feature_extract_func = _extract_itervar_feature_log
         elif self.fea_type == 'knob':
@@ -185,10 +220,21 @@ def fit_log(self, records, plan_size):
             feature_extract_func = _extract_curve_feature_log
         else:
             raise RuntimeError("Invalid feature type: " + self.fea_type)
-        res = self.pool.map(feature_extract_func, args)
-        xs, ys = zip(*res)
-        xs, ys = np.array(xs), np.array(ys)
+        res = pool.map(feature_extract_func, data)
+
+        # filter out feature with different shapes
+        fea_len = len(self._get_feature([0])[0])
+
+        xs, ys = [], []
+        for x, y in res:
+            if len(x) == fea_len:
+                xs.append(x)
+                ys.append(y)
 
+        if len(xs) < 500:  # no enough samples
+            return False
+
+        xs, ys = np.array(xs), np.array(ys)
         x_train = xs
         y_train = ys
         y_max = np.max(y_train)
@@ -212,6 +258,8 @@ def fit_log(self, records, plan_size):
 
         logger.debug("XGB train: %.2f\tobs: %d", time.time() - tic, len(xs))
 
+        return True
+
     def predict(self, xs, output_margin=False):
         feas = self._get_feature(xs)
         dtest = xgb.DMatrix(feas)
@@ -224,20 +272,12 @@ def predict(self, xs, output_margin=False):
 
     def load_basemodel(self, base_model):
         self.base_model = base_model
-        if isinstance(base_model, XGBoostCostModel):
-            # share feature cache
-            base_model.feature_cache = self.feature_cache
-
-            # close thread pool
-            if base_model.pool:
-                base_model.pool.terminate()
-                base_model.pool.join()
-                del base_model.pool
-            self.base_model.upper_model = self
-
-    def clone_new(self):
+        self.base_model._close_pool()
+        self.base_model.upper_model = self
+
+    def spawn_base_model(self):
         return XGBoostCostModel(self.task, self.fea_type, self.loss_type,
-                                self.num_threads, self.log_interval)
+                                self.num_threads, self.log_interval, self)
 
     def _get_feature(self, indexes):
         """get features for indexes, run extraction if we do not have cache for them"""
@@ -251,7 +291,7 @@ def _get_feature(self, indexes):
         need_extract = [x for x in indexes if x not in fea_cache]
 
         if need_extract:
-            pool = self.pool if self.upper_model is None else self.upper_model.pool
+            pool = self._get_pool()
             feas = pool.map(self.feature_extract_func, need_extract)
             for i, fea in zip(need_extract, feas):
                 fea_cache[i] = fea
@@ -261,6 +301,9 @@ def _get_feature(self, indexes):
             ret[i, :] = fea_cache[ii]
         return ret
 
+    def __del__(self):
+        self._close_pool()
+
 
 _extract_space = None
 _extract_target = None
diff --git a/python/tvm/autotvm/tuner/xgboost_tuner.py b/python/tvm/autotvm/tuner/xgboost_tuner.py
index 237ac4e19ab1..886c82a4d749 100644
--- a/python/tvm/autotvm/tuner/xgboost_tuner.py
+++ b/python/tvm/autotvm/tuner/xgboost_tuner.py
@@ -20,8 +20,12 @@ class XGBTuner(ModelBasedTuner):
         If is 'curve', use sampled curve feature (relation feature).
 
         Note on choosing feature type:
-        For single task tuning, 'itervar' and 'knob' is good.
+        For single task tuning, 'itervar' and 'knob' are good.
                                 'itervar' is more accurate but 'knob' is much faster.
+                                There are some constraints on 'itervar', if you meet
+                                problems with feature extraction when using 'itervar',
+                                you can swith to 'knob'.
+
         For cross-shape tuning (e.g. many convolutions with different shapes),
                                'itervar' and 'curve' has better transferability,
                                'knob' is faster.
@@ -32,8 +36,7 @@ class XGBTuner(ModelBasedTuner):
         If is 'rank', use pairwise rank loss to train cost model.
                      The cost model predicts relative rank score.
     num_threads: int, optional
-        The number of threads.
-    optimizer: str or ModelOptimizer, optional
+        The number of threads.  optimizer: str or ModelOptimizer, optional
         If is 'sa', use a default simulated annealing optimizer.
         Otherwise it should be a ModelOptimizer object.
     diversity_filter_ratio: int or float, optional
@@ -45,7 +48,7 @@ class XGBTuner(ModelBasedTuner):
         If is 0, output nothing.
         Otherwise, output debug information every `verbose` iterations.
     """
-    def __init__(self, task, plan_size=32,
+    def __init__(self, task, plan_size=64,
                  feature_type='itervar', loss_type='rank', num_threads=None,
                  optimizer='sa', diversity_filter_ratio=None, log_interval=50):
         cost_model = XGBoostCostModel(task,
@@ -62,3 +65,9 @@ def __init__(self, task, plan_size=32,
 
         super(XGBTuner, self).__init__(task, cost_model, optimizer,
                                        plan_size, diversity_filter_ratio)
+
+    def tune(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        super(XGBTuner, self).tune(*args, **kwargs)
+
+        # manually close pool to avoid multiprocessing issues
+        self.cost_model._close_pool()
diff --git a/python/tvm/exec/tophub.py b/python/tvm/exec/tophub.py
index 9dd951a52701..9bfd6866506d 100644
--- a/python/tvm/exec/tophub.py
+++ b/python/tvm/exec/tophub.py
@@ -8,8 +8,8 @@
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument("--download", type=str, nargs='+',
-                        help="Target to download. Use 'all' to download for all targets")
+    parser.add_argument("-d", "--download", type=str, nargs='+',
+                        help="The targets to download. Use 'all' to download for all targets")
     parser.add_argument("-l", "--list", action='store_true', help="List available packages")
     args = parser.parse_args()
 
@@ -21,8 +21,7 @@
         print("-" * 41)
         for target, info in info:
             print("%-20s %-20s" % (target, "%.2f MB" % (info['size']/1000000)))
-
-    if args.download:
+    elif args.download:
         info = list_packages()
         all_targets = [x[0] for x in info]
         if 'all' in args.download:
@@ -34,3 +33,5 @@
             if t not in all_targets:
                 print("Warning : cannot find tuned parameters of " + t + ". (ignored)")
             download_package(t)
+    else:
+        parser.print_help()
diff --git a/python/tvm/target.py b/python/tvm/target.py
index e2d780f75264..9d5200661c6c 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -263,6 +263,7 @@ def dispatch_func(func, *args, **kwargs):
                     "Keyword arguments cannot be used when invoking generic_func %s" % func_name)
             return generic_func_node(*args)
         fresult = decorate(fdefault, dispatch_func)
+        fresult.fdefault = fdefault
         fresult.register = register
         return fresult
     return fdecorate
diff --git a/tests/python/unittest/test_autotvm_dispatch_context.py b/tests/python/unittest/test_autotvm_dispatch_context.py
index 6c718e5bd041..1f2a7e276a32 100644
--- a/tests/python/unittest/test_autotvm_dispatch_context.py
+++ b/tests/python/unittest/test_autotvm_dispatch_context.py
@@ -3,34 +3,48 @@
 to the parameters of workload"""
 
 from collections import namedtuple
+from tvm import autotvm
 from tvm.autotvm.task import dispatcher, DispatchContext
 
-SimpleWorkload = namedtuple("SimpleWorkload", ["key"])
-SimpleConfig = namedtuple("SimpleConfig", ["template_key"])
+SimpleConfig = namedtuple('SimpleConfig', ('template_key', 'is_fallback'))
 
 def test_dispatch():
     @dispatcher
     def my_dispatcher(a, b):
-        return SimpleWorkload(key=a + b)
-
-    @my_dispatcher.register("spatial_pack")
-    def _sp_pack_add(cfg, a, b):
-        return b + 100
+        return (a, b)
 
     @my_dispatcher.register("im2col")
-    def _im2col_add(cfg, a, b):
-        return a + 1
+    def _im2col(cfg, a, b):
+        return a
+
+    @my_dispatcher.register("spatial_pack")
+    def _spatial_pack(cfg, a, b):
+        return b
 
     class SimpleDispatcher(DispatchContext):
         def query(self, target, workload):
-            tkey = "spatial_pack" if workload.key > 2 else "im2col"
-            return SimpleConfig(tkey)
+            a, b = workload
+            tkey = "spatial_pack" if a + b > 2 else "im2col"
+            cfg = SimpleConfig(tkey, False)
+            return cfg
 
     with SimpleDispatcher():
-        # im2col
-        assert my_dispatcher(1, 0) == 2
-        # spack
-        assert my_dispatcher(1, 100) == 200
+        # this will call im2col
+        assert my_dispatcher(1, 0) == 1
+
+        # this will call spatial pack
+        assert my_dispatcher(1, 100) == 100
+
+def test_fallback():
+
+    @autotvm.template
+    def simple_template(a, b):
+        cfg = autotvm.get_config()
+        assert cfg.is_fallback
+
+    simple_template(2, 3)
+
 
 if __name__ == "__main__":
     test_dispatch()
+    test_fallback()
diff --git a/tests/python/unittest/test_autotvm_space.py b/tests/python/unittest/test_autotvm_space.py
index 0320ef1c6f3c..e51e34e95a3b 100644
--- a/tests/python/unittest/test_autotvm_space.py
+++ b/tests/python/unittest/test_autotvm_space.py
@@ -1,7 +1,7 @@
 """Test space definition primitives"""
 
 import tvm
-from tvm.autotvm.task.space import ConfigSpace
+from tvm.autotvm.task.space import ConfigSpace, FallbackConfigEntity
 
 def gemm_func(cfg, N):
     A = tvm.placeholder((N, N), name='A')
@@ -26,5 +26,18 @@ def test_split():
     assert len(cfg) == 64
     assert len(cfg.space_map['tile_y']) == 8
 
+    # test fallback
+    cfg = FallbackConfigEntity()
+    cfg.define_split('tile_n', cfg.axis(128), num_outputs=3)
+    cfg.fallback_split('tile_n', [-1, 8, 4])
+
+    assert cfg['tile_n'].size == [4, 8, 4]
+
+    cfg = FallbackConfigEntity()
+    cfg.define_split('tile_n', cfg.axis(49), num_outputs=3)
+    cfg.fallback_split('tile_n', [-1, 8, 4])
+
+    assert cfg['tile_n'].size == [7, 7, 1]
+
 if __name__ == '__main__':
     test_split()
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
index 3488d0f599a5..58da219f2e48 100644
--- a/tests/python/unittest/test_autotvm_xgboost_model.py
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -12,7 +12,7 @@
 
 def test_fit():
     task, target = get_sample_task()
-    records = get_sample_records(n=100)
+    records = get_sample_records(n=500)
 
     base_model = XGBoostCostModel(task, feature_type='itervar', loss_type='rank')
     base_model.fit_log(records, plan_size=32)
@@ -20,8 +20,8 @@ def test_fit():
     upper_model = XGBoostCostModel(task, feature_type='itervar', loss_type='rank')
     upper_model.load_basemodel(base_model)
 
-    xs = np.arange(100)
-    ys = np.arange(100)
+    xs = np.arange(10)
+    ys = np.arange(10)
 
     upper_model.fit(xs, ys, plan_size=32)
 
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 48bb4fb022c7..a3945a4c9d76 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -27,7 +27,14 @@ def _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
 @autotvm.task.dispatcher
 def conv2d_arm_cpu(data, kernel, strides, padding, layout, out_dtype):
     """TOPI compute callback. Mark this function as a dispatcher, so
-    this template can assign config according to workload"""
+    this template can assign config according to workload
+
+    Returns
+    -------
+    workload: Tuple
+        Dispatcher will use this workload to query corresponding config.
+        Then use cfg.template_key to call a registered template.
+    """
     return _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
 
 @conv2d_arm_cpu.register(['direct'])
@@ -70,8 +77,10 @@ def _callback(op):
 
 def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
     assert layout == "NCHW", "Only support NCHW"
-    out_dtype = out_dtype or data.dtype
+    # create workload according to raw arguments
+    wkl = _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
 
+    out_dtype = out_dtype or data.dtype
     N, CI, IH, IW = get_const_tuple(data.shape)
     if len(kernel.shape) == 4:
         pre_packed = False
@@ -113,6 +122,18 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
     cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
     # ====================================================================
 
+    if cfg.is_fallback:
+        if num_tile == 2:
+            cfg.fallback_split('tile_co', [-1, 8])
+            cfg.fallback_split('tile_oh', [-1, 2])
+            cfg.fallback_split('tile_ow', [-1, 8])
+        else:
+            cfg.fallback_split('tile_co', [-1, 16, 4])
+            cfg.fallback_split('tile_oh', [-1, 1, 1])
+            cfg.fallback_split('tile_ow', [-1, 1, 4])
+        cfg['ann_reduce'].anns = ['unroll', 'unroll']
+        cfg['ann_spatial'].anns = ['none', 'unroll', 'vec']
+
     VC = cfg["tile_co"].size[-1]
     VH = cfg["tile_oh"].size[-1]
     VW = cfg["tile_ow"].size[-1]
@@ -145,8 +166,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
     output = tvm.compute(oshape, lambda n, co, h, w:
                          conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
                          name='output_unpack', tag='spatial_conv2d_output',
-                         attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding,
-                                                                  layout, out_dtype)})
+                         attrs={'workload': wkl})
     return output
 
 def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
@@ -212,6 +232,10 @@ def decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
     return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
 
 def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    # create workload according to raw arguments
+    wkl = _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout,
+                                         out_dtype, tile_size)
+
     N, CI, IH, IW = get_const_tuple(data.shape)
     if len(kernel.shape) == 4:
         pre_computed = False
@@ -333,10 +357,9 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
     output = tvm.compute((N, K, H, W), lambda n, k, h, w:
                          Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
                          name='output', tag='winograd_conv2d_output',
-                         attrs={'workload': _winograd_conv_arg_to_workload(
-                             data, kernel, strides, padding, layout, out_dtype, tile_size)})
+                         attrs={'workload': wkl})
 
-    # we have to manually assign effective GFLOP for winogard
+    # we have to manually assign effective GFLOP for winograd
     cfg.add_flop(2 * N * K * H * W * KH * KW * C)
     return output
 
@@ -358,30 +381,29 @@ def _schedule_winograd(cfg, s, output, last):
         kernel, G = U.op.input_tensors
         s[G].compute_inline()
         eps, nu, k, c, kk, = s[U].op.axis
-        r_kh, r_kw = s[U].op.reduce_axis
-        s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
-        s[U].unroll(eps)
-        s[U].unroll(nu)
-        s[U].unroll(r_kh)
-        s[U].unroll(r_kw)
-        s[U].vectorize(kk)
         if autotvm.GLOBAL_SCOPE.in_tuning:
             # kernel transformation will be pre-computed during compilation, so we skip
             # this part to make tuning records correct
-            s[U].pragma(k, 'debug_skip_region')
+            s[U].pragma(eps, 'debug_skip_region')
         else:
+            r_kh, r_kw = s[U].op.reduce_axis
+            s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
+            for axis in [eps, nu, r_kh, r_kw]:
+                s[U].unroll(axis)
+            s[U].vectorize(kk)
             s[U].parallel(k)
 
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            s[kernel].compute_inline()
+
     # transform image
     DD = s.cache_read(d, 'global', [V])
     s[B].compute_inline()
     eps, nu, b, c, bb = s[V].op.axis
     r_eps, r_nu = s[V].op.reduce_axis
     s[V].reorder(b, c, eps, nu, r_eps, r_nu, bb)
-    s[V].unroll(eps)
-    s[V].unroll(nu)
-    s[V].unroll(r_eps)
-    s[V].unroll(r_nu)
+    for axis in [eps, nu, r_eps, r_nu]:
+        s[V].unroll(axis)
     s[DD].compute_at(s[V], c)
     s[V].vectorize(bb)
     s[V].parallel(b)
@@ -405,10 +427,8 @@ def _schedule_winograd(cfg, s, output, last):
     s[A].compute_inline()
     k, b, vh, vw = s[Y].op.axis
     r_eps, r_nu = s[Y].op.reduce_axis
-    s[Y].unroll(vh)
-    s[Y].unroll(vw)
-    s[Y].unroll(r_eps)
-    s[Y].unroll(r_nu)
+    for axis in [vh, vw, r_eps, r_nu]:
+        s[Y].unroll(axis)
 
     # output
     n, co, h, w = s[last].op.axis
@@ -444,6 +464,7 @@ def _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_d
         [data, raw_kernel, strides, padding, layout, out_dtype])
 
 
+##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
 @conv2d_winograd_without_weight_transform.register(['arm_cpu'])
 @autotvm.task.dispatcher
 def winograd_ww_config_dispatcher_(data, kernel, strides, padding, layout, out_dtype, tile_size):
@@ -472,6 +493,7 @@ def _callback(op):
     return s
 
 
+##### REGISTER ALTER OP LAYOUT #####
 @conv2d_alter_layout.register(["arm_cpu", "mali"])
 def _alter_conv2d_layout(attrs, inputs, tinfos):
     """Alter op layout for pre-computing kernel transformation"""
@@ -493,18 +515,30 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
         # query config of this workload
         workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding,
                                          layout, out_dtype)
-        cfg = autotvm.task.DispatchContext.current.query(tvm.target.current_target(), workload)
+        cfg = autotvm.DispatchContext.current.query(tvm.target.current_target(), workload)
+
+        if cfg.is_fallback: # if is fallback, clear query cache and return None
+            context = autotvm.DispatchContext.current
+            while not isinstance(context, autotvm.FallbackContext):
+                context = context._old_ctx
+            context.clear_cache(tvm.target.current_target(), workload)
+            return None
 
         if cfg.template_key == 'direct':  # packing weight tensor
             new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
             return sym.conv2d(*copy_inputs, **new_attrs)
         else:  # pre-compute weight transformation in winograd
-            tile_size = 4
+            if "-device=arm_cpu" in tvm.target.current_target().options:
+                tile_size = 4
+                VC = cfg['tile_k'].size[-1]
+            else:
+                from ..mali.conv2d import _pick_tile_size
+                tile_size = _pick_tile_size(tinfos[0], tinfos[1])
+                VC = cfg['tile_bna'].val
 
             weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1],
                                                                   tile_size=tile_size)
             CO, CI, KH, KW = get_const_tuple(tinfos[1].shape)
-            VC = cfg['tile_k'].size[-1]
             weight = sym.reshape(weight,
                                  shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
             weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])
diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index 8aafc436319f..e066a1e29435 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -14,16 +14,21 @@
 
 # register customized schedule for arm cpu.
 @autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
-def schedule_depthwise_conv2d_nchw_(cfg, outs):
+def schedule_depthwise_conv2d_nchw_arm(cfg, outs):
     """Schedule depthwise conv2d
 
     Parameters
     ----------
     cfg: ConfigEntity
-        The configuration of this tempalte
+        The configuration of this template
     outs: Array of Tensor
         The computation graph description of depthwise convolution2d
         in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for depthwise_conv2d nchw.
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
@@ -38,6 +43,11 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
         cfg.define_split('tile_h', h, num_outputs=2)
         cfg.define_split('tile_w', w, num_outputs=2)
 
+        if cfg.is_fallback:
+            cfg.fallback_split('tile_c', [-1, 8])
+            cfg.fallback_split('tile_h', [-1, 2])
+            cfg.fallback_split('tile_w', [-1, 8])
+
         # park data to vector form  [n, c, h, w] -> [n, C, h, w, VC]
         A0 = s.cache_read(data_pad, "global", C)
         _, c, h, w = s[A0].op.axis
diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py
index ac552903ad7f..06847bf9f427 100644
--- a/topi/python/topi/x86/injective.py
+++ b/topi/python/topi/x86/injective.py
@@ -29,7 +29,7 @@ def schedule_injective(outs):
     elif len(s[x].op.axis) >= 3:
         fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
         s[x].parallel(fused)
-    else:
+    elif len(s[x].op.axis) >= 1:
         s[x].parallel(s[x].op.axis[0])
     return s
 
diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py
new file mode 100644
index 000000000000..d992be9292fc
--- /dev/null
+++ b/topi/tests/python/common.py
@@ -0,0 +1,12 @@
+"""Common utility for topi test"""
+
+def get_all_backend():
+    """return all supported target
+
+    Returns
+    -------
+    targets: list
+        A list of all supported targets
+    """
+    return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx',
+            'llvm -device=arm_cpu']
diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py
index 6df18483a45f..82af0006c2ef 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d.py
@@ -1,11 +1,8 @@
-import os
 import numpy as np
 import tvm
 import topi
 import topi.testing
-from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
-from tvm.contrib import util
 from tvm.contrib.pickle_memoize import memoize
 
 def generate_quantized_np(shape, bits, out_dtype):
@@ -16,23 +13,23 @@ def generate_quantized_np(shape, bits, out_dtype):
 def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
     activation_bits, weight_bits, dorefa):
     in_height = in_width = in_size
-    input_type='uint32'
-    out_dtype='int32'
+    input_type = 'uint32'
+    out_dtype = 'int32'
 
     with tvm.target.create('llvm'):
         A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_type, name='A')
         W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_type, name='W')
         B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, 
-            out_dtype=out_dtype, layout="NCHW", dorefa=dorefa)
+                                     out_dtype=out_dtype, layout="NCHW", dorefa=dorefa)
         s = topi.generic.schedule_bitserial_conv2d_nchw([B])
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
 
+    @memoize("topi.tests.test_topi_bitseral_conv2d_nchw")
     def get_ref_data():
-        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
-        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(w_shape), weight_bits, input_type)
         if dorefa:
             w_ = np.copy(w_np).astype(out_dtype)
             for x in np.nditer(w_, op_flags=['readwrite']):
@@ -61,16 +58,16 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel,
         A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
         W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
         B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
-                            layout="NHWC", dorefa=dorefa)
+                                     layout="NHWC", dorefa=dorefa)
         s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
 
+    @memoize("topi.tests.test_topi_bitseral_conv2d_nhwc")
     def get_ref_data():
-        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
-        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(w_shape), weight_bits, input_type)
         if dorefa:
             w_ = np.copy(w_np).astype(out_dtype)
             for x in np.nditer(w_, op_flags=['readwrite']):
@@ -109,4 +106,4 @@ def test_bitserial_conv2d():
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
 
 if __name__ == "__main__":
-    test_bitserial_conv2d()
\ No newline at end of file
+    test_bitserial_conv2d()
diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
index 3de954abc291..de467818d37f 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@@ -4,10 +4,6 @@
 import tvm
 import topi
 import topi.testing
-from topi.util import get_const_tuple
-from tvm.contrib import util
-
-target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
 
 def generate_quantized_np(shape, bits, out_dtype):
     np.random.seed(0)
@@ -17,20 +13,19 @@ def generate_quantized_np(shape, bits, out_dtype):
 
 # Verify that certain special instructions from the tensorize pass exist
 def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
-                        activation_bits, weight_bits, dorefa):
+                                 activation_bits, weight_bits, dorefa):
     in_height = in_width = in_size
-    input_type='uint32'
-    out_dtype='int32'
+    input_type = 'uint32'
+    out_dtype = 'int32'
 
     with tvm.target.arm_cpu('rasp3b'):
         A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
         W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
         B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
-                            layout="NHWC", dorefa=dorefa)
+                                     layout="NHWC", dorefa=dorefa)
         s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
 
-    
-    func = tvm.build(s, [A, W, B], target)
+    func = tvm.build(s, [A, W, B], tvm.target.arm_cpu('rasp3b'))
    
     assembly = func.get_source('asm')
     matches = re.findall("vpadal", assembly)
@@ -47,7 +42,6 @@ def test_bitserial_conv2d():
     stride = 1
     pad = 1
 
-
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
 
diff --git a/topi/tests/python/test_topi_bnn.py b/topi/tests/python/test_topi_bnn.py
index 90abc68e6b68..cf9f377e9e1c 100644
--- a/topi/tests/python/test_topi_bnn.py
+++ b/topi/tests/python/test_topi_bnn.py
@@ -28,7 +28,7 @@ def get_ref_data():
         a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype)
         b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype)
         c_np = np.dot(a_np, b_np.T)
-        return (a_np, b_np, c_np)
+        return a_np, b_np, c_np
 
     a_np, b_np, c_np = get_ref_data()
 
diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py
index f888033b3914..4ed5b31708e4 100644
--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -1,5 +1,5 @@
 """Test code for broadcasting operators."""
-import os
+from common import get_all_backend
 import numpy as np
 import tvm
 import topi
@@ -8,6 +8,7 @@ def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
     # Build the logic and compile the function
     A = tvm.placeholder(shape=in_shape, name="A")
     B = fbcast(A, out_shape)
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -21,16 +22,11 @@ def check_device(device):
         out_npy = np.broadcast_to(data_npy, out_shape)
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
-        for _ in range(1):
-            foo(data_nd, out_nd)
+        foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    check_device("vulkan")
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("nvptx")
+    for target in get_all_backend():
+        check_device(target)
     check_device("sdaccel")
 
 
@@ -45,9 +41,10 @@ def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
     B = (tvm.var("B", dtype=dtype) if rhs_shape is None
          else tvm.placeholder(shape=rhs_shape, name="B", dtype=dtype))
     C = ftopi(A, B)
-    if (isinstance(A, tvm.expr.Expr) and isinstance(B, tvm.expr.Expr)):
+    if isinstance(A, tvm.expr.Expr) and isinstance(B, tvm.expr.Expr):
         assert(isinstance(C, tvm.expr.Expr))
         return
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -82,12 +79,8 @@ def check_device(device):
         foo(lhs_nd, rhs_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
 
-    check_device("opencl")
-    check_device("vulkan")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("nvptx")
+    for target in get_all_backend():
+        check_device(target)
     check_device("sdaccel")
 
 def test_broadcast_to():
diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py
index ffc89aeb9bc3..f1367463eb4f 100644
--- a/topi/tests/python/test_topi_clip.py
+++ b/topi/tests/python/test_topi_clip.py
@@ -5,6 +5,7 @@
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
+from common import get_all_backend
 
 def verify_clip(N, a_min, a_max, dtype):
     A = tvm.placeholder((N, N), dtype=dtype, name='A')
@@ -34,7 +35,7 @@ def check_device(device):
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['llvm', 'opencl', 'sdaccel']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_clip():
diff --git a/topi/tests/python/test_topi_conv2d.py b/topi/tests/python/test_topi_conv2d.py
deleted file mode 100644
index 365fdf551c4f..000000000000
--- a/topi/tests/python/test_topi_conv2d.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""Example code to do conv2d."""
-import os
-import numpy as np
-import tvm
-from tvm import autotvm
-import topi
-import topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from topi.util import get_const_tuple
-
-
-def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding):
-    in_height = in_width = in_size
-
-    with tvm.target.arm_cpu():
-        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-        B = topi.nn.conv2d(A, W, (stride, stride), (padding, padding), 'NCHW', 'float32')
-        s = topi.generic.schedule_conv2d_nchw([B])
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d.verify_conv2d")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
-        return a_np, w_np, b_np
-
-    a_np, w_np, b_np = get_ref_data()
-
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(a_np, ctx)
-    w = tvm.nd.array(w_np, ctx)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-    func = tvm.build(s, [A, W, B], "llvm")
-    func(a, w, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-def test_conv2d():
-    with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b'), allow_fallback=True):
-        verify_conv2d(1, 56, 64, 64, 3, 1, 1)
-
-if __name__ == "__main__":
-    test_conv2d()
diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py
index 1ff4b02470c4..af1afcb9ea9a 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -43,14 +43,12 @@ def check_device(device):
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=128,
-                              unroll_explicit=(device != "cuda")):
-            func1 = tvm.build(s1, [A, W, B], device)
-            func2 = tvm.build(s2, [A, W, C], device)
-            func1(a, w, b)
-            func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        func1 = tvm.build(s1, [A, W, B], device)
+        func2 = tvm.build(s2, [A, W, C], device)
+        func1(a, w, b)
+        func2(a, w, c)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index c663384b8187..6f367d10c048 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -1,31 +1,41 @@
 """Example code to do convolution."""
-import os
+
 import numpy as np
 import tvm
+from tvm import autotvm
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
-def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
+from common import get_all_backend
+
+def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
     print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
 
     in_height = in_width = in_size
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
     W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
     dtype = A.dtype
 
     @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
         dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        b_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
-        c_np = np.maximum(b_np, 0)
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
         return a_np, w_np, b_np, c_np
 
     a_np, w_np, b_np, c_np = get_ref_data()
@@ -38,66 +48,103 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
-            B = topi.nn.conv2d(A, dW, stride, padding, layout='NCHW')
-            C = topi.nn.relu(B)
-            s1 = topi.generic.schedule_conv2d_nchw([B])
-            s2 = topi.generic.schedule_conv2d_nchw([C])
+            C = topi.nn.conv2d(A, dW, stride, padding, layout='NCHW', out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_nchw([C])
+
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        b = tvm.nd.array(b_np, ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        no_unroll_explicit = device in ["cuda", "nvptx", "rocm"]
-        with tvm.build_config(auto_unroll_max_step=1400,
-                              unroll_explicit=not no_unroll_explicit):
-            func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
-            func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
-            func1(a, w, b)
-            func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in get_all_backend():
         check_device(device)
 
 
 def test_conv2d_nchw():
+    autotvm.DispatchContext.current.silent = True
+
     # ResNet18 workloads
-    verify_conv2d_nchw(1, 3, 224, 64, 7, 2, 3)
-    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
-    verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0)
-    verify_conv2d_nchw(1, 64, 56, 128, 3, 2, 1)
-    verify_conv2d_nchw(1, 64, 56, 128, 1, 2, 0)
-    verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
-    verify_conv2d_nchw(1, 128, 28, 256, 3, 2, 1)
-    verify_conv2d_nchw(1, 128, 28, 256, 1, 2, 0)
-    verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
-    verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1)
-    verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0)
-    verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
-    # ResNet50 workloads
-    verify_conv2d_nchw(1, 64, 56, 256, 1, 1, 0)
-    verify_conv2d_nchw(1, 256, 56, 64, 1, 1, 0)
-    verify_conv2d_nchw(1, 256, 56, 128, 1, 2, 0)
-    verify_conv2d_nchw(1, 128, 28, 512, 1, 1, 0)
-    verify_conv2d_nchw(1, 256, 56, 512, 1, 2, 0)
-    verify_conv2d_nchw(1, 512, 28, 128, 1, 1, 0)
-    verify_conv2d_nchw(1, 512, 28, 256, 1, 2, 0)
-    verify_conv2d_nchw(1, 256, 14, 1024, 1, 1, 0)
-    verify_conv2d_nchw(1, 512, 28, 1024, 1, 2, 0)
-    verify_conv2d_nchw(1, 1024, 14, 256, 1, 1, 0)
-    verify_conv2d_nchw(1, 1024, 14, 512, 1, 2, 0)
-    verify_conv2d_nchw(1, 512, 7, 2048, 1, 2, 0)
-    verify_conv2d_nchw(1, 1024, 14, 2048, 1, 2, 0)
-    verify_conv2d_nchw(1, 2048, 7, 512, 1, 1, 0)
-    # Vgg16 workloads
-    verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1)
-    # Super resolution workloads
-    verify_conv2d_nchw(1, 1, 224, 64, 5, 1, 2)
-    verify_conv2d_nchw(1, 64, 224, 64, 3, 1, 1)
-    verify_conv2d_nchw(1, 64, 224, 32, 3, 1, 1)
-    verify_conv2d_nchw(1, 32, 224, 9, 3, 1, 1)
+    verify_conv2d_nchw(1,   3, 224,  64, 7, 2, 3)
+    verify_conv2d_nchw(1,  64,  56,  64, 3, 1, 1)
+    verify_conv2d_nchw(1,  64,  56,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  64,  56, 128, 3, 2, 1)
+    verify_conv2d_nchw(1,  64,  56, 128, 1, 2, 0)
+    verify_conv2d_nchw(1, 128,  28, 128, 3, 1, 1)
+    verify_conv2d_nchw(1, 128,  28, 256, 3, 2, 1)
+    verify_conv2d_nchw(1, 128,  28, 256, 1, 2, 0)
+    verify_conv2d_nchw(1, 256,  14, 256, 3, 1, 1)
+    verify_conv2d_nchw(1, 256,  14, 512, 3, 2, 1)
+    verify_conv2d_nchw(1, 256,  14, 512, 1, 2, 0)
+    verify_conv2d_nchw(1, 512,   7, 512, 3, 1, 1)
+
+    # bias, relu
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True)
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True)
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
+
     # dilation = 2
-    verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1, dilation=2)
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, dilation=2)
+
+    # weird workloads
+    verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=1)
+    verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=2)
+
+    # inception v3 workloads
+    verify_conv2d_nchw(1,    3, 299,  32, 3, 2, 0)
+    verify_conv2d_nchw(1,   32, 149,  32, 3, 1, 0)
+    verify_conv2d_nchw(1,   32, 147,  64, 3, 1, 1)
+    verify_conv2d_nchw(1,   64,  73,  80, 1, 1, 0)
+    verify_conv2d_nchw(1,   80,  73, 192, 3, 1, 0)
+    verify_conv2d_nchw(1,  192,  35,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  192,  35,  48, 1, 1, 0)
+    verify_conv2d_nchw(1,   48,  35,  64, 5, 1, 2)
+    verify_conv2d_nchw(1,   64,  35,  96, 3, 1, 1)
+    verify_conv2d_nchw(1,   96,  35,  96, 3, 1, 1)
+    verify_conv2d_nchw(1,  192,  35,  32, 1, 1, 0)
+    verify_conv2d_nchw(1,  256,  35,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  256,  35,  48, 1, 1, 0)
+    verify_conv2d_nchw(1,  288,  35,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  288,  35,  48, 1, 1, 0)
+    verify_conv2d_nchw(1,  288,  35, 384, 3, 2, 0)
+    # verify_conv2d_nchw(1,   96,  35,  96, 3, 2, 0)
+    # verify_conv2d_nchw(1,  768,  17, 192, 1, 1, 0)
+    # verify_conv2d_nchw(1,  768,  17, 128, 1, 1, 0)
+    # verify_conv2d_nchw(1,  128,  17, 128, 1, 1, 0)
+    # verify_conv2d_nchw(1,  128,  17, 192, 7, 1, 3)
+    # verify_conv2d_nchw(1,  128,  17, 128, 7, 1, 3)
+    # verify_conv2d_nchw(1,  128,  17, 192, 1, 1, 0)
+    # verify_conv2d_nchw(1,  768,  17, 160, 1, 1, 0)
+    # verify_conv2d_nchw(1,  160,  17, 160, 1, 1, 0)
+    # verify_conv2d_nchw(1,  160,  17, 192, 7, 1, 3)
+    # verify_conv2d_nchw(1,  160,  17, 160, 7, 1, 3)
+    # verify_conv2d_nchw(1,  160,  17, 192, 1, 1, 0)
+    # verify_conv2d_nchw(1,  192,  17, 192, 1, 1, 0)
+    # verify_conv2d_nchw(1,  192,  17, 192, 7, 1, 3)
+    # verify_conv2d_nchw(1,  192,  17, 320, 3, 2, 0)
+    # verify_conv2d_nchw(1,  192,  17, 192, 3, 2, 0)
+    verify_conv2d_nchw(1, 1280,   8, 320, 1, 1, 0)
+    verify_conv2d_nchw(1, 1280,   8, 384, 1, 1, 0)
+    verify_conv2d_nchw(1,  384,   8, 384, 1, 1, 0)
+    verify_conv2d_nchw(1,  384,   8, 384, 3, 1, 1)
+    verify_conv2d_nchw(1, 1280,   8, 448, 1, 1, 0)
+    verify_conv2d_nchw(1,  448,   8, 384, 3, 1, 1)
+    verify_conv2d_nchw(1, 1280,   8, 192, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 320, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 384, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 448, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 192, 1, 1, 0)
+
 
 if __name__ == "__main__":
     test_conv2d_nchw()
diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
index 0c985400031a..5f65c038be60 100644
--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -6,14 +6,13 @@
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
+from common import get_all_backend
 
 def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
     in_height = in_width = in_size
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
     W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W')
-    B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding, A.dtype)
-    C = topi.nn.relu(B)
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
@@ -36,22 +35,23 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], [padding, padding], A.dtype)
+            C = topi.nn.relu(B)
             s1 = topi.generic.schedule_conv2d_transpose_nchw([B])
             s2 = topi.generic.schedule_conv2d_transpose_nchw([C])
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=128,
-                              unroll_explicit=(device != "cuda")):
-            func1 = tvm.build(s1, [A, W, B], device)
-            func2 = tvm.build(s2, [A, W, C], device)
-            func1(a, w, b)
-            func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+        func1 = tvm.build(s1, [A, W, B], device)
+        func2 = tvm.build(s2, [A, W, C], device)
+        func1(a, w, b)
+        func2(a, w, c)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py
index 2df43eb30887..92f95f3e0497 100644
--- a/topi/tests/python/test_topi_dense.py
+++ b/topi/tests/python/test_topi_dense.py
@@ -6,13 +6,12 @@
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
+from common import get_all_backend
 
 def verify_dense(batch, in_dim, out_dim, use_bias=True):
     A = tvm.placeholder((batch, in_dim), name='A')
     B = tvm.placeholder((out_dim, in_dim), name='B')
     C = tvm.placeholder((out_dim,), name='C')
-    D = topi.nn.dense(A, B, C if use_bias else None)
-    D = topi.nn.relu(D)
     dtype = A.dtype
 
     # use memoize to pickle the test data for next time use
@@ -36,6 +35,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            D = topi.nn.dense(A, B, C if use_bias else None)
+            D = topi.nn.relu(D)
             s = topi.generic.schedule_dense(D)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(b_np, ctx)
@@ -45,13 +46,15 @@ def check_device(device):
         f(a, b, c, d)
         np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_dense():
     verify_dense(1, 1024, 1000, use_bias=True)
     verify_dense(1, 1024, 1000, use_bias=False)
 
+    verify_dense(2, 1024, 1000, use_bias=True)
+
 
 if __name__ == "__main__":
     test_dense()
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index 3086054ba487..8c27af8390fe 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -2,11 +2,10 @@
 import topi
 import topi.testing
 import numpy as np
-from scipy import signal
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
-from topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_nhwc
 
+from common import get_all_backend
 
 def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
     in_width = in_height
@@ -18,10 +17,6 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
     DilatedFilter = topi.nn.dilate(Filter, (1, 1, dilation, dilation), name='DilatedFilter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
-    # declare
-    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, DilatedFilter, stride=stride, padding=padding)
-    ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
-    Relu = topi.nn.relu(ScaleShift)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -30,6 +25,10 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            # declare
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, DilatedFilter, stride=stride, padding=padding)
+            ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
+            Relu = topi.nn.relu(ScaleShift)
             # schedule
             s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift)
@@ -88,12 +87,8 @@ def get_ref_data():
         np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
         np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("vulkan")
-    check_device("nvptx")
+    for device in get_all_backend():
+        check_device(device)
 
 
 def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1):
@@ -107,11 +102,6 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
     DilatedFilter = topi.nn.dilate(Filter, (1, 1, dilation, dilation), name='DilatedFilter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
-    # declare
-    DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, DilatedFilter, stride=[stride_h, stride_w], padding=padding)
-    ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
-    Relu = topi.nn.relu(ScaleShift)
-    # schedule
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -121,6 +111,11 @@ def check_device(device):
         print("Running on target: %s" % device)
 
         with tvm.target.create(device):
+            # declare
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, DilatedFilter, stride=[stride_h, stride_w], padding=padding)
+            ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
+            Relu = topi.nn.relu(ScaleShift)
+            # schedule
             s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift)
             s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu)
@@ -180,12 +175,9 @@ def get_ref_data():
         np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
         np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("vulkan")
-    check_device("nvptx")
+    for device in get_all_backend():
+        check_device(device)
+
 
 def test_depthwise_conv2d():
     print("testing nchw")
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index f3d1c62bdaf2..e85786037477 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -312,7 +312,9 @@ def tune_and_evaluate():
 
         # upload module to device
         print("Upload...")
-        remote = autotvm.measure.request_remote(device_key, timeout=10000)
+        remote = autotvm.measure.request_remote(device_key,
+                                                tracker_addr=('localhost', 9190),
+                                                timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
 
@@ -333,7 +335,6 @@ def tune_and_evaluate():
 
 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run by yourself.
-
 # tune_and_evaluate()
 
 ######################################################################

From c9feaf97d2548d54814487a546fff132ec25578d Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 22 Aug 2018 09:23:07 -0700
Subject: [PATCH 037/529] [TOPI][ARM CPU] fuse bias to depthwise conv2d (#1631)

---
 topi/python/topi/arm_cpu/depthwise_conv2d.py | 25 ++++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index e066a1e29435..c341d1a5b325 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -9,11 +9,11 @@
 from ..util import traverse_inline
 
 # register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-autotvm.task.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
+autotvm.task.register_topi_compute(depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct',
                                    depthwise_conv2d_nchw.fdefault)
 
 # register customized schedule for arm cpu.
-@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
+@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct')
 def schedule_depthwise_conv2d_nchw_arm(cfg, outs):
     """Schedule depthwise conv2d
 
@@ -44,15 +44,15 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
         cfg.define_split('tile_w', w, num_outputs=2)
 
         if cfg.is_fallback:
-            cfg.fallback_split('tile_c', [-1, 8])
+            cfg.fallback_split('tile_c', [-1, 4])
             cfg.fallback_split('tile_h', [-1, 2])
-            cfg.fallback_split('tile_w', [-1, 8])
+            cfg.fallback_split('tile_w', [-1, 4])
 
         # park data to vector form  [n, c, h, w] -> [n, C, h, w, VC]
         A0 = s.cache_read(data_pad, "global", C)
-        _, c, h, w = s[A0].op.axis
+        n, c, h, w = s[A0].op.axis
         c, vc = cfg['tile_c'].apply(s, A0, c)
-        s[A0].reorder(c, h, w, vc)
+        s[A0].reorder(n, c, h, w, vc)
         A1 = s.cache_write(A0, 'global')
         s[A0].compute_inline()
 
@@ -64,9 +64,9 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
         B1 = s.cache_write(B0, 'global')
         s[B0].compute_inline()
 
-        _, c, h, w = s[C].op.axis
+        n, c, h, w = s[C].op.axis
         c, vc, = cfg['tile_c'].apply(s, C, c)
-        s[C].reorder(c, h, w, vc)
+        s[C].reorder(n, c, h, w, vc)
 
         # depthwise conv
         C0 = s.cache_write(C, 'global')
@@ -86,9 +86,14 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
                          max_unroll=16,
                          cfg=cfg)
 
+        # fusion
+        if C.op not in s.outputs:
+            s[C].compute_inline()
+
         # mark parallel
-        n, c, h, w = s[C].op.axis
-        s[C].parallel(c)
+        last = outs[0]
+        n, c, h, w = s[last].op.axis
+        s[last].parallel(c)
 
         n, c, h, w, vc = s[C0].op.axis
         s[C0].parallel(c)

From 22ffa7a35d0d862a42d49ed74d2e8a363236a69c Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Wed, 22 Aug 2018 21:53:55 +0530
Subject: [PATCH 038/529] [FRONTEND]minor bug fixes (#1632)

---
 nnvm/python/nnvm/frontend/tensorflow.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 092b8fa20219..65dd3619b5b2 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -430,7 +430,6 @@ def _impl(inputs, attr, params):
 
 def _lrn():
     def _impl(inputs, attr, params):
-        new_inputs = []
         attr_new = {}
         depth_radius = attr.get('depth_radius', 5)
         size = (depth_radius * 2) + 1
@@ -439,7 +438,7 @@ def _impl(inputs, attr, params):
         attr_new['bias'] = attr.get('bias', 1)
         attr_new['alpha'] = attr.get('alpha', 1) * size
         attr_new['beta'] = attr.get('beta', 0.5)
-        return AttrCvt(op_name='lrn')(new_inputs, attr_new)
+        return AttrCvt(op_name='lrn')(inputs, attr_new)
     return _impl
 
 def _sum():
@@ -613,7 +612,7 @@ def _impl(inputs, in_state_c, in_state_h, attr, params):
         ixh = _sym.concatenate(*[in_data, in_state_h], axis=1)
         in_weight = _sym.transpose(in_weight)
         gates = _sym.dense(ixh, in_weight, in_bias, use_bias=True,
-                           units=num_hidden_layers, name="dense")
+                           units=num_hidden_layers)
         gate_list = _sym.split(gates, indices_or_sections=4, axis=1)
         in_gate = _sym.sigmoid(gate_list[0])
         in_transform = _sym.tanh(gate_list[1])

From 49690ceefd6183e708a2be7d2350c5fbf4bffdb6 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 22 Aug 2018 11:18:05 -0700
Subject: [PATCH 039/529] [ATTRS] change AttrFiledInfo->Node (#1634)

---
 include/tvm/attrs.h | 45 +++++++++++++++++++++++++++++----------------
 src/lang/attrs.cc   |  2 +-
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index aed6b1ff722f..3e5169ba02b8 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -69,15 +69,27 @@ struct AttrError : public dmlc::Error {
 /*!
  * \brief Information about attribute fields in string representations.
  */
-struct AttrFieldInfo {
+class AttrFieldInfoNode : public Node {
+ public:
   /*! \brief name of the field */
   std::string name;
   /*! \brief type docstring information in str. */
   std::string type_info;
   /*! \brief detailed description of the type */
   std::string description;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("type_info", &type_info);
+    v->Visit("description", &description);
+  }
+  static constexpr const char* _type_key = "AttrFieldInfo";
+  TVM_DECLARE_NODE_TYPE_INFO(AttrFieldInfoNode, Node);
 };
 
+/*! \brief AttrFieldInfo */
+TVM_DEFINE_NODE_REF(AttrFieldInfo, AttrFieldInfoNode);
+
 /*!
  * \brief Base class of all attribute class
  * \note Do not subclass AttrBaseNode directly,
@@ -104,7 +116,7 @@ class BaseAttrsNode : public Node {
    * \brief Get the field information about the
    * \note This function throws when the required a field is not present.
    */
-  TVM_DLL virtual std::vector<AttrFieldInfo> ListFieldInfo() const = 0;
+  TVM_DLL virtual Array<AttrFieldInfo> ListFieldInfo() const = 0;
   /*!
    * \brief Initialize the attributes by arguments.
    * \param kwargs The key value pairs for initialization.
@@ -159,7 +171,7 @@ class DictAttrsNode : public BaseAttrsNode {
   // implementations
   void VisitAttrs(AttrVisitor* v) final;
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final;
-  std::vector<AttrFieldInfo> ListFieldInfo() const final;
+  Array<AttrFieldInfo> ListFieldInfo() const final;
   // type info
   static constexpr const char* _type_key = "DictAttrs";
   TVM_DECLARE_NODE_TYPE_INFO(DictAttrsNode, BaseAttrsNode);
@@ -430,7 +442,7 @@ class AttrDocEntry {
  public:
   using TSelf = AttrDocEntry;
 
-  explicit AttrDocEntry(AttrFieldInfo* info)
+  explicit AttrDocEntry(std::shared_ptr<AttrFieldInfoNode> info)
       : info_(info) {
   }
   TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
@@ -454,21 +466,22 @@ class AttrDocEntry {
   }
 
  private:
-  AttrFieldInfo* info_;
+  std::shared_ptr<AttrFieldInfoNode> info_;
 };
 
 class AttrDocVisitor {
  public:
   template<typename T>
   AttrDocEntry operator()(const char* key, T* v) {
-    AttrFieldInfo info;
-    info.name = key;
-    info.type_info = TypeName<T>::value;
-    fields_.emplace_back(std::move(info));
-    return AttrDocEntry(&(fields_.back()));
+    std::shared_ptr<AttrFieldInfoNode> info
+        = std::make_shared<AttrFieldInfoNode>();
+    info->name = key;
+    info->type_info = TypeName<T>::value;
+    fields_.push_back(AttrFieldInfo(info));
+    return AttrDocEntry(info);
   }
 
-  std::vector<AttrFieldInfo> fields_;
+  Array<AttrFieldInfo> fields_;
 };
 
 class AttrExistVisitor {
@@ -557,7 +570,7 @@ class AttrsNode : public BaseAttrsNode {
     }
   }
 
-  std::vector<AttrFieldInfo> ListFieldInfo() const final {
+  Array<AttrFieldInfo> ListFieldInfo() const final {
     detail::AttrDocVisitor visitor;
     self()->__VisitAttrs__(visitor);
     return visitor.fields_;
@@ -580,11 +593,11 @@ inline void BaseAttrsNode::InitBySeq(Args&& ...args) {
 }
 
 inline void BaseAttrsNode::PrintDocString(std::ostream &os) const { // NOLINT(*)
-  std::vector<AttrFieldInfo> entry = this->ListFieldInfo();
+  Array<AttrFieldInfo> entry = this->ListFieldInfo();
   for (AttrFieldInfo info : entry) {
-    os << info.name << " : " << info.type_info << '\n';
-    if (info.description.length() != 0) {
-      os << "    " << info.description << '\n';
+    os << info->name << " : " << info->type_info << '\n';
+    if (info->description.length() != 0) {
+      os << "    " << info->description << '\n';
     }
   }
 }
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index 49a91983e79d..0d8d1f3c9ece 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -25,7 +25,7 @@ void DictAttrsNode::InitByPackedArgs(
   }
 }
 
-std::vector<AttrFieldInfo> DictAttrsNode::ListFieldInfo() const {
+Array<AttrFieldInfo> DictAttrsNode::ListFieldInfo() const {
   return {};
 }
 

From f88a7ed593740c1f27e31af43a3cfe1d2291aa21 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 23 Aug 2018 07:36:09 +0900
Subject: [PATCH 040/529] [NNVM][KERAS] Add cropping support (#1636)

---
 nnvm/python/nnvm/frontend/keras.py              | 17 ++++++++++++++++-
 .../tests/python/frontend/keras/test_forward.py | 15 +++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 15493d18e7bb..3cfa481762e2 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -311,6 +311,21 @@ def _convert_upsample(insym, keras_layer, _):
     return _sym.upsampling(insym, **params)
 
 
+def _convert_cropping(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    crop_type = type(keras_layer).__name__
+    if crop_type == "Cropping1D":
+        raise NotImplementedError("Cropping1D not implemented")
+    elif crop_type == "Cropping2D":
+        (_, in_h, in_w, _) = keras_layer.input_shape
+        ((crop_t, crop_b), (crop_l, crop_r)) = keras_layer.cropping
+    else:
+        raise TypeError("Unrecognized cropping type : {}".format(crop_type))
+    int32_max = np.iinfo(np.int32).max
+    return _sym.strided_slice(insym, begin=[0, 0, crop_t, crop_l],
+                              end=[int32_max, int32_max, in_h-crop_b, in_w-crop_r])
+
+
 def _convert_batchnorm(insym, keras_layer, symtab):
     params = {'scale': False,
               'center': False,
@@ -409,6 +424,7 @@ def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     'Multiply'                 : _convert_merge,
     'ZeroPadding2D'            : _convert_padding,
     'UpSampling2D'             : _convert_upsample,
+    'Cropping2D'               : _convert_cropping,
 
     # 'ZeroPadding1D'          : _convert_padding,
     # 'AveragePooling1D'       : _convert_pooling,
@@ -416,7 +432,6 @@ def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     # 'GlobalAveragePooling1D' : _convert_pooling,
     # 'GlobalMaxPooling1D'     : _convert_pooling,
     # 'Cropping1D'             : _convert_cropping,
-    # 'Cropping2D'             : _convert_cropping,
     # 'UpSampling1D'           : _convert_upsample,
     # 'UpSampling3D'           : _convert_upsample,
     # 'Conv1D'                 : _convert_convolution1d,
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 17c9fc1329d7..5c27a3e38099 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -110,6 +110,20 @@ def test_forward_reshape():
     verify_keras_frontend(keras_model)
 
 
+def test_forward_crop():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Cropping2D(cropping=((1, 1), (1, 1)))(data)
+    x = keras.layers.Cropping2D(cropping=(1, 1))(x)
+    x = keras.layers.Cropping2D(cropping=1)(x)
+    x = keras.layers.Cropping2D(cropping=((0, 1), (1, 0)))(x)
+    x = keras.layers.Cropping2D(cropping=(1, 0))(x)
+    x = keras.layers.Cropping2D(cropping=0)(x)
+    x = keras.layers.Add()([x, x])
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
 def test_forward_vgg16():
     keras_model = keras.applications.vgg16.VGG16(include_top=True, weights=None,
         input_shape=(224,224,3), classes=1000)
@@ -196,6 +210,7 @@ def test_forward_reuse_layers():
     test_forward_separable_conv()
     test_forward_upsample()
     test_forward_reshape()
+    test_forward_crop()
     test_forward_vgg16()
     test_forward_xception()
     test_forward_resnet50()

From 9f75cddf8f0e50f7eac13a7bd2fda5981085ab12 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 23 Aug 2018 07:48:01 +0900
Subject: [PATCH 041/529] [NNVM][KERAS] Fixed padding in pooling (#1635)

---
 nnvm/python/nnvm/frontend/keras.py               |  4 +---
 nnvm/tests/python/frontend/keras/test_forward.py | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 3cfa481762e2..3a0a25aa4979 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -269,14 +269,12 @@ def _convert_pooling(insym, keras_layer, symtab):
                   'padding': [0, 0]}
         if keras_layer.padding == 'valid':
             pass
-        # we insert a separate pad operator
         elif keras_layer.padding == 'same':
             in_h = keras_layer.input_shape[1]
             in_w = keras_layer.input_shape[2]
             pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
             pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
-            insym = _sym.pad(data=insym, pad_width=(
-                (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+            params['padding'] = [pad_t, pad_l, pad_b, pad_r]
         else:
             raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
         if pool_type == 'MaxPooling2D':
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 5c27a3e38099..3e80c74399cc 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -38,7 +38,7 @@ def get_tvm_output(xs, target, ctx, dtype='float32'):
         out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
         return out.asnumpy()
 
-    xs = [np.random.uniform(size=shape) for shape in in_shapes]
+    xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
     keras_out = get_keras_output(xs)
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output([x.transpose([0,3,1,2]) for x in xs], target, ctx)
@@ -74,6 +74,18 @@ def test_forward_dense():
     verify_keras_frontend(keras_model)
 
 
+def test_forward_pool():
+    data = keras.layers.Input(shape=(2,2,1))
+    # maxpool
+    x = keras.layers.MaxPooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+    # avgpool
+    y = keras.layers.AveragePooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+
+
 def test_forward_transpose_conv():
     data = keras.layers.Input(shape=(32,32,3))
     x = keras.layers.Conv2D(filters=10, kernel_size=(3,3), strides=(2,2), padding='same')(data)
@@ -206,6 +218,7 @@ def test_forward_reuse_layers():
     test_forward_elemwise_add()
     test_forward_activations()
     test_forward_dense()
+    test_forward_pool()
     test_forward_transpose_conv()
     test_forward_separable_conv()
     test_forward_upsample()

From 7cc795c94e1cc1b3b09e59de0ba7bc1319a64e5b Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 22 Aug 2018 20:17:02 -0700
Subject: [PATCH 042/529] [TEAM] New reviewer: nishi-t (#1637)

---
 CONTRIBUTORS.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 9db50b02b11a..faac3386f245 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -23,6 +23,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 ## Reviewers
 - [Masahiro Masuda](https://github.com/masahi)
 - [Kazutaka Morita](https://github.com/kazum)
+- [Tatsuya Nishiyama](https://github.com/nishi-t)
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
 - [Siva](https://github.com/srkreddy1238)
 - [Alex Weaver](https://github.com/alex-weaver)
@@ -35,9 +36,6 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
   - To contributors: please add your name to the list.
 - [Qiao Zhang](https://github.com/zhangqiaorjc)
 - [Jian Weng](https://github.com/were)
-- [Masahiro Masuda](https://github.com/masahi)
 - [Haolong Zhang](https://github.com/haolongzhangm)
 - [Cody Hao Yu](https://github.com/comaniac)
 - [Chris Nuernberger](https://github.com/cnuernber)
-- [Tatsuya Nishiyama](https://github.com/nishi-t)
-- [Kazutaka Morita](https://github.com/kazum)

From 15935f9ad9a499d79eb6b1a628c43e8cca2eb0fb Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Thu, 23 Aug 2018 08:47:36 +0530
Subject: [PATCH 043/529] [FRONTEND][COREML]More ops are added (#1619)

---
 nnvm/python/nnvm/frontend/coreml.py           |  40 +++++-
 .../python/frontend/coreml/test_forward.py    | 121 ++++++++++++++++++
 2 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py
index 3ca76bb0b20e..7dfd54317b55 100644
--- a/nnvm/python/nnvm/frontend/coreml.py
+++ b/nnvm/python/nnvm/frontend/coreml.py
@@ -269,6 +269,40 @@ def UpsampleLayerParams(op, insym, symtab):
 def L2NormalizeLayerParams(op, insym, symtab):
     return _sym.l2_normalize(insym, eps=op.epsilon, axis=1)
 
+def LRNLayerParams(op, insym, symtab):
+    par = {}
+    par['size'] = op.localSize
+    par['bias'] = op.k
+    par['alpha'] = op.alpha
+    par['beta'] = op.beta
+    par['axis'] = 1 #default layout is nchw
+    return _sym.lrn(data=insym, **par)
+
+def AverageLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    count = len(insyms)
+    _sum = insyms[0]
+    for i in range(1, count):
+        _sum = _sym.broadcast_add(_sum, insyms[i])
+    return _sum / count
+
+def MaxLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    _max = insyms[0]
+    for i in range(1, len(insyms)):
+        _max = _sym.broadcast_max(_max, insyms[i])
+    return _max
+
+def MinLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    _min = insyms[0]
+    for i in range(1, len(insyms)):
+        _min = _sym.broadcast_min(_min, insyms[i])
+    return _min
+
 _convert_map = {
     'NeuralNetworkMeanImage': NeuralNetworkMeanImage,
     'NeuralNetworkImageScaler': NeuralNetworkImageScaler,
@@ -286,7 +320,11 @@ def L2NormalizeLayerParams(op, insym, symtab):
     'PaddingLayerParams':PaddingLayerParams,
     'PermuteLayerParams':PermuteLayerParams,
     'UpsampleLayerParams':UpsampleLayerParams,
-    'L2NormalizeLayerParams':L2NormalizeLayerParams
+    'L2NormalizeLayerParams':L2NormalizeLayerParams,
+    'LRNLayerParams':LRNLayerParams,
+    'AverageLayerParams':AverageLayerParams,
+    'MaxLayerParams':MaxLayerParams,
+    'MinLayerParams':MinLayerParams,
 }
 
 def coreml_op_to_nnvm(op, inname, outname, symtab):
diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py
index 27ae28c20ab9..a33a7c5a5ed7 100644
--- a/nnvm/tests/python/frontend/coreml/test_forward.py
+++ b/nnvm/tests/python/frontend/coreml/test_forward.py
@@ -223,6 +223,123 @@ def verify_l2_normalize(input_dim, eps):
 def test_forward_l2_normalize():
     verify_l2_normalize((1, 3, 20, 20), 0.001)
 
+def verify_lrn(input_dim, size, bias, alpha, beta):
+    dtype = "float32"
+    axis=1
+    a_np = np.random.uniform(size=input_dim).astype(dtype)
+    b_np = topi.testing.lrn_python(a_np, size, axis, bias, alpha, beta)
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_lrn(name='LRN',
+                    input_name='input',
+                    output_name='output',
+                    alpha=alpha,
+                    beta=beta,
+                    k=bias,
+                    local_size=size)
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_lrn():
+    verify_lrn((1, 3, 10, 20), 3, 1.0, 1.0, 0.5)
+
+def verify_average(input_dim1, input_dim2, axis=0):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim1).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim2).astype(dtype)
+
+    b_np = np.mean((a_np1, a_np2), axis=axis)
+
+    inputs = [('input1', datatypes.Array(*input_dim1)),
+              ('input2', datatypes.Array(*input_dim2))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='MEAN',
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='AVE')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_average():
+    verify_average((1, 3, 20, 20), (1, 3, 20, 20))
+    verify_average((3, 20, 20), (1, 3, 20, 20))
+    verify_average((20, 20), (1, 3, 20, 20))
+
+def verify_max(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.max((a_np1, a_np2, a_np3), axis=0)
+
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim)),
+              ('input3', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Max',
+                            input_names=['input1', 'input2', 'input3'],
+                            output_name='output',
+                            mode='MAX')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2, a_np3],
+                           ['input1', 'input2', 'input3'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_max():
+    verify_max((1, 3, 20, 20))
+    verify_max((20, 20))
+
+def verify_min(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.min((a_np1, a_np2, a_np3), axis=0)
+
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim)),
+              ('input3', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Min',
+                            input_names=['input1', 'input2', 'input3'],
+                            output_name='output',
+                            mode='MIN')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2, a_np3],
+                           ['input1', 'input2', 'input3'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_min():
+    verify_min((1, 3, 20, 20))
+    verify_min((20, 20))
+
 if __name__ == '__main__':
     test_mobilenet_checkonly()
     test_resnet50_checkonly()
@@ -231,3 +348,7 @@ def test_forward_l2_normalize():
     test_forward_MultiplyLayerParams()
     test_forward_UpsampleLayerParams()
     test_forward_l2_normalize()
+    test_forward_lrn()
+    test_forward_average()
+    test_forward_max()
+    test_forward_min()

From 7a930c68c330f44a016739176f7498aa93799f3c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 22 Aug 2018 20:19:43 -0700
Subject: [PATCH 044/529] trigger ci (#1620)

---
 topi/python/topi/x86/nn.py | 43 +++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index 03e07222c420..6802d4c01e60 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -2,8 +2,9 @@
 """x86 nn operators"""
 from __future__ import absolute_import as _abs
 import tvm
+
 from .. import generic
-from .. import tag
+from ..util import traverse_inline
 
 @generic.schedule_softmax.register(["cpu"])
 def schedule_softmax(outs):
@@ -53,44 +54,38 @@ def schedule_dense(outs):
 
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
 
+    def _callback(op):
         if 'dense' in op.tag:
-            C = op.output(0)
-            x, y = C.op.axis
+            output = outs[0]
+            dense = op.output(0)
 
             # Write cache for blocks
-            CC = s.cache_write(C, 'global')
+            if dense.op in s.outputs:
+                CC = s.cache_write(dense, 'local')
+            else:
+                CC = dense
 
             # Tile
             bnx = 1
             bny = 4
-            _, yo, _, yi = s[C].tile(x, y, bnx, bny)
-            s[CC].compute_at(s[C], yo)
+            x, y = output.op.axis
+            xo, yo, xi, yi = s[output].tile(x, y, bnx, bny)
+
             xc, yc = s[CC].op.axis
             k, = s[CC].op.reduce_axis
             ko, ki = s[CC].split(k, factor=4)
             s[CC].reorder(ko, xc, ki, yc)
+
             s[CC].unroll(ki)
             s[CC].vectorize(yc)
 
-            # Vectorization
-            s[C].vectorize(yi)
-
-            # Parallelization
-            s[C].parallel(yo)
+            s[output].unroll(xi)
+            s[output].vectorize(yi)
 
-        scheduled_ops.append(op)
+            fused = s[output].fuse(xo, yo)
+            s[output].parallel(fused)
+            s[CC].compute_at(s[output], fused)
 
-    traverse(outs[0].op)
+    traverse_inline(s, outs[0].op, _callback)
     return s

From 00266bb9f79d717a8505a002ed1a6c8c2c5d53ba Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 22 Aug 2018 20:21:15 -0700
Subject: [PATCH 045/529] [AUTOTVM] Simplify TopHub (#1630)

---
 apps/benchmark/arm_cpu_imagenet_bench.py      |  44 +++----
 apps/benchmark/util.py                        |  72 ++++++++++++
 nnvm/python/nnvm/testing/__init__.py          |   1 +
 nnvm/python/nnvm/testing/mobilenet_v2.py      |  51 ++++++++
 python/tvm/autotvm/tophub.py                  | 109 ++++++++----------
 python/tvm/exec/tophub.py                     |  37 ------
 python/tvm/rpc/server.py                      |   2 +-
 python/tvm/target.py                          |   5 -
 .../integration/test_benchmark_topi_conv2d.py |   2 +-
 vta/tutorials/resnet.py                       |   2 +-
 10 files changed, 190 insertions(+), 135 deletions(-)
 create mode 100644 apps/benchmark/util.py
 create mode 100644 nnvm/python/nnvm/testing/mobilenet_v2.py
 delete mode 100644 python/tvm/exec/tophub.py

diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index 7baf244e0dae..f5057299920c 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -1,45 +1,26 @@
-"""Benchmark script for performance on ARM CPU.
+"""Benchmark script for ARM CPU.
 see README.md for the usage and results of this script.
 """
-
 import argparse
-import time
 
 import numpy as np
 
-import nnvm.testing
-import nnvm.compiler
 import tvm
-from tvm import autotvm
 from tvm.contrib.util import tempdir
 import tvm.contrib.graph_runtime as runtime
+import nnvm.compiler
+import nnvm.testing
 
-def get_network(name, batch_size):
-    """Get the symbol definition and random weight of a network"""
-    input_shape = (batch_size, 3, 224, 224)
-    output_shape = (batch_size, 1000)
-
-    if name == 'resnet-18':
-        net, params = nnvm.testing.resnet.get_workload(num_layers=18,
-                                                       batch_size=batch_size, image_shape=(3, 224, 224))
-    elif name == 'mobilenet':
-        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
-    elif name == 'squeezenet v1.1':
-        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size,
-                                                           version='1.1')
-    elif name == 'vgg-16':
-        net, params = nnvm.testing.vgg.get_workload(batch_size=batch_size, num_layers=16)
-    else:
-        raise RuntimeError("Unsupported network: " + name)
-
-    return net, params, input_shape, output_shape
+from util import get_network, print_progress
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--network", type=str, choices=['resnet-18', 'mobilenet', 'squeezenet v1.1', 'vgg-16'])
-    parser.add_argument("--device", type=str, required=True, choices=['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro', 
-                                                                      'pixel2', 'rasp3b', 'pynq'])
+    parser.add_argument("--network", type=str, choices=
+                        ['resnet-18', 'resnet-34', 'vgg-16', 'mobilenet', 'squeezenet v1.1', ])
+    parser.add_argument("--device", type=str, required=True, choices=
+                        ['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro',
+                         'pixel2', 'rasp3b', 'pynq'])
     parser.add_argument("--host", type=str, default='localhost')
     parser.add_argument("--port", type=int, default=9190)
     parser.add_argument("--rpc-key", type=str, required=True)
@@ -49,7 +30,7 @@ def get_network(name, batch_size):
     dtype = 'float32'
 
     if args.network is None:
-        networks = ['squeezenet v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
+        networks = ['squeezenet_v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
     else:
         networks = [args.network]
 
@@ -63,8 +44,10 @@ def get_network(name, batch_size):
     print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
     print("--------------------------------------------------")
     for network in networks:
+        print_progress(network)
         net, params, input_shape, output_shape = get_network(network, batch_size=1)
 
+        print_progress("%-20s building..." % network)
         with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
             graph, lib, params = nnvm.compiler.build(
                 net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
@@ -79,6 +62,7 @@ def get_network(name, batch_size):
             lib.export_library(tmp.relpath(filename))
 
         # upload library and params
+        print_progress("%-20s uploading..." % network)
         ctx = remote.context(str(target), 0)
         remote.upload(tmp.relpath(filename))
         rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
@@ -90,7 +74,7 @@ def get_network(name, batch_size):
         module.set_input(**rparams)
 
         # evaluate
+        print_progress("%-20s evaluating..." % network)
         ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
         prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
         print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
-
diff --git a/apps/benchmark/util.py b/apps/benchmark/util.py
new file mode 100644
index 000000000000..bd4a3d04a1d1
--- /dev/null
+++ b/apps/benchmark/util.py
@@ -0,0 +1,72 @@
+"""Utility for benchmark"""
+
+import sys
+import nnvm
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network
+    
+    Parameters
+    ----------
+    name: str
+        The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ...
+    batch_size:
+        batch size
+
+    Returns
+    -------
+    net: nnvm.symbol
+        The NNVM symbol of network definition
+    params: dict
+        The random parameters for benchmark
+    input_shape: tuple
+        The shape of input tensor
+    output_shape: tuple
+        The shape of output tensor
+    """
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif "squeezenet" in name:
+        version = name.split("_v")[1]
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version=version)
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+def print_progress(msg):
+    """print progress message
+    
+    Parameters
+    ----------
+    msg: str
+        The message to print
+    """
+    sys.stdout.write(msg + "\r")
+    sys.stdout.flush()
diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
index 4a879047ec7e..a04d2bc83587 100644
--- a/nnvm/python/nnvm/testing/__init__.py
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -4,6 +4,7 @@
 from .config import ctx_list
 from .utils import create_workload
 from . import mobilenet
+from . import mobilenet_v2
 from . import mlp
 from . import resnet
 from . import vgg
diff --git a/nnvm/python/nnvm/testing/mobilenet_v2.py b/nnvm/python/nnvm/testing/mobilenet_v2.py
new file mode 100644
index 000000000000..dc3c7cd85660
--- /dev/null
+++ b/nnvm/python/nnvm/testing/mobilenet_v2.py
@@ -0,0 +1,51 @@
+"""
+MobileNetV2, load model from gluon model zoo
+
+Reference:
+Inverted Residuals and Linear Bottlenecks:
+Mobile Networks for Classification, Detection and Segmentation
+https://arxiv.org/abs/1801.04381
+"""
+
+from .utils import create_workload
+from ..frontend.mxnet import _from_mxnet_impl
+
+def get_workload(batch_size, num_classes=1000, multiplier=1.0, dtype="float32"):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    multiplier : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision.mobilenet import MobileNetV2
+
+    image_shape = (1, 3, 224, 224)
+
+    block = MobileNetV2(multiplier=multiplier, classes=num_classes)
+
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    sym = mx.sym.SoftmaxOutput(sym)
+
+    net = _from_mxnet_impl(sym, {})
+
+    return create_workload(net, batch_size, image_shape[1:], dtype)
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 3d7b249df905..4982455038fc 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -8,16 +8,22 @@
 
 import logging
 import os
-import json
 import sys
 
 from .task import ApplyHistoryBest
 from .. import target as _target
-from ..contrib.util import tempdir
 from ..contrib.download import download
 
+# root path to store TopHub files
 AUTOTVM_TOPHUB_ROOT_PATH = os.path.join(os.path.expanduser('~'), ".tvm", "tophub")
 
+# the version of each package
+PACKAGE_VERSION = {
+    'vta':     "v0.01",
+    'arm_cpu': "v0.01",
+    'cuda':    "v0.01",
+}
+
 logger = logging.getLogger('autotvm')
 
 def _alias(name):
@@ -30,7 +36,8 @@ def _alias(name):
 
 def context(target, extra_files=None):
     """Return the dispatch context with pre-tuned parameters.
-    The corresponding downloaded *.log files under tophub root path will be loaded.
+    This function will load the corresponding *.log files in AUTOTVM_TOPHUB_ROOT_PATH.
+    If cannot find them, it will download them from TopHub github repo.
     Users can also add their own files in argument `extra_files`.
 
     Parameters
@@ -40,21 +47,24 @@ def context(target, extra_files=None):
     extra_files: list of str, optional
         Extra log files to load
     """
-    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
     best_context = ApplyHistoryBest([])
 
     if isinstance(target, str):
         target = _target.create(target)
 
-    big_target = str(target).split()[0]
-    if os.path.isfile(os.path.join(rootpath, big_target + ".log")):
-        best_context.load(os.path.join(rootpath, big_target + ".log"))
-
+    possible_names = [str(target).split()[0]]
     for opt in target.options:
         if opt.startswith("-device"):
-            model = _alias(opt[8:])
-            if os.path.isfile(os.path.join(rootpath, model) + ".log"):
-                best_context.load(os.path.join(rootpath, model) + ".log")
+            device = _alias(opt[8:])
+            possible_names.append(device)
+
+    all_packages = list(PACKAGE_VERSION.keys())
+    for name in possible_names:
+        if name in all_packages:
+            check_backend(name)
+
+            filename = "%s_%s.log" % (name, PACKAGE_VERSION[name])
+            best_context.load(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, filename))
 
     if extra_files:
         for filename in extra_files:
@@ -63,42 +73,21 @@ def context(target, extra_files=None):
     return best_context
 
 
-def download_package(backend):
-    """Download pre-tuned parameters of operators for a backend
-
-    Parameters
-    ----------
-    backend: str
-        The name of package
-    """
-    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
-
-    if not os.path.isdir(rootpath):
-        # make directory
-        splits = os.path.split(rootpath)
-        for j in range(1, len(splits)+1):
-            path = os.path.join(*splits[:j])
-            if not os.path.isdir(path):
-                os.mkdir(path)
-
-    backend = _alias(backend)
-    logger.info("Download pre-tuned parameters for %s", backend)
-    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/%s.log" % backend,
-             os.path.join(rootpath, backend + ".log"), True, verbose=0)
-
-
-def check_package(backend):
+def check_backend(backend):
     """Check whether have pre-tuned parameters of the certain target.
     If not, will download it.
 
     Parameters
     ----------
     backend: str
-        The name of package
+        The name of backend.
     """
     backend = _alias(backend)
+    assert backend in PACKAGE_VERSION, 'Cannot find backend "%s" in TopHub' % backend
 
-    if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")):
+    version = PACKAGE_VERSION[backend]
+    package_name = "%s_%s.log" % (backend, version)
+    if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, package_name)):
         return
 
     if sys.version_info >= (3,):
@@ -106,29 +95,29 @@ def check_package(backend):
     else:
         import urllib2
     try:
-        download_package(backend)
-    except urllib2.URLError:
-        logging.warning("Failed to download tophub package for %s", backend)
+        download_package(package_name)
+    except urllib2.URLError as e:
+        logging.warning("Failed to download tophub package for %s: %s", backend, e)
 
 
-def list_packages():
-    """List all available pre-tuned op parameters for targets
+def download_package(package_name):
+    """Download pre-tuned parameters of operators for a backend
 
-    Returns
-    -------
-    ret: List
-        All available packets
+    Parameters
+    ----------
+    package_name: str
+        The name of package
     """
-    path = tempdir()
-    filename = path.relpath("info.json")
-    logger.info("Download meta info for pre-tuned parameters")
-    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/info.json",
-             filename, True, verbose=0)
-
-    with open(filename, "r") as fin:
-        text = "".join(fin.readlines())
-    info = json.loads(text)
-    keys = list(info.keys())
-    keys.sort()
-
-    return [(k, info[k]) for k in keys]
+    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
+
+    if not os.path.isdir(rootpath):
+        # make directory
+        splits = os.path.split(rootpath)
+        for j in range(1, len(splits)+1):
+            path = os.path.join(*splits[:j])
+            if not os.path.isdir(path):
+                os.mkdir(path)
+
+    logger.info("Download pre-tuned parameters package %s", package_name)
+    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/%s"
+             % package_name, os.path.join(rootpath, package_name), True, verbose=0)
diff --git a/python/tvm/exec/tophub.py b/python/tvm/exec/tophub.py
deleted file mode 100644
index 9bfd6866506d..000000000000
--- a/python/tvm/exec/tophub.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# pylint: disable=invalid-name
-"""Download pre-tuned parameters of ops"""
-
-import argparse
-import logging
-
-from ..autotvm.tophub import list_packages, download_package
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-d", "--download", type=str, nargs='+',
-                        help="The targets to download. Use 'all' to download for all targets")
-    parser.add_argument("-l", "--list", action='store_true', help="List available packages")
-    args = parser.parse_args()
-
-    logging.basicConfig(level=logging.INFO)
-
-    if args.list:
-        info = list_packages()
-        print("\n%-20s %-20s" % ("Target", "Size"))
-        print("-" * 41)
-        for target, info in info:
-            print("%-20s %-20s" % (target, "%.2f MB" % (info['size']/1000000)))
-    elif args.download:
-        info = list_packages()
-        all_targets = [x[0] for x in info]
-        if 'all' in args.download:
-            targets = all_targets
-        else:
-            targets = args.download
-
-        for t in targets:
-            if t not in all_targets:
-                print("Warning : cannot find tuned parameters of " + t + ". (ignored)")
-            download_package(t)
-    else:
-        parser.print_help()
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index 792685b94a18..d65e21c794df 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -313,7 +313,7 @@ def __init__(self,
         self.use_popen = use_popen
 
         if silent:
-            logger.setLevel(logging.WARN)
+            logger.setLevel(logging.ERROR)
 
         if use_popen:
             cmd = [sys.executable,
diff --git a/python/tvm/target.py b/python/tvm/target.py
index 9d5200661c6c..40f9e099b3a6 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -425,8 +425,6 @@ def arm_cpu(model='unknown', options=None):
     options : str or list of str
         Additional options
     """
-    from . import autotvm
-
     trans_table = {
         "pixel2":    ["-model=snapdragon835", "-target=arm64-linux-android -mattr=+neon"],
         "mate10":    ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
@@ -439,9 +437,6 @@ def arm_cpu(model='unknown', options=None):
     }
     pre_defined_opt = trans_table.get(model, ["-model=%s" % model])
 
-    # download pre-tuned parameters for arm_cpu if there is not any.
-    autotvm.tophub.check_package('arm_cpu')
-
     opts = ["-device=arm_cpu"] + pre_defined_opt
     opts = _merge_opts(opts, options)
     return _api_internal._TargetCreate("llvm", *opts)
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index ca2451dec614..0661d292f4e5 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -128,7 +128,7 @@ def _run(env, remote):
                 run_cpu_conv2d(env, remote, key, batch_size, wl)
 
     # load pre-tuned operator parameters for ARM CPU
-    autotvm.tophub.check_package('vta')
+    autotvm.tophub.check_backend('vta')
     with autotvm.tophub.context('llvm -device=vtacpu'):
         vta.testing.run(_run)
 
diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
index 8d33a91d5691..8ca5eb7375b3 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@@ -154,7 +154,7 @@ def generate_graph(graph_fn, params_fn, device="vta"):
 synset = eval(open(os.path.join(data_dir, categ_fn)).read())
 
 # Download pre-tuned op parameters of conv2d for ARM CPU used in VTA
-autotvm.tophub.check_package('vta')
+autotvm.tophub.check_backend('vta')
 
 
 ######################################################################

From 4f482f1ed1c43018ee4443527a19b6a851465d53 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 23 Aug 2018 14:11:12 +0900
Subject: [PATCH 046/529] Remove leading "./" from include paths (#1640)

---
 include/tvm/api_registry.h                       |  6 +++---
 include/tvm/arithmetic.h                         |  2 +-
 include/tvm/attrs.h                              |  6 +++---
 include/tvm/base.h                               |  2 +-
 include/tvm/buffer.h                             |  4 ++--
 include/tvm/build_module.h                       |  6 +++---
 include/tvm/c_dsl_api.h                          |  2 +-
 include/tvm/codegen.h                            | 10 +++++-----
 include/tvm/expr.h                               |  4 ++--
 include/tvm/ir.h                                 |  6 +++---
 include/tvm/ir_functor_ext.h                     |  2 +-
 include/tvm/ir_mutator.h                         |  4 ++--
 include/tvm/ir_operator.h                        |  4 ++--
 include/tvm/ir_pass.h                            |  8 ++++----
 include/tvm/ir_visitor.h                         |  2 +-
 include/tvm/lowered_func.h                       |  6 +++---
 include/tvm/operation.h                          | 12 ++++++------
 include/tvm/packed_func_ext.h                    |  8 ++++----
 include/tvm/runtime/c_backend_api.h              |  2 +-
 include/tvm/runtime/device_api.h                 |  4 ++--
 include/tvm/runtime/module.h                     |  4 ++--
 include/tvm/runtime/ndarray.h                    |  4 ++--
 include/tvm/runtime/packed_func.h                |  6 +++---
 include/tvm/runtime/registry.h                   |  2 +-
 include/tvm/runtime/serializer.h                 |  4 ++--
 include/tvm/runtime/util.h                       |  2 +-
 include/tvm/schedule.h                           |  8 ++++----
 include/tvm/schedule_pass.h                      |  4 ++--
 include/tvm/target_info.h                        |  4 ++--
 include/tvm/tensor.h                             |  6 +++---
 include/tvm/tensor_intrin.h                      |  4 ++--
 include/tvm/tvm.h                                | 12 ++++++------
 nnvm/include/nnvm/graph.h                        |  6 +++---
 nnvm/include/nnvm/graph_attr_types.h             |  4 ++--
 nnvm/include/nnvm/node.h                         |  6 +++---
 nnvm/include/nnvm/op.h                           |  4 ++--
 nnvm/include/nnvm/op_attr_types.h                |  8 ++++----
 nnvm/include/nnvm/pass.h                         |  4 ++--
 nnvm/include/nnvm/pass_functions.h               |  6 +++---
 nnvm/include/nnvm/symbolic.h                     |  4 ++--
 nnvm/include/nnvm/top/nn.h                       |  2 +-
 nnvm/include/nnvm/tuple.h                        |  2 +-
 nnvm/src/c_api/c_api_error.cc                    |  2 +-
 nnvm/src/c_api/c_api_graph.cc                    |  2 +-
 nnvm/src/c_api/c_api_symbolic.cc                 |  2 +-
 nnvm/src/compiler/alter_op_layout.cc             |  4 ++--
 nnvm/src/compiler/compile_engine.cc              |  7 +++++--
 nnvm/src/compiler/compile_engine.h               |  2 +-
 nnvm/src/compiler/fold_scale_axis.cc             |  4 ++--
 nnvm/src/compiler/graph_fuse.cc                  |  7 ++++---
 nnvm/src/compiler/graph_hash.cc                  |  6 ++++--
 nnvm/src/compiler/graph_runtime.cc               |  2 +-
 nnvm/src/compiler/packed_func_ext.cc             |  2 +-
 nnvm/src/compiler/simplify_inference.cc          |  4 ++--
 nnvm/src/pass/plan_memory.cc                     |  2 +-
 nnvm/src/top/elemwise_op_common.h                |  2 +-
 nnvm/src/top/nn/convolution.cc                   |  2 +-
 nnvm/src/top/nn/nn.cc                            |  2 +-
 nnvm/src/top/nn/pooling.cc                       |  2 +-
 nnvm/src/top/nn/upsampling.cc                    |  2 +-
 src/arithmetic/canonical.cc                      |  8 ++++++--
 src/arithmetic/detect_linear_equation.cc         |  2 +-
 src/arithmetic/int_set.cc                        |  4 ++--
 src/arithmetic/modular.cc                        |  2 +-
 src/codegen/codegen_aocl.cc                      |  4 ++--
 src/codegen/codegen_c.cc                         |  2 +-
 src/codegen/codegen_c.h                          |  2 +-
 src/codegen/codegen_cuda.cc                      |  2 +-
 src/codegen/codegen_cuda.h                       |  2 +-
 src/codegen/codegen_metal.cc                     |  5 +++--
 src/codegen/codegen_metal.h                      |  2 +-
 src/codegen/codegen_opencl.cc                    |  4 ++--
 src/codegen/codegen_opencl.h                     |  2 +-
 src/codegen/codegen_opengl.cc                    |  4 ++--
 src/codegen/codegen_opengl.h                     |  2 +-
 src/codegen/codegen_source_base.cc               |  2 +-
 src/codegen/codegen_vhls.cc                      |  4 ++--
 src/codegen/codegen_vhls.h                       |  2 +-
 src/codegen/intrin_rule.cc                       |  2 +-
 src/codegen/intrin_rule_cuda.cc                  |  2 +-
 src/codegen/intrin_rule_metal.cc                 |  2 +-
 src/codegen/intrin_rule_opencl.cc                |  2 +-
 src/codegen/intrin_rule_opengl.cc                |  2 +-
 src/codegen/intrin_rule_vhls.cc                  |  2 +-
 src/codegen/llvm/codegen_amdgpu.cc               |  2 +-
 src/codegen/llvm/codegen_arm.cc                  |  2 +-
 src/codegen/llvm/codegen_cpu.cc                  |  2 +-
 src/codegen/llvm/codegen_cpu.h                   |  2 +-
 src/codegen/llvm/codegen_llvm.cc                 |  4 ++--
 src/codegen/llvm/codegen_llvm.h                  |  2 +-
 src/codegen/llvm/codegen_nvptx.cc                |  2 +-
 src/codegen/llvm/intrin_rule_llvm.cc             |  2 +-
 src/codegen/llvm/intrin_rule_llvm.h              |  2 +-
 src/codegen/llvm/intrin_rule_rocm.cc             |  2 +-
 src/codegen/llvm/llvm_common.cc                  |  2 +-
 src/codegen/llvm/llvm_module.cc                  |  4 ++--
 src/codegen/source_module.cc                     |  2 +-
 src/codegen/spirv/build_vulkan.cc                |  2 +-
 src/codegen/spirv/codegen_spirv.cc               |  3 ++-
 src/codegen/spirv/codegen_spirv.h                |  2 +-
 src/codegen/spirv/ir_builder.cc                  |  2 +-
 src/codegen/stackvm/codegen_stackvm.cc           |  2 +-
 src/codegen/verilog/codegen_verilog.cc           |  2 +-
 src/codegen/verilog/codegen_verilog.h            |  2 +-
 src/codegen/verilog/verilog_ir.cc                |  3 ++-
 src/codegen/verilog/verilog_module.cc            |  2 +-
 src/codegen/verilog/vpi_device_api.cc            |  2 +-
 src/codegen/verilog/vpi_session.cc               |  2 +-
 src/contrib/nnpack/convolution.cc                |  2 +-
 src/contrib/nnpack/fully_connected.cc            |  2 +-
 src/contrib/nnpack/nnpack_utils.cc               |  2 +-
 src/contrib/random/random.cc                     |  4 ++--
 src/op/compute_op.cc                             |  5 +++--
 src/op/cross_thread_reduction.cc                 |  4 ++--
 src/op/extern_op.cc                              |  2 +-
 src/op/op_util.cc                                |  3 ++-
 src/op/scan_op.cc                                |  2 +-
 src/op/tensorize.cc                              |  4 ++--
 src/pass/arg_binder.cc                           |  4 ++--
 src/pass/coproc_sync.cc                          |  4 ++--
 src/pass/inject_double_buffer.cc                 |  2 +-
 src/pass/ir_mutator.cc                           |  2 +-
 src/pass/ir_util.cc                              |  2 +-
 src/pass/lift_attr_scope.cc                      |  2 +-
 src/pass/lower_intrin.cc                         |  2 +-
 src/pass/lower_thread_allreduce.cc               |  2 +-
 src/pass/lower_tvm_builtin.cc                    |  2 +-
 src/pass/lower_warp_memory.cc                    |  2 +-
 src/pass/make_api.cc                             |  4 ++--
 src/pass/narrow_channel_access.cc                |  2 +-
 src/pass/split_pipeline.cc                       |  2 +-
 src/pass/storage_access.cc                       |  5 +++--
 src/pass/storage_flatten.cc                      |  4 ++--
 src/pass/storage_rewrite.cc                      |  2 +-
 src/pass/storage_sync.cc                         |  4 ++--
 src/runtime/c_dsl_api.cc                         |  4 ++--
 src/runtime/c_runtime_api.cc                     |  2 +-
 src/runtime/cpu_device_api.cc                    |  2 +-
 src/runtime/cuda/cuda_device_api.cc              |  2 +-
 src/runtime/cuda/cuda_module.cc                  |  4 ++--
 src/runtime/dso_module.cc                        |  2 +-
 src/runtime/file_util.cc                         |  3 ++-
 src/runtime/file_util.h                          |  2 +-
 src/runtime/graph/graph_runtime.cc               |  5 ++++-
 src/runtime/meta_data.h                          |  2 +-
 src/runtime/metal/metal_device_api.mm            |  2 +-
 src/runtime/metal/metal_module.mm                |  4 ++--
 src/runtime/module.cc                            |  2 +-
 src/runtime/module_util.cc                       |  3 ++-
 src/runtime/ndarray.cc                           |  2 +-
 src/runtime/opencl/aocl/aocl_device_api.cc       |  2 +-
 src/runtime/opencl/aocl/aocl_module.cc           |  4 ++--
 src/runtime/opencl/opencl_device_api.cc          |  2 +-
 src/runtime/opencl/opencl_module.cc              |  4 ++--
 src/runtime/opencl/sdaccel/sdaccel_device_api.cc |  2 +-
 src/runtime/opencl/sdaccel/sdaccel_module.cc     |  4 ++--
 src/runtime/opengl/opengl_device_api.cc          |  4 ++--
 src/runtime/opengl/opengl_module.cc              |  4 ++--
 src/runtime/registry.cc                          |  2 +-
 src/runtime/rocm/rocm_device_api.cc              |  2 +-
 src/runtime/rocm/rocm_module.cc                  |  4 ++--
 src/runtime/rpc/rpc_device_api.cc                |  2 +-
 src/runtime/rpc/rpc_event_impl.cc                |  2 +-
 src/runtime/rpc/rpc_module.cc                    |  2 +-
 src/runtime/rpc/rpc_session.cc                   |  4 +++-
 src/runtime/rpc/rpc_socket_impl.cc               |  2 +-
 src/runtime/sgx/trusted/runtime.cc               |  6 +++---
 src/runtime/stackvm/stackvm.cc                   |  2 +-
 src/runtime/stackvm/stackvm_module.cc            |  2 +-
 src/runtime/stackvm/stackvm_module.h             |  2 +-
 src/runtime/system_lib_module.cc                 |  2 +-
 src/runtime/vulkan/vulkan_device_api.cc          |  2 +-
 src/runtime/vulkan/vulkan_module.cc              |  4 ++--
 src/runtime/workspace_pool.cc                    |  2 +-
 src/schedule/bound.cc                            |  4 ++--
 src/schedule/graph.cc                            |  2 +-
 src/schedule/message_passing.cc                  |  2 +-
 src/schedule/schedule_dataflow_rewrite.cc        |  2 +-
 src/schedule/schedule_lang.cc                    |  2 +-
 src/schedule/schedule_ops.cc                     |  2 +-
 verilog/tvm_vpi.cc                               |  4 +++-
 vta/hardware/xilinx/src/vta.cc                   |  2 +-
 vta/include/vta/runtime.h                        |  2 +-
 vta/src/pynq/pynq_driver.cc                      |  2 +-
 vta/tests/hardware/common/test_lib.cc            |  2 +-
 185 files changed, 310 insertions(+), 285 deletions(-)

diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h
index 93bff2762481..e12ef423ed32 100644
--- a/include/tvm/api_registry.h
+++ b/include/tvm/api_registry.h
@@ -7,9 +7,9 @@
 #ifndef TVM_API_REGISTRY_H_
 #define TVM_API_REGISTRY_H_
 
-#include "./base.h"
-#include "./packed_func_ext.h"
-#include "./runtime/registry.h"
+#include "base.h"
+#include "packed_func_ext.h"
+#include "runtime/registry.h"
 
 /*!
  * \brief Register an API function globally.
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index 6a3c395fd404..54875bbbf474 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -9,7 +9,7 @@
 #include <vector>
 #include <unordered_map>
 #include <memory>
-#include "./expr.h"
+#include "expr.h"
 
 namespace tvm {
 
diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 3e5169ba02b8..7cd77a92d0dd 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -31,9 +31,9 @@
 #include <vector>
 #include <type_traits>
 #include <string>
-#include "./ir.h"
-#include "./base.h"
-#include "./packed_func_ext.h"
+#include "ir.h"
+#include "base.h"
+#include "packed_func_ext.h"
 
 namespace tvm {
 /*!
diff --git a/include/tvm/base.h b/include/tvm/base.h
index 1d7cf8add3ca..2c5c5ad54875 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -12,7 +12,7 @@
 #include <string>
 #include <memory>
 #include <functional>
-#include "./runtime/registry.h"
+#include "runtime/registry.h"
 
 namespace tvm {
 
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 41fa1fa804a8..0f591299718e 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -9,8 +9,8 @@
 #include <tvm/container.h>
 #include <string>
 
-#include "./base.h"
-#include "./expr.h"
+#include "base.h"
+#include "expr.h"
 
 namespace tvm {
 
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 96b876fe92f0..5dc832041410 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -9,9 +9,9 @@
 #include <string>
 #include <vector>
 #include <utility>
-#include "./runtime/packed_func.h"
-#include "./schedule_pass.h"
-#include "./lowered_func.h"
+#include "runtime/packed_func.h"
+#include "schedule_pass.h"
+#include "lowered_func.h"
 
 namespace tvm {
 using namespace tvm::runtime;
diff --git a/include/tvm/c_dsl_api.h b/include/tvm/c_dsl_api.h
index 6f15ef9a3e80..027a3952d9d4 100644
--- a/include/tvm/c_dsl_api.h
+++ b/include/tvm/c_dsl_api.h
@@ -14,7 +14,7 @@
 #ifndef TVM_C_DSL_API_H_
 #define TVM_C_DSL_API_H_
 
-#include "./runtime/c_runtime_api.h"
+#include "runtime/c_runtime_api.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/tvm/codegen.h b/include/tvm/codegen.h
index 6b5116a143cc..fca88de6a238 100644
--- a/include/tvm/codegen.h
+++ b/include/tvm/codegen.h
@@ -7,11 +7,11 @@
 #define TVM_CODEGEN_H_
 
 #include <string>
-#include "./base.h"
-#include "./expr.h"
-#include "./lowered_func.h"
-#include "./api_registry.h"
-#include "./runtime/packed_func.h"
+#include "base.h"
+#include "expr.h"
+#include "lowered_func.h"
+#include "api_registry.h"
+#include "runtime/packed_func.h"
 
 namespace tvm {
 /*! \brief namespace for lowlevel IR pass and codegen */
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 8c789f8df1dc..64a112d05518 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -11,8 +11,8 @@
 #include <ir/IRPrinter.h>
 #include <string>
 #include <algorithm>
-#include "./base.h"
-#include "./runtime/c_runtime_api.h"
+#include "base.h"
+#include "runtime/c_runtime_api.h"
 
 namespace tvm {
 
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 646824332902..f73533439dba 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -10,9 +10,9 @@
 #include <ir/IR.h>
 #include <type_traits>
 #include <string>
-#include "./base.h"
-#include "./expr.h"
-#include "./runtime/util.h"
+#include "base.h"
+#include "expr.h"
+#include "runtime/util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_functor_ext.h b/include/tvm/ir_functor_ext.h
index 3784608c8da1..a9845fdfc898 100644
--- a/include/tvm/ir_functor_ext.h
+++ b/include/tvm/ir_functor_ext.h
@@ -7,7 +7,7 @@
 #define TVM_IR_FUNCTOR_EXT_H_
 
 #include <tvm/ir_functor.h>
-#include "./ir.h"
+#include "ir.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_mutator.h b/include/tvm/ir_mutator.h
index b8aae3638149..35c82e9f16c1 100644
--- a/include/tvm/ir_mutator.h
+++ b/include/tvm/ir_mutator.h
@@ -8,8 +8,8 @@
 
 #include <tvm/ir_functor.h>
 #include <unordered_map>
-#include "./expr.h"
-#include "./ir.h"
+#include "expr.h"
+#include "ir.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_operator.h b/include/tvm/ir_operator.h
index 947c3b736d80..e809b06e49b5 100644
--- a/include/tvm/ir_operator.h
+++ b/include/tvm/ir_operator.h
@@ -7,8 +7,8 @@
 #define TVM_IR_OPERATOR_H_
 
 #include <algorithm>
-#include "./expr.h"
-#include "./ir.h"
+#include "expr.h"
+#include "ir.h"
 
 namespace tvm {
 
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index d875621a3f5e..cf20dfa1e9f3 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -14,10 +14,10 @@
 #include <unordered_map>
 #include <vector>
 #include <string>
-#include "./expr.h"
-#include "./buffer.h"
-#include "./schedule.h"
-#include "./lowered_func.h"
+#include "expr.h"
+#include "buffer.h"
+#include "schedule.h"
+#include "lowered_func.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_visitor.h b/include/tvm/ir_visitor.h
index 8919b0f7a5c2..4b2887b28885 100644
--- a/include/tvm/ir_visitor.h
+++ b/include/tvm/ir_visitor.h
@@ -7,7 +7,7 @@
 #define TVM_IR_VISITOR_H_
 
 #include <tvm/ir_functor.h>
-#include "./ir.h"
+#include "ir.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h
index 19f7e27f1c75..acb9813339f8 100644
--- a/include/tvm/lowered_func.h
+++ b/include/tvm/lowered_func.h
@@ -11,9 +11,9 @@
 #include <ir/FunctionBase.h>
 #include <string>
 
-#include "./base.h"
-#include "./expr.h"
-#include "./tensor.h"
+#include "base.h"
+#include "expr.h"
+#include "tensor.h"
 
 namespace tvm {
 
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index ed8be6e4a7c0..c11242c0a55d 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -9,12 +9,12 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
-#include "./expr.h"
-#include "./ir_operator.h"
-#include "./tensor.h"
-#include "./schedule.h"
-#include "./arithmetic.h"
-#include "./buffer.h"
+#include "expr.h"
+#include "ir_operator.h"
+#include "tensor.h"
+#include "schedule.h"
+#include "arithmetic.h"
+#include "buffer.h"
 
 namespace tvm {
 
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 95964547ef8e..78351e094e69 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -12,10 +12,10 @@
 #include <memory>
 #include <type_traits>
 
-#include "./base.h"
-#include "./expr.h"
-#include "./tensor.h"
-#include "./runtime/packed_func.h"
+#include "base.h"
+#include "expr.h"
+#include "tensor.h"
+#include "runtime/packed_func.h"
 
 namespace tvm {
 using runtime::TVMArgs;
diff --git a/include/tvm/runtime/c_backend_api.h b/include/tvm/runtime/c_backend_api.h
index 60e284610494..f55748e38289 100644
--- a/include/tvm/runtime/c_backend_api.h
+++ b/include/tvm/runtime/c_backend_api.h
@@ -10,7 +10,7 @@
 #ifndef TVM_RUNTIME_C_BACKEND_API_H_
 #define TVM_RUNTIME_C_BACKEND_API_H_
 
-#include "./c_runtime_api.h"
+#include "c_runtime_api.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 3458c143e662..0b91deafd9c0 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -7,8 +7,8 @@
 #define TVM_RUNTIME_DEVICE_API_H_
 
 #include <string>
-#include "./packed_func.h"
-#include "./c_runtime_api.h"
+#include "packed_func.h"
+#include "c_runtime_api.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index de0b02500b6d..3a98820b76f3 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -13,7 +13,7 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./c_runtime_api.h"
+#include "c_runtime_api.h"
 
 namespace tvm {
 namespace runtime {
@@ -173,5 +173,5 @@ inline const ModuleNode* Module::operator->() const {
 }  // namespace runtime
 }  // namespace tvm
 
-#include "./packed_func.h"
+#include "packed_func.h"
 #endif  // TVM_RUNTIME_MODULE_H_
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index c288ce5f3adb..0b7c3b49ccac 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -9,8 +9,8 @@
 #include <atomic>
 #include <vector>
 #include <utility>
-#include "./c_runtime_api.h"
-#include "./serializer.h"
+#include "c_runtime_api.h"
+#include "serializer.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 758d03b5b18b..1c873a5ebccc 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -14,9 +14,9 @@
 #include <limits>
 #include <memory>
 #include <type_traits>
-#include "./c_runtime_api.h"
-#include "./module.h"
-#include "./ndarray.h"
+#include "c_runtime_api.h"
+#include "module.h"
+#include "ndarray.h"
 
 namespace HalideIR {
 // Forward declare type for extensions
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 2a328c8086e0..c10a03e0604a 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -27,7 +27,7 @@
 
 #include <string>
 #include <vector>
-#include "./packed_func.h"
+#include "packed_func.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/include/tvm/runtime/serializer.h b/include/tvm/runtime/serializer.h
index b2ab5483a22d..e9a7d1db50ec 100644
--- a/include/tvm/runtime/serializer.h
+++ b/include/tvm/runtime/serializer.h
@@ -9,8 +9,8 @@
 
 #include <dmlc/io.h>
 #include <dmlc/serializer.h>
-#include "./c_runtime_api.h"
-#include "./ndarray.h"
+#include "c_runtime_api.h"
+#include "ndarray.h"
 
 namespace dmlc {
 namespace serializer {
diff --git a/include/tvm/runtime/util.h b/include/tvm/runtime/util.h
index 7fa62be912be..5fd130737158 100644
--- a/include/tvm/runtime/util.h
+++ b/include/tvm/runtime/util.h
@@ -6,7 +6,7 @@
 #ifndef TVM_RUNTIME_UTIL_H_
 #define TVM_RUNTIME_UTIL_H_
 
-#include "./c_runtime_api.h"
+#include "c_runtime_api.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index deaf74ccf222..b72eb7105faa 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -7,10 +7,10 @@
 #define TVM_SCHEDULE_H_
 
 #include <string>
-#include "./base.h"
-#include "./expr.h"
-#include "./tensor.h"
-#include "./tensor_intrin.h"
+#include "base.h"
+#include "expr.h"
+#include "tensor.h"
+#include "tensor_intrin.h"
 
 namespace tvm {
 
diff --git a/include/tvm/schedule_pass.h b/include/tvm/schedule_pass.h
index cd248f8b9b96..e2b4462b8d73 100644
--- a/include/tvm/schedule_pass.h
+++ b/include/tvm/schedule_pass.h
@@ -10,8 +10,8 @@
 #ifndef TVM_SCHEDULE_PASS_H_
 #define TVM_SCHEDULE_PASS_H_
 
-#include "./base.h"
-#include "./schedule.h"
+#include "base.h"
+#include "schedule.h"
 
 namespace tvm {
 namespace schedule {
diff --git a/include/tvm/target_info.h b/include/tvm/target_info.h
index 8569f188a4ab..338749cf832e 100644
--- a/include/tvm/target_info.h
+++ b/include/tvm/target_info.h
@@ -7,8 +7,8 @@
 #define TVM_TARGET_INFO_H_
 
 #include <string>
-#include "./base.h"
-#include "./expr.h"
+#include "base.h"
+#include "expr.h"
 
 namespace tvm {
 
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index 1a6338d9058c..ddccfce2fefb 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -12,9 +12,9 @@
 #include <vector>
 #include <type_traits>
 
-#include "./base.h"
-#include "./expr.h"
-#include "./arithmetic.h"
+#include "base.h"
+#include "expr.h"
+#include "arithmetic.h"
 
 namespace tvm {
 
diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index bd3fd11021b4..fa8c895ccb08 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -7,8 +7,8 @@
 #define TVM_TENSOR_INTRIN_H_
 
 #include <string>
-#include "./tensor.h"
-#include "./buffer.h"
+#include "tensor.h"
+#include "buffer.h"
 
 namespace tvm {
 
diff --git a/include/tvm/tvm.h b/include/tvm/tvm.h
index 7e9c4305ffbb..645c68357f13 100644
--- a/include/tvm/tvm.h
+++ b/include/tvm/tvm.h
@@ -6,11 +6,11 @@
 #ifndef TVM_TVM_H_
 #define TVM_TVM_H_
 
-#include "./base.h"
-#include "./expr.h"
-#include "./ir_operator.h"
-#include "./tensor.h"
-#include "./operation.h"
-#include "./packed_func_ext.h"
+#include "base.h"
+#include "expr.h"
+#include "ir_operator.h"
+#include "tensor.h"
+#include "operation.h"
+#include "packed_func_ext.h"
 
 #endif  // TVM_TVM_H_
diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h
index 1d3b662ff0b8..3f8a2a3642b1 100644
--- a/nnvm/include/nnvm/graph.h
+++ b/nnvm/include/nnvm/graph.h
@@ -12,9 +12,9 @@
 #include <algorithm>
 #include <unordered_map>
 #include <unordered_set>
-#include "./base.h"
-#include "./node.h"
-#include "./symbolic.h"
+#include "base.h"
+#include "node.h"
+#include "symbolic.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/graph_attr_types.h b/nnvm/include/nnvm/graph_attr_types.h
index 2bd998fedfbb..2fe82c9a7de0 100644
--- a/nnvm/include/nnvm/graph_attr_types.h
+++ b/nnvm/include/nnvm/graph_attr_types.h
@@ -8,8 +8,8 @@
 
 #include <vector>
 #include <string>
-#include "./tuple.h"
-#include "./layout.h"
+#include "tuple.h"
+#include "layout.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/node.h b/nnvm/include/nnvm/node.h
index 57afb0c5587a..ae782f04965e 100644
--- a/nnvm/include/nnvm/node.h
+++ b/nnvm/include/nnvm/node.h
@@ -10,9 +10,9 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
-#include "./base.h"
-#include "./op.h"
-#include "./c_api.h"
+#include "base.h"
+#include "op.h"
+#include "c_api.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/op.h b/nnvm/include/nnvm/op.h
index 5bdfcaca169d..9d171bbdb2bc 100644
--- a/nnvm/include/nnvm/op.h
+++ b/nnvm/include/nnvm/op.h
@@ -13,8 +13,8 @@
 #include <typeinfo>
 #include <limits>
 #include <functional>
-#include "./base.h"
-#include "./c_api.h"
+#include "base.h"
+#include "c_api.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/op_attr_types.h b/nnvm/include/nnvm/op_attr_types.h
index b7f6be408a16..abed19f9bc7d 100644
--- a/nnvm/include/nnvm/op_attr_types.h
+++ b/nnvm/include/nnvm/op_attr_types.h
@@ -10,10 +10,10 @@
 #include <string>
 #include <utility>
 #include <functional>
-#include "./base.h"
-#include "./node.h"
-#include "./tuple.h"
-#include "./layout.h"
+#include "base.h"
+#include "node.h"
+#include "tuple.h"
+#include "layout.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/pass.h b/nnvm/include/nnvm/pass.h
index 016d5ee2a763..2e8db6111887 100644
--- a/nnvm/include/nnvm/pass.h
+++ b/nnvm/include/nnvm/pass.h
@@ -8,8 +8,8 @@
 
 #include <vector>
 #include <functional>
-#include "./base.h"
-#include "./graph.h"
+#include "base.h"
+#include "graph.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/pass_functions.h b/nnvm/include/nnvm/pass_functions.h
index 4c29e09d813a..5a98dd456fb2 100644
--- a/nnvm/include/nnvm/pass_functions.h
+++ b/nnvm/include/nnvm/pass_functions.h
@@ -13,9 +13,9 @@
 #include <string>
 #include <memory>
 #include <vector>
-#include "./base.h"
-#include "./pass.h"
-#include "./graph_attr_types.h"
+#include "base.h"
+#include "pass.h"
+#include "graph_attr_types.h"
 
 namespace nnvm {
 namespace pass {
diff --git a/nnvm/include/nnvm/symbolic.h b/nnvm/include/nnvm/symbolic.h
index ebb2ab5d30d0..42cf5dd775c2 100644
--- a/nnvm/include/nnvm/symbolic.h
+++ b/nnvm/include/nnvm/symbolic.h
@@ -15,8 +15,8 @@
 #include <tuple>
 #include <utility>
 
-#include "./base.h"
-#include "./node.h"
+#include "base.h"
+#include "node.h"
 
 namespace nnvm {
 /*!
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 865024733494..143a9548f18a 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -11,7 +11,7 @@
 #include <nnvm/tuple.h>
 #include <nnvm/layout.h>
 #include <string>
-#include "./tensor.h"
+#include "tensor.h"
 
 namespace nnvm {
 namespace top {
diff --git a/nnvm/include/nnvm/tuple.h b/nnvm/include/nnvm/tuple.h
index 7e83aecc11f0..36b8ef13c74a 100644
--- a/nnvm/include/nnvm/tuple.h
+++ b/nnvm/include/nnvm/tuple.h
@@ -12,7 +12,7 @@
 #include <utility>
 #include <iostream>
 #include <string>
-#include "./base.h"
+#include "base.h"
 
 namespace nnvm {
 
diff --git a/nnvm/src/c_api/c_api_error.cc b/nnvm/src/c_api/c_api_error.cc
index 399268667ddd..fd91bfb8b306 100644
--- a/nnvm/src/c_api/c_api_error.cc
+++ b/nnvm/src/c_api/c_api_error.cc
@@ -4,7 +4,7 @@
  * \brief C error handling
  */
 #include <dmlc/thread_local.h>
-#include "./c_api_common.h"
+#include "c_api_common.h"
 
 struct ErrorEntry {
   std::string last_error;
diff --git a/nnvm/src/c_api/c_api_graph.cc b/nnvm/src/c_api/c_api_graph.cc
index 831aaec33e8c..a0e84aef4482 100644
--- a/nnvm/src/c_api/c_api_graph.cc
+++ b/nnvm/src/c_api/c_api_graph.cc
@@ -9,7 +9,7 @@
 #include <nnvm/graph.h>
 #include <nnvm/pass.h>
 #include <dmlc/json.h>
-#include "./c_api_common.h"
+#include "c_api_common.h"
 
 using namespace nnvm;
 
diff --git a/nnvm/src/c_api/c_api_symbolic.cc b/nnvm/src/c_api/c_api_symbolic.cc
index 9f62dbd80b0c..e175cfc7da25 100644
--- a/nnvm/src/c_api/c_api_symbolic.cc
+++ b/nnvm/src/c_api/c_api_symbolic.cc
@@ -6,7 +6,7 @@
 #include <nnvm/c_api.h>
 #include <nnvm/op.h>
 #include <nnvm/symbolic.h>
-#include "./c_api_common.h"
+#include "c_api_common.h"
 
 using namespace nnvm;
 
diff --git a/nnvm/src/compiler/alter_op_layout.cc b/nnvm/src/compiler/alter_op_layout.cc
index bf28df3d04f8..b02655fc8925 100644
--- a/nnvm/src/compiler/alter_op_layout.cc
+++ b/nnvm/src/compiler/alter_op_layout.cc
@@ -12,8 +12,8 @@
 #include <tvm/tvm.h>
 #include <algorithm>
 #include <functional>
-#include "./compile_engine.h"
-#include "./graph_transform.h"
+#include "compile_engine.h"
+#include "graph_transform.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
index a9d4aa2d016a..b9b27621840c 100644
--- a/nnvm/src/compiler/compile_engine.cc
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -11,8 +11,11 @@
 #include <nnvm/pass_functions.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <mutex>
-#include "./graph_hash.h"
-#include "./compile_engine.h"
+#include <tuple>
+#include <vector>
+#include <limits>
+#include "graph_hash.h"
+#include "compile_engine.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/compile_engine.h b/nnvm/src/compiler/compile_engine.h
index d84fe2facbd3..7696b3b5f4eb 100644
--- a/nnvm/src/compiler/compile_engine.h
+++ b/nnvm/src/compiler/compile_engine.h
@@ -18,7 +18,7 @@
 #include <tvm/lowered_func.h>
 #include <string>
 #include <utility>
-#include "./graph_hash.h"
+#include "graph_hash.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/fold_scale_axis.cc b/nnvm/src/compiler/fold_scale_axis.cc
index e38082b69916..639aba602589 100644
--- a/nnvm/src/compiler/fold_scale_axis.cc
+++ b/nnvm/src/compiler/fold_scale_axis.cc
@@ -9,8 +9,8 @@
 #include <nnvm/pass.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./pattern_util.h"
-#include "./graph_transform.h"
+#include "pattern_util.h"
+#include "graph_transform.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
index 4999d93d1861..c9ea58affb2c 100644
--- a/nnvm/src/compiler/graph_fuse.cc
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -14,10 +14,11 @@
 #include <nnvm/tuple.h>
 #include <tvm/lowered_func.h>
 #include <tvm/runtime/packed_func.h>
+#include <limits>
 
-#include "./graph_fuse.h"
-#include "./graph_runtime.h"
-#include "./pattern_util.h"
+#include "graph_fuse.h"
+#include "graph_runtime.h"
+#include "pattern_util.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc
index ccd2e3ce433f..ca68727ea067 100644
--- a/nnvm/src/compiler/graph_hash.cc
+++ b/nnvm/src/compiler/graph_hash.cc
@@ -10,8 +10,10 @@
 #include <tvm/ir.h>
 #include <tvm/runtime/packed_func.h>
 #include <functional>
-#include "./node_attr.h"
-#include "./graph_hash.h"
+#include <vector>
+#include <algorithm>
+#include "node_attr.h"
+#include "graph_hash.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
index 7301fd74117e..bc4a803681e4 100644
--- a/nnvm/src/compiler/graph_runtime.cc
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -4,7 +4,7 @@
  * \brief Interface code with TVM graph runtime.
 */
 #include <dmlc/memory_io.h>
-#include "./graph_runtime.h"
+#include "graph_runtime.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/packed_func_ext.cc b/nnvm/src/compiler/packed_func_ext.cc
index d549f9e2004f..64846fc8e247 100644
--- a/nnvm/src/compiler/packed_func_ext.cc
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -9,7 +9,7 @@
 #include <nnvm/compiler/packed_func_ext.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <tvm/runtime/c_runtime_api.h>
-#include "./node_attr.h"
+#include "node_attr.h"
 #include "compile_engine.h"
 
 namespace tvm {
diff --git a/nnvm/src/compiler/simplify_inference.cc b/nnvm/src/compiler/simplify_inference.cc
index a0782222aa06..bf00bcb5a894 100644
--- a/nnvm/src/compiler/simplify_inference.cc
+++ b/nnvm/src/compiler/simplify_inference.cc
@@ -9,8 +9,8 @@
 #include <nnvm/pass.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./graph_transform.h"
-#include "./pattern_util.h"
+#include "graph_transform.h"
+#include "pattern_util.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc
index 51448bcf1065..6c2fc0d087ea 100644
--- a/nnvm/src/pass/plan_memory.cc
+++ b/nnvm/src/pass/plan_memory.cc
@@ -8,7 +8,7 @@
 #include <nnvm/graph_attr_types.h>
 #include <nnvm/op_attr_types.h>
 #include <memory>
-#include "./graph_algorithm.h"
+#include "graph_algorithm.h"
 
 namespace nnvm {
 namespace pass {
diff --git a/nnvm/src/top/elemwise_op_common.h b/nnvm/src/top/elemwise_op_common.h
index e5bb0adcb078..ad8fc3d54ba8 100644
--- a/nnvm/src/top/elemwise_op_common.h
+++ b/nnvm/src/top/elemwise_op_common.h
@@ -12,7 +12,7 @@
 #include <vector>
 #include <utility>
 #include <functional>
-#include "./op_common.h"
+#include "op_common.h"
 
 namespace nnvm {
 namespace top {
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index 229d4ac30f78..d5c9c18f68a6 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -12,7 +12,7 @@
 #include <tvm/packed_func_ext.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <tvm/tvm.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/nn.h"
diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
index 322d77b6d032..0b5a11fdd096 100644
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -12,7 +12,7 @@
 #include <nnvm/op_attr_types.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/nn/dense.h"
diff --git a/nnvm/src/top/nn/pooling.cc b/nnvm/src/top/nn/pooling.cc
index 8b9b7a64aa0d..6a53e1994fc1 100644
--- a/nnvm/src/top/nn/pooling.cc
+++ b/nnvm/src/top/nn/pooling.cc
@@ -10,7 +10,7 @@
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/compiler/util.h>
 #include <nnvm/top/nn.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/nn/pooling.h"
diff --git a/nnvm/src/top/nn/upsampling.cc b/nnvm/src/top/nn/upsampling.cc
index 6c5e13441406..f4bbeb62aa29 100644
--- a/nnvm/src/top/nn/upsampling.cc
+++ b/nnvm/src/top/nn/upsampling.cc
@@ -11,7 +11,7 @@
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/elemwise.h"
diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc
index ed6239961a3b..7acf79ff5308 100644
--- a/src/arithmetic/canonical.cc
+++ b/src/arithmetic/canonical.cc
@@ -6,8 +6,12 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/arithmetic.h>
 #include <tvm/ir_pass.h>
-#include "./canonical.h"
-#include "./compute_expr.h"
+#include <algorithm>
+#include <map>
+#include <limits>
+#include <vector>
+#include "canonical.h"
+#include "compute_expr.h"
 #include "arithmetic/Simplify.h"
 
 namespace tvm {
diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index 642a866866d2..109cdc6d9146 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -8,7 +8,7 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_functor_ext.h>
 #include <tvm/arithmetic.h>
-#include "./compute_expr.h"
+#include "compute_expr.h"
 
 namespace tvm {
 namespace arith {
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index c004b9666a58..c1b68fddd0e9 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -9,8 +9,8 @@
 #include <tvm/ir_functor_ext.h>
 #include <arithmetic/Interval.h>
 #include <unordered_map>
-#include "./compute_expr.h"
-#include "./int_set_internal.h"
+#include "compute_expr.h"
+#include "int_set_internal.h"
 
 namespace tvm {
 namespace arith {
diff --git a/src/arithmetic/modular.cc b/src/arithmetic/modular.cc
index c0eee45cc395..1c03d0f97485 100644
--- a/src/arithmetic/modular.cc
+++ b/src/arithmetic/modular.cc
@@ -8,7 +8,7 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/arithmetic.h>
 #include <limits>
-#include "./int_set_internal.h"
+#include "int_set_internal.h"
 
 namespace tvm {
 namespace arith {
diff --git a/src/codegen/codegen_aocl.cc b/src/codegen/codegen_aocl.cc
index 8830588758ef..506a4f7ed92c 100644
--- a/src/codegen/codegen_aocl.cc
+++ b/src/codegen/codegen_aocl.cc
@@ -5,8 +5,8 @@
 #include <tvm/build_module.h>
 #include <vector>
 #include <string>
-#include "./codegen_opencl.h"
-#include "./build_common.h"
+#include "codegen_opencl.h"
+#include "build_common.h"
 #include "../runtime/opencl/aocl/aocl_module.h"
 #include "../runtime/file_util.h"
 
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index ec27f41cc702..09a6c7e6ab4c 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -4,7 +4,7 @@
  */
 #include <iomanip>
 #include <cctype>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 #include "../pass/ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index 0f14415f2af6..b36e37da54fe 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -14,7 +14,7 @@
 #include <vector>
 #include <unordered_map>
 #include <unordered_set>
-#include "./codegen_source_base.h"
+#include "codegen_source_base.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 44c02830d0fc..7c8399cfc7b5 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -7,7 +7,7 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_cuda.h"
+#include "codegen_cuda.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index f5d9861ec6b2..cef2c77f9901 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_metal.cc b/src/codegen/codegen_metal.cc
index 37121ccb755c..3bbe98289439 100644
--- a/src/codegen/codegen_metal.cc
+++ b/src/codegen/codegen_metal.cc
@@ -5,8 +5,9 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_metal.h"
-#include "./build_common.h"
+#include <algorithm>
+#include "codegen_metal.h"
+#include "build_common.h"
 #include "../runtime/metal/metal_module.h"
 #include "../runtime/thread_storage_scope.h"
 
diff --git a/src/codegen/codegen_metal.h b/src/codegen/codegen_metal.h
index 6f8bef64bbcf..9779fb800ff9 100644
--- a/src/codegen/codegen_metal.h
+++ b/src/codegen/codegen_metal.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc
index 2d5026e827e2..3d3de5e3bcf4 100644
--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -5,8 +5,8 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_opencl.h"
-#include "./build_common.h"
+#include "codegen_opencl.h"
+#include "build_common.h"
 #include "../runtime/thread_storage_scope.h"
 #include "../runtime/opencl/opencl_module.h"
 
diff --git a/src/codegen/codegen_opencl.h b/src/codegen/codegen_opencl.h
index 424bfa5ae2b3..90569d176a0b 100644
--- a/src/codegen/codegen_opencl.h
+++ b/src/codegen/codegen_opencl.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc
index 5e750a39e598..7fd85d35409d 100644
--- a/src/codegen/codegen_opengl.cc
+++ b/src/codegen/codegen_opengl.cc
@@ -8,8 +8,8 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_opengl.h"
-#include "./build_common.h"
+#include "codegen_opengl.h"
+#include "build_common.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_opengl.h b/src/codegen/codegen_opengl.h
index 3cae1e323ec4..aa1552dfcff7 100644
--- a/src/codegen/codegen_opengl.h
+++ b/src/codegen/codegen_opengl.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 #include "../runtime/opengl/opengl_module.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_source_base.cc b/src/codegen/codegen_source_base.cc
index cf3a6ec5ab04..39a573ceec68 100644
--- a/src/codegen/codegen_source_base.cc
+++ b/src/codegen/codegen_source_base.cc
@@ -2,7 +2,7 @@
  *  Copyright (c) 2017 by Contributors
  * \file codegen_source_base.cc
  */
-#include "./codegen_source_base.h"
+#include "codegen_source_base.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_vhls.cc b/src/codegen/codegen_vhls.cc
index b9f9f7505978..5776b895b4b3 100644
--- a/src/codegen/codegen_vhls.cc
+++ b/src/codegen/codegen_vhls.cc
@@ -5,8 +5,8 @@
 #include <tvm/build_module.h>
 #include <vector>
 #include <string>
-#include "./codegen_vhls.h"
-#include "./build_common.h"
+#include "codegen_vhls.h"
+#include "build_common.h"
 #include "../runtime/opencl/sdaccel/sdaccel_module.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_vhls.h b/src/codegen/codegen_vhls.h
index bcb7d6f49d8c..c0faefc75837 100644
--- a/src/codegen/codegen_vhls.h
+++ b/src/codegen/codegen_vhls.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule.cc b/src/codegen/intrin_rule.cc
index 5f15a879c2ed..822d515fb8a5 100644
--- a/src/codegen/intrin_rule.cc
+++ b/src/codegen/intrin_rule.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_default.cc
  * \brief Default intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_cuda.cc b/src/codegen/intrin_rule_cuda.cc
index 43461a15932d..ee98a54329ab 100644
--- a/src/codegen/intrin_rule_cuda.cc
+++ b/src/codegen/intrin_rule_cuda.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_cuda.cc
  * \brief CUDA intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_metal.cc b/src/codegen/intrin_rule_metal.cc
index 3c210919132e..8b499fb9ea9b 100644
--- a/src/codegen/intrin_rule_metal.cc
+++ b/src/codegen/intrin_rule_metal.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_metal.cc
  * \brief Metal intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_opencl.cc b/src/codegen/intrin_rule_opencl.cc
index d91deaeda5fe..1cb1aed01102 100644
--- a/src/codegen/intrin_rule_opencl.cc
+++ b/src/codegen/intrin_rule_opencl.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_opencl.cc
  * \brief OpenCL intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_opengl.cc b/src/codegen/intrin_rule_opengl.cc
index e9728a25b40c..c9aa21c1a883 100644
--- a/src/codegen/intrin_rule_opengl.cc
+++ b/src/codegen/intrin_rule_opengl.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_opencl.cc
  * \brief OpenCL intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_vhls.cc b/src/codegen/intrin_rule_vhls.cc
index b360142cd985..996c45707364 100644
--- a/src/codegen/intrin_rule_vhls.cc
+++ b/src/codegen/intrin_rule_vhls.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_vhls.cc
  * \brief VHLS intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc
index 9d1decb43227..dd2cf6714251 100644
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -8,7 +8,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/registry.h>
-#include "./codegen_llvm.h"
+#include "codegen_llvm.h"
 #include "../build_common.h"
 #include "../codegen_source_base.h"
 #include "../../pass/ir_util.h"
diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc
index 18a0eb54e182..9f19fa1f47f0 100644
--- a/src/codegen/llvm/codegen_arm.cc
+++ b/src/codegen/llvm/codegen_arm.cc
@@ -4,7 +4,7 @@
  * \brief ARM specific code generator
  */
 #ifdef TVM_LLVM_VERSION
-#include "./codegen_cpu.h"
+#include "codegen_cpu.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/codegen_cpu.cc b/src/codegen/llvm/codegen_cpu.cc
index a8a2127febde..436c727f86f0 100644
--- a/src/codegen/llvm/codegen_cpu.cc
+++ b/src/codegen/llvm/codegen_cpu.cc
@@ -6,7 +6,7 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/ir_pass.h>
-#include "./codegen_cpu.h"
+#include "codegen_cpu.h"
 #include "../../pass/ir_util.h"
 
 namespace tvm {
diff --git a/src/codegen/llvm/codegen_cpu.h b/src/codegen/llvm/codegen_cpu.h
index 5027dab911bd..b7a95a835d89 100644
--- a/src/codegen/llvm/codegen_cpu.h
+++ b/src/codegen/llvm/codegen_cpu.h
@@ -9,7 +9,7 @@
 #include <utility>
 #include <vector>
 #include <string>
-#include "./codegen_llvm.h"
+#include "codegen_llvm.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index ae576c981395..c1b1fe24f0a8 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -7,8 +7,8 @@
 
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/c_runtime_api.h>
-#include "./codegen_llvm.h"
-#include "./codegen_cpu.h"
+#include "codegen_llvm.h"
+#include "codegen_cpu.h"
 #include "../codegen_common.h"
 #include "../../pass/ir_util.h"
 #include "../../arithmetic/compute_expr.h"
diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h
index 4e61247f4acf..d0cee581a0b6 100644
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -15,7 +15,7 @@
 #include <utility>
 #include <vector>
 #include <string>
-#include "./llvm_common.h"
+#include "llvm_common.h"
 #include "../../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index 1cca1eacfe85..fc5ad99119ae 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -6,7 +6,7 @@
 #ifdef TVM_LLVM_VERSION
 
 #include <tvm/runtime/device_api.h>
-#include "./codegen_llvm.h"
+#include "codegen_llvm.h"
 #include "../build_common.h"
 #include "../../pass/ir_util.h"
 #include "../../runtime/cuda/cuda_module.h"
diff --git a/src/codegen/llvm/intrin_rule_llvm.cc b/src/codegen/llvm/intrin_rule_llvm.cc
index 4b2a3ca5bd02..307f0a3bc412 100644
--- a/src/codegen/llvm/intrin_rule_llvm.cc
+++ b/src/codegen/llvm/intrin_rule_llvm.cc
@@ -4,7 +4,7 @@
  */
 #ifdef TVM_LLVM_VERSION
 
-#include "./intrin_rule_llvm.h"
+#include "intrin_rule_llvm.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/intrin_rule_llvm.h b/src/codegen/llvm/intrin_rule_llvm.h
index 85641cb178e7..30e7674c3297 100644
--- a/src/codegen/llvm/intrin_rule_llvm.h
+++ b/src/codegen/llvm/intrin_rule_llvm.h
@@ -11,7 +11,7 @@
 #include <tvm/api_registry.h>
 #include <tvm/codegen.h>
 #include <string>
-#include "./llvm_common.h"
+#include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/intrin_rule_rocm.cc b/src/codegen/llvm/intrin_rule_rocm.cc
index b9bee94e9c24..092eb77f8f59 100644
--- a/src/codegen/llvm/intrin_rule_rocm.cc
+++ b/src/codegen/llvm/intrin_rule_rocm.cc
@@ -4,7 +4,7 @@
  */
 #ifdef TVM_LLVM_VERSION
 
-#include "./intrin_rule_llvm.h"
+#include "intrin_rule_llvm.h"
 #include <tvm/ir.h>
 #include <tvm/expr.h>
 #include <tvm/api_registry.h>
diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc
index 01f2c8869dc1..9d1ba6b1068f 100644
--- a/src/codegen/llvm/llvm_common.cc
+++ b/src/codegen/llvm/llvm_common.cc
@@ -6,7 +6,7 @@
 
 #include <tvm/base.h>
 #include <mutex>
-#include "./llvm_common.h"
+#include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index 1b0e43f9c23a..54f986d628d3 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -7,8 +7,8 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/codegen.h>
 #include <mutex>
-#include "./llvm_common.h"
-#include "./codegen_llvm.h"
+#include "llvm_common.h"
+#include "codegen_llvm.h"
 #include "../../runtime/file_util.h"
 #include "../../runtime/module_util.h"
 
diff --git a/src/codegen/source_module.cc b/src/codegen/source_module.cc
index 69dbda49976b..c7100e18735e 100644
--- a/src/codegen/source_module.cc
+++ b/src/codegen/source_module.cc
@@ -4,7 +4,7 @@
  * \brief Source code module, only for viewing
  */
 #include <tvm/runtime/packed_func.h>
-#include "./codegen_source_base.h"
+#include "codegen_source_base.h"
 #include "../runtime/file_util.h"
 #include "../runtime/meta_data.h"
 
diff --git a/src/codegen/spirv/build_vulkan.cc b/src/codegen/spirv/build_vulkan.cc
index 3cd1b56cda43..f5ec5628545a 100644
--- a/src/codegen/spirv/build_vulkan.cc
+++ b/src/codegen/spirv/build_vulkan.cc
@@ -8,7 +8,7 @@
 #include <dmlc/memory_io.h>
 #include <tvm/ir_pass.h>
 
-#include "./codegen_spirv.h"
+#include "codegen_spirv.h"
 #include "../build_common.h"
 #include "../../runtime/vulkan/vulkan_module.h"
 
diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc
index 395bdff1477d..812fee4a114e 100644
--- a/src/codegen/spirv/codegen_spirv.cc
+++ b/src/codegen/spirv/codegen_spirv.cc
@@ -5,8 +5,9 @@
  */
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
+#include <string>
 #include "../codegen_common.h"
-#include "./codegen_spirv.h"
+#include "codegen_spirv.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/spirv/codegen_spirv.h b/src/codegen/spirv/codegen_spirv.h
index a6c09362ddf7..6a43182f7f2e 100644
--- a/src/codegen/spirv/codegen_spirv.h
+++ b/src/codegen/spirv/codegen_spirv.h
@@ -12,7 +12,7 @@
 
 #include <vector>
 
-#include "./ir_builder.h"
+#include "ir_builder.h"
 #include "../../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/codegen/spirv/ir_builder.cc b/src/codegen/spirv/ir_builder.cc
index eb7a67228e60..41cb48c5854b 100644
--- a/src/codegen/spirv/ir_builder.cc
+++ b/src/codegen/spirv/ir_builder.cc
@@ -3,7 +3,7 @@
  * \file ir_builder.cc
  * \brief IRBuilder for SPIRV block
  */
-#include "./ir_builder.h"
+#include "ir_builder.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/stackvm/codegen_stackvm.cc b/src/codegen/stackvm/codegen_stackvm.cc
index 517793ff14a3..0bede2dc0751 100644
--- a/src/codegen/stackvm/codegen_stackvm.cc
+++ b/src/codegen/stackvm/codegen_stackvm.cc
@@ -5,7 +5,7 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <limits>
-#include "./codegen_stackvm.h"
+#include "codegen_stackvm.h"
 #include "../../runtime/stackvm/stackvm_module.h"
 
 namespace tvm {
diff --git a/src/codegen/verilog/codegen_verilog.cc b/src/codegen/verilog/codegen_verilog.cc
index a4887390ad5d..d7e149257fdb 100644
--- a/src/codegen/verilog/codegen_verilog.cc
+++ b/src/codegen/verilog/codegen_verilog.cc
@@ -6,7 +6,7 @@
 #include <cctype>
 #include <sstream>
 #include <iostream>
-#include "./codegen_verilog.h"
+#include "codegen_verilog.h"
 #include "../../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/codegen/verilog/codegen_verilog.h b/src/codegen/verilog/codegen_verilog.h
index 7c8b811c2fa6..a38640ac3799 100644
--- a/src/codegen/verilog/codegen_verilog.h
+++ b/src/codegen/verilog/codegen_verilog.h
@@ -14,7 +14,7 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
-#include "./verilog_ir.h"
+#include "verilog_ir.h"
 #include "../codegen_source_base.h"
 
 namespace tvm {
diff --git a/src/codegen/verilog/verilog_ir.cc b/src/codegen/verilog/verilog_ir.cc
index 1a03fc881665..b7576c83dfa8 100644
--- a/src/codegen/verilog/verilog_ir.cc
+++ b/src/codegen/verilog/verilog_ir.cc
@@ -5,7 +5,8 @@
 #include <tvm/ir_pass.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_mutator.h>
-#include "./verilog_ir.h"
+#include <utility>
+#include "verilog_ir.h"
 #include "../../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/codegen/verilog/verilog_module.cc b/src/codegen/verilog/verilog_module.cc
index 0319d6e6556c..0670a02e34ac 100644
--- a/src/codegen/verilog/verilog_module.cc
+++ b/src/codegen/verilog/verilog_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/codegen.h>
 #include <mutex>
-#include "./codegen_verilog.h"
+#include "codegen_verilog.h"
 #include "../../runtime/file_util.h"
 #include "../../runtime/meta_data.h"
 
diff --git a/src/codegen/verilog/vpi_device_api.cc b/src/codegen/verilog/vpi_device_api.cc
index d53a12962fd7..656630351cf5 100644
--- a/src/codegen/verilog/vpi_device_api.cc
+++ b/src/codegen/verilog/vpi_device_api.cc
@@ -10,7 +10,7 @@
 #include <unordered_map>
 #include <map>
 #include <queue>
-#include "./vpi_session.h"
+#include "vpi_session.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/verilog/vpi_session.cc b/src/codegen/verilog/vpi_session.cc
index 6fbbbc01d32b..ac2861e8f74f 100644
--- a/src/codegen/verilog/vpi_session.cc
+++ b/src/codegen/verilog/vpi_session.cc
@@ -4,7 +4,7 @@
  * \brief IPC session call to verilog simulator via VPI.
  */
 #include <tvm/api_registry.h>
-#include "./vpi_session.h"
+#include "vpi_session.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/contrib/nnpack/convolution.cc b/src/contrib/nnpack/convolution.cc
index 9ca02118aeb3..f658a1fe96d4 100644
--- a/src/contrib/nnpack/convolution.cc
+++ b/src/contrib/nnpack/convolution.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/util.h>
 #include <dmlc/logging.h>
 #include <nnpack.h>
-#include "./nnpack_utils.h"
+#include "nnpack_utils.h"
 
 namespace tvm {
 namespace contrib {
diff --git a/src/contrib/nnpack/fully_connected.cc b/src/contrib/nnpack/fully_connected.cc
index df6356d933aa..ad2569e1f2e1 100644
--- a/src/contrib/nnpack/fully_connected.cc
+++ b/src/contrib/nnpack/fully_connected.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/util.h>
 #include <dmlc/logging.h>
 #include <nnpack.h>
-#include "./nnpack_utils.h"
+#include "nnpack_utils.h"
 
 namespace tvm {
 namespace contrib {
diff --git a/src/contrib/nnpack/nnpack_utils.cc b/src/contrib/nnpack/nnpack_utils.cc
index 631f25b36647..3220d7af339f 100644
--- a/src/contrib/nnpack/nnpack_utils.cc
+++ b/src/contrib/nnpack/nnpack_utils.cc
@@ -2,7 +2,7 @@
  *  Copyright (c) 2017 by Contributors
  * \file Use external nnpack library call.
  */
-#include "./nnpack_utils.h"
+#include "nnpack_utils.h"
 
 namespace tvm {
 namespace contrib {
diff --git a/src/contrib/random/random.cc b/src/contrib/random/random.cc
index 27e2b065a01b..68821fe04124 100644
--- a/src/contrib/random/random.cc
+++ b/src/contrib/random/random.cc
@@ -8,9 +8,9 @@
 #include <dmlc/thread_local.h>
 #include <algorithm>
 #ifndef _LIBCPP_SGX_CONFIG
-#include "./mt_random_engine.cc"
+#include "mt_random_engine.cc"
 #else
-#include "./sgx_random_engine.cc"
+#include "sgx_random_engine.cc"
 #endif
 
 #define DLPACK_INTEGER_TYPE_SWITCH(type, DType, ...)    \
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index 366ea2c78fe6..267a25ff372b 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -9,8 +9,9 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./compute_op.h"
-#include "./op_util.h"
+#include <string>
+#include "compute_op.h"
+#include "op_util.h"
 #include "../schedule/message_passing.h"
 
 namespace tvm {
diff --git a/src/op/cross_thread_reduction.cc b/src/op/cross_thread_reduction.cc
index eb320388860a..c4599dee9bd8 100644
--- a/src/op/cross_thread_reduction.cc
+++ b/src/op/cross_thread_reduction.cc
@@ -4,8 +4,8 @@
  * \file cross_thread_reduction.cc
  */
 #include <tvm/ir_pass.h>
-#include "./compute_op.h"
-#include "./op_util.h"
+#include "compute_op.h"
+#include "op_util.h"
 
 namespace tvm {
 using namespace ir;
diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc
index 759e258e90ef..86c1d5e74527 100644
--- a/src/op/extern_op.cc
+++ b/src/op/extern_op.cc
@@ -7,7 +7,7 @@
 #include <tvm/arithmetic.h>
 #include <tvm/ir.h>
 #include <unordered_set>
-#include "./op_util.h"
+#include "op_util.h"
 
 namespace tvm {
 using namespace ir;
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index 4f34d8d972ce..ba83997a0a16 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -7,7 +7,8 @@
 #include <tvm/ir_pass.h>
 #include <tvm/operation.h>
 #include <tvm/ir_mutator.h>
-#include "./op_util.h"
+#include <string>
+#include "op_util.h"
 #include "../schedule/message_passing.h"
 #include "../arithmetic/compute_expr.h"
 
diff --git a/src/op/scan_op.cc b/src/op/scan_op.cc
index 626c8eba46b4..d03601709ab4 100644
--- a/src/op/scan_op.cc
+++ b/src/op/scan_op.cc
@@ -6,7 +6,7 @@
 #include <tvm/operation.h>
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
-#include "./op_util.h"
+#include "op_util.h"
 #include "../schedule/graph.h"
 
 namespace tvm {
diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc
index 148ad0f90fe7..6423c4e942e4 100644
--- a/src/op/tensorize.cc
+++ b/src/op/tensorize.cc
@@ -7,8 +7,8 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <tvm/api_registry.h>
-#include "./op_util.h"
-#include "./compute_op.h"
+#include "op_util.h"
+#include "compute_op.h"
 #include "../schedule/message_passing.h"
 #include "../arithmetic/compute_expr.h"
 
diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc
index 390c918d9692..0fac313c079b 100644
--- a/src/pass/arg_binder.cc
+++ b/src/pass/arg_binder.cc
@@ -6,8 +6,8 @@
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
 #include <tvm/runtime/device_api.h>
-#include "./ir_util.h"
-#include "./arg_binder.h"
+#include "ir_util.h"
+#include "arg_binder.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/coproc_sync.cc b/src/pass/coproc_sync.cc
index b3e64a989702..13dfef107e87 100644
--- a/src/pass/coproc_sync.cc
+++ b/src/pass/coproc_sync.cc
@@ -8,8 +8,8 @@
 #include <tvm/ir_visitor.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./ir_util.h"
-#include "./storage_access.h"
+#include "ir_util.h"
+#include "storage_access.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/inject_double_buffer.cc b/src/pass/inject_double_buffer.cc
index 03ffdb01e107..1384ea1a89ac 100644
--- a/src/pass/inject_double_buffer.cc
+++ b/src/pass/inject_double_buffer.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_pass.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_mutator.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/ir_mutator.cc b/src/pass/ir_mutator.cc
index 9ca9ccd190ff..e82c4f554be0 100644
--- a/src/pass/ir_mutator.cc
+++ b/src/pass/ir_mutator.cc
@@ -5,7 +5,7 @@
 #include <tvm/ir.h>
 #include <tvm/ir_mutator.h>
 #include <tvm/packed_func_ext.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/ir_util.cc b/src/pass/ir_util.cc
index 579706ca9964..d06839beca33 100644
--- a/src/pass/ir_util.cc
+++ b/src/pass/ir_util.cc
@@ -3,7 +3,7 @@
  * \file ir_util.cc
  * \brief Helper functions to construct and compose IR nodes.
  */
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/lift_attr_scope.cc b/src/pass/lift_attr_scope.cc
index a3a60aaac4d1..d5fd53812b99 100644
--- a/src/pass/lift_attr_scope.cc
+++ b/src/pass/lift_attr_scope.cc
@@ -7,7 +7,7 @@
  */
 #include <tvm/ir_pass.h>
 #include <tvm/ir_mutator.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/lower_intrin.cc b/src/pass/lower_intrin.cc
index 33ac6a94ecf7..b38051326d1d 100644
--- a/src/pass/lower_intrin.cc
+++ b/src/pass/lower_intrin.cc
@@ -8,7 +8,7 @@
 #include <tvm/ir_pass.h>
 #include <tvm/api_registry.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/lower_thread_allreduce.cc b/src/pass/lower_thread_allreduce.cc
index 8c0eb037d953..4d7f086d0534 100644
--- a/src/pass/lower_thread_allreduce.cc
+++ b/src/pass/lower_thread_allreduce.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc
index a63fef07bd12..46686a65803a 100644
--- a/src/pass/lower_tvm_builtin.cc
+++ b/src/pass/lower_tvm_builtin.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/lower_warp_memory.cc b/src/pass/lower_warp_memory.cc
index 8f153fd61188..85ae365f2a82 100644
--- a/src/pass/lower_warp_memory.cc
+++ b/src/pass/lower_warp_memory.cc
@@ -13,7 +13,7 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 206bd95010ce..8113c58f3f78 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -12,8 +12,8 @@
 #include <utility>
 #include <unordered_set>
 
-#include "./ir_util.h"
-#include "./arg_binder.h"
+#include "ir_util.h"
+#include "arg_binder.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/narrow_channel_access.cc b/src/pass/narrow_channel_access.cc
index 733eeffb632e..7faf7d1b173e 100644
--- a/src/pass/narrow_channel_access.cc
+++ b/src/pass/narrow_channel_access.cc
@@ -11,7 +11,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/arithmetic.h>
 #include <tvm/channel.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/split_pipeline.cc b/src/pass/split_pipeline.cc
index 38bd5f86fd68..0dd5bd65106f 100644
--- a/src/pass/split_pipeline.cc
+++ b/src/pass/split_pipeline.cc
@@ -11,7 +11,7 @@
 #include <tvm/channel.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/storage_access.cc b/src/pass/storage_access.cc
index 09be1a53da42..e7adcc75854f 100644
--- a/src/pass/storage_access.cc
+++ b/src/pass/storage_access.cc
@@ -5,8 +5,9 @@
 #include <tvm/ir_pass.h>
 #include <tvm/ir_mutator.h>
 #include <tvm/target_info.h>
-#include "./ir_util.h"
-#include "./storage_access.h"
+#include <string>
+#include "ir_util.h"
+#include "storage_access.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index f5cb98495ff9..28a6ace9bfa6 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -14,8 +14,8 @@
 #include <tvm/target_info.h>
 #include <tvm/runtime/device_api.h>
 #include <unordered_map>
-#include "./ir_util.h"
-#include "./arg_binder.h"
+#include "ir_util.h"
+#include "arg_binder.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 877216ed7656..58b62f291d39 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -12,7 +12,7 @@
 #include <map>
 #include <unordered_set>
 #include <unordered_map>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
diff --git a/src/pass/storage_sync.cc b/src/pass/storage_sync.cc
index 6e2d1020a6b5..43f3b94d114f 100644
--- a/src/pass/storage_sync.cc
+++ b/src/pass/storage_sync.cc
@@ -8,8 +8,8 @@
 #include <tvm/ir_visitor.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./ir_util.h"
-#include "./storage_access.h"
+#include "ir_util.h"
+#include "storage_access.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/runtime/c_dsl_api.cc b/src/runtime/c_dsl_api.cc
index 6ae8b9911a4c..ae39a1266d06 100644
--- a/src/runtime/c_dsl_api.cc
+++ b/src/runtime/c_dsl_api.cc
@@ -5,8 +5,8 @@
  */
 #include <tvm/runtime/registry.h>
 #include <tvm/c_dsl_api.h>
-#include "./dsl_api.h"
-#include "./runtime_base.h"
+#include "dsl_api.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 916dfadecb4c..a081a4c1df11 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <cstdlib>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index e3434e01813e..d166a3a43dfa 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -8,7 +8,7 @@
 #include <tvm/runtime/device_api.h>
 #include <cstdlib>
 #include <cstring>
-#include "./workspace_pool.h"
+#include "workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 98accdf1b0aa..8309b45a7963 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -8,7 +8,7 @@
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
 #include <cuda_runtime.h>
-#include "./cuda_common.h"
+#include "cuda_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index a0e613107bae..4984517b16c6 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -2,7 +2,7 @@
  *  Copyright (c) 2017 by Contributors
  * \file cuda_module.cc
  */
-#include "./cuda_module.h"
+#include "cuda_module.h"
 
 #include <tvm/runtime/registry.h>
 #include <cuda.h>
@@ -11,7 +11,7 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./cuda_common.h"
+#include "cuda_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
diff --git a/src/runtime/dso_module.cc b/src/runtime/dso_module.cc
index 60fdb427c246..fe7c362472d1 100644
--- a/src/runtime/dso_module.cc
+++ b/src/runtime/dso_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/packed_func.h>
-#include "./module_util.h"
+#include "module_util.h"
 
 #if defined(_WIN32)
 #include <windows.h>
diff --git a/src/runtime/file_util.cc b/src/runtime/file_util.cc
index 7606bf89cd92..4df335a54f25 100644
--- a/src/runtime/file_util.cc
+++ b/src/runtime/file_util.cc
@@ -6,8 +6,9 @@
 #include <dmlc/logging.h>
 #include <tvm/runtime/serializer.h>
 #include <fstream>
+#include <vector>
 
-#include "./file_util.h"
+#include "file_util.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/file_util.h b/src/runtime/file_util.h
index b3357271856e..de520fa3158c 100644
--- a/src/runtime/file_util.h
+++ b/src/runtime/file_util.h
@@ -7,7 +7,7 @@
 #define TVM_RUNTIME_FILE_UTIL_H_
 
 #include <string>
-#include "./meta_data.h"
+#include "meta_data.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 7a75771af23b..34bde9a89e36 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -8,7 +8,10 @@
 #include <dmlc/memory_io.h>
 #include <dmlc/json.h>
 #include <numeric>
-#include "./graph_runtime.h"
+#include <algorithm>
+#include <vector>
+#include <functional>
+#include "graph_runtime.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 381bf9f60c79..40d08015e8cd 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -11,7 +11,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <string>
 #include <vector>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index 47c2899cea71..fcdbf13138a8 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./metal_common.h"
+#include "metal_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index c79e2cf11ac5..c538957ca561 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -8,8 +8,8 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./metal_module.h"
-#include "./metal_common.h"
+#include "metal_module.h"
+#include "metal_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index dbddfde44733..80dc1f3172f8 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -9,7 +9,7 @@
 #include <unordered_set>
 #include <cstring>
 #ifndef _LIBCPP_SGX_CONFIG
-#include "./file_util.h"
+#include "file_util.h"
 #endif
 
 namespace tvm {
diff --git a/src/runtime/module_util.cc b/src/runtime/module_util.cc
index 95da78d23f09..0c6d8ae4058d 100644
--- a/src/runtime/module_util.cc
+++ b/src/runtime/module_util.cc
@@ -8,7 +8,8 @@
 #endif
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
-#include "./module_util.h"
+#include <string>
+#include "module_util.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 424a2b09cb15..04c178f25dfa 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -7,7 +7,7 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 // deleter for arrays used by DLPack exporter
 extern "C" void NDArrayDLPackDeleter(DLManagedTensor* tensor);
diff --git a/src/runtime/opencl/aocl/aocl_device_api.cc b/src/runtime/opencl/aocl/aocl_device_api.cc
index e9cbc6b4cda0..61f636df6039 100644
--- a/src/runtime/opencl/aocl/aocl_device_api.cc
+++ b/src/runtime/opencl/aocl/aocl_device_api.cc
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./aocl_common.h"
+#include "aocl_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/aocl/aocl_module.cc b/src/runtime/opencl/aocl/aocl_module.cc
index a056c5cee671..bbf2828fbd79 100644
--- a/src/runtime/opencl/aocl/aocl_module.cc
+++ b/src/runtime/opencl/aocl/aocl_module.cc
@@ -7,8 +7,8 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./aocl_common.h"
-#include "./aocl_module.h"
+#include "aocl_common.h"
+#include "aocl_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index ac9373f1375b..f1e224e5a9d1 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./opencl_common.h"
+#include "opencl_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 3efd789513ba..6d392036cc56 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -7,8 +7,8 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./opencl_common.h"
-#include "./opencl_module.h"
+#include "opencl_common.h"
+#include "opencl_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
index 4b057b7e009a..bc98759b9b3f 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
+++ b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./sdaccel_common.h"
+#include "sdaccel_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/sdaccel/sdaccel_module.cc b/src/runtime/opencl/sdaccel/sdaccel_module.cc
index c99e78c8e347..de9a710fbfe8 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_module.cc
+++ b/src/runtime/opencl/sdaccel/sdaccel_module.cc
@@ -7,8 +7,8 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./sdaccel_common.h"
-#include "./sdaccel_module.h"
+#include "sdaccel_common.h"
+#include "sdaccel_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc
index 3a21ed6e6d07..191b64b6ce0a 100644
--- a/src/runtime/opengl/opengl_device_api.cc
+++ b/src/runtime/opengl/opengl_device_api.cc
@@ -4,8 +4,8 @@
  */
 #include <tvm/runtime/registry.h>
 #include <cstring>
-#include "./opengl_common.h"
-#include "./opengl_module.h"
+#include "opengl_common.h"
+#include "opengl_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opengl/opengl_module.cc b/src/runtime/opengl/opengl_module.cc
index d800af95f053..976227a2924b 100644
--- a/src/runtime/opengl/opengl_module.cc
+++ b/src/runtime/opengl/opengl_module.cc
@@ -4,8 +4,8 @@
  */
 #include <tvm/runtime/registry.h>
 #include <utility>
-#include "./opengl_common.h"
-#include "./opengl_module.h"
+#include "opengl_common.h"
+#include "opengl_module.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../file_util.h"
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index 3f72828390ee..d7bbc3ce9996 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -10,7 +10,7 @@
 #include <mutex>
 #include <memory>
 #include <array>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 6aff5e56c715..355200a0cbb0 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -10,7 +10,7 @@
 #include <tvm/runtime/registry.h>
 #include <hip/hip_runtime_api.h>
 #include <hsa/hsa.h>
-#include "./rocm_common.h"
+#include "rocm_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index 503b04872c82..da3b04f66c49 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -8,8 +8,8 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./rocm_module.h"
-#include "./rocm_common.h"
+#include "rocm_module.h"
+#include "rocm_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index 5740a393c253..4242f8e1ae58 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -5,7 +5,7 @@
 #include <dmlc/logging.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/device_api.h>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rpc/rpc_event_impl.cc b/src/runtime/rpc/rpc_event_impl.cc
index fc5ecca1f421..e553c6fad4a0 100644
--- a/src/runtime/rpc/rpc_event_impl.cc
+++ b/src/runtime/rpc/rpc_event_impl.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <memory>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index d6c56e1b7cf4..80a8cc93ce19 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/registry.h>
 #include <memory>
 #include <cstring>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 6bb01b9bd459..0e2d637ab475 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -11,7 +11,9 @@
 #include <array>
 #include <string>
 #include <chrono>
-#include "./rpc_session.h"
+#include <vector>
+#include <utility>
+#include "rpc_session.h"
 #include "../../common/ring_buffer.h"
 
 namespace tvm {
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index 22f221d46526..6b2fa6c1f608 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <memory>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 #include "../../common/socket.h"
 
 namespace tvm {
diff --git a/src/runtime/sgx/trusted/runtime.cc b/src/runtime/sgx/trusted/runtime.cc
index a863327f956c..b7f66efbc97c 100644
--- a/src/runtime/sgx/trusted/runtime.cc
+++ b/src/runtime/sgx/trusted/runtime.cc
@@ -12,9 +12,9 @@
 #include "../../system_lib_module.cc"
 #include "../../thread_pool.cc"
 #include "../../workspace_pool.cc"
-#include "./ecall_registry.h"
-#include "./runtime.h"
-#include "./threading_backend.cc"
+#include "ecall_registry.h"
+#include "runtime.h"
+#include "threading_backend.cc"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/stackvm/stackvm.cc b/src/runtime/stackvm/stackvm.cc
index f86bfec087e4..f45d83027467 100644
--- a/src/runtime/stackvm/stackvm.cc
+++ b/src/runtime/stackvm/stackvm.cc
@@ -7,7 +7,7 @@
 #include <tvm/runtime/util.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <algorithm>
-#include "./stackvm.h"
+#include "stackvm.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc
index 71ca9ba6c09a..7256c47862e5 100644
--- a/src/runtime/stackvm/stackvm_module.cc
+++ b/src/runtime/stackvm/stackvm_module.cc
@@ -5,7 +5,7 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/module.h>
 #include <dmlc/memory_io.h>
-#include "./stackvm_module.h"
+#include "stackvm_module.h"
 #include "../file_util.h"
 #include "../module_util.h"
 
diff --git a/src/runtime/stackvm/stackvm_module.h b/src/runtime/stackvm/stackvm_module.h
index fcd51a64f870..918228faea1f 100644
--- a/src/runtime/stackvm/stackvm_module.h
+++ b/src/runtime/stackvm/stackvm_module.h
@@ -8,7 +8,7 @@
 
 #include <tvm/runtime/packed_func.h>
 #include <string>
-#include "./stackvm.h"
+#include "stackvm.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/system_lib_module.cc b/src/runtime/system_lib_module.cc
index 01ff99d7da87..ed48cb1a9d44 100644
--- a/src/runtime/system_lib_module.cc
+++ b/src/runtime/system_lib_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <mutex>
-#include "./module_util.h"
+#include "module_util.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
index 45f8549d54f2..cc89804806d2 100644
--- a/src/runtime/vulkan/vulkan_device_api.cc
+++ b/src/runtime/vulkan/vulkan_device_api.cc
@@ -5,7 +5,7 @@
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
 #include <cstring>
-#include "./vulkan_common.h"
+#include "vulkan_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/vulkan/vulkan_module.cc b/src/runtime/vulkan/vulkan_module.cc
index b5425dd8fbc5..134c5fa45ba4 100644
--- a/src/runtime/vulkan/vulkan_module.cc
+++ b/src/runtime/vulkan/vulkan_module.cc
@@ -8,8 +8,8 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./vulkan_common.h"
-#include "./vulkan_module.h"
+#include "vulkan_common.h"
+#include "vulkan_module.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc
index c903a8621206..d43b4641192c 100644
--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -3,7 +3,7 @@
  * \file workspace_pool.h
  * \brief Workspace pool utility.
  */
-#include "./workspace_pool.h"
+#include "workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/schedule/bound.cc b/src/schedule/bound.cc
index 7929969a8502..05c04834e78c 100644
--- a/src/schedule/bound.cc
+++ b/src/schedule/bound.cc
@@ -9,8 +9,8 @@
 #include <tvm/ir_pass.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./graph.h"
-#include "./message_passing.h"
+#include "graph.h"
+#include "message_passing.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/schedule/graph.cc b/src/schedule/graph.cc
index da0aeb0eccaa..d92e7730b313 100644
--- a/src/schedule/graph.cc
+++ b/src/schedule/graph.cc
@@ -8,7 +8,7 @@
 #include <tvm/operation.h>
 #include <unordered_set>
 #include <unordered_map>
-#include "./graph.h"
+#include "graph.h"
 
 namespace tvm {
 namespace schedule {
diff --git a/src/schedule/message_passing.cc b/src/schedule/message_passing.cc
index b13dcefb1b9f..622e0b698902 100644
--- a/src/schedule/message_passing.cc
+++ b/src/schedule/message_passing.cc
@@ -6,7 +6,7 @@
 #include <tvm/arithmetic.h>
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
-#include "./message_passing.h"
+#include "message_passing.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index e9fbcba088fe..fa26aea51a2b 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./message_passing.h"
+#include "message_passing.h"
 #include "../pass/ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index eea8aa1aae80..1490c85ff786 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -6,7 +6,7 @@
 #include <tvm/operation.h>
 #include <tvm/ir_mutator.h>
 #include <unordered_set>
-#include "./graph.h"
+#include "graph.h"
 
 namespace tvm {
 
diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc
index 6fd2496aeabe..242423695464 100644
--- a/src/schedule/schedule_ops.cc
+++ b/src/schedule/schedule_ops.cc
@@ -11,7 +11,7 @@
 #include <utility>
 #include <unordered_map>
 #include <unordered_set>
-#include "./graph.h"
+#include "graph.h"
 #include "../op/op_util.h"
 #include "../pass/ir_util.h"
 
diff --git a/verilog/tvm_vpi.cc b/verilog/tvm_vpi.cc
index c663f7df51be..949b660ce447 100644
--- a/verilog/tvm_vpi.cc
+++ b/verilog/tvm_vpi.cc
@@ -8,7 +8,9 @@
 #include <cstdlib>
 #include <memory>
 #include <queue>
-#include "./tvm_vpi.h"
+#include <string>
+#include <vector>
+#include "tvm_vpi.h"
 #include "../src/common/pipe.h"
 
 namespace tvm {
diff --git a/vta/hardware/xilinx/src/vta.cc b/vta/hardware/xilinx/src/vta.cc
index 8d0432477486..5ca2cec6575d 100644
--- a/vta/hardware/xilinx/src/vta.cc
+++ b/vta/hardware/xilinx/src/vta.cc
@@ -8,7 +8,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./vta.h"
+#include "vta.h"
 
 void fetch(
   uint32_t insn_count,
diff --git a/vta/include/vta/runtime.h b/vta/include/vta/runtime.h
index 6d77067be931..e58d45486282 100644
--- a/vta/include/vta/runtime.h
+++ b/vta/include/vta/runtime.h
@@ -11,7 +11,7 @@
 extern "C" {
 #endif
 
-#include "./driver.h"
+#include "driver.h"
 
 #define VTA_MEMCPY_H2D 1
 #define VTA_MEMCPY_D2H 2
diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc
index e2630b14acde..1909ed35c562 100644
--- a/vta/src/pynq/pynq_driver.cc
+++ b/vta/src/pynq/pynq_driver.cc
@@ -6,7 +6,7 @@
 
 #include <vta/driver.h>
 #include <thread>
-#include "./pynq_driver.h"
+#include "pynq_driver.h"
 
 
 void* VTAMemAlloc(size_t size, int cached) {
diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc
index 6c6d28ec0c69..95b793ea3ba1 100644
--- a/vta/tests/hardware/common/test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -4,7 +4,7 @@
  * \brief Test library for the VTA design simulation and driver tests.
  */
 
-#include "./test_lib.h"
+#include "test_lib.h"
 
 #ifdef NO_SIM
 #ifdef VTA_TARGET_PYNQ

From cebda234234685c03d942edbee82db3f49e8a944 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 23 Aug 2018 08:28:55 -0700
Subject: [PATCH 047/529] [RUNTIME][PYTHON] Switch to use __new__ for
 constructing node. (#1644)

---
 python/tvm/_ffi/_ctypes/node.py  | 18 +++++++-----------
 python/tvm/_ffi/_cython/base.pxi |  4 ++--
 python/tvm/_ffi/_cython/node.pxi | 12 ++++++------
 python/tvm/_ffi/node.py          |  9 ++++++++-
 python/tvm/target.py             | 12 +++++++-----
 5 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/python/tvm/_ffi/_ctypes/node.py b/python/tvm/_ffi/_ctypes/node.py
index 01244519532b..925aa93f8f96 100644
--- a/python/tvm/_ffi/_ctypes/node.py
+++ b/python/tvm/_ffi/_ctypes/node.py
@@ -24,7 +24,13 @@ def _return_node(x):
         handle = NodeHandle(handle)
     tindex = ctypes.c_int()
     check_call(_LIB.TVMNodeGetTypeIndex(handle, ctypes.byref(tindex)))
-    return NODE_TYPE.get(tindex.value, NodeBase)(handle)
+    cls = NODE_TYPE.get(tindex.value, NodeBase)
+    # Avoid calling __init__ of cls, instead directly call __new__
+    # This allows child class to implement their own __init__
+    node = cls.__new__(cls)
+    node.handle = handle
+    return node
+
 
 RETURN_SWITCH[TypeCode.NODE_HANDLE] = _return_node
 C_TO_PY_ARG_SWITCH[TypeCode.NODE_HANDLE] = _wrap_arg_func(
@@ -34,16 +40,6 @@ def _return_node(x):
 class NodeBase(object):
     __slots__ = ["handle"]
     # pylint: disable=no-member
-    def __init__(self, handle):
-        """Initialize the function with handle
-
-        Parameters
-        ----------
-        handle : SymbolHandle
-            the handle to the underlying C++ Symbol
-        """
-        self.handle = handle
-
     def __del__(self):
         if _LIB is not None:
             check_call(_LIB.TVMNodeFree(self.handle))
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 00173c431bb7..ac5532835c47 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -106,8 +106,8 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
 
 cdef extern from "tvm/c_dsl_api.h":
     int TVMNodeFree(NodeHandle handle)
-    TVMNodeTypeKey2Index(const char* type_key,
-                         int* out_index)
+    int TVMNodeTypeKey2Index(const char* type_key,
+                             int* out_index)
     int TVMNodeGetTypeIndex(NodeHandle handle,
                             int* out_index)
     int TVMNodeGetAttr(NodeHandle handle,
diff --git a/python/tvm/_ffi/_cython/node.pxi b/python/tvm/_ffi/_cython/node.pxi
index a563af5237f9..1ced48878803 100644
--- a/python/tvm/_ffi/_cython/node.pxi
+++ b/python/tvm/_ffi/_cython/node.pxi
@@ -1,3 +1,4 @@
+from ... import _api_internal
 from ..base import string_types
 from ..node_generic import _set_class_node_base
 
@@ -10,6 +11,7 @@ def _register_node(int index, object cls):
         NODE_TYPE.append(None)
     NODE_TYPE[index] = cls
 
+
 cdef inline object make_ret_node(void* chandle):
     global NODE_TYPE
     cdef int tindex
@@ -20,14 +22,15 @@ cdef inline object make_ret_node(void* chandle):
     if tindex < len(node_type):
         cls = node_type[tindex]
         if cls is not None:
-            obj = cls(None)
+            obj = cls.__new__(cls)
         else:
-            obj = NodeBase(None)
+            obj = NodeBase.__new__(NodeBase)
     else:
-        obj = NodeBase(None)
+        obj = NodeBase.__new__(NodeBase)
     (<NodeBase>obj).chandle = chandle
     return obj
 
+
 cdef class NodeBase:
     cdef void* chandle
 
@@ -49,9 +52,6 @@ cdef class NodeBase:
         def __set__(self, value):
             self._set_handle(value)
 
-    def __init__(self, handle):
-        self._set_handle(handle)
-
     def __dealloc__(self):
         CALL(TVMNodeFree(self.chandle))
 
diff --git a/python/tvm/_ffi/node.py b/python/tvm/_ffi/node.py
index d9e7397ae71f..98ece19f77f2 100644
--- a/python/tvm/_ffi/node.py
+++ b/python/tvm/_ffi/node.py
@@ -21,6 +21,12 @@
     # pylint: disable=wrong-import-position
     from ._ctypes.node import _register_node, NodeBase as _NodeBase
 
+
+def _new_object(cls):
+    """Helper function for pickle"""
+    return cls.__new__(cls)
+
+
 class NodeBase(_NodeBase):
     """NodeBase is the base class of all TVM language AST object."""
     def __repr__(self):
@@ -46,7 +52,8 @@ def __ne__(self, other):
         return not self.__eq__(other)
 
     def __reduce__(self):
-        return (type(self), (None,), self.__getstate__())
+        cls = type(self)
+        return (_new_object, (cls, ), self.__getstate__())
 
     def __getstate__(self):
         handle = self.handle
diff --git a/python/tvm/target.py b/python/tvm/target.py
index 40f9e099b3a6..07200058a021 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -79,11 +79,13 @@ class Target(NodeBase):
     - :any:`tvm.target.mali` create Mali target
     - :any:`tvm.target.intel_graphics` create Intel Graphics target
     """
-    def __init__(self, handle):
-        super(Target, self).__init__(handle)
-        self._keys = None
-        self._options = None
-        self._libs = None
+    def __new__(cls):
+        # Always override new to enable class
+        obj = NodeBase.__new__(cls)
+        obj._keys = None
+        obj._options = None
+        obj._libs = None
+        return obj
 
     @property
     def keys(self):

From a438e2bcf3d81b89dcab026a19b85ab18b453ac6 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Thu, 23 Aug 2018 13:43:23 -0700
Subject: [PATCH 048/529] fix CO CI problem (#1641)

---
 tutorials/autotvm/tune_conv2d_cuda.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 375d1a9b755e..3ff26a05064d 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -64,7 +64,7 @@
 #
 
 @autotvm.template
-def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
+def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     assert N == 1, "Only consider batch_size = 1 in this template"
 
     data = tvm.placeholder((N, CI, H, W), name='data')
@@ -206,8 +206,8 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
 
 np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
 
-# Evaluate running time. Here we choose a large repeat number (200) to reduce the noise
+# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
 # and the overhead of kernel launch. You can also use nvprof to validate the result.
-evaluator = func.time_evaluator(func.entry_name, ctx, number=200)
+evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
 print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
 

From ce67586407c3ed08dc4fd32fc539070b0b1510c1 Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Fri, 24 Aug 2018 05:44:02 +0900
Subject: [PATCH 049/529] [RUNTIME] [OPENCL] Fix access modifiers (#1643)

---
 src/runtime/opencl/opencl_common.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index c37dbaa94d7a..d42cc669e742 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -260,14 +260,12 @@ class OpenCLModuleNode : public ModuleNode {
                           const std::string& func_name,
                           const KTRefEntry& e);
 
- protected:
+ private:
   // The workspace, need to keep reference to use it in destructor.
   // In case of static destruction order problem.
   std::shared_ptr<cl::OpenCLWorkspace> workspace_;
   // the binary data
   std::string data_;
-
- private:
   // The format
   std::string fmt_;
   // function information table.

From 2a381abe7f573a45e0c2cedff911ee5ff7d303af Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 23 Aug 2018 14:57:53 -0700
Subject: [PATCH 050/529] [FIX] Fix issue with TypedPackedFunc template
 instatition (#1649)

---
 include/tvm/runtime/packed_func.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 1c873a5ebccc..b7351274a350 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -180,9 +180,7 @@ class TypedPackedFunc<R(Args...)> {
    *
    * \param packed The packed function
    */
-  explicit TypedPackedFunc(PackedFunc packed)
-      : packed_(packed) {
-  }
+  inline explicit TypedPackedFunc(PackedFunc packed);
   /*!
    * \brief construct from a lambda function with the same signature.
    *
@@ -925,6 +923,8 @@ inline PackedFunc::FType PackedFunc::body() const {
   return body_;
 }
 
+
+
 // internal namespace
 namespace detail {
 
@@ -1131,6 +1131,10 @@ struct typed_packed_call_dispatcher<void> {
 };
 }  // namespace detail
 
+template<typename R, typename ...Args>
+TypedPackedFunc<R(Args...)>::TypedPackedFunc(PackedFunc packed)
+  : packed_(packed) {}
+
 template<typename R, typename ...Args>
 template<typename FType>
 inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda) {

From f0e77dad0be0ce43e107644f3dab1dd980185aa9 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Fri, 24 Aug 2018 07:57:28 +0900
Subject: [PATCH 051/529] [NNVM][KERAS] Support multiple outputs (#1648)

---
 nnvm/python/nnvm/frontend/keras.py            | 10 ++++-----
 .../python/frontend/keras/test_forward.py     | 21 ++++++++++++++++---
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 3a0a25aa4979..bb2ad783000c 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -532,15 +532,15 @@ def from_keras(model):
                 # they are named uniquely to input_1, input_2, input_3 ... by default.
                 for pred_idx, pred in zip(node.node_indices, node.inbound_layers):
                     if isinstance(pred, keras.engine.InputLayer):
-                        _sym = symtab.get_var(pred.name, must_contain=True)
+                        sym = symtab.get_var(pred.name, must_contain=True)
                     else:
-                        _sym = symtab.get_var(pred.name + ':' + str(pred_idx), must_contain=True)
-                    insym.append(_sym)
+                        sym = symtab.get_var(pred.name + ':' + str(pred_idx), must_contain=True)
+                    insym.append(sym)
 
                 if len(insym) == 1:
                     insym = insym[0]
                 keras_op_to_nnvm(insym, keras_layer, keras_layer.name + ':' + str(my_idx), symtab)
 
-    outsym = symtab.get_var(model._output_layers[0].name + ':0')
+    outsym = [symtab.get_var(layer.name + ':0') for layer in model._output_layers]
     tvmparams = {k:tvm.nd.array(np.array(v, dtype=np.float32)) for k, v in symtab.params.items()}
-    return outsym, tvmparams
+    return _sym.Group(outsym), tvmparams
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 3e80c74399cc..c8c9b2c784e8 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -20,7 +20,9 @@ def verify_keras_frontend(keras_model):
     in_shapes = []
     for layer in keras_model._input_layers:
         in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
-    out_shape = [dim.value if dim.value is not None else 1 for dim in keras_model._output_layers[0].output.shape]
+    out_shapes = []
+    for layer in keras_model._output_layers:
+        out_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.output.shape))
 
     def get_keras_output(xs, dtype='float32'):
         return keras_model.predict(xs)
@@ -35,8 +37,10 @@ def get_tvm_output(xs, target, ctx, dtype='float32'):
             m.set_input(name, tvm.nd.array(x.astype(dtype)))
         m.set_input(**params)
         m.run()
-        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
-        return out.asnumpy()
+
+        out = [m.get_output(i, tvm.nd.empty(shape, dtype)).asnumpy()
+                   for i, shape in enumerate(out_shapes)]
+        return out if len(out) > 1 else out[0]
 
     xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
     keras_out = get_keras_output(xs)
@@ -192,6 +196,16 @@ def test_forward_multi_inputs():
     verify_keras_frontend(keras_model)
 
 
+def test_forward_multi_outputs():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    y = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, [x, y])
+    verify_keras_frontend(keras_model)
+
+
 def test_forward_reuse_layers():
     # reuse conv2d
     data = keras.layers.Input(shape=(32,32,3))
@@ -230,4 +244,5 @@ def test_forward_reuse_layers():
     test_forward_mobilenet()
 
     test_forward_multi_inputs()
+    test_forward_multi_outputs()
     test_forward_reuse_layers()

From 93a1e72dd5315eed35dc2e35d1aef39a194f829e Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 23 Aug 2018 16:28:46 -0700
Subject: [PATCH 052/529] [PYTHON] Enable constructors in Node (#1647)

---
 python/tvm/_ffi/_ctypes/function.py           |  19 +
 python/tvm/_ffi/_ctypes/node.py               |  25 +-
 python/tvm/_ffi/_cython/function.pxi          |  44 +-
 python/tvm/_ffi/_cython/node.pxi              |  23 +
 python/tvm/_ffi/function.py                   |  18 +-
 python/tvm/api.py                             |  16 +-
 python/tvm/expr.py                            | 478 ++++++++++++++++--
 python/tvm/make.py                            |  43 +-
 python/tvm/stmt.py                            | 337 +++++++++++-
 src/api/api_ir.cc                             |   1 +
 .../python/unittest/test_lang_constructor.py  | 202 ++++++++
 11 files changed, 1084 insertions(+), 122 deletions(-)
 create mode 100644 tests/python/unittest/test_lang_constructor.py

diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index 79f3c6033a1f..61679f0018c0 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -17,6 +17,7 @@
 from .types import TVMPackedCFunc, TVMCFuncFinalizer
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
 from .node import NodeBase
+from . import node as _node
 
 FunctionHandle = ctypes.c_void_p
 ModuleHandle = ctypes.c_void_p
@@ -186,6 +187,23 @@ def __call__(self, *args):
         _ = args
         return RETURN_SWITCH[ret_tcode.value](ret_val)
 
+
+def __init_handle_by_constructor__(fconstructor, args):
+    """Initialize handle by constructor"""
+    temp_args = []
+    values, tcodes, num_args = _make_tvm_args(args, temp_args)
+    ret_val = TVMValue()
+    ret_tcode = ctypes.c_int()
+    check_call(_LIB.TVMFuncCall(
+        fconstructor.handle, values, tcodes, ctypes.c_int(num_args),
+        ctypes.byref(ret_val), ctypes.byref(ret_tcode)))
+    _ = temp_args
+    _ = args
+    assert ret_tcode.value == TypeCode.NODE_HANDLE
+    handle = ret_val.v_handle
+    return handle
+
+
 def _return_module(x):
     """Return function"""
     handle = x.v_handle
@@ -202,6 +220,7 @@ def _handle_return_func(x):
 
 
 # setup return handle for function type
+_node.__init_by_constructor__ = __init_handle_by_constructor__
 RETURN_SWITCH[TypeCode.FUNC_HANDLE] = _handle_return_func
 RETURN_SWITCH[TypeCode.MODULE_HANDLE] = _return_module
 RETURN_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False)
diff --git a/python/tvm/_ffi/_ctypes/node.py b/python/tvm/_ffi/_ctypes/node.py
index 925aa93f8f96..eb9e930b30eb 100644
--- a/python/tvm/_ffi/_ctypes/node.py
+++ b/python/tvm/_ffi/_ctypes/node.py
@@ -1,5 +1,5 @@
 # pylint: disable=invalid-name, protected-access
-# pylint: disable=no-member, missing-docstring
+# pylint: disable=no-member, missing-docstring, not-callable
 from __future__ import absolute_import
 
 import ctypes
@@ -9,6 +9,7 @@
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
 
 NodeHandle = ctypes.c_void_p
+__init_by_constructor__ = None
 
 """Maps node type to its constructor"""
 NODE_TYPE = {}
@@ -58,4 +59,26 @@ def __getattr__(self, name):
                 "'%s' object has no attribute '%s'" % (str(type(self)), name))
         return RETURN_SWITCH[ret_type_code.value](ret_val)
 
+    def __init_handle_by_constructor__(self, fconstructor, *args):
+        """Initialize the handle by calling constructor function.
+
+        Parameters
+        ----------
+        fconstructor : Function
+            Constructor function.
+
+        args: list of objects
+            The arguments to the constructor
+
+        Note
+        ----
+        We have a special calling convention to call constructor functions.
+        So the return handle is directly set into the Node object
+        instead of creating a new Node.
+        """
+        handle = __init_by_constructor__(fconstructor, args)
+        if not isinstance(handle, NodeHandle):
+            handle = NodeHandle(handle)
+        self.handle = handle
+
 _set_class_node_base(NodeBase)
diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi
index 989f5b8e7b47..dcbf4c665e66 100644
--- a/python/tvm/_ffi/_cython/function.pxi
+++ b/python/tvm/_ffi/_cython/function.pxi
@@ -196,37 +196,54 @@ cdef inline object make_ret(TVMValue value, int tcode):
     raise ValueError("Unhandled type code %d" % tcode)
 
 
-cdef inline object FuncCall3(void* chandle, tuple args, int nargs):
+cdef inline int FuncCall3(void* chandle,
+                          tuple args,
+                          int nargs,
+                          TVMValue* ret_val,
+                          int* ret_tcode) except -1:
     cdef TVMValue[3] values
     cdef int[3] tcodes
-    cdef TVMValue ret_val
-    cdef int ret_code
     nargs = len(args)
     temp_args = []
     for i in range(nargs):
         make_arg(args[i], &values[i], &tcodes[i], temp_args)
     CALL(TVMFuncCall(chandle, &values[0], &tcodes[0],
-                     nargs, &ret_val, &ret_code))
-    return make_ret(ret_val, ret_code)
+                     nargs, ret_val, ret_tcode))
+    return 0
 
-cdef inline object FuncCall(void* chandle, tuple args):
+cdef inline int FuncCall(void* chandle,
+                         tuple args,
+                         TVMValue* ret_val,
+                         int* ret_tcode) except -1:
     cdef int nargs
     nargs = len(args)
     if nargs <= 3:
-        return FuncCall3(chandle, args, nargs)
+        FuncCall3(chandle, args, nargs, ret_val, ret_tcode)
+        return 0
 
     cdef vector[TVMValue] values
     cdef vector[int] tcodes
-    cdef TVMValue ret_val
-    cdef int ret_code
     values.resize(max(nargs, 1))
     tcodes.resize(max(nargs, 1))
     temp_args = []
     for i in range(nargs):
         make_arg(args[i], &values[i], &tcodes[i], temp_args)
     CALL(TVMFuncCall(chandle, &values[0], &tcodes[0],
-                     nargs, &ret_val, &ret_code))
-    return make_ret(ret_val, ret_code)
+                     nargs, ret_val, ret_tcode))
+    return 0
+
+
+cdef inline int ConstructorCall(void* constructor_handle,
+                                int type_code,
+                                tuple args,
+                                void** handle) except -1:
+    """Call contructor of a handle function"""
+    cdef TVMValue ret_val
+    cdef int ret_tcode
+    FuncCall(constructor_handle, args, &ret_val, &ret_tcode)
+    assert ret_tcode == type_code
+    handle[0] = ret_val.v_handle
+    return 0
 
 
 cdef class FunctionBase:
@@ -264,7 +281,10 @@ cdef class FunctionBase:
             CALL(TVMFuncFree(self.chandle))
 
     def __call__(self, *args):
-        return FuncCall(self.chandle, args)
+        cdef TVMValue ret_val
+        cdef int ret_tcode
+        FuncCall(self.chandle, args, &ret_val, &ret_tcode)
+        return make_ret(ret_val, ret_tcode)
 
 _CLASS_FUNCTION = None
 _CLASS_MODULE = None
diff --git a/python/tvm/_ffi/_cython/node.pxi b/python/tvm/_ffi/_cython/node.pxi
index 1ced48878803..c62e4ab44cef 100644
--- a/python/tvm/_ffi/_cython/node.pxi
+++ b/python/tvm/_ffi/_cython/node.pxi
@@ -65,4 +65,27 @@ cdef class NodeBase:
                 "'%s' object has no attribute '%s'" % (type(self), name))
         return make_ret(ret_val, ret_type_code)
 
+    def __init_handle_by_constructor__(self, fconstructor, *args):
+        """Initialize the handle by calling constructor function.
+
+        Parameters
+        ----------
+        fconstructor : Function
+            Constructor function.
+
+        args: list of objects
+            The arguments to the constructor
+
+        Note
+        ----
+        We have a special calling convention to call constructor functions.
+        So the return handle is directly set into the Node object
+        instead of creating a new Node.
+        """
+        cdef void* chandle
+        ConstructorCall(
+            (<FunctionBase>fconstructor).chandle,
+            kNodeHandle, args, &chandle)
+        self.chandle = chandle
+
 _set_class_node_base(NodeBase)
diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py
index cfda2a35f9b9..ca1812d4109a 100644
--- a/python/tvm/_ffi/function.py
+++ b/python/tvm/_ffi/function.py
@@ -262,23 +262,7 @@ def _list(name, func):
 def _get_api(f):
     flocal = f
     flocal.is_global = True
-    def my_api_func(*args):
-        """
-
-        This is a type erased API that calls into Global PackedFunc.
-        These APIs corresponds to functions registered from C++ backend
-        and can be used as developer functions.
-
-        args : list
-          The positional arguments to the function call.
-
-        Returns
-        -------
-        value : int, float, None, Node or Function
-        The result of the API function call.
-        """
-        return flocal(*args)
-    return my_api_func
+    return flocal
 
 def _init_api(namespace, target_module_name=None):
     """Initialize api for a given module name
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 75debc33db66..2bcb003ee7e5 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -134,9 +134,9 @@ def any(*args):
         raise ValueError("Any must take at least 1 argument")
     if len(args) == 1:
         return args[0]
-    ret = _make.Or(args[0], args[1])
+    ret = _expr.Or(args[0], args[1])
     for i in range(2, len(args)):
-        ret = _make.Or(ret, args[i])
+        ret = _expr.Or(ret, args[i])
     return ret
 
 
@@ -158,9 +158,9 @@ def all(*args):
         raise ValueError("Any must take at least 1 argument")
     if len(args) == 1:
         return args[0]
-    ret = _make.And(args[0], args[1])
+    ret = _expr.And(args[0], args[1])
     for i in range(2, len(args)):
-        ret = _make.And(ret, args[i])
+        ret = _expr.And(ret, args[i])
     return ret
 
 
@@ -616,7 +616,7 @@ def select(cond, t, f):
     node : Node
         The tvm.expr.Select node
     """
-    return _make.Select(convert(cond), convert(t), convert(f))
+    return _expr.Select(convert(cond), convert(t), convert(f))
 
 
 def comm_reducer(fcombine, fidentity, name="reduce"):
@@ -699,7 +699,7 @@ def _make_reduce(expr, axis, where=None):
         axis = convert(axis if isinstance(axis, (list, tuple)) else [axis])
         if where is None:
             where = convert(True)
-        outputs = tuple(_make.Reduce(combiner, expr, axis, where, i)
+        outputs = tuple(_expr.Reduce(combiner, expr, axis, where, i)
                         for i in range(size))
         return outputs[0] if size == 1 else outputs
 
@@ -751,5 +751,5 @@ def reducer(expr, axis, where=None, *args):
 _init_api("tvm.api")
 #pylint: disable=unnecessary-lambda
 sum = comm_reducer(lambda x, y: x+y, lambda t: const(0, dtype=t), name="sum")
-min = comm_reducer(lambda x, y: _make.Min(x, y), max_value, name='min')
-max = comm_reducer(lambda x, y: _make.Max(x, y), min_value, name='max')
+min = comm_reducer(lambda x, y: _expr.Min(x, y), max_value, name='min')
+max = comm_reducer(lambda x, y: _expr.Max(x, y), min_value, name='max')
diff --git a/python/tvm/expr.py b/python/tvm/expr.py
index 8bf46b7eee62..1c1c9f82cb97 100644
--- a/python/tvm/expr.py
+++ b/python/tvm/expr.py
@@ -225,127 +225,545 @@ class LogicalExpr(Expr):
 
 @register_node("Variable")
 class Var(Expr):
-    """Symbolic variable."""
-    pass
+    """Symbolic variable.
+
+    Parameters
+    ----------
+    name : str
+        The name
+
+    dtype : int
+        The data type
+    """
+    def __init__(self, name, dtype):
+        self.__init_handle_by_constructor__(
+            _api_internal._Var, name, dtype)
+
 
 @register_node
 class Reduce(Expr):
-    pass
+    """Reduce node.
+
+    Parameters
+    ----------
+    combiner : CommReducer
+        The combiner.
+
+    src : list of Expr
+        The source expression.
+
+    rdom : list of IterVar
+        The iteration domain
+
+    condition : Expr
+        The reduce condition.
+
+    value_index : int
+        The value index.
+    """
+    def __init__(self, combiner, src, rdom, condition, value_index):
+        self.__init_handle_by_constructor__(
+            _make.Reduce, combiner, src, rdom,
+            condition, value_index)
+
 
 @register_node
 class FloatImm(ConstExpr):
-    pass
+    """Float constant.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : float
+        The constant value.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.FloatImm, dtype, value)
 
 @register_node
 class IntImm(ConstExpr):
-    pass
+    """Int constant.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : int
+        The constant value.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.IntImm, dtype, value)
+
 
 @register_node
 class UIntImm(ConstExpr):
-    pass
+    """UInt constant.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : int
+        The constant value.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.UIntImm, dtype, value)
+
 
 @register_node
 class StringImm(ConstExpr):
-    pass
+    """String constant.
+
+    Parameters
+    ----------
+    value : str
+        The value of the function.
+    """
+    def __init__(self, value):
+        self.__init_handle_by_constructor__(
+            _make.StringImm, value)
+
 
 @register_node
 class Cast(Expr):
-    pass
+    """Cast expression.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : Expr
+        The value of the function.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.Cast, dtype, value)
+
 
 @register_node
 class Add(BinaryOpExpr):
-    pass
+    """Add node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Add, a, b)
+
 
 @register_node
 class Sub(BinaryOpExpr):
-    pass
+    """Sub node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Sub, a, b)
+
 
 @register_node
 class Mul(BinaryOpExpr):
-    pass
+    """Mul node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Mul, a, b)
+
 
 @register_node
 class Div(BinaryOpExpr):
-    pass
+    """Div node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Div, a, b)
+
 
 @register_node
 class Mod(BinaryOpExpr):
-    pass
+    """Mod node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Mod, a, b)
+
 
 @register_node
 class Min(BinaryOpExpr):
-    pass
+    """Min node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Min, a, b)
+
 
 @register_node
 class Max(BinaryOpExpr):
-    pass
+    """Max node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Max, a, b)
+
 
 @register_node
 class EQ(CmpExpr):
-    pass
+    """EQ node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.EQ, a, b)
+
 
 @register_node
 class NE(CmpExpr):
-    pass
+    """NE node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.NE, a, b)
+
 
 @register_node
 class LT(CmpExpr):
-    pass
+    """LT node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.LT, a, b)
+
 
 @register_node
 class LE(CmpExpr):
-    pass
+    """LE node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.LE, a, b)
+
 
 @register_node
 class GT(CmpExpr):
-    pass
+    """GT node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.GT, a, b)
+
 
 @register_node
 class GE(CmpExpr):
-    pass
+    """GE node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.GE, a, b)
+
 
 @register_node
 class And(LogicalExpr):
-    pass
+    """And node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.And, a, b)
+
 
 @register_node
 class Or(LogicalExpr):
-    pass
+    """Or node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Or, a, b)
+
 
 @register_node
 class Not(LogicalExpr):
-    pass
+    """Not node.
+
+    Parameters
+    ----------
+    a : Expr
+        The input value
+    """
+    def __init__(self, a):
+        self.__init_handle_by_constructor__(
+            _make.Not, a)
+
 
 @register_node
 class Select(Expr):
-    pass
+    """Select node.
+
+    Parameters
+    ----------
+    condition : Expr
+        The condition expression.
+
+    true_value : Expr
+        The value to take when condition is true.
+
+    false_value : Expr
+        The value to take when condition is false.
+    """
+    def __init__(self, condition, true_value, false_value):
+        self.__init_handle_by_constructor__(
+            _make.Select, condition, true_value, false_value)
+
 
 @register_node
 class Load(Expr):
-    pass
+    """Load node.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type.
+
+    buffer_var : Var
+        The buffer variable in the load expression.
+
+    index : Expr
+        The index in the load.
+
+    predicate : Expr
+        The load predicate.
+    """
+    def __init__(self, dtype, buffer_var, index, predicate):
+        self.__init_handle_by_constructor__(
+            _make.Load, dtype, buffer_var, index, predicate)
+
 
 @register_node
 class Ramp(Expr):
-    pass
+    """Ramp node.
+
+    Parameters
+    ----------
+    base : Expr
+        The base expression.
+
+    stride : ramp stride
+        The stride of the ramp.
+
+    lanes : int
+        The lanes of the expression.
+    """
+    def __init__(self, base, stride, lanes):
+        self.__init_handle_by_constructor__(
+            _make.Ramp, base, stride, lanes)
+
 
 @register_node
 class Broadcast(Expr):
-    pass
+    """Broadcast node.
+
+    Parameters
+    ----------
+    value : Expr
+        The value of the expression.
+
+    lanes : int
+        The lanes of the expression.
+    """
+    def __init__(self, value, lanes):
+        self.__init_handle_by_constructor__(
+            _make.Broadcast, value, lanes)
+
 
 @register_node
 class Shuffle(Expr):
-    pass
+    """Shuffle node.
+
+    Parameters
+    ----------
+    vectors : Array of Expr
+        The vectors
+
+    indices : Array of indices
+        The indices
+    """
+    def __init__(self, vectors, indices):
+        self.__init_handle_by_constructor__(
+            _make.Shuffle, vectors, indices)
+
 
 @register_node
 class Call(Expr):
+    """Call node.
+
+    Parameters
+    ----------
+    dtype : str
+        The return data type
+
+    name : str
+        The name of the function
+
+    args : list of Expr
+        The input arguments to the call
+
+    call_type : int
+        The type of the call
+
+    func : Operation, optional
+        Operation if call_type is Halide
+
+    value_index : int
+        The output value index
+    """
     Extern = 0
     ExternCPlusPlus = 1
     PureExtern = 2
     Halide = 3
     Intrinsic = 4
     PureIntrinsic = 5
+    def __init__(self, dtype, name, args, call_type, func, value_index):
+        self.__init_handle_by_constructor__(
+            _make.Call, dtype, name, args, call_type, func, value_index)
 
 
 @register_node
 class Let(Expr):
-    pass
+    """Let node.
+
+    Parameters
+    ----------
+    var : Var
+        The variable in the binding.
+
+    value : Expr
+        The value in to be binded.
+
+    body : Expr
+        The body expression.
+    """
+    def __init__(self, var, value, body):
+        self.__init_handle_by_constructor__(
+            _make.Let, var, value, body)
diff --git a/python/tvm/make.py b/python/tvm/make.py
index 19949509778b..6238fd7f1789 100644
--- a/python/tvm/make.py
+++ b/python/tvm/make.py
@@ -6,9 +6,10 @@
 Each api is a PackedFunc that can be called in a positional argument manner.
 You can use make function to build the IR node.
 """
+from __future__ import absolute_import as _abs
 from ._ffi.function import _init_api
 from ._ffi.runtime_ctypes import TVMType
-from . import stmt as _stmt
+
 
 def range_by_min_extent(min_value, extent):
     """Construct a Range by min and extent.
@@ -98,44 +99,4 @@ def node(type_key, **kwargs):
     return _Node(*args)
 
 
-def stmt_seq(*args):
-    """Make sequence of statements
-
-    Parameters
-    ----------
-    args : list of Expr or Var
-        List of statements to be combined as sequence.
-
-    Returns
-    -------
-    stmt : Stmt
-        The combined statement.
-    """
-    ret = None
-    for value in args:
-        if not isinstance(value, _stmt.Stmt):
-            value = Evaluate(value)
-        ret = value if ret is None else Block(ret, value)
-    return ret if ret else Evaluate(0)
-
-
-def stmt_list(stmt):
-    """Make list of stmt from blocks.
-
-    Parameters
-    ----------
-    stmt : A block statement
-
-    Returns
-    -------
-    stmt_list : list of Stmt
-         The unpacked list of statements
-    """
-    if isinstance(stmt, _stmt.Block):
-        return stmt_list(stmt.first) + stmt_list(stmt.rest)
-    elif isinstance(stmt, _stmt.ProducerConsumer):
-        return stmt_list(stmt.body)
-    return [stmt]
-
-
 _init_api("tvm.make")
diff --git a/python/tvm/stmt.py b/python/tvm/stmt.py
index 1f5fea11a472..48d91dfa8044 100644
--- a/python/tvm/stmt.py
+++ b/python/tvm/stmt.py
@@ -15,65 +15,376 @@
 """
 from __future__ import absolute_import as _abs
 from ._ffi.node import NodeBase, register_node
+from . import make as _make
+
 
 class Stmt(NodeBase):
     pass
 
 @register_node
 class LetStmt(Stmt):
-    pass
+    """LetStmt node.
+
+    Parameters
+    ----------
+    var : Var
+        The variable in the binding.
+
+    value : Expr
+        The value in to be binded.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, var, value, body):
+        self.__init_handle_by_constructor__(
+            _make.LetStmt, var, value, body)
+
 
 @register_node
 class AssertStmt(Stmt):
-    pass
+    """AssertStmt node.
+
+    Parameters
+    ----------
+    condition : Expr
+        The assert condition.
+
+    message : Expr
+        The error message.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, condition, message, body):
+        self.__init_handle_by_constructor__(
+            _make.AssertStmt, condition, message, body)
+
 
 @register_node
 class ProducerConsumer(Stmt):
-    pass
+    """ProducerConsumer node.
+
+    Parameters
+    ----------
+    func : Operation
+        The Operation.
+
+    is_producer : bool
+        Whether if the node is producer.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, func, is_producer, body):
+        self.__init_handle_by_constructor__(
+            _make.ProducerConsumer, func, is_producer, body)
+
 
 @register_node
 class For(Stmt):
+    """For node.
+
+    Parameters
+    ----------
+    loop_var : Var
+        The loop variable.
+
+    min_val : Expr
+        The begining value.
+
+    extent : Expr
+        The length of the loop.
+
+    for_type : int
+        The for type.
+
+    device_api : int
+        The device api type.
+
+    body : Stmt
+        The body statement.
+    """
     Serial = 0
     Parallel = 1
     Vectorized = 2
     Unrolled = 3
+    def __init__(self,
+                 loop_var,
+                 min_val,
+                 extent,
+                 for_type,
+                 device_api,
+                 body):
+        self.__init_handle_by_constructor__(
+            _make.For, loop_var, min_val, extent,
+            for_type, device_api, body)
+
 
 @register_node
 class Store(Stmt):
-    pass
+    """Store node.
+
+    Parameters
+    ----------
+    buffer_var : Var
+        The buffer Variable.
+
+    value : Expr
+        The value we want to store.
+
+    index : Expr
+        The index in the store expression.
+
+    predicate : Expr
+        The store predicate.
+    """
+    def __init__(self, buffer_var, value, index, predicate):
+        self.__init_handle_by_constructor__(
+            _make.Store, buffer_var, value, index, predicate)
+
 
 @register_node
 class Provide(Stmt):
-    pass
+    """Provide node.
+
+    Parameters
+    ----------
+    func : Operation
+        The operation to create the function.
+
+    value_index : int
+        The output value index
+
+    value : Expr
+        The value to be stored.
+
+    args : list of Expr
+        The index arguments of the Provide.
+    """
+    def __init__(self, func, value_index, value, args):
+        self.__init_handle_by_constructor__(
+            _make.Provide, func, value_index, value, args)
+
 
 @register_node
 class Allocate(Stmt):
-    pass
+    """Allocate node.
+
+    Parameters
+    ----------
+    buffer_var : Var
+        The buffer variable.
+
+    dtype : str
+        The data type of the buffer.
+
+    extents : list of Expr
+        The extents of the allocate
+
+    condition : Expr
+        The condition.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self,
+                 buffer_var,
+                 dtype,
+                 extents,
+                 condition,
+                 body):
+        self.__init_handle_by_constructor__(
+            _make.Allocate, buffer_var, dtype,
+            extents, condition, body)
+
 
 @register_node
 class AttrStmt(Stmt):
-    pass
+    """AttrStmt node.
+
+    Parameters
+    ----------
+    node : Node
+        The node to annotate the attribute
+
+    attr_key : str
+        Attribute type key.
+
+    value : Expr
+        The value of the attribute
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, node, attr_key, value, body):
+        self.__init_handle_by_constructor__(
+            _make.AttrStmt, node, attr_key, value, body)
+
 
 @register_node
 class Free(Stmt):
-    pass
+    """Free node.
+
+    Parameters
+    ----------
+    buffer_var : Var
+        The buffer variable.
+    """
+    def __init__(self, buffer_var):
+        self.__init_handle_by_constructor__(
+            _make.Free, buffer_var)
+
 
 @register_node
 class Realize(Stmt):
-    pass
+    """Realize node.
+
+    Parameters
+    ----------
+    func : Operation
+        The operation to create the function.
+
+    value_index : int
+        The output value index
+
+    dtype : str
+        The data type of the operation.
+
+    bounds : list of range
+        The bound of realize
+
+    condition : Expr
+        The realize condition.
+
+    body : Stmt
+        The realize body
+    """
+    def __init__(self,
+                 func,
+                 value_index,
+                 dtype,
+                 bounds,
+                 condition,
+                 body):
+        self.__init_handle_by_constructor__(
+            _make.Realize, func, value_index, dtype,
+            bounds, condition, body)
+
 
 @register_node
 class Block(Stmt):
-    pass
+    """Block node.
+
+    Parameters
+    ----------
+    first : Stmt
+        The first statement.
+
+    rest : Stmt
+        The following statement.
+    """
+    def __init__(self, first, rest):
+        self.__init_handle_by_constructor__(
+            _make.Block, first, rest)
+
 
 @register_node
 class IfThenElse(Stmt):
-    pass
+    """IfThenElse node.
+
+    Parameters
+    ----------
+    condition : Expr
+        The expression
+
+    then_case : Stmt
+        The statement to execute if condition is true.
+
+    else_case : Stmt
+        The statement to execute if condition is false.
+    """
+    def __init__(self, condition, then_case, else_case):
+        self.__init_handle_by_constructor__(
+            _make.IfThenElse, condition, then_case, else_case)
+
 
 @register_node
 class Evaluate(Stmt):
-    pass
+    """Evaluate node.
+
+    Parameters
+    ----------
+    value : Expr
+        The expression to be evalued.
+    """
+    def __init__(self, value):
+        self.__init_handle_by_constructor__(
+            _make.Evaluate, value)
+
 
 @register_node
 class Prefetch(Stmt):
-    pass
+    """Prefetch node.
+
+    Parameters
+    ----------
+    func : Operation
+        The operation to create the function.
+
+    value_index : int
+        The output value index
+
+    dtype : str
+        The data type to be prefetched.
+
+    bounds : list of Range
+        The bounds to be prefetched.
+    """
+    def __init__(self, func, value_index, dtype, bounds):
+        self.__init_handle_by_constructor__(
+            _make.Prefetch, func, value_index, dtype, bounds)
+
+
+def stmt_seq(*args):
+    """Make sequence of statements
+
+    Parameters
+    ----------
+    args : list of Expr or Var
+        List of statements to be combined as sequence.
+
+    Returns
+    -------
+    stmt : Stmt
+        The combined statement.
+    """
+    ret = None
+    for value in args:
+        if not isinstance(value, Stmt):
+            value = Evaluate(value)
+        ret = value if ret is None else Block(ret, value)
+    return ret if ret else Evaluate(0)
+
+
+def stmt_list(stmt):
+    """Make list of stmt from blocks.
+
+    Parameters
+    ----------
+    stmt : A block statement
+
+    Returns
+    -------
+    stmt_list : list of Stmt
+         The unpacked list of statements
+    """
+    if isinstance(stmt, Block):
+        return stmt_list(stmt.first) + stmt_list(stmt.rest)
+    elif isinstance(stmt, ProducerConsumer):
+        return stmt_list(stmt.body)
+    return [stmt]
+
+
+_make.stmt_list = stmt_list
+_make.stmt_seq = stmt_seq
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index bc9293c20b7a..8a65260a0f58 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -170,6 +170,7 @@ REGISTER_MAKE3(Select);
 REGISTER_MAKE3(Ramp);
 REGISTER_MAKE2(Cast);
 REGISTER_MAKE2(Broadcast);
+REGISTER_MAKE2(Shuffle);
 REGISTER_MAKE3(Let);
 REGISTER_MAKE3(LetStmt);
 REGISTER_MAKE3(AssertStmt);
diff --git a/tests/python/unittest/test_lang_constructor.py b/tests/python/unittest/test_lang_constructor.py
new file mode 100644
index 000000000000..caca08afa804
--- /dev/null
+++ b/tests/python/unittest/test_lang_constructor.py
@@ -0,0 +1,202 @@
+import tvm
+
+def test_expr_constructor():
+    x = tvm.expr.Var("xx", "float32")
+    assert isinstance(x, tvm.expr.Var)
+    assert x.name == "xx"
+
+    x = tvm.expr.Reduce(None, [1],
+                        [tvm.api._IterVar((0, 1), "x", 2)],
+                        None, 0)
+    assert isinstance(x, tvm.expr.Reduce)
+    assert x.combiner == None
+    assert x.value_index == 0
+
+    x = tvm.expr.FloatImm("float32", 1.0)
+    assert isinstance(x, tvm.expr.FloatImm)
+    assert x.value == 1.0
+    assert x.dtype == "float32"
+
+    x = tvm.expr.IntImm("int64", 2)
+    assert isinstance(x, tvm.expr.IntImm)
+    assert x.value == 2
+    assert x.dtype == "int64"
+
+    x = tvm.expr.UIntImm("uint16", 2)
+    assert isinstance(x, tvm.expr.UIntImm)
+    assert x.value == 2
+    assert x.dtype == "uint16"
+
+    x = tvm.expr.StringImm("xyza")
+    assert isinstance(x, tvm.expr.StringImm)
+    assert x.value == "xyza"
+
+    x = tvm.expr.Cast("float32", tvm.expr.IntImm("int32", 1))
+    assert isinstance(x, tvm.expr.Cast)
+    assert x.dtype == "float32"
+    assert x.value.value == 1
+
+    a = tvm.const(1.0, dtype="float32")
+    b = tvm.var("x", dtype="float32")
+
+    for cls in [tvm.expr.Add,
+                tvm.expr.Sub,
+                tvm.expr.Mul,
+                tvm.expr.Div,
+                tvm.expr.Mod,
+                tvm.expr.Min,
+                tvm.expr.Max,
+                tvm.expr.LT,
+                tvm.expr.LE,
+                tvm.expr.GT,
+                tvm.expr.GE]:
+        x = cls(a, b)
+        assert isinstance(x, cls)
+        assert x.a == a
+        assert x.b.same_as(b)
+
+
+    a = tvm.convert(tvm.var("x") > 1)
+    b = tvm.convert(tvm.var("x") == 1)
+
+    for cls in [tvm.expr.And,
+                tvm.expr.Or]:
+        x = cls(a, b)
+        assert isinstance(x, cls)
+        assert x.a == a
+        assert x.b.same_as(b)
+
+    x = tvm.expr.Not(a)
+    assert isinstance(x, tvm.expr.Not)
+    assert x.a == a
+
+    x = tvm.expr.Select(a, a, b)
+    assert isinstance(x, tvm.expr.Select)
+    assert x.true_value == a
+    assert x.false_value == b
+    assert x.condition == a
+
+    buffer_var = tvm.var("x", dtype="handle")
+    x = tvm.expr.Load("float32", buffer_var, 1, a)
+    assert isinstance(x, tvm.expr.Load)
+    assert x.dtype == "float32"
+    assert x.buffer_var == buffer_var
+    assert x.index.value == 1
+    assert x.predicate == a
+
+    x = tvm.expr.Ramp(1, 2, 10)
+    assert isinstance(x, tvm.expr.Ramp)
+    assert x.base.value == 1
+    assert x.stride.value == 2
+    assert x.lanes == 10
+
+    x = tvm.expr.Broadcast(a, 10)
+    assert isinstance(x, tvm.expr.Broadcast)
+    assert x.value == a
+    assert x.lanes == 10
+
+    x = tvm.expr.Shuffle([a], [0])
+    assert isinstance(x, tvm.expr.Shuffle)
+    assert x.vectors[0] == a
+    assert x.indices[0].value == 0
+
+    x = tvm.expr.Call("float32", "xyz", [a], tvm.expr.Call.Extern, None, 0)
+    assert isinstance(x, tvm.expr.Call)
+    assert x.dtype == "float32"
+    assert x.name == "xyz"
+    assert x.args[0] == a
+    assert x.call_type == tvm.expr.Call.Extern
+    assert x.func == None
+    assert x.value_index == 0
+
+    v = tvm.var("aa")
+    x = tvm.expr.Let(v, 1, v)
+    assert x.var == v
+    assert x.value.value == 1
+    assert x.body == v
+
+
+def test_stmt_constructor():
+    v = tvm.var("aa")
+    buffer_var = tvm.var("buf", dtype="handle")
+    nop = tvm.stmt.Evaluate(1)
+    x = tvm.stmt.LetStmt(v, 1, tvm.stmt.Evaluate(1))
+    assert isinstance(x, tvm.stmt.LetStmt)
+    assert x.var == v
+    assert x.value.value == 1
+    assert isinstance(x.body, tvm.stmt.Evaluate)
+
+    x = tvm.stmt.AttrStmt(v == 1, "xx", 1, tvm.stmt.Evaluate(1))
+    assert isinstance(x, tvm.stmt.AttrStmt)
+    assert x.value.value == 1
+
+    x = tvm.stmt.Block(tvm.stmt.Evaluate(11),
+                       nop)
+    assert isinstance(x, tvm.stmt.Block)
+    assert x.first.value.value == 11
+    assert x.rest == nop
+
+    x = tvm.stmt.AssertStmt(tvm.const(1, "uint1"),
+                            tvm.convert("hellow"),
+                            nop)
+    assert isinstance(x, tvm.stmt.AssertStmt)
+    assert x.body == nop
+
+    x = tvm.stmt.ProducerConsumer(None, True, nop)
+    assert isinstance(x, tvm.stmt.ProducerConsumer)
+    assert x.body == nop
+
+    x = tvm.stmt.For(tvm.var("x"), 0, 10, 0, 0, nop)
+    assert isinstance(x, tvm.stmt.For)
+    assert x.min.value == 0
+    assert x.extent.value == 10
+    assert x.body == nop
+
+    x = tvm.stmt.Store(buffer_var, 1, 10, tvm.const(1, "uint1"))
+    assert isinstance(x, tvm.stmt.Store)
+    assert x.buffer_var == buffer_var
+    assert x.index.value == 10
+    assert x.value.value == 1
+
+    tensor = tvm.placeholder((), dtype="float32")
+    x = tvm.stmt.Provide(tensor.op, 0, 10, [])
+    assert isinstance(x, tvm.stmt.Provide)
+    assert x.value_index == 0
+    assert x.value.value == 10
+
+    x = tvm.stmt.Allocate(buffer_var, "float32", [10],
+                          tvm.const(1, "uint1"), nop)
+    assert isinstance(x, tvm.stmt.Allocate)
+    assert x.dtype == "float32"
+    assert x.buffer_var == buffer_var
+    assert x.body == nop
+
+    x = tvm.stmt.AttrStmt(buffer_var, "xyz", 1, nop)
+    assert isinstance(x, tvm.stmt.AttrStmt)
+    assert x.node == buffer_var
+    assert x.attr_key == "xyz"
+    assert x.body == nop
+
+    x = tvm.stmt.Free(buffer_var)
+    assert isinstance(x, tvm.stmt.Free)
+    assert x.buffer_var == buffer_var
+
+    x = tvm.stmt.Realize(None, 0, "float", [], tvm.const(1, "uint1"), nop)
+    assert isinstance(x, tvm.stmt.Realize)
+    assert x.body == nop
+
+    x = tvm.stmt.IfThenElse(tvm.const(1, "uint1"),
+                            tvm.stmt.Evaluate(11),
+                            nop)
+    assert isinstance(x, tvm.stmt.IfThenElse)
+    assert x.then_case.value.value == 11
+    assert x.else_case == nop
+
+    x = tvm.stmt.Prefetch(None, 1, "float32", [])
+    assert isinstance(x, tvm.stmt.Prefetch)
+    assert x.value_index == 1
+
+
+if __name__ == "__main__":
+    test_expr_constructor()
+    test_stmt_constructor()

From 82943650b74f5fe4a75e46107e73a1543ac6fde8 Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Fri, 24 Aug 2018 02:49:36 +0300
Subject: [PATCH 053/529] [NNVM][TEST] Test against numerical grad (#1505)

* [NNVM][TEST] Numerical gradient testing

* [NNVM][TEST] Make some tests a little faster

* Fix the failing test_top_level3

* Target exclusion for the check_function

* Try to ignore singularities

* grad_input_vars now can't contain shapes

* Don't pass unnecessary grad_input_vars to check_function

* Multiple outputs; fixes; testing of check_function

* Use numerical_grads_params to pass parameters to numgrad checker

* Fail when no action is requested excplicitly

* Pass additional params to functions

* Silence the linter issue

* Simplified numgrad checking

* Improved docs for check_function

* Fixed the error message when no dtype is provided

* Several fixes

* Tests with shape/dtype inference for inputs

* Don't check dense's grads on cuda

* Raise an error if output dtypes haven't been inferred

* Moved shape/dtype inference into a separate function; use float32 as fallback

* Remove redundant dtype=float32

* Fix multiple outputs

* Use check_function in the rest of the test_top_level1
---
 docs/api/python/nnvm/index.rst                |   1 +
 docs/api/python/nnvm/testing.rst              |  14 +
 docs/api/python/nnvm/testing_new_ops.rst      | 135 ++++
 nnvm/python/nnvm/testing/__init__.py          |   1 +
 nnvm/python/nnvm/testing/check_computation.py | 641 ++++++++++++++++++
 nnvm/tests/python/compiler/test_top_level1.py | 468 +++++++------
 nnvm/tests/python/compiler/test_top_level3.py |   9 +-
 nnvm/tests/python/compiler/test_top_level4.py | 153 ++---
 8 files changed, 1095 insertions(+), 327 deletions(-)
 create mode 100644 docs/api/python/nnvm/testing.rst
 create mode 100644 docs/api/python/nnvm/testing_new_ops.rst
 create mode 100644 nnvm/python/nnvm/testing/check_computation.py

diff --git a/docs/api/python/nnvm/index.rst b/docs/api/python/nnvm/index.rst
index c0e5912c76be..64447bb793fb 100644
--- a/docs/api/python/nnvm/index.rst
+++ b/docs/api/python/nnvm/index.rst
@@ -11,3 +11,4 @@ This document contains the python API to NNVM compiler toolchain.
    symbol
    graph
    top
+   testing
diff --git a/docs/api/python/nnvm/testing.rst b/docs/api/python/nnvm/testing.rst
new file mode 100644
index 000000000000..56783622648d
--- /dev/null
+++ b/docs/api/python/nnvm/testing.rst
@@ -0,0 +1,14 @@
+nnvm.testing
+------------
+
+.. automodule:: nnvm.testing
+
+.. autofunction:: nnvm.testing.ctx_list
+
+nnvm.testing.check_computation
+------------------------------
+
+.. automodule:: nnvm.testing.check_computation
+    :members:
+
+.. include:: testing_new_ops.rst
diff --git a/docs/api/python/nnvm/testing_new_ops.rst b/docs/api/python/nnvm/testing_new_ops.rst
new file mode 100644
index 000000000000..dfe7df485b78
--- /dev/null
+++ b/docs/api/python/nnvm/testing_new_ops.rst
@@ -0,0 +1,135 @@
+Testing new operations
+----------------------
+
+When adding new operations, it is a good idea to test them. Testing
+should be done with the function ``nnvm.testing.check_function``. You
+should provide it with the symbol representing the result of a
+computation and a reference numpy implementation. By default, it will
+also check analytical gradients against numerical gradients if
+analytical gradients are implemented for your operation. You can also
+pass a reference implementation for the gradients, but numerical
+gradients will still be checked. Numerical gradient checking may be
+switched off explicitly, but doing this is not a good idea generally.
+Here is an example testing the logarithm operation:
+
+.. code:: python
+
+    import numpy as np
+    import nnvm
+    import nnvm.symbol as sym
+    from nnvm.testing.check_computation import check_function
+
+    x = sym.Variable("x")
+    y = sym.log(x)
+
+    def forward(x):
+        return np.log(x)
+
+    def backward(head_grads, x):
+        return [1. / x * head_grads]
+
+    dtype = "float32"
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, in_range=(0.001, 2.0), dtype=dtype, shape=shape)
+
+If you run the code above, you might get an ``AssertionError`` in rare
+cases. That’s why it is recommended to run new tests a lot of times.
+
+.. code:: python
+
+    for _ in range(10000):
+        check_function(y, forward, backward, in_range=(0.001, 2.0), dtype=dtype, shape=shape)
+
+If you run the code above then sooner or later you will get an exception
+which may look like this:
+
+.. code-block:: text
+
+    AssertionError: Analytical and numerical grads wrt x differ too much
+    analytical grad = [
+            ...
+        ]
+    numerical grad = [
+            ...
+        ]
+    distance > atol*sqrt(n) + rtol*grad_norm
+    distance 308.50885009765625 > 0.01*55.42562584220407 + 0.1*2167.70703125
+
+It means that either you have a mistake in the ``FGradient`` function or
+the numerical error is too high. Generally, if you look at the printed
+gradients and see that they differ only slightly or just in a single
+position, then it is a numerical error. But if the gradients look
+completely different, especially if many corresponding positions have
+different signs, then it must be something wrong with the analytical
+gradient implementation.
+
+Then try to make this error reproducible, and also try to reduce the
+shape of inputs, but not too much, a vector of 10 elements is a
+reasonable choice. Also you won’t need reference functions ``forward``
+and ``backward``, and restricting the number of targets might also be a
+good idea. Since the error may manifest itself only in rare cases, you
+might want to run it in a loop.
+
+.. code:: python
+
+    shape = {'x': (10,)}
+    np.random.seed(42)
+
+    for _ in range(1000):
+        check_function(y, in_range=(0.001, 2.0), dtype=dtype, shape=shape,
+                       numerical_grads=True, only_targets=['llvm'])
+
+Running this code will result in the following:
+
+.. code-block:: text
+
+    check_function failed while checking gradients numerically, here is the main graph
+    Graph(%x, %head_grads_0) {
+      %x, shape=[10], dtype=0
+      %head_grads_0, shape=[10], dtype=0
+      %1 = log(%x), shape=[10], dtype=0
+      %3 = elemwise_div(%head_grads_0, %x), shape=[10], dtype=0
+      ret %1, %3, %head_grads_0
+    }
+    graph_attr_keys = [layout_inputs, dtype_num_unknown_nodes, dtype, shape_num_unknown_nodes, shape]
+
+    Generated inputs:
+    {'x': array([2.5660574e-01, 1.5313280e+00, 1.0232578e-03, 8.3371508e-01,
+           1.0454979e+00, 1.1021420e-01, 1.9461832e+00, 4.5302454e-01,
+           6.0909325e-01, 6.0858107e-01], dtype=float32), 'head_grads_0': array([0.4616029 , 0.00394617, 1.4589603 , 1.9337242 , 0.44936267,
+           1.3264314 , 1.4840508 , 1.6970023 , 0.84583575, 0.60655886],
+          dtype=float32)}
+
+    ...
+
+    AssertionError: Analytical and numerical grads wrt x differ too much
+    analytical grad = [1.7988799e+00 2.5769596e-03 1.4257993e+03 2.3194065e+00 4.2980734e-01
+     1.2035031e+01 7.6254421e-01 3.7459390e+00 1.3886802e+00 9.9667716e-01]
+     numerical grad = [1.7948151e+00 1.9073486e-03 9.9268610e+02 2.3174286e+00 4.2915344e-01
+     1.1980057e+01 7.6198578e-01 3.7412643e+00 1.3866425e+00 9.9563599e-01]
+    distance > atol*sqrt(n) + rtol*grad_norm
+    distance 433.11322021484375 > 0.01*3.1622776601683795 + 0.1*992.7716674804688
+
+In this case the largest difference is in the 2nd position (starting
+from 0) which corresponds to input value ``1.0232578e-03``. This value
+is too close to the singularity, so the numerical derivative gets too
+imprecise. The solution is to shrink the range for ``x``, here, for
+example, ``(0.002, 2.0)`` turned out to be enough. Don’t forget to run
+lots of tests, so that other people don’t get false positives.
+
+.. code:: python
+
+    for _ in range(100):
+        check_function(y, in_range={x: (0.002, 2.0)}, dtype=dtype, shape=(1, 3, 32, 32),
+                       numerical_grads=True, only_targets=['llvm'])
+
+If you need a more precise control over which values get passed to the
+checking function, you can use ``values={x: ...}``:
+
+.. code:: python
+
+    x_val = np.array([1.2594858e+00, 1.0960974e-01, 1.4975418e+00, 6.3585603e-01,
+           1.2692513e-03, 1.0227472e+00, 9.4656967e-02, 5.5306298e-01,
+           1.4142460e+00, 1.2631655e-01], dtype=np.float32)
+    check_function(y, values={x: x_val}, dtype=dtype, shape=shape,
+                   numerical_grads=True, only_targets=['llvm'])
diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
index a04d2bc83587..acf37999cc15 100644
--- a/nnvm/python/nnvm/testing/__init__.py
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -13,3 +13,4 @@
 from . import dcgan
 from . import dqn
 from . import yolo2_detection
+from . import check_computation
diff --git a/nnvm/python/nnvm/testing/check_computation.py b/nnvm/python/nnvm/testing/check_computation.py
new file mode 100644
index 000000000000..a207e8eb8ceb
--- /dev/null
+++ b/nnvm/python/nnvm/testing/check_computation.py
@@ -0,0 +1,641 @@
+# pylint: disable=cell-var-from-loop,no-else-return
+"""Helper utilities to check functions and their gradients."""
+from __future__ import absolute_import as _abs
+
+import logging
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+
+import nnvm
+from nnvm.compiler import graph_util
+from nnvm.compiler.graph_attr import TCODE_TO_DTYPE, DTYPE_TO_TCODE
+from .config import ctx_list
+
+def infer_shapes_dtypes(graph, shape=None, dtype=None, fallback_dtype=None):
+    """Runs dtype and shape inference passes on a graph and returns the resulting graph
+    along with the inferred information.
+
+    Parameters
+    ----------
+    graph : nnvm.graph.Graph
+        A graph we want to run inference on.
+
+    shape : Dict[str, Tuple[int]] or Tuple[int], optional
+        A dict mapping input variable names to shapes.
+        By default shapes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    fallback_dtype : str, optional
+        A dtype that will be used for variables whose dtype can't be inferred from other
+        variables' dtypes.
+
+    Returns
+    -------
+    graph : nnvm.graph.Graph
+        The resulting graph with dtype and shape information on its nodes.
+
+    input_shapes : Dict[str, Tuple[int]]
+        The inferred shapes of input variables merged with the `shape` dictionary.
+
+    input_dtypes : Dict[str, str]
+        The inferred dtypes of input variables merged with the `dtype` dictionary.
+
+    output_shapes : List[Tuple[int]]
+        The inferred shapes of outputs.
+
+    output_dtypes : List[str]
+        The inferred dtypes of outputs.
+    """
+    # Preprocess input parameters
+    if shape is None:
+        shape = {}
+
+    if dtype is None:
+        dtype = {}
+
+    if not isinstance(shape, dict):
+        shape = {x: shape for x in graph.symbol.list_input_variables()}
+
+    if not isinstance(dtype, dict):
+        dtype = {x: dtype for x in graph.symbol.list_input_variables()}
+
+    shape = _dict_var_to_dict_str(shape)
+    dtype = _dict_var_to_dict_str(dtype)
+
+    # The graph may already contain shape and dtype info, so extract it and merge with
+    # the user-specified shapes and dtypes (use the user-specified one on contradiction)
+    all_initial_shapes = graph.json_attr('shape')
+    all_initial_dtypes = graph.json_attr('dtype')
+
+    if all_initial_shapes:
+        for x in graph.index.input_names:
+            if x not in shape:
+                x_shape = tuple(all_initial_shapes[graph.index.entry_id(x)])
+                shape[x] = x_shape
+
+    if all_initial_dtypes:
+        for x in graph.index.input_names:
+            if x not in dtype:
+                x_dtype = TCODE_TO_DTYPE[all_initial_dtypes[graph.index.entry_id(x)]]
+                dtype[x] = x_dtype
+
+    # Perform inference
+    nnvm.compiler.graph_attr.set_shape_inputs(graph, shape)
+    nnvm.compiler.graph_attr.set_dtype_inputs(graph, dtype)
+
+    graph = graph.apply('InferShape').apply('InferType')
+
+    shapes = graph.json_attr('shape')
+    dtypes = graph.json_attr('dtype')
+
+    out_len = len(graph.symbol.list_output_names())
+
+    index = graph.index
+
+    output_shapes = \
+        [tuple(shapes[index.entry_id(index.output_entries[i])]) for i in range(out_len)]
+    output_dtypes = \
+        [TCODE_TO_DTYPE[dtypes[index.entry_id(index.output_entries[i])]] for i in range(out_len)]
+
+    # Postprocess the results
+    input_shapes = shape.copy()
+    input_dtypes = dtype.copy()
+
+    for x in graph.symbol.list_input_variables():
+        x_name = x.attr('name')
+        x_node_id = graph.index.node_id(x_name)
+        input_shapes[x_name] = tuple(shapes[x_node_id])
+        input_dtypes[x_name] = TCODE_TO_DTYPE[dtypes[x_node_id]]
+
+    # Merge the original user-specified shapes in case some of them are specified for non-existing
+    # variables
+    for x_name, x_shape in shape.items():
+        x_shape = tuple(x_shape)
+        if input_shapes.get(x_name, x_shape) != x_shape:
+            raise RuntimeError("Inferred shape differs from the provided shape.\n"
+                               "Provided shapes: {}\nInferred shapes: {}"
+                               .format(shapes, input_shapes))
+        else:
+            input_shapes[x_name] = x_shape
+
+    # Merge the original user-specified dtypes
+    for x_name, x_dtype in dtype.items():
+        if not isinstance(x_dtype, str):
+            x_dtype = TCODE_TO_DTYPE[x_dtype]
+        if input_dtypes.get(x_name, x_dtype) != x_dtype:
+            raise RuntimeError("Inferred dtype differs from the provided dtype.\n"
+                               "Provided dtypes: {}\nInferred dtypes: {}"
+                               .format(dtypes, input_dtypes))
+        else:
+            input_dtypes[x_name] = x_dtype
+
+    # If some dtypes weren't inferred and there is a fallback dtype, assign it to those varibles
+    # and repeat the inference
+    if fallback_dtype is not None and not all(input_dtypes.values()):
+        input_dtypes = {x: input_dtypes[x] if input_dtypes[x] else fallback_dtype
+                        for x in input_dtypes}
+        return infer_shapes_dtypes(graph, input_shapes, input_dtypes, fallback_dtype=None)
+
+    return graph, input_shapes, input_dtypes, output_shapes, output_dtypes
+
+def graph_to_function(graph, target, ctx, shape=None, dtype=None):
+    """Convert a graph to a function taking a keyword args and returning a list of results
+    (both args and results are numpy arrays).
+
+    Example::
+
+        fun = graph_to_function(graph, llvm, cpu(0))
+        [res1, res2] = fun(x=np.zeros((1,2)), y=np.zeros((1,)))
+
+    Parameters
+    ----------
+    graph : nnvm.graph.Graph
+        A graph we want to convert to a function.
+
+    target : str or :any:`tvm.target.Target`
+        The build target
+
+    ctx : TVMContext
+        The context to deploy the module.
+
+    shape : Dict[str, Tuple[int]], optional
+        A dict mapping input variable names to shapes.
+        By default shapes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    Returns
+    -------
+    function : Callable[..., List[numpy.ndarray]]
+    """
+    # Infer missing shapes and dtypes
+    graph, shape, dtype, output_shapes, output_dtypes = \
+        infer_shapes_dtypes(graph, shape=shape, dtype=dtype)
+
+    if None in dtype.values():
+        raise ValueError("Input variables with no type: {}".format(dtype))
+
+    if not all(shape.values()):
+        raise ValueError("Input variables with no shape: {}".format(shape))
+
+    compute_graph, lib, params = nnvm.compiler.build(graph, target, shape=shape, dtype=dtype)
+    module = graph_runtime.create(compute_graph, lib, ctx)
+
+    if params:
+        module.set_inputs(**params)
+
+    def run(**kwargs):
+        module.run(**kwargs)
+        res = []
+        for i, (o_shape, o_dtype) in enumerate(zip(output_shapes, output_dtypes)):
+            res.append(module.get_output(i, tvm.nd.empty(o_shape, o_dtype)).asnumpy())
+        return res
+
+    return run
+
+def _dict_var_to_dict_str(dictionary):
+    """Convert a Dict[nnvm.Symbol, T] to Dict[str, T]"""
+    if isinstance(dictionary, dict):
+        return {s.attr('name') if isinstance(s, nnvm.symbol.Symbol) else s:
+                dictionary[s] for s in dictionary}
+    else:
+        return dictionary
+
+def check_function(symbol, forward=None, backward=None, grad_input_vars=None,
+                   shape=None, dtype=None, in_range=None, values=None,
+                   exclude_targets=None, only_targets=None,
+                   additional_params=None,
+                   numerical_grads=None, numerical_grads_params=None,
+                   atol=1e-5, rtol=1e-5, quiet=False):
+    """Compute the function and/or its gradients on a random input and raise
+    an exception if the result doesn't match the reference implementation.
+
+    Parameters
+    ----------
+    symbol : nnvm.Symbol
+        A symbol representing the output.
+
+    forward : Callable[..., List[numpy.ndarray]], optional
+        A reference implementation to compare with.
+
+    backward : Callable[..., List[numpy.ndarray] or Dict[str, numpy.ndarray]], optional
+        A reference implementation of gradients. Should also accept head_grads besides
+        normal inputs which is a list of gradients of some scalar wrt the outputs or just a
+        single gradient if there are multiple outputs.
+        Should return either a dict mapping input variable names to the respective
+        gradients or a list of gradients wrt variables from grad_input_vars in
+        exactly the same order (in alphabetical order by default).
+
+    grad_input_vars : List[nnvm.Symbol or str], optional
+        A list of variables with respect to which the gradients will be computed.
+        None (default) means that all input variables will be used in an alphabetical order.
+
+    shape : Dict[nnvm.Symbol or str, Tuple[int]] or Tuple[int], optional
+        A dict mapping input variable names to shapes, or just a single shape.
+        By default shapes will be inferred from variables' attributes (see the Examples).
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[nnvm.Symbol or str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes (see the Examples).
+        If dtypes cannot be inferred for some variables then float32 will be used as a fallback.
+        Note that this parameter takes precedence over variables' attributes.
+
+    in_range : Dict[nnvm.Symbol or str, (float, float)] or (float, float), optional
+        A dict mapping input variable names to ranges or just a single range
+        (the same for all variables). Input values will be generated from
+        uniform distributions on these ranges. `head_grads` can also be
+        assigned a range this way.
+
+    values : Dict[nnvm.Symbol or str, numpy.ndarray], optional
+        A dict explicitly providing values for some variables instead of random generation.
+
+    exclude_targets : Set[str], optional
+        Skip compiling and running anything for these targets.
+
+    only_targets : Set[str], optional
+        Test only for those targets from `ctx_list()` that are also in this set.
+
+    additional_params : dict, optional
+        A dict of additional parameters which will be passed to forward and backward.
+
+    numerical_grads : bool or 'if_possible', optional
+        Whether to additionally check against numerically computed gradients. If 'if_possible' or
+        None is passed (which is the default) then it will try to create a gradient computation
+        graph and then check gradients numerically only if this graph can be created (i.e. if there
+        are some operations with unimplemented gradients, it will just issue a warning).
+        Checking against numerical gradients is done via the `check_numerical_grads` function.
+
+    numerical_grads_params : dict, optional
+        Additional parameters for `check_numerical_grads`.
+
+    atol : float, optional
+        Absolute tolerance for `np.testing.assert_allclose`. NOT used for numerical gradients.
+
+    rtol : float, optional
+        Relative tolerance for `np.testing.assert_allclose`. NOT used for numerical gradients.
+
+    quiet : bool, optional
+        Don't dump additional information to stdout on failure.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        x = sym.Variable("x", shape=(1, 2))
+        y = sym.Variable("y", shape=(1, 2))
+
+        # check the function and its gradients both numerically and using a reference function
+        check_function(x + 2*y,
+                       lambda x, y: x + 2*y,
+                       lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads})
+
+        # just check gradients numerically
+        check_function(x + 2*y, numerical_grads=True)
+
+        # just check the forward computation
+        check_function(x + 2*y, lambda x, y: x + 2*y, numerical_grads=False)
+
+        # specifying dtype
+        check_function(x + 2*y, lambda x, y: x + 2*y, dtype='float64')
+
+        # dtypes can also be specified during variable creation with dtype codes
+        x = sym.Variable("x", dtype=0)
+        check_function(x + 1, shape=(2, 2), numerical_grads=True)
+    """
+    # validate and preprocess the input params
+    if numerical_grads is None and forward is None and backward is None:
+        raise ValueError("No reference function was passed to check_function. If you only want to "
+                         "check gradients numerically, pass numerical_grads=True explicitly.")
+
+    if numerical_grads is None:
+        numerical_grads = 'if_possible'
+
+    if numerical_grads not in [False, True, 'if_possible']:
+        raise ValueError("numerical_grads must be a bool or 'if_possible', not {}"
+                         .format(numerical_grads))
+
+    if additional_params is None:
+        additional_params = {}
+
+    input_vars = symbol.list_input_variables()
+    input_dict = {x.attr('name'): x for x in input_vars}
+
+    if grad_input_vars is None:
+        grad_input_vars = sorted(input_vars, key=lambda x: x.attr('name'))
+    else:
+        grad_input_vars = [input_dict[x] if isinstance(x, str) else x for x in grad_input_vars]
+
+    in_range = _dict_var_to_dict_str(in_range)
+    values = _dict_var_to_dict_str(values)
+
+    out_len = len(symbol.list_output_names())
+
+    # Infer the output shapes and dtypes, and preprocess the shape and dtype params
+    forward_graph, shape, dtype, out_shapes, out_dtypes = \
+        infer_shapes_dtypes(nnvm.graph.create(symbol), shape=shape, dtype=dtype,
+                            fallback_dtype='float32')
+
+    if not all(out_shapes) or not all(out_dtypes):
+        if not quiet:
+            print(forward_graph.ir(join_node_attrs=['shape', 'dtype']))
+        raise ValueError("Could not infer shapes or dtypes for outputs.\n"
+                         "out_shapes = {}\nout_dtypes = {}".format(out_shapes, out_dtypes))
+
+    backward_graph = None
+
+    # If we want gradients, we have to recreate the graph, but now with gradient computations
+    # Note that here we need out_shapes for defining the shape of head grads, so we have to
+    # create the graph twice
+    if backward is not None or numerical_grads:
+        try:
+            head_grads_symbols = [nnvm.symbol.Variable("head_grads_" + str(i),
+                                                       shape=out_shapes[i],
+                                                       dtype=DTYPE_TO_TCODE[out_dtypes[i]])
+                                  for i in range(out_len)]
+            grad_symbols = graph_util.gradients([symbol], grad_input_vars,
+                                                grad_ys=head_grads_symbols)
+            # Sometimes grads do not depend on head_grads, so head_grads does not appear
+            # in the variable list; adding it manually prevents this, making things a bit easier
+            backward_graph = \
+                nnvm.graph.create(nnvm.symbol.Group([symbol] + grad_symbols + head_grads_symbols))
+
+            backward_graph, shape, dtype, out_shapes, out_dtypes = \
+                infer_shapes_dtypes(backward_graph, shape=shape, dtype=dtype,
+                                    fallback_dtype='float32')
+        except nnvm._base.NNVMError as err:
+            if backward is None and numerical_grads == "if_possible":
+                logging.warning("Won't check gradients because: %s", str(err).split('\n', 1)[0])
+                numerical_grads = False
+                backward_graph = None
+            else:
+                raise
+
+    main_graph = backward_graph if backward_graph is not None else forward_graph
+
+    # Generate random data for inputs (including head_grads)
+
+    np_inputs = {}
+
+    for x in main_graph.symbol.list_input_variables():
+        x_name = x.attr('name')
+        x_shape = shape[x_name]
+        x_dtype = dtype[x_name]
+
+        if values is not None and x_name in values:
+            np_inputs[x_name] = values[x_name].astype(x_dtype)
+            continue
+
+        low = -1.0
+        high = 1.0
+        if in_range is not None:
+            if isinstance(in_range, dict):
+                if x_name in in_range:
+                    low = in_range[x_name][0]
+                    high = in_range[x_name][1]
+            else:
+                low = in_range[0]
+                high = in_range[1]
+
+        np_inputs[x_name] = np.random.uniform(size=x_shape, low=low, high=high).astype(x_dtype)
+
+    np_inputs_without_head_grads = {k: np_inputs[k] for k in np_inputs
+                                    if not k.startswith('head_grads_')}
+
+    nothing_was_done = True
+
+    # Compute and compare the results
+    for target, ctx in ctx_list():
+        if exclude_targets is not None:
+            if target in exclude_targets or str(target) in exclude_targets:
+                logging.info("Skipping target = %s, ctx = %s", target, ctx)
+                continue
+        if only_targets is not None:
+            if target not in only_targets and str(target) not in only_targets:
+                logging.info("Skipping target = %s, ctx = %s", target, ctx)
+                continue
+
+        logging.info("Checking computation on target = %s, ctx = %s", target, ctx)
+
+        debug_stage = None
+
+        try:
+            nnvm_res = None
+
+            debug_stage = "compiling"
+            main_function = graph_to_function(main_graph, target, ctx)
+
+            # nnvm_res contains the output and gradients (if they are needed)
+            debug_stage = "running"
+            nnvm_res = main_function(**np_inputs)
+
+            if backward_graph is not None:
+                grad_var_names = [x.attr('name') for x in grad_input_vars]
+                nnvm_grads = {x: v for x, v in zip(grad_var_names, nnvm_res[out_len:])}
+
+            if forward is not None:
+                nothing_was_done = False
+                debug_stage = "checking forward computation"
+                logging.debug(debug_stage)
+
+                params = {}
+                params.update(np_inputs_without_head_grads)
+                params.update(additional_params)
+                numpy_res = forward(**params)
+
+                if isinstance(numpy_res, tuple):
+                    numpy_res = list(numpy_res)
+
+                if not isinstance(numpy_res, list):
+                    numpy_res = [numpy_res]
+
+                if len(numpy_res) != out_len:
+                    raise ValueError("Forward function returned {} values, but "
+                                     "the nnvm graph returns {} values"
+                                     .format(len(numpy_res), out_len))
+
+                for i in range(out_len):
+                    np.testing.assert_allclose(nnvm_res[i], numpy_res[i], atol=atol, rtol=rtol)
+
+            if backward is not None:
+                nothing_was_done = False
+                debug_stage = "checking gradients"
+                logging.debug(debug_stage)
+
+                np_head_grads = [np_inputs["head_grads_" + str(i)] for i in range(out_len)]
+
+                if out_len == 1:
+                    np_head_grads = np_head_grads[0]
+
+                params = {'head_grads': np_head_grads}
+                params.update(np_inputs_without_head_grads)
+                params.update(additional_params)
+                numpy_grads = backward(**params)
+
+                if not isinstance(numpy_grads, dict):
+                    if isinstance(numpy_grads, tuple):
+                        numpy_grads = list(numpy_grads)
+                    if not isinstance(numpy_grads, list):
+                        numpy_grads = [numpy_grads]
+                    numpy_grads = {x: v for x, v in zip(grad_var_names, numpy_grads)}
+                    if len(numpy_grads) != len(grad_var_names):
+                        raise ValueError("The backward function returns a list of gradients which "
+                                         "does not contain gradients for these variables: {}"
+                                         .format(set(grad_var_names) - set(numpy_grads)))
+
+                for x_name in numpy_grads:
+                    np.testing.assert_allclose(nnvm_grads[x_name], numpy_grads[x_name],
+                                               atol=atol, rtol=rtol)
+
+            if numerical_grads:
+                nothing_was_done = False
+                debug_stage = "checking gradients numerically"
+                logging.debug(debug_stage)
+
+                forward_function = graph_to_function(forward_graph, target, ctx)
+
+                # Since the result may be non-scalar, we have to put another operation on the top,
+                # so we just multiple by the randomly generated head_grads and then sum everything.
+                # This way we can reuse the gradient values which has been already computed.
+                def scalar_function(**kwargs):
+                    res = forward_function(**kwargs)
+                    return np.sum([np.dot(np_inputs['head_grads_' + str(i)].ravel(), res[i].ravel())
+                                   for i in range(out_len)])
+
+                if numerical_grads_params is None:
+                    numerical_grads_params = {}
+
+                check_numerical_grads(
+                    scalar_function,
+                    input_values=np_inputs_without_head_grads,
+                    grad_values=nnvm_grads,
+                    **numerical_grads_params)
+
+        except:
+            if not quiet:
+                print("\ncheck_function failed while {}, here is the main graph"
+                      .format(debug_stage))
+                print(main_graph.ir(join_node_attrs=['shape', 'dtype']))
+                if nnvm_res is not None:
+                    print("Generated inputs:")
+                    print(np_inputs)
+                    print()
+            raise
+
+    if nothing_was_done:
+        logging.warning("Nothing was done in check_function. Check ctx_list().")
+
+
+def check_numerical_grads(function, input_values, grad_values, function_value=None,
+                          delta=1e-3, atol=1e-2, rtol=0.1):
+    """A helper function that checks that numerical gradients of a function are equal to
+    gradients computed in some different way (analytical gradients).
+
+    Numerical gradients are computed using finite difference approximation. To reduce the number of
+    function evaluations, the number of points used is gradually increased if the error value is
+    too high (up to 5 points).
+
+    Parameters
+    ----------
+    function
+        A function that takes inputs as keyword arguments (like `function(**input_values)`) and
+        returns a scalar result. Should accept numpy ndarrays.
+
+    input_values : Dict[str, numpy.ndarray]
+        A dict assigning values to variables. Represents the point at which gradients should be
+        computed.
+
+    grad_values : Dict[str, numpy.ndarray]
+        Gradients computed using a different method.
+
+    function_value : float, optional
+        Should be equal to `function(**input_values)`.
+
+    delta : float, optional
+        A small number used for numerical computation of partial derivatives. The default 1e-3 is a
+        good choice for float32.
+
+    atol : float, optional
+        Absolute tolerance.
+
+    rtol : float, optional
+        Relative tolerance.
+    """
+
+    if function_value is None:
+        function_value = function(**input_values)
+
+    # a helper to modify j-th element of val by a_delta
+    def modify(val, j, a_delta):
+        val = val.copy()
+        val.reshape(-1)[j] = val.reshape(-1)[j] + a_delta
+        return val
+
+    # numerically compute a partial derivative with respect to j-th element of the var `name`
+    def derivative(x_name, j, a_delta):
+        modified_values = {n: modify(val, j, a_delta) if n == x_name else val
+                           for n, val in input_values.items()}
+        return (function(**modified_values) - function_value)/a_delta
+
+    def compare_derivative(j, n_der, grad):
+        der = grad.reshape(-1)[j]
+        return np.abs(n_der - der) < atol + rtol*np.abs(n_der)
+
+    for x_name, grad in grad_values.items():
+        if grad.shape != input_values[x_name].shape:
+            raise AssertionError(
+                "Gradient wrt '{}' has unexpected shape {}, expected {} "
+                .format(x_name, grad.shape, input_values[x_name].shape))
+
+        ngrad = np.zeros_like(grad)
+
+        # compute partial derivatives for each position in this variable
+        for j in range(np.prod(grad.shape)):
+            # forward difference approximation
+            nder = derivative(x_name, j, delta)
+
+            # if the derivative is not equal to the analytical one, try to use more
+            # precise and expensive methods
+            if not compare_derivative(j, nder, grad):
+                # central difference approximation
+                nder = (derivative(x_name, j, -delta) + nder)/2
+
+                if not compare_derivative(j, nder, grad):
+                    # central difference approximation using h = delta/2
+                    cnder2 = (derivative(x_name, j, delta/2) + derivative(x_name, j, -delta/2))/2
+                    # five-point derivative
+                    nder = (4*cnder2 - nder)/3
+
+            ngrad.reshape(-1)[j] = nder
+
+        dist = np.sqrt(np.sum((ngrad - grad)**2))
+        grad_norm = np.sqrt(np.sum(ngrad**2))
+
+        # we multiple atol by this number to make it more universal for different sizes
+        sqrt_n = np.sqrt(float(np.prod(grad.shape)))
+
+        if dist > atol*sqrt_n + rtol*grad_norm:
+            raise AssertionError(
+                "Analytical and numerical grads wrt {} differ too much\n"
+                "analytical grad = {}\n numerical grad = {}\n"
+                "distance > atol*sqrt(n) + rtol*grad_norm\n"
+                "distance {} > {}*{} + {}*{}"
+                .format(x_name, grad, ngrad,
+                        dist, atol, sqrt_n, rtol, grad_norm))
+
+        max_diff = np.max(np.abs(ngrad - grad))
+        avg_diff = np.mean(np.abs(ngrad - grad))
+        logging.info("Numerical grad test wrt %s of shape %s passes, "
+                     "dist = %f, max_diff = %f, avg_diff = %f",
+                     x_name, grad.shape, dist, max_diff, avg_diff)
diff --git a/nnvm/tests/python/compiler/test_top_level1.py b/nnvm/tests/python/compiler/test_top_level1.py
index d9c6655fea1d..ba6280dd9b14 100644
--- a/nnvm/tests/python/compiler/test_top_level1.py
+++ b/nnvm/tests/python/compiler/test_top_level1.py
@@ -5,49 +5,162 @@
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
+from nnvm.testing.check_computation import check_function
 
-def helper(symbol, inputs, dtype,
-           np_forward, np_backward=None,
-           need_input=True, need_head_grads=True,
-           rnd_min=-1, rnd_max=1):
-    ishapes = {}
-    itypes = {}
-    input_syms = []
-    np_inputs = {}
-    for (name, shape, s) in inputs:
-        ishapes.update({name: shape})
-        itypes.update({name: dtype})
-        np_inputs.update({name: np.random.uniform(rnd_min, rnd_max, size=shape).astype(dtype)})
-        input_syms.append(s)
-
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(symbol, target, ishapes, itypes)
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(**np_inputs)
-        y_np = np_forward(**np_inputs)
-        out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
-        # backward
-        if np_backward:
-            graph._set_symbol_list_attr("grad_ys", symbol)
-            graph._set_symbol_list_attr("grad_xs", input_syms)
-            graph._set_symbol_list_attr("grad_ys_out_grad", sym.Variable("head_grads", shape=y_np.shape))
-            graph = graph.apply("Gradient")
-            ishapes.update({"head_grads": y_np.shape})
-            graph, lib, _ = nnvm.compiler.build(graph, target, ishapes)
-            m = graph_runtime.create(graph, lib, ctx)
-            head_grads = np.random.uniform(size=y_np.shape).astype(dtype)
-            y_np = np_backward(head_grads=head_grads, **np_inputs)
-            b_inputs = {}
-            if need_input:
-                b_inputs.update(np_inputs)
-            if need_head_grads:
-                b_inputs.update({"head_grads":head_grads})
-            m.run(**b_inputs)
-            for i in range(len(y_np)):
-                out = m.get_output(i, tvm.nd.empty(y_np[i].shape, dtype))
-                np.testing.assert_allclose(out.asnumpy(), y_np[i], atol=1e-5, rtol=1e-5)
+def test_check_function():
+    # test the testing function
 
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+
+    # different styles of returning gradients from the backward function
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: [head_grads, 2*head_grads],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: (head_grads, 2*head_grads),
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads},
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: {'y': 2*head_grads},
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: [2*head_grads],
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: 2*head_grads,
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: 2*head_grads,
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float64')
+
+    # test just numerical gradients
+    # different styles of shape and dtype passing
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)},
+                   numerical_grads=True)
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)}, dtype='float32',
+                   numerical_grads=True)
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)}, dtype={x: 'float32', 'y': 'float32'},
+                   numerical_grads=True)
+    check_function(x + 2*y, shape=(1, 2), dtype='float32',
+                   numerical_grads=True)
+
+    # specifying variable attributes on variable creation
+    # (in this case type codes must be used)
+    x = sym.Variable("x", dtype=0, shape=(1, 2))
+    check_function(x + 2*y, shape={y: (1, 2)}, dtype={'y': 'float32'}, numerical_grads=True)
+    y = sym.Variable("y", dtype=0, shape=(1, 2))
+
+    # shape overriding
+    def _fwd1(x, y):
+        assert x.shape == (1, 1)
+        assert y.shape == (1, 2)
+        return x + 2*y
+    check_function(x + 2*y, _fwd1, shape={x: (1, 1)})
+
+    # in_range
+    def _fwd2(x, y):
+        assert x.shape == (100,)
+        assert (x <= 0.9).all()
+        assert (x >= 0.8).all()
+        return x + 2*y
+    check_function(x + 2*y, _fwd2, shape=(100,), in_range=(0.8, 0.9), numerical_grads=False)
+    check_function(x + 2*y, _fwd2, shape=(100,), in_range={'x': (0.8, 0.9)}, numerical_grads=False)
+    check_function(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0],
+                   in_range={'head_grads_0': (1.0, 1.0)})
+    # explicit passing of values
+    check_function(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0],
+                   values={'head_grads_0': np.full((1, 2), 1.0)})
+
+    # check that the function reports errors
+    def _check_function_must_fail(*args, **kwargs):
+        error = AssertionError
+        if 'error' in kwargs:
+            error = kwargs['error']
+            del kwargs['error']
+        try:
+            check_function(*args, quiet=True, **kwargs)
+        except error:
+            pass
+        else:
+            raise AssertionError("check_function didn't raise an exception")
+
+    _check_function_must_fail(x + 2*y, error=ValueError)
+    _check_function_must_fail(x + 2*y, lambda x, y: x + y)
+    _check_function_must_fail(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0])
+    _check_function_must_fail(sym.block_grad(x + 2*y), numerical_grads=True)
+    _check_function_must_fail(x*x, numerical_grads=True,
+                              numerical_grads_params={'atol': 0.0, 'rtol': 0.0})
+
+    # different styles of returning results from the forward function
+    check_function(x + 2*y, lambda x, y: [x + 2*y], numerical_grads=False)
+    _check_function_must_fail(x + 2*y, lambda x, y: [x + 2*y, x], numerical_grads=False,
+                              error=ValueError)
+    _check_function_must_fail(x + 2*y, lambda x, y: [], numerical_grads=False,
+                              error=ValueError)
+
+    # multiple outputs
+    z = sym.Group([2*x + y, x + 2*y])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y])
+    check_function(z, lambda x, y: (2*x + y, x + 2*y))
+    check_function(z, backward=lambda x, y, head_grads: [2*head_grads[0] + head_grads[1],
+                                                         head_grads[0] + 2*head_grads[1]])
+    _check_function_must_fail(z, backward=lambda x, y, head_grads: [2*head_grads[0],
+                                                                    2*head_grads[1]])
+    check_function(z, backward=lambda x, y, head_grads: [head_grads[1], 2*head_grads[1]],
+                   in_range={'head_grads_0': (0, 0)})
+    check_function(z, numerical_grads=True)
+
+    z = sym.Group([sym.block_grad(2*x + y), x + 2*y])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y], numerical_grads=False)
+    _check_function_must_fail(z, lambda x, y: [2*x + y, x + 2*y])
+    _check_function_must_fail(z, numerical_grads=True)
+
+    z = sym.Group([2*x + y, sym.block_grad(x + 2*y)])
+    _check_function_must_fail(z, numerical_grads=True)
+
+    z = sym.Group([2*x + y, x + 2*y, x, y, sym.sum(x)])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y, x, y, np.sum(x)])
+
+    # passing additional parameters to forward and backward
+    def _fwd3(x, p):
+        assert p == 'v'
+        return x + 1
+    def _bwd3(x, p, head_grads):
+        assert p == 'v'
+        return head_grads
+    check_function(x + 1, _fwd3, _bwd3, additional_params={'p': 'v'})
+
+    # implicitly created variables and shape/dtype inference for inputs
+    x = sym.Variable("x", shape=(2, 3), dtype=0)
+    b = sym.Variable("b")
+    y = sym.dense(data=x, bias=b, units=4)
+    # Don't check gradients on cuda because is doesn't yet support ewise after reduce
+    check_function(y, exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, shape={'x': (3, 4)}, exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, dtype={'x': 'float64'}, exclude_targets={'cuda'}, numerical_grads=True)
+
+    x = sym.Variable("x")
+    b = sym.Variable("b")
+    w = sym.Variable("w")
+    y = sym.dense(data=x, bias=b, weight=w, units=4)
+    def _fwd_dense(x, w, b):
+        return np.dot(x, w.T) + b
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, dtype={'x': 'float32'}, numerical_grads=False)
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, dtype={'w': 'float64'}, numerical_grads=False)
+    _check_function_must_fail(y, _fwd_dense, shape={'x': (1,2)},
+                              dtype={'w': 'float64', 'b': 'float32'},
+                              numerical_grads=False,
+                              error=nnvm._base.NNVMError)
+    # fails because no shape
+    _check_function_must_fail(y, _fwd_dense, numerical_grads=False, error=ValueError)
+    # ok because type is float32 by default
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, numerical_grads=False)
 
 def test_relu():
     x = sym.Variable("x")
@@ -62,10 +175,8 @@ def backward(head_grads, x):
         return [(sub > 0).astype("float") * \
                 ((x > 0).astype("float") + 0.3 * (x < 0).astype("float")) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 def test_prelu_nchw():
     x = sym.Variable("x")
@@ -75,15 +186,8 @@ def test_prelu_nchw():
     def forward(x, a):
         return (x < 0) * (x * a.reshape(3, 1, 1)) + (x>=0) * x
 
-    dtype = "float32"
-    dshape_x = (1, 3, 32, 32)
-    dshape_w = (3,)
-
-    inputs = [
-        ('x', dshape_x, x),
-        ('a', dshape_w, a)
-    ]
-    helper(y, inputs, dtype, forward)
+    shape = {'x': (1, 3, 32, 32), 'a': (3,)}
+    check_function(y, forward, shape=shape)
 
 def test_prelu_nhwc():
     x = sym.Variable("x")
@@ -93,17 +197,8 @@ def test_prelu_nhwc():
     def forward(x, a):
         return (x < 0) * (x * a.reshape(1, 1, 3)) + (x>=0) * x
 
-    dtype = "float32"
-    dshape_x = (1, 32, 32, 3)
-    dshape_w = (3,)
-
-    inputs = [
-        ('x', dshape_x, x),
-        ('a', dshape_w, a)
-    ]
-
-
-    helper(y, inputs, dtype, forward)
+    shape = {'x': (1, 32, 32, 3), 'a': (3,)}
+    check_function(y, forward, shape=shape)
 
 def test_sym_scalar_pow():
     scalar = 3
@@ -116,10 +211,8 @@ def forward(x):
     def backward(head_grads, x):
         return [scalar * x**(scalar -  1) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_scalar_sym_pow():
@@ -133,10 +226,8 @@ def forward(x):
     def backward(head_grads, x):
         return [np.log(scalar) * scalar**x * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_exp():
@@ -149,10 +240,8 @@ def forward(x):
     def backward(head_grads, x):
         return [np.exp(x) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_log():
@@ -165,10 +254,8 @@ def forward(x):
     def backward(head_grads, x):
         return [1. / x * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward, rnd_min=0.001)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, in_range=(0.002, 2.0), shape=shape)
 
 
 def test_tanh():
@@ -182,10 +269,8 @@ def backward(head_grads, x):
         y_np = forward(x)
         return [(1 - y_np**2) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_sigmoid():
@@ -199,10 +284,8 @@ def backward(head_grads, x):
         y_np = forward(x)
         return [y_np *(1 - y_np) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_softmax():
@@ -217,10 +300,10 @@ def backward(head_grads, x):
         grad = y * (head_grads - np.sum(y * head_grads, axis=1, keepdims=True))
         return [grad]
 
-    dtype = "float32"
-    dshape = (10, 1000)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    check_function(y, forward, backward,
+                   shape={'x': (10, 1000)}, numerical_grads=False)
+    check_function(y, forward, backward,
+                   shape={'x': (2, 10)})
 
 
 def test_log_softmax():
@@ -235,10 +318,10 @@ def backward(head_grads, x):
         grad = head_grads - np.exp(y) * np.sum(head_grads, axis=1, keepdims=True)
         return [grad]
 
-    dtype = "float32"
-    dshape = (10, 1000)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    check_function(y, forward, backward,
+                   shape={'x': (10, 1000)}, numerical_grads=False)
+    check_function(y, forward, backward,
+                   shape={'x': (2, 10)})
 
 
 def test_dense():
@@ -250,13 +333,16 @@ def test_dense():
 
     def forward(x, dense_weight, dense_bias):
         return np.dot(x, dense_weight.T) + dense_bias
-    dtype = "float32"
-    inputs = [
-        ('x', (10, 100), x),
-        ('dense_weight', (3, 100), w),
-        ('dense_bias', (3,), b)
-    ]
-    helper(y, inputs, dtype, forward)
+    shape = {
+        'x': (10, 100),
+        'w': (3, 100),
+        'b': (3,)
+    }
+    # Don't check gradients on cuda because is doesn't yet support ewise after reduce
+    check_function(y, forward, shape=shape,
+                   exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, forward, shape=shape,
+                   only_targets={'cuda'}, numerical_grads=False)
 
 
 def test_batchnorm():
@@ -272,35 +358,25 @@ def test_batchnorm():
     def forward(x, gamma, beta, moving_mean, moving_var):
         return (x - moving_mean) / np.sqrt(moving_var + eps) * gamma + beta
 
-    dtype = "float32"
-    inputs = [
-        ('x', (10, 20), x),
-        ('gamma', (20,), gamma),
-        ('beta', (20,), beta),
-        ('moving_mean', (20,), moving_var),
-        ('moving_var', (20,), moving_mean)
-    ]
+    shape = {
+        'x': (10, 20),
+        'gamma': (20,),
+        'beta': (20,),
+        'moving_mean': (20,),
+        'moving_var': (20,)
+    }
 
-    helper(y, inputs,  dtype, forward, rnd_min=0.001)
+    check_function(y, forward, in_range=(0.001, 1.0), shape=shape)
 
 
 def verify_concatenate(ishape, axis):
-    x = [sym.Variable("x%d" % i) for i in range(len(ishape))]
+    x = [sym.Variable("x%d" % i, shape=ishape[i]) for i in range(len(ishape))]
     y = sym.concatenate(*x, axis=axis) + 1
-    dtype = "float32"
-    for target, ctx in ctx_list():
-        # set input
-        data = []
-        for i, shape in enumerate(ishape):
-            data.append(np.random.uniform(size=shape).astype(dtype))
-        pdict = {"x%d" % i :  v for i, v in enumerate(data)}
-        shape = {"x%d" % i :  v.shape for i, v in enumerate(data)}
-        graph, lib, _ = nnvm.compiler.build(y, target, shape)
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(**pdict)
-        out_np = np.concatenate(data, axis=axis) + 1
-        out = m.get_output(0, tvm.nd.empty(out_np.shape))
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+    def forward(**kwargs):
+        return np.concatenate(list(kwargs.values()), axis=axis) + 1
+
+    check_function(y, forward)
 
 
 def test_concatenate():
@@ -309,19 +385,13 @@ def test_concatenate():
 
 
 def verify_split(ishape, indices_or_sections, axis):
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     y = sym.split(x, indices_or_sections=indices_or_sections, axis=axis)
-    dtype = "float32"
-    x_np = np.random.uniform(size=ishape).astype(dtype)
-    res = np.split(x_np, indices_or_sections, axis=axis)
-    for target, ctx in ctx_list():
-        # set input
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        for i, arr  in enumerate(res):
-            out = m.get_output(i, tvm.nd.empty(arr.shape))
-            np.testing.assert_allclose(out.asnumpy(), arr, atol=1e-5, rtol=1e-5)
+
+    def forward(x):
+        return np.split(x, indices_or_sections, axis=axis)
+
+    check_function(y, forward)
 
 
 def test_split():
@@ -331,28 +401,22 @@ def test_split():
 
 def verify_strided_slice(ishape, begin, end, strideinp=None):
     stride = strideinp if strideinp else [1, 1, 1]
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     if strideinp:
         y = sym.strided_slice(x, begin = begin, end = end, stride = stride) + 1
     else:
         y = sym.strided_slice(x, begin = begin, end = end) + 1
-    x_np = np.random.uniform(size=ishape).astype("float32")
+
     for i in range(len(begin), 3):
         begin.append(0)
     for i in range(len(end), 3):
         end.append(ishape[i])
-    def test_forward(x, begin, end, stride):
+
+    def test_forward(x):
         return x[begin[0]:end[0]:stride[0],
                     begin[1]:end[1]:stride[1], begin[2]:end[2]:stride[2]] + 1
 
-    for target, ctx in ctx_list():
-        # set input
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        res = test_forward(x_np, begin, end, stride)
-        out = m.get_output(0, tvm.nd.empty(res.shape))
-        np.testing.assert_allclose(out.asnumpy(), res, atol=1e-5, rtol=1e-5)
+    check_function(y, test_forward)
 
 def test_strided_slice():
     verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
@@ -369,24 +433,18 @@ def verify_take(src_shape, indices_src, axis=None):
     src_dtype = "float32"
     indices_dtype = "int32"
     indices_src = np.array(indices_src, dtype=indices_dtype)
-    a = sym.Variable("a")
-    indices = sym.Variable("indices")
+    a = sym.Variable("a", shape=src_shape)
+    indices = sym.Variable("indices", shape=indices_src.shape)
     y = sym.take(a, indices, axis=axis)
-    for target, ctx in ctx_list():
-        # set input
-        shape_dict = {"a":src_shape, "indices":indices_src.shape}
-        type_dict = {"a":src_dtype, "indices":indices_dtype}
-        graph, lib, _ = nnvm.compiler.build(y, target, shape=shape_dict, dtype=type_dict)
-        m = graph_runtime.create(graph, lib, ctx)
-
-        shape_size = 1
-        for i in range(len(src_shape)):
-            shape_size = shape_size * src_shape[i]
-        a_src = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
-        out_np = np.take(a_src, indices_src, axis=axis)
-        m.run(a=a_src, indices=indices_src)
-        out = m.get_output(0, tvm.nd.empty(out_np.shape, dtype=src_dtype))
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+    def forward(a, indices):
+        return np.take(a, indices=indices, axis=axis)
+
+    a_src = np.arange(np.prod(src_shape), dtype=src_dtype).reshape(src_shape)
+
+    check_function(y, forward,
+                   dtype={'a': src_dtype, 'indices': indices_dtype},
+                   values={'a': a_src, 'indices': indices_src})
 
 def test_take():
     verify_take((4,), [1])
@@ -399,9 +457,9 @@ def test_take():
     verify_take((4,3,5,6), [[2,1,0,0]], -2)
 
 
-def verify_squeeze(dshape, axis):
+def verify_squeeze(shape, axis):
     x = sym.Variable("x")
-    if axis:
+    if axis is not None:
         y = sym.squeeze(x, axis=axis)
     else:
         y = sym.squeeze(x)
@@ -413,9 +471,7 @@ def forward(x):
     def backward(head_grads, x):
         return [np.reshape(head_grads, x.shape)]
 
-    dtype = "float32"
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_squeeze():
@@ -433,61 +489,40 @@ def forward(x):
                       pad_width=((0, 0), (0, 0), (0, 1), (2, 3)),
                       mode='constant', constant_values=1.)
 
-    dtype = "float32"
-    inputs = [('x', (1, 3, 28, 28), x)]
-    helper(y, inputs, dtype, forward)
+    shape = {'x': (1, 3, 28, 28)}
+    check_function(y, forward, shape=shape)
 
 def verify_lrn(ishape, size, axis, bias, alpha, beta):
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     y = sym.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
-    dtype = "float32"
-    x_np = np.random.uniform(size=ishape).astype(dtype)
 
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.lrn_python(x_np, size, axis, bias, alpha, beta)
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    def forward1(x):
+        return topi.testing.lrn_python(x, size, axis, bias, alpha, beta)
+
+    check_function(y, forward1)
+
+    def forward2(x):
+        y = forward1(x)
+        return (y > 0)*y
 
     #Checking LRN op followed by elementwise op relu
-    z = sym.relu(y)
-    x_np = np.random.uniform(low=-10.0, high=10.0, size=ishape).astype(dtype)
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(z, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.lrn_python(x_np, size, axis, bias, alpha, beta)
-        out_np = (out_np > 0) * out_np
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    check_function(sym.relu(y), forward2, in_range={'x': (-10.0, 10.0)})
 
 def verify_l2_normalize(ishape, eps, axis):
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     y = sym.l2_normalize(x, eps=eps, axis=axis)
-    dtype = "float32"
-    x_np = np.random.uniform(size=ishape).astype(dtype)
 
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.l2_normalize_python(x_np, eps, axis)
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    def forward1(x):
+        return topi.testing.l2_normalize_python(x, eps, axis)
+
+    check_function(y, forward1)
+
+    def forward2(x):
+        y = forward1(x)
+        return (y > 0)*y
 
     #Checking L2 normalization op followed by elementwise op relu
-    z = sym.relu(y)
-    x_np = np.random.uniform(low=-10.0, high=10.0, size=ishape).astype(dtype)
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(z, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.l2_normalize_python(x_np, eps, axis)
-        out_np = (out_np > 0) * out_np
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    check_function(sym.relu(y), forward2, in_range={'x': (-10.0, 10.0)})
 
 def test_lrn():
     verify_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5)
@@ -498,6 +533,7 @@ def test_l2_normalize():
     verify_l2_normalize((1, 3, 20, 20), 0.001, (1, 2))
 
 if __name__ == "__main__":
+    test_check_function()
     test_split()
     test_concatenate()
     test_log_softmax()
diff --git a/nnvm/tests/python/compiler/test_top_level3.py b/nnvm/tests/python/compiler/test_top_level3.py
index c8bd37c38e5b..11af2d0bc9c4 100644
--- a/nnvm/tests/python/compiler/test_top_level3.py
+++ b/nnvm/tests/python/compiler/test_top_level3.py
@@ -5,15 +5,14 @@
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
-from test_top_level1 import helper
+from nnvm.testing.check_computation import check_function
 
 def check_map(symfunc, np_func, np_backward=None, dtype="float32", rnd_min=-1, rnd_max=1):
     x = sym.Variable("x")
     y = symfunc(x)
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, lambda x: np_func(x), np_backward,
-           rnd_min=rnd_min, rnd_max=rnd_max)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, lambda x: np_func(x), np_backward,
+                   dtype=dtype, shape=shape, in_range=(rnd_min, rnd_max))
 
 
 def test_floor():
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 5bf134b49a7b..50ce1571e1a8 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -6,52 +6,7 @@
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
-
-
-def helper(symbol, inputs, dtype,
-           np_forward, np_backward=None,
-           need_input=True, need_head_grads=True, in_range={}):
-    ishapes = {}
-    input_syms = []
-    np_inputs = {}
-    for (name, shape, s) in inputs:
-        ishapes.update({name: shape})
-        if name in in_range:
-            np_inputs.update({name: np.random.uniform(size=shape,
-                                                      low=in_range[name][0],
-                                                      high=in_range[name][1]).astype(dtype)})
-        else:
-            np_inputs.update({name: np.random.uniform(size=shape).astype(dtype)})
-        input_syms.append(s)
-
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(symbol, target, ishapes, dtype=dtype)
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(**np_inputs)
-        y_np = np_forward(**np_inputs)
-        out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
-        # backward
-        if np_backward:
-            graph._set_symbol_list_attr("grad_ys", symbol)
-            graph._set_symbol_list_attr("grad_xs", input_syms)
-            graph._set_symbol_list_attr("grad_ys_out_grad", sym.Variable("head_grads", shape=y_np.shape))
-            graph = graph.apply("Gradient")
-            ishapes.update({"head_grads": y_np.shape})
-            graph, lib, _ = nnvm.compiler.build(graph, target, ishapes)
-            m = graph_runtime.create(graph, lib, ctx)
-            head_grads = np.random.uniform(size=y_np.shape).astype(dtype)
-            y_np = np_backward(head_grads=head_grads, **np_inputs)
-            b_inputs = {}
-            if need_input:
-                b_inputs.update(np_inputs)
-            if need_head_grads:
-                b_inputs.update({"head_grads":head_grads})
-            m.run(**b_inputs)
-            for i in range(len(y_np)):
-                out = m.get_output(i, tvm.nd.empty(y_np[i].shape, dtype))
-                np.testing.assert_allclose(out.asnumpy(), y_np[i], atol=1e-5, rtol=1e-5)
-
+from nnvm.testing.check_computation import check_function
 
 def verify_transpose(dshape, axes):
     x = sym.Variable("x")
@@ -228,93 +183,92 @@ def backward(head_grads, x):
         mask2 = np.less_equal(x, a_max).astype("float")
         return [head_grads * mask1 * mask2]
 
-
-    dtype = "float32"
-    inputs = [('x', (3, 4, 5), x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_broadcast():
     a = sym.Variable("a")
     b = sym.Variable("b")
-    inputs = [('a', (3, 4, 5), a),
-              ('b', (1, 5), b)]
-    dtype = "float32"
+    shape = {'a': (3, 4, 5), 'b': (1, 5)}
 
     def _collapse(g):
-        return g.reshape(-1, inputs[-1][1][-1]).sum(0, keepdims=True)
+        return g.reshape(-1, shape['b'][-1]).sum(0, keepdims=True)
 
     y = sym.broadcast_add(a, b)
     def _backward_add(head_grads, a, b):
         da = head_grads
         db = _collapse(head_grads)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a + b, _backward_add)
+    check_function(y, lambda a, b: a + b, _backward_add, shape=shape)
 
     y = sym.broadcast_sub(a, b)
     def _backward_sub(head_grads, a, b):
         da = head_grads
         db = -_collapse(head_grads)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a - b, _backward_sub)
+    check_function(y, lambda a, b: a - b, _backward_sub, shape=shape)
 
     y = sym.broadcast_mul(a, b)
     def _backward_mul(head_grads, a, b):
         da = head_grads * b
         db = _collapse(head_grads * a)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a * b, _backward_mul)
+    check_function(y, lambda a, b: a * b, _backward_mul, shape=shape)
 
     y = sym.broadcast_div(a, b)
     def _backward_div(head_grads, a, b):
         da = head_grads / b
         db = _collapse(- head_grads * a / b**2)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a / b, _backward_div)
+    # We avoid computing numerical derivatives too close to zero here
+    check_function(y, lambda a, b: a / b, _backward_div, shape=shape, numerical_grads=False)
+    check_function(y, lambda a, b: a / b, _backward_div, shape=shape,
+                   in_range={'b': (0.1, 20)})
 
     y = sym.broadcast_mod(a, b)
-    helper(y, inputs, 'int32',
-           lambda a, b: np.mod(a, b),
-           in_range={'a': (0.001, 100), 'b': (1, 100)})
+    check_function(y,
+                   lambda a, b: np.mod(a, b),
+                   in_range={'a': (0.001, 100), 'b': (1, 100)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_max(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.maximum(a, b))
+    check_function(y, lambda a, b: np.maximum(a, b), shape=shape)
 
     y = sym.broadcast_min(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.minimum(a, b))
+    check_function(y, lambda a, b: np.minimum(a, b), shape=shape)
 
     y = sym.broadcast_pow(a, b)
-    helper(y, inputs, dtype,
-           lambda a, b: np.power(a, b),
-           in_range={'a': (0.001, 100), 'b': (0.001, 2)})
+    check_function(y,
+                   lambda a, b: np.power(a, b),
+                   in_range={'a': (0.001, 100), 'b': (0.001, 2)}, shape=shape)
 
     y = sym.broadcast_left_shift(a, b)
-    helper(y, inputs, 'int32', lambda a, b: a << b)
+    check_function(y, lambda a, b: a << b, dtype='int32', shape=shape)
 
     y = sym.broadcast_right_shift(a, b)
-    helper(y, inputs, 'int32', lambda a, b: a >> b)
+    check_function(y, lambda a, b: a >> b, dtype='int32', shape=shape)
 
     y = sym.broadcast_greater(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.greater(a, b))
+    check_function(y, lambda a, b: np.greater(a, b), shape=shape)
 
     y = sym.broadcast_less(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.less(a, b))
+    check_function(y, lambda a, b: np.less(a, b), shape=shape)
 
     y = sym.broadcast_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.equal(a, b),
-           in_range={'a': (-2, 2), 'b': (-2, 2)})
+    check_function(y, lambda a, b: np.equal(a, b),
+                   in_range={'a': (-2, 2), 'b': (-2, 2)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_not_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.not_equal(a, b),
-           in_range={'a': (-2, 2), 'b': (-2, 2)})
+    check_function(y, lambda a, b: np.not_equal(a, b),
+                   in_range={'a': (-2, 2), 'b': (-2, 2)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_greater_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.greater_equal(a, b),
-           in_range={'a': (-3, 3), 'b': (-3, 3)})
+    check_function(y, lambda a, b: np.greater_equal(a, b),
+                   in_range={'a': (-3, 3), 'b': (-3, 3)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_less_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.less_equal(a, b),
-           in_range={'a': (-3, 3), 'b': (-3, 3)})
+    check_function(y, lambda a, b: np.less_equal(a, b),
+                   in_range={'a': (-3, 3), 'b': (-3, 3)}, dtype='int32', shape=shape)
 
 def test_greater():
     l = sym.Variable("l")
@@ -325,13 +279,10 @@ def forward(l, r):
         return np.greater(l, r).astype("float32")
 
     def backward(head_grads, l, r):
-        return [np.zeros_like(l)]
+        return {'l': np.zeros_like(l)}
 
-
-    dtype = "float32"
-    inputs = [('l', (3, 4, 5), l),
-              ('r', (3, 4, 5), r)]
-    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+    shape = {'l': (3, 4, 5), 'r': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_less():
@@ -343,13 +294,10 @@ def forward(l, r):
         return np.less(l, r).astype("float32")
 
     def backward(head_grads, l, r):
-        return [np.zeros_like(l)]
+        return {'l': np.zeros_like(l)}
 
-
-    dtype = "float32"
-    inputs = [('l', (3, 4, 5), l),
-              ('r', (3, 4, 5), r)]
-    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+    shape = {'l': (3, 4, 5), 'r': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_reshape_like():
@@ -364,11 +312,8 @@ def backward(head_grads, x, y):
         return [np.reshape(head_grads, x.shape),
                 np.zeros_like(y)]
 
-
-    dtype = "float32"
-    inputs = [('x', (3, 4, 5), x),
-              ('y', (5, 4, 3), y)]
-    helper(z, inputs, dtype, forward, backward)
+    shape = {'x': (3, 4, 5), 'y': (5, 4, 3)}
+    check_function(z, forward, backward, shape=shape)
 
 
 def verify_expand_like(in_shape, out_shape, axis, exclude):
@@ -412,10 +357,8 @@ def backward(head_grads, x, y):
                 np.zeros_like(y)]
 
 
-    dtype = "float32"
-    inputs = [('x', in_shape, x),
-              ('y', out_shape, y)]
-    helper(z, inputs, dtype, forward, backward, need_input=False)
+    shape = {'x': in_shape, 'y': out_shape}
+    check_function(z, forward, backward, shape=shape)
 
 
 def test_expand_like():
@@ -440,10 +383,8 @@ def forward(**inputs):
     def backward(head_grads, **inputs):
         return [head_grads] * num_args
 
-    dtype = "float32"
-    inputs = [("input" + str(i), (3, 4, 5), s[i])
-              for i in range(num_args)]
-    helper(y, inputs, dtype, forward, backward, need_input=False)
+    shape = {s[i]: (3, 4, 5) for i in range(num_args)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_elemwise_sum():
@@ -463,9 +404,9 @@ def backward(head_grads, x):
         return [np.zeros_like(head_grads)]
 
 
-    dtype = "float32"
-    inputs = [('x', (3, 4, 5), x)]
-    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+    shape = {'x': (3, 4, 5)}
+    # Numerical grad checking would fail for this function
+    check_function(y, forward, backward, shape=shape, numerical_grads=False)
 
 
 def test_full():

From 32fdda214a932f472b0f3147f3cec4141d03145d Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Thu, 23 Aug 2018 19:37:03 -0700
Subject: [PATCH 054/529] [AUTOTVM] Fix local executor (#1651)

The old queue size is too small. It will stall the executor due to race condition.
---
 python/tvm/autotvm/measure/local_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py
index 8a045ecfb4c0..55f1dc75fc5c 100644
--- a/python/tvm/autotvm/measure/local_executor.py
+++ b/python/tvm/autotvm/measure/local_executor.py
@@ -133,7 +133,7 @@ def submit(self, func, *args, **kwargs):
         if not self.do_fork:
             return LocalFutureNoFork(func(*args, **kwargs))
 
-        queue = Queue(1)
+        queue = Queue(2)
         process = Process(target=timeout_monitor,
                           args=(queue, self.timeout, func, args, kwargs))
         process.start()

From b3e958dc1b44ab98211413d28bd06187df6d6999 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 23 Aug 2018 21:59:34 -0700
Subject: [PATCH 055/529] [NODE] Enable global singleton object, allow
 set_body_typed in function registry, default fallback of IRPrinter. (#1652)

---
 HalideIR                           |  2 +-
 include/tvm/base.h                 | 72 ++++++++++++++++++++++++------
 include/tvm/runtime/registry.h     | 18 ++++++++
 nnvm/src/compiler/graph_runtime.cc |  2 +-
 src/api/api_base.cc                | 12 +----
 src/lang/reflection.cc             | 30 +++++++++++--
 6 files changed, 108 insertions(+), 28 deletions(-)

diff --git a/HalideIR b/HalideIR
index a0b9563f4571..6f64f7866747 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit a0b9563f45719553adf4d39fe3c14db1af0e1f40
+Subproject commit 6f64f7866747a2a81bec84aea9bde0479c5b72c1
diff --git a/include/tvm/base.h b/include/tvm/base.h
index 2c5c5ad54875..c2d796b6002c 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -68,26 +68,72 @@ inline NodeType LoadJSON(const std::string& json_str) {
   return NodeType(LoadJSON_(json_str));
 }
 
-/*! \brief typedef the factory function of data iterator */
-using NodeFactory = std::function<std::shared_ptr<Node> ()>;
 /*!
- * \brief Registry entry for NodeFactory
+ * \brief Registry entry for NodeFactory.
+ *
+ *  There are two types of Nodes that can be serialized.
+ *  The normal node requires a registration a creator function that
+ *  constructs an empty Node of the corresponding type.
+ *
+ *  The global singleton(e.g. global operator) where only global_key need to be serialized,
+ *  in this case, FGlobalKey need to be defined.
  */
-struct NodeFactoryReg
-    : public dmlc::FunctionRegEntryBase<NodeFactoryReg,
-                                        NodeFactory> {
+struct NodeFactoryReg {
+  /*!
+   * \brief creator function.
+   * \param global_key Key that identifies a global single object.
+   *        If this is not empty then FGlobalKey
+   * \return The created function.
+   */
+  using FCreate = std::function<std::shared_ptr<Node>(const std::string& global_key)>;
+  /*!
+   * \brief Global key function, only needed by global objects.
+   * \param node The node pointer.
+   * \return node The global key to the node.
+   */
+  using FGlobalKey = std::function<std::string(const Node* node)>;
+  /*! \brief registered name */
+  std::string name;
+  /*!
+   * \brief The creator function
+   */
+  FCreate fcreator = nullptr;
+  /*!
+   * \brief The global key function.
+   */
+  FGlobalKey fglobal_key = nullptr;
+  // setter of creator
+  NodeFactoryReg& set_creator(FCreate f) {  // NOLINT(*)
+    this->fcreator = f;
+    return *this;
+  }
+  // setter of creator
+  NodeFactoryReg& set_global_key(FGlobalKey f) {  // NOLINT(*)
+    this->fglobal_key = f;
+    return *this;
+  }
+  // global registry singleton
+  TVM_DLL static ::dmlc::Registry<::tvm::NodeFactoryReg> *Registry();
 };
 
+/*!
+ * \brief Register a Node type
+ * \note This is necessary to enable serialization of the Node.
+ */
 #define TVM_REGISTER_NODE_TYPE(TypeName)                                \
   static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
-      ::dmlc::Registry<::tvm::NodeFactoryReg>::Get()->__REGISTER__(TypeName::_type_key) \
-      .set_body([]() { return std::make_shared<TypeName>(); })
+      ::tvm::NodeFactoryReg::Registry()->__REGISTER__(TypeName::_type_key) \
+      .set_creator([](const std::string&) { return std::make_shared<TypeName>(); })
+
+
+#define TVM_STRINGIZE_DETAIL(x) #x
+#define TVM_STRINGIZE(x) TVM_STRINGIZE_DETAIL(x)
+#define TVM_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" TVM_STRINGIZE(__LINE__))
+/*!
+ * \brief Macro to include current line as string
+ */
+#define TVM_ADD_FILELINE "\n\nDefined in " __FILE__ ":L" TVM_STRINGIZE(__LINE__)
 
-TVM_DLL::dmlc::Registry<::tvm::NodeFactoryReg > * GetTVMNodeFactoryRegistry();
 
-#define TVM_EXTERNAL_REGISTER_NODE_TYPE(TypeName)                                \
-  static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
-      ::tvm::GetTVMNodeFactoryRegistry()->__REGISTER__(TypeName::_type_key) \
-      .set_body([]() { return std::make_shared<TypeName>(); })
 }  // namespace tvm
 #endif  // TVM_BASE_H_
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index c10a03e0604a..9466056a1282 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -47,6 +47,24 @@ class Registry {
   Registry& set_body(PackedFunc::FType f) {  // NOLINT(*)
     return set_body(PackedFunc(f));
   }
+  /*!
+   * \brief set the body of the function to be TypedPackedFunc.
+   *
+   * \code
+   *
+   * TVM_REGISTER_API("addone")
+   * .set_body_typed<int(int)>([](int x) { return x + 1; });
+   *
+   * \endcode
+   *
+   * \param f The body of the function.
+   * \tparam FType the signature of the function.
+   * \tparam FLambda The type of f.
+   */
+  template<typename FType, typename FLambda>
+  Registry& set_body_typed(FLambda f) {
+    return set_body(TypedPackedFunc<FType>(f).packed());
+  }
   /*!
    * \brief Register a function with given name
    * \param name The name of the function.
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
index bc4a803681e4..c680e82dd936 100644
--- a/nnvm/src/compiler/graph_runtime.cc
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -100,6 +100,6 @@ TVM_REGISTER_GLOBAL("nnvm.compiler._load_param_dict")
     *rv = ret;
   });
 
-TVM_EXTERNAL_REGISTER_NODE_TYPE(NDArrayWrapperNode);
+TVM_REGISTER_NODE_TYPE(NDArrayWrapperNode);
 }  // namespace compiler
 }  // namespace nnvm
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index 3583f42a00c9..a385d170070a 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -24,21 +24,13 @@ TVM_REGISTER_API("_raw_ptr")
   });
 
 TVM_REGISTER_API("_save_json")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = SaveJSON(args[0]);
-  });
+.set_body_typed<std::string(NodeRef)>(SaveJSON);
 
 TVM_REGISTER_API("_load_json")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = LoadJSON<NodeRef>(args[0]);
-  });
+.set_body_typed<NodeRef(std::string)>(LoadJSON<NodeRef>);
 
 TVM_REGISTER_API("_TVMSetStream")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     TVMSetStream(args[0], args[1], args[2]);
   });
-
-TVM_DLL::dmlc::Registry<::tvm::NodeFactoryReg > * GetTVMNodeFactoryRegistry() {
-  return ::dmlc::Registry<::tvm::NodeFactoryReg>::Get();
-}
 }  // namespace tvm
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 9fb9143aa7f4..93e2defd5aef 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -20,6 +20,10 @@ DMLC_REGISTRY_ENABLE(::tvm::NodeFactoryReg);
 
 namespace tvm {
 
+::dmlc::Registry<NodeFactoryReg>* NodeFactoryReg::Registry() {
+  return ::dmlc::Registry<NodeFactoryReg>::Get();
+}
+
 inline std::string Type2String(const Type& t) {
   if (t.code()  ==Type::Handle) return "handle";
   std::ostringstream os;
@@ -115,6 +119,8 @@ using AttrMap = std::map<std::string, std::string>;
 struct JSONNode {
   // The type key of the data
   std::string type_key;
+  // The global key for global object
+  std::string global_key;
   // the attributes
   AttrMap attrs;
   // container keys
@@ -125,6 +131,9 @@ struct JSONNode {
   void Save(dmlc::JSONWriter *writer) const {
     writer->BeginObject();
     writer->WriteObjectKeyValue("type_key", type_key);
+    if (global_key.size() != 0) {
+      writer->WriteObjectKeyValue("global_key", global_key);
+    }
     if (attrs.size() != 0) {
       writer->WriteObjectKeyValue("attrs", attrs);
     }
@@ -140,9 +149,11 @@ struct JSONNode {
   void Load(dmlc::JSONReader *reader) {
     attrs.clear();
     data.clear();
+    global_key.clear();
     type_key.clear();
     dmlc::JSONObjectReadHelper helper;
     helper.DeclareOptionalField("type_key", &type_key);
+    helper.DeclareOptionalField("global_key", &global_key);
     helper.DeclareOptionalField("attrs", &attrs);
     helper.DeclareOptionalField("keys", &keys);
     helper.DeclareOptionalField("data", &data);
@@ -195,6 +206,12 @@ class JSONAttrGetter : public AttrVisitor {
       return;
     }
     node_->type_key = node->type_key();
+    // sepcially handle global object
+    auto* f = dmlc::Registry<NodeFactoryReg>::Find(node_->type_key);
+    if (f->fglobal_key != nullptr) {
+      node_->global_key = f->fglobal_key(node);
+      return;
+    }
     node_->attrs.clear();
     node_->data.clear();
     if (node->is_type<ArrayNode>()) {
@@ -403,7 +420,7 @@ std::shared_ptr<Node> LoadJSON_(std::string json_str) {
       auto* f = dmlc::Registry<NodeFactoryReg>::Find(jnode.type_key);
       CHECK(f != nullptr)
           << "Node type \'" << jnode.type_key << "\' is not registered in TVM";
-      nodes.emplace_back(f->body());
+      nodes.emplace_back(f->fcreator(jnode.global_key));
     } else {
       nodes.emplace_back(std::shared_ptr<Node>());
     }
@@ -415,7 +432,11 @@ std::shared_ptr<Node> LoadJSON_(std::string json_str) {
 
   for (size_t i = 0; i < nodes.size(); ++i) {
     setter.node_ = &jgraph.nodes[i];
-    setter.Set(nodes[i].get());
+    // do not need to recover content of global singleton object
+    // they are registered via the environment
+    if (setter.node_->global_key.length() == 0) {
+      setter.Set(nodes[i].get());
+    }
   }
   return nodes.at(jgraph.root);
 }
@@ -493,11 +514,14 @@ void InitNodeByPackedArgs(Node* n, const TVMArgs& args) {
 //   key1, value1, ..., key_n, value_n
 void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
   std::string type_key = args[0];
+  std::string empty_str;
   auto* f = dmlc::Registry<NodeFactoryReg>::Find(type_key);
   CHECK(f != nullptr)
       << "Node type \'" << type_key << "\' is not registered in TVM";
   TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1);
-  std::shared_ptr<Node> n = f->body();
+  CHECK(f->fglobal_key == nullptr)
+      << "Cannot make node type \'" << type_key << "\' with global_key.";
+  std::shared_ptr<Node> n = f->fcreator(empty_str);
   if (n->derived_from<BaseAttrsNode>()) {
     static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
   } else {

From b0aae880acd87a593ad5ddbc533d343a1f80e0ce Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Fri, 24 Aug 2018 09:37:05 -0700
Subject: [PATCH 056/529] improve text summary (#1655)

---
 apps/android_rpc/README.md                | 10 +--
 apps/benchmark/README.md                  | 16 ++---
 python/tvm/rpc/client.py                  | 25 ++++---
 python/tvm/rpc/tornado_util.py            |  2 +
 python/tvm/rpc/tracker.py                 | 31 ++++++++-
 tests/python/unittest/test_runtime_rpc.py | 80 ++++++++++++++++++++++-
 tutorials/autotvm/tune_nnvm_arm.py        | 14 ++--
 7 files changed, 146 insertions(+), 32 deletions(-)

diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index eef22f3c7010..453263aa824e 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -104,11 +104,11 @@ You are supposed to find a free "android" in the queue status.
 ...
 
 Queue Status
-----------------------------
-key    	free	pending
-----------------------------
-android	1	0
-----------------------------
+-------------------------------
+key       total  free  pending
+-------------------------------
+android   1      1     0
+-------------------------------
 ```
 
 
diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index ee22f90dc435..95742e0decf4 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -40,14 +40,14 @@ python3 -m tvm.exec.rpc_tracker
   For our test environment, one sample output can be 
   ```bash
   Queue Status                
-  ------------------------------
-  key            free    pending    
-  ------------------------------
-  mate10pro      1       0   
-  p20pro         2       0  
-  pixel2         2       0 
-  rk3399         2       0
-  rasp3b         8       0
+  ----------------------------------
+  key          total  free  pending    
+  ----------------------------------
+  mate10pro    1      1     0
+  p20pro       2      2     0 
+  pixel2       2      2     0
+  rk3399       2      2     0
+  rasp3b       8      8     0
   ```
 
  4. Run benchmark  
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index ffbe6eeab6ee..ae44e5a79933 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -218,6 +218,9 @@ def summary(self):
     def text_summary(self):
         """Get a text summary of the tracker."""
         data = self.summary()
+
+        total_ct = {}
+
         res = ""
         res += "Server List\n"
         res += "----------------------------\n"
@@ -225,8 +228,12 @@ def text_summary(self):
         res += "----------------------------\n"
         for item in data["server_info"]:
             addr = item["addr"]
-            res += addr[0] + ":" + str(addr[1])+ "\t"
+            res += addr[0] + ":" + str(addr[1]) + "\t"
             res += item["key"] + "\n"
+            key = item['key'].split(':')[1]   # 'server:rasp3b` -> 'rasp3b'
+            if key not in total_ct:
+                total_ct[key] = 0
+            total_ct[key] += 1
         res += "----------------------------\n"
         res += "\n"
 
@@ -240,14 +247,16 @@ def text_summary(self):
             max_key_len = 0
 
         res += "Queue Status\n"
-        res += "----------------------------\n"
-        res += ("%%-%ds" % max_key_len + "\tfree\tpending\n") % 'key'
-        res += "----------------------------\n"
+        title = ("%%-%ds" % max_key_len + "   total  free  pending\n") % 'key'
+        separate_line = '-' * len(title) + '\n'
+        res += separate_line + title + separate_line
         for k in keys:
-            res += ("%%-%ds" % max_key_len + "\t%d\t%g\n") % \
-                   (k, queue_info[k]["free"], queue_info[k]["pending"])
-
-        res += "----------------------------\n"
+            total = total_ct.get(k, 0)
+            free, pending = queue_info[k]["free"], queue_info[k]["pending"]
+            if total or pending:
+                res += ("%%-%ds" % max_key_len + "   %-5d  %-4d  %-7d\n") % \
+                       (k, total, free, pending)
+        res += separate_line
         return res
 
     def request(self, key, priority=1, session_timeout=0, max_retry=5):
diff --git a/python/tvm/rpc/tornado_util.py b/python/tvm/rpc/tornado_util.py
index 00e1fd13865b..eafea2e85394 100644
--- a/python/tvm/rpc/tornado_util.py
+++ b/python/tvm/rpc/tornado_util.py
@@ -66,6 +66,8 @@ def _update_write(self):
         while self._pending_write:
             try:
                 msg = self._pending_write[0]
+                if self._sock is None:
+                    return
                 nsend = self._sock.send(msg)
                 if nsend != len(msg):
                     self._pending_write[0] = msg[nsend:]
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index de39c97b5000..88868ad6e978 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -78,6 +78,16 @@ def request(self, user, priority, callback):
         """
         raise NotImplementedError()
 
+    def remove(self, value):
+        """Remove a resource in the scheduler
+
+        Parameters
+        ----------
+        value: object
+            The resource to remove
+        """
+        pass
+
     def summary(self):
         """Get summary information of the scheduler."""
         raise NotImplementedError()
@@ -108,6 +118,11 @@ def request(self, user, priority, callback):
         heapq.heappush(self._requests, (-priority, time.time(), callback))
         self._schedule()
 
+    def remove(self, value):
+        if value in self._values:
+            self._values.remove(value)
+            self._schedule()
+
     def summary(self):
         """Get summary information of the scheduler."""
         return {"free": len(self._values),
@@ -132,6 +147,7 @@ def __init__(self, tracker, sock, addr):
         # list of pending match keys that has not been used.
         self.pending_matchkeys = set()
         self._tracker._connections.add(self)
+        self.put_values = []
 
     def name(self):
         """name of connection"""
@@ -199,9 +215,11 @@ def call_handler(self, args):
             self.pending_matchkeys.add(matchkey)
             # got custom address (from rpc server)
             if args[3] is not None:
-                self._tracker.put(key, (self, args[3], port, matchkey))
+                value = (self, args[3], port, matchkey)
             else:
-                self._tracker.put(key, (self, self._addr[0], port, matchkey))
+                value = (self, self._addr[0], port, matchkey)
+            self._tracker.put(key, value)
+            self.put_values.append(value)
             self.ret_value(TrackerCode.SUCCESS)
         elif code == TrackerCode.REQUEST:
             key = args[1]
@@ -239,7 +257,7 @@ def _cb(value):
             self.close()
 
     def on_close(self):
-        self._tracker._connections.remove(self)
+        self._tracker.close(self)
 
     def on_error(self, err):
         logger.warning("%s: Error in RPC Tracker: %s", self.name(), err)
@@ -285,6 +303,13 @@ def request(self, key, user, priority, callback):
             self._scheduler_map[key] = self.create_scheduler(key)
         self._scheduler_map[key].request(user, priority, callback)
 
+    def close(self, conn):
+        self._connections.remove(conn)
+        if 'key' in conn._info:
+            key = conn._info['key'].split(':')[1]  # 'server:rasp3b' -> 'rasp3b'
+            for value in conn.put_values:
+                self._scheduler_map[key].remove(value)
+
     def stop(self):
         """Safely stop tracker."""
         for conn in list(self._connections):
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index 0de788068b6b..756b2867184d 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -1,10 +1,13 @@
 import tvm
 import os
 import logging
-import numpy as np
 import time
+import multiprocessing
+
+import numpy as np
 from tvm import rpc
 from tvm.contrib import util
+from tvm.rpc.tracker import Tracker
 
 
 def test_bigendian_rpc():
@@ -237,6 +240,79 @@ def addone(x):
     rev = client.download("dat.bin")
     assert rev == blob
 
+def test_rpc_tracker_register():
+    # test registration
+    tracker = Tracker('localhost', port=9000, port_end=10000)
+    device_key = 'test_device'
+    server = rpc.Server('localhost', port=9000, port_end=10000,
+                        key=device_key,
+                        tracker_addr=(tracker.host, tracker.port))
+    time.sleep(1)
+    client = rpc.connect_tracker(tracker.host, tracker.port)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 1
+
+    remote = client.request(device_key)
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+
+    del remote
+    time.sleep(1)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 1
+
+    server.terminate()
+    time.sleep(1)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+
+    tracker.terminate()
+
+def test_rpc_tracker_request():
+    # test concurrent request
+    tracker = Tracker('localhost', port=9000, port_end=10000)
+    device_key = 'test_device'
+    server = rpc.Server('localhost', port=9000, port_end=10000,
+                        key=device_key,
+                        tracker_addr=(tracker.host, tracker.port))
+    client = rpc.connect_tracker(tracker.host, tracker.port)
+
+    def target(host, port, device_key, timeout):
+        client = rpc.connect_tracker(host, port)
+        remote = client.request(device_key, session_timeout=timeout)
+        while True:
+            pass
+        remote.cpu()
+
+    proc1 = multiprocessing.Process(target=target,
+                                    args=(tracker.host, tracker.port, device_key, 4))
+    proc2 = multiprocessing.Process(target=target,
+                                    args=(tracker.host, tracker.port, device_key, 200))
+    proc1.start()
+    time.sleep(0.5)
+    proc2.start()
+    time.sleep(0.5)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+    assert summary['queue_info'][device_key]['pending'] == 1
+
+    proc1.terminate()
+    proc1.join()
+    time.sleep(0.5)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+    assert summary['queue_info'][device_key]['pending'] == 0
+
+    proc2.terminate()
+    proc2.join()
+    server.terminate()
+    tracker.terminate()
+
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
@@ -248,3 +324,5 @@ def addone(x):
     test_rpc_array()
     test_rpc_simple()
     test_local_func()
+    test_rpc_tracker_register()
+    test_rpc_tracker_request()
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index e85786037477..a080681f4ca3 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -151,13 +151,13 @@ def get_network(name, batch_size):
 # .. code-block:: bash
 #
 #    Queue Status
-#    ----------------------------
-#    key          free    pending
-#    ----------------------------
-#    mate10pro    2       0
-#    rk3399       2       0
-#    rpi3b        11      0
-#    ----------------------------
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    mate10pro    2      2     0
+#    rk3399       2      2     0
+#    rpi3b        11     11    0
+#    ----------------------------------
 
 ###########################################
 # Set Tuning Options

From be80d2db5447b6913d853fbaf277c588f6de7717 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 24 Aug 2018 21:06:15 -0700
Subject: [PATCH 057/529] [LANG] Improve serializer (#1658)

---
 src/lang/attrs.cc                             | 2 ++
 src/lang/reflection.cc                        | 5 +++++
 tests/python/unittest/test_lang_reflection.py | 3 ++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index 0d8d1f3c9ece..360c5b2e9833 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -42,4 +42,6 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 
 TVM_REGISTER_NODE_TYPE(DictAttrsNode);
 
+TVM_REGISTER_NODE_TYPE(AttrFieldInfoNode);
+
 }  // namespace tvm
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 93e2defd5aef..a33594107a69 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -236,6 +236,11 @@ class JSONAttrGetter : public AttrVisitor {
             node_index_->at(kv.second.get()));
       }
     } else {
+      // do not need to recover content of global singleton object
+      // they are registered via the environment
+      auto* f = dmlc::Registry<NodeFactoryReg>::Find(node->type_key());
+      if (f != nullptr && f->fglobal_key != nullptr) return;
+      // recursively index normal object.
       node->VisitAttrs(this);
     }
   }
diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py
index 2ba67b8d9c86..9678fff8ef9b 100644
--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -58,7 +58,8 @@ def test_make_attrs():
 
     dattr = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
     assert dattr.x.value == 1
-
+    datrr = tvm.load_json(tvm.save_json(dattr))
+    assert dattr.name.value == "xyz"
 
 
 def test_make_sum():

From 4a1441490406a3f7c463b604d05f1fb2601a7f3c Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Sat, 25 Aug 2018 13:15:35 +0900
Subject: [PATCH 058/529] [RUNTIME][OPENCL] delay device check (#1657)

---
 docs/deploy/aws_fpga.md             | 4 ++--
 src/runtime/opencl/opencl_module.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/deploy/aws_fpga.md b/docs/deploy/aws_fpga.md
index 7554ce7f64cd..86918ca90a80 100644
--- a/docs/deploy/aws_fpga.md
+++ b/docs/deploy/aws_fpga.md
@@ -108,10 +108,10 @@ python run.py
 Synthesis
 ---------
 
-- Run synthesis with the following script. `XCL_EMULATION_MODE` must be set to 1 at this stage.
+- Run synthesis with the following script.
 
 ```bash
-export XCL_EMULATION_MODE=1
+unset XCL_EMULATION_MODE
 export XCL_TARGET=hw
 
 python build.py
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 6d392036cc56..ed5c3c235ac1 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -34,6 +34,7 @@ class OpenCLWrappedFunc {
   void operator()(TVMArgs args,
                   TVMRetValue* rv,
                   void** void_args) const {
+    CHECK(w_->context != nullptr) << "No OpenCL device";
     cl::OpenCLThreadEntry* t = w_->GetThreadEntry();
     // get the kernel from thread local kernel table.
     if (entry_.kernel_id >= t->kernel_table.size()) {
@@ -157,7 +158,6 @@ std::string OpenCLModuleNode::GetSource(const std::string& format) {
 void OpenCLModuleNode::Init() {
   workspace_ = GetGlobalWorkspace();
   workspace_->Init();
-  CHECK(workspace_->context != nullptr) << "No OpenCL device";
   device_built_flag_.resize(workspace_->devices.size(), false);
   // initialize the kernel id, need to lock global table.
   std::lock_guard<std::mutex> lock(workspace_->mu);

From 8d10743f74ea08b89bd331d65a32090b93d6ff24 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Sat, 25 Aug 2018 17:05:06 +0900
Subject: [PATCH 059/529] [CODEGEN][AOCL] Add math intrinsic rules (#1653)

* [CODEGEN][AOCL] Add math intrinsic rules

* introduce aocl_emu target for AOCL emulation

* rename aocl_emu with aocl_sw_emu

* update docs
---
 docs/deploy/aocl_fpga.md                    |  4 +-
 python/tvm/_ffi/runtime_ctypes.py           |  1 +
 src/codegen/build_module.cc                 |  2 +-
 src/codegen/codegen_aocl.cc                 | 23 +++---
 src/codegen/intrin_rule_aocl.cc             | 82 +++++++++++++++++++++
 tests/python/integration/test_ewise_fpga.py |  4 +-
 topi/tests/python/test_topi_math.py         |  3 +-
 7 files changed, 103 insertions(+), 16 deletions(-)
 create mode 100644 src/codegen/intrin_rule_aocl.cc

diff --git a/docs/deploy/aocl_fpga.md b/docs/deploy/aocl_fpga.md
index bd0dae97879d..f29fc9ef1ace 100644
--- a/docs/deploy/aocl_fpga.md
+++ b/docs/deploy/aocl_fpga.md
@@ -12,7 +12,7 @@ We use two python scripts for this tutorial.
 import tvm
 
 tgt_host="llvm"
-tgt="aocl -device=s5_ref -mattr=emulator"
+tgt="aocl_sw_emu"
 
 n = tvm.var("n")
 A = tvm.placeholder((n,), name='A')
@@ -38,7 +38,7 @@ import tvm
 import numpy as np
 import os
 
-tgt="aocl -device=s5_ref -mattr=emulator"
+tgt="aocl_sw_emu"
 
 fadd = tvm.module.load("myadd.so")
 fadd_dev = tvm.module.load("myadd.aocx")
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 4c36e82a81ec..4f94e0e62d0a 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -115,6 +115,7 @@ class TVMContext(ctypes.Structure):
         'cl': 4,
         'opencl': 4,
         'aocl' : 5,
+        'aocl_sw_emu' : 5,
         'sdaccel': 6,
         'vulkan': 7,
         'metal': 8,
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 0cb0ec3cc4be..fef5a28b1d21 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -92,7 +92,7 @@ Target CreateTarget(const std::string& target_name,
     t->device_type = kDLOpenCL;
     t->keys_array.push_back(ir::StringImm::make("sdaccel"));
     t->keys_array.push_back(ir::StringImm::make("hls"));
-  } else if (target_name == "aocl") {
+  } else if (target_name == "aocl" || target_name == "aocl_sw_emu") {
     t->device_type = kDLAOCL;
     t->keys_array.push_back(ir::StringImm::make("aocl"));
     t->keys_array.push_back(ir::StringImm::make("hls"));
diff --git a/src/codegen/codegen_aocl.cc b/src/codegen/codegen_aocl.cc
index 506a4f7ed92c..6ae89fecf6bf 100644
--- a/src/codegen/codegen_aocl.cc
+++ b/src/codegen/codegen_aocl.cc
@@ -13,7 +13,8 @@
 namespace tvm {
 namespace codegen {
 
-runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str) {
+runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str,
+                          bool emulation) {
   // Get code.
   using tvm::runtime::Registry;
   bool output_ssa = false;
@@ -31,17 +32,14 @@ runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str) {
   runtime::SaveBinaryToFile("aocl.cl", code.c_str());
 
   // Compile the .cl file.
+  std::string cmd = "aoc aocl.cl";
   Target target = Target::create(target_str);
-  if (target->device_name == "") {
-    LOG(FATAL) << "AOCL device name not specified in build target.";
+  if (target->device_name != "") {
+    cmd += " -board=" + target->device_name;
   }
-  std::string cmd = "aoc aocl.cl";
-  for (std::string option : target->options()) {
-    if (option == "-mattr=emulator") {
-      cmd += " -march=emulator";
-    }
+  if (emulation) {
+    cmd += " -march=emulator";
   }
-  cmd += " -board=" + target->device_name;
   if (system(cmd.c_str()) != 0) {
     LOG(FATAL) << "OpenCL offline compilation error.";
   }
@@ -55,7 +53,12 @@ runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str) {
 
 TVM_REGISTER_API("codegen.build_aocl")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildAOCL(args[0], args[1]);
+    *rv = BuildAOCL(args[0], args[1], false);
+  });
+
+TVM_REGISTER_API("codegen.build_aocl_sw_emu")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildAOCL(args[0], args[1], true);
   });
 
 }  // namespace codegen
diff --git a/src/codegen/intrin_rule_aocl.cc b/src/codegen/intrin_rule_aocl.cc
new file mode 100644
index 000000000000..fc5dbe741d63
--- /dev/null
+++ b/src/codegen/intrin_rule_aocl.cc
@@ -0,0 +1,82 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file intrin_rule_aocl.cc
+ * \brief AOCL intrinsic rules.
+ */
+#include "intrin_rule.h"
+
+namespace tvm {
+namespace codegen {
+namespace intrin {
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.floor")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.ceil")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.trunc")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.fabs")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.round")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.exp")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.log")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.tanh")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.sqrt")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.pow")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.popcount")
+.set_body(DispatchExtern<Direct>);
+
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.floor")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.ceil")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.trunc")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.fabs")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.round")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.exp")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.log")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.tanh")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.sqrt")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.pow")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.popcount")
+.set_body(DispatchExtern<Direct>);
+
+
+}  // namespace intrin
+}  // namespace codegen
+}  // namespace tvm
diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py
index 0abefff02778..2524e2d230b1 100644
--- a/tests/python/integration/test_ewise_fpga.py
+++ b/tests/python/integration/test_ewise_fpga.py
@@ -44,7 +44,7 @@ def check_device(device, host="llvm"):
     if "AWS_PLATFORM" in os.environ:
         check_device("sdaccel -device=" + os.environ.get("AWS_PLATFORM"))
 
-    check_device("aocl -device=s5_ref -mattr=emulator")
+    check_device("aocl_sw_emu")
 
 def test_multi_kernel():
     # graph
@@ -82,7 +82,7 @@ def check_device(device, host="llvm"):
             d.asnumpy(), a.asnumpy() * 2 + b.asnumpy(), rtol=1e-5)
 
     check_device("sdaccel")
-    check_device("aocl -device=s5_ref -mattr=emulator")
+    check_device("aocl_sw_emu")
 
 
 if __name__ == "__main__":
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index 8d82dbe5bf82..5d606d507387 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -39,7 +39,8 @@ def check_device(device):
             foo(a, b)
             np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel']:
+        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel',
+                       'aocl_sw_emu']:
             check_device(device)
 
 

From 35a58aeac83dabeb785e83951f3e799bd645aa67 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Sun, 26 Aug 2018 12:20:55 +0530
Subject: [PATCH 060/529] [FRONTEND][ONNX]HardSigmoid, min, max, mean ops
 support (#1645)

---
 nnvm/python/nnvm/frontend/onnx.py             |  56 +++++++-
 .../python/frontend/onnx/test_forward.py      | 125 ++++++++++++++++++
 2 files changed, 176 insertions(+), 5 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index f62202a37dff..5127dfd299bd 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -529,6 +529,53 @@ def _impl_v1(cls, inputs, attr, params):
         return _sym.lrn(inputs[0], size=nsize, axis=axis,
                         alpha=alpha, beta=beta, bias=bias)
 
+class Maximum(OnnxOpConverter):
+    """ Operator converter for Maximum.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        _max = inputs[0]
+        for i in range(1, len(inputs)):
+            _max = AttrCvt(op_name='broadcast_max')([_max, inputs[i]], {})
+        return _max
+
+class Minimum(OnnxOpConverter):
+    """ Operator converter for Minimum.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        _min = inputs[0]
+        for i in range(1, len(inputs)):
+            _min = AttrCvt(op_name='broadcast_min')([_min, inputs[i]], {})
+        return _min
+
+class Mean(OnnxOpConverter):
+    """ Operator converter for Mean.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        count = len(inputs)
+        _sum = inputs[0]
+        for i in range(1, count):
+            _sum = AttrCvt(op_name='broadcast_add')([_sum, inputs[i]], {})
+        return _sum / count
+
+class HardSigmoid(OnnxOpConverter):
+    """ Operator converter for HardSigmoid.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = attr.get('alpha', 0.2)
+        beta = attr.get('beta', 0.5)
+        transformX = (inputs[0] * alpha) + beta
+        attr = {'a_min':0, 'a_max':1}
+        return AttrCvt(op_name='clip')([transformX], attr)
 
 # compatible operators that do NOT require any conversion.
 _identity_list = []
@@ -557,7 +604,6 @@ def _get_convert_map(opset):
         # 'MeanVarianceNormalization'
         # 'Crop'
         # 'Embedding'
-        # 'Upsample'
         'Upsample' : Upsample.get_converter(opset),
         'SpatialBN': BatchNorm.get_converter(opset),
 
@@ -591,11 +637,11 @@ def _get_convert_map(opset):
         'Pow': Renamer('broadcast_pow'),
         'PRelu': Prelu.get_converter(opset),
         'Sigmoid': Renamer('sigmoid'),
-        # 'HardSigmoid'
-        # 'Max' : this is the elemwise maximum
-        # 'Min' : this is the elemwise minimum
+        'HardSigmoid': HardSigmoid.get_converter(opset),
+        'Max': Maximum.get_converter(opset),
+        'Min': Minimum.get_converter(opset),
         'Sum': Sum.get_converter(opset),
-        # 'Mean'
+        'Mean': Mean.get_converter(opset),
         'Clip': AttrCvt('clip', transforms={'min': 'a_min', 'max': 'a_max'}),
         # softmax default axis is different in onnx
         'Softmax': AttrCvt('softmax', {'axis': ('axis', 1)}),
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 3f2fbb144289..49cf58fa1aa5 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -426,6 +426,127 @@ def test_upsample():
     _test_upsample_nearest()
     _test_upsample_bilinear()
 
+def verify_min(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.min((a_np1, a_np2, a_np3), axis=0)
+
+    min_node = helper.make_node("Min", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([min_node],
+                              "Min_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Min_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_min():
+    verify_min((1, 3, 20, 20))
+    verify_min((20, 20))
+
+def verify_max(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.max((a_np1, a_np2, a_np3), axis=0)
+
+    max_node = helper.make_node("Max", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([max_node],
+                              "Max_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Max_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_max():
+    verify_max((1, 3, 20, 20))
+    verify_max((20, 20))
+
+def verify_mean(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.mean((a_np1, a_np2, a_np3), axis=0)
+
+    mean_node = helper.make_node("Mean", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([mean_node],
+                              "Mean_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Mean_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_mean():
+    verify_mean((1, 3, 20, 20))
+    verify_mean((20, 20))
+
+def verify_hardsigmoid(input_dim, alpha, beta):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.clip(a_np1 * alpha + beta, 0, 1)
+
+    hardsigmoid_node = helper.make_node("HardSigmoid", ["a_np1"], ["out"], alpha=alpha, beta=beta)
+
+    graph = helper.make_graph([hardsigmoid_node],
+                              "HardSigmoid_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='HardSigmoid_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape)
+        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_hardsigmoid():
+    verify_hardsigmoid((1, 3, 20, 20), 0.5, 0.6)
+    verify_hardsigmoid((20, 20), 0.3, 0.4)
 
 if __name__ == '__main__':
     # verify_super_resolution_example()
@@ -445,3 +566,7 @@ def test_upsample():
     test_gather()
     test_lrn()
     test_upsample()
+    test_forward_min()
+    test_forward_max()
+    test_forward_mean()
+    test_forward_hardsigmoid()

From 857c0bd941b6cefe611638d338274e93795078a0 Mon Sep 17 00:00:00 2001
From: Tatsuya Nishiyama <nishiyama.tatsuya0@gmail.com>
Date: Tue, 28 Aug 2018 05:05:50 +0900
Subject: [PATCH 061/529] [FRONTEND][TENSORFLOW] fix the convertion of sum and
 add testcase for it (#1654)

* [TENSORFLOW] fix the convertion of sum and add testcase for it

* delete checking tyoe of axis and divide reduce test
---
 nnvm/python/nnvm/frontend/tensorflow.py       |  2 ++
 .../frontend/tensorflow/test_forward.py       | 21 +++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 65dd3619b5b2..6be5333ccee6 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -444,6 +444,8 @@ def _impl(inputs, attr, params):
 def _sum():
     def _impl(inputs, attr, params):
         axis = params.pop(inputs[1].list_output_names()[0]).asnumpy()
+        # convert to tuple for preventing invalid parameter format error
+        axis = tuple(axis)
         return AttrCvt(
             op_name='sum',
             extras={'axis': axis},
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 6fa020a03444..e0e18d1bdb06 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -348,6 +348,26 @@ def test_forward_argminmax():
         _test_argx(tf.argmax, data=data, axis=axis)
         _test_argx(tf.argmin, data=data, axis=axis)
 
+#######################################################################
+# Reduce
+# ------
+
+def _test_reduce(func, data, **kwargs):
+    """ One iteration of a reduce operation"""
+
+    with tf.Graph().as_default():
+        inp = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="c0")
+        func(inp, name="reducex0", **kwargs)
+
+        compare_tf_with_tvm(data, 'c0:0', 'reducex0:0')
+
+def test_forward_reduce():
+    data = np.random.uniform(size=(8,4,9)).astype('float32')
+    _test_reduce(tf.reduce_sum, data=data)
+    _test_reduce(tf.reduce_sum, data=data, axis=0)
+    _test_reduce(tf.reduce_sum, data=data, axis=(0,1))    
+
+
 #######################################################################
 # Variable
 # --------
@@ -844,6 +864,7 @@ def test_forward_l2_normalize():
     test_forward_squeeze()
     test_forward_sigmoid()
     test_forward_argminmax()
+    test_forward_reduce()
     if tf.__version__ == '1.4.1':
         _test_forward_concat_v2()
     test_forward_multi_input()

From 100cf485f98c6ed6eda612242d1b884fb30c9ebc Mon Sep 17 00:00:00 2001
From: Jian Weng <werefluke@gmail.com>
Date: Mon, 27 Aug 2018 13:33:27 -0700
Subject: [PATCH 062/529] add docstring skip in hybrid script (#1668)

* add docstring skip in hybrid script

* fix lint
---
 python/tvm/hybrid/parser.py                 | 4 ++--
 python/tvm/hybrid/util.py                   | 6 ++++++
 tests/python/unittest/test_hybrid_script.py | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index 1e532367a321..cf21ea950549 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -3,7 +3,7 @@
 import ast
 import operator
 import sys
-from .util import make_nop, halide_imm_types
+from .util import make_nop, halide_imm_types, is_docstring
 from .intrin import LOOP_INTRIN, MATH_INTRIN
 from .var_decl import determine_variable_usage
 from ..api import thread_axis
@@ -15,7 +15,7 @@
 
 def list_to_block(visit, lst):
     """Convert a list of Python IR nodes to HalideIR Block"""
-    lst = list(map(visit, lst))
+    lst = [visit(stmt) for stmt in lst if not is_docstring(stmt)]
     lst = [stmt for stmt in lst if not _ir_pass.Equal(stmt, make_nop())]
     if not lst:
         return make_nop()
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index 43d26e859560..2a43957e9706 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -1,5 +1,6 @@
 """Internal utilities for parsing Python subset to HalideIR"""
 
+import ast
 import inspect
 import numpy
 from .intrin import HYBRID_GLOBALS
@@ -22,6 +23,11 @@ def make_nop():
     return _make.Evaluate(_api.const(0, dtype='int32'))
 
 
+def is_docstring(node):
+    """Checks if a Python AST node is a docstring"""
+    return isinstance(node, ast.Expr) and isinstance(node.value, ast.Str)
+
+
 def _pruned_source(func):
     """Prune source code's extra leading spaces"""
     lines = inspect.getsource(func).split('\n')
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 0f500d7c704f..ef0bcf8f72e5 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -43,6 +43,7 @@ def tvm_val_2_py_val(val):
 
 @script
 def outer_product(n, m, a, b, c):
+    """This is a simple outer product"""
     for i in range(n):
         for j in range(m):
             c[i, j] = a[i] * b[j]

From bfed21f05a993eeca14fe127c2d0607595195228 Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Tue, 28 Aug 2018 14:02:24 +0900
Subject: [PATCH 063/529] [DOCS][NNVM] Delete duplicated tensor operators from
 list (#1669)

---
 docs/nnvm_top.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
index 927990647a69..663c85ac789e 100644
--- a/docs/nnvm_top.rst
+++ b/docs/nnvm_top.rst
@@ -136,11 +136,9 @@ This level enables typical convnet models.
    nnvm.symbol.collapse_sum
    nnvm.symbol.broadcast_equal
    nnvm.symbol.broadcast_greater_equal
-   nnvm.symbol.broadcast_greater_equal
    nnvm.symbol.broadcast_greater
    nnvm.symbol.broadcast_left_shift
    nnvm.symbol.broadcast_less_equal
-   nnvm.symbol.broadcast_less_equal
    nnvm.symbol.broadcast_less
    nnvm.symbol.broadcast_max
    nnvm.symbol.broadcast_min
@@ -252,11 +250,9 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.collapse_sum
 .. autofunction:: nnvm.symbol.broadcast_equal
 .. autofunction:: nnvm.symbol.broadcast_greater_equal
-.. autofunction:: nnvm.symbol.broadcast_greater_equal
 .. autofunction:: nnvm.symbol.broadcast_greater
 .. autofunction:: nnvm.symbol.broadcast_left_shift
 .. autofunction:: nnvm.symbol.broadcast_less_equal
-.. autofunction:: nnvm.symbol.broadcast_less_equal
 .. autofunction:: nnvm.symbol.broadcast_less
 .. autofunction:: nnvm.symbol.broadcast_max
 .. autofunction:: nnvm.symbol.broadcast_min

From cd73fc4166100923877cc12d73f20d8b3ad17d6c Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 28 Aug 2018 17:01:57 -0500
Subject: [PATCH 064/529] Fix incorrect stride in conv2d_nhwc_python (#1670)

---
 topi/python/topi/testing/conv2d_nhwc_python.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py
index 880088a6f89f..461d7a6a0e06 100644
--- a/topi/python/topi/testing/conv2d_nhwc_python.py
+++ b/topi/python/topi/testing/conv2d_nhwc_python.py
@@ -63,5 +63,5 @@ def conv2d_nhwc_python(a_np, w_np, stride, padding):
                     apad = at[n, c]
                 out = scipy.signal.convolve2d(
                     apad, np.rot90(np.rot90(wt[f, c])), mode='valid')
-                bt[n, f] += out[::stride, ::stride]
+                bt[n, f] += out[::stride_h, ::stride_w]
     return bt.transpose((0, 2, 3, 1))

From ffbd44277973db66e456c04321b88f9b6567d30d Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Tue, 28 Aug 2018 22:56:49 -0700
Subject: [PATCH 065/529] [AUTOTVM] Decouple build and run in measurement
 (#1661)

---
 docs/api/python/autotvm.rst                   |   5 +
 python/tvm/autotvm/__init__.py                |   3 +-
 python/tvm/autotvm/measure/__init__.py        |   8 +-
 python/tvm/autotvm/measure/local_executor.py  |  14 +-
 python/tvm/autotvm/measure/measure.py         | 251 +++--
 python/tvm/autotvm/measure/measure_methods.py | 896 ++++++++++--------
 python/tvm/autotvm/tuner/ga_tuner.py          |   2 +-
 .../tvm/autotvm/tuner/sa_model_optimizer.py   |   2 +-
 tests/python/integration/test_tuning.py       |  41 +-
 tests/python/unittest/test_autotvm_common.py  |  19 +
 .../python/unittest/test_autotvm_database.py  | 151 +--
 tests/python/unittest/test_autotvm_measure.py |  97 ++
 topi/recipe/gemm/gemm_int8.py                 |   7 +-
 tutorials/autotvm/tune_conv2d_cuda.py         |  12 +-
 tutorials/autotvm/tune_nnvm_arm.py            | 123 +--
 tutorials/autotvm/tune_simple_template.py     |   9 +-
 16 files changed, 880 insertions(+), 760 deletions(-)
 create mode 100644 tests/python/unittest/test_autotvm_measure.py

diff --git a/docs/api/python/autotvm.rst b/docs/api/python/autotvm.rst
index f03406dbc720..93d6905077fb 100644
--- a/docs/api/python/autotvm.rst
+++ b/docs/api/python/autotvm.rst
@@ -16,6 +16,11 @@ tvm.autotvm.measure
 
 .. autofunction:: tvm.autotvm.measure.create_measure_batch
 
+.. autoclass:: tvm.autotvm.measure.measure_methods.LocalBuilder
+
+.. autoclass:: tvm.autotvm.measure.measure_methods.RPCRunner
+
+.. autoclass:: tvm.autotvm.measure.measure_methods.LocalRunner
 
 tvm.autotvm.tuner
 ~~~~~~~~~~~~~~~~~
diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index 625b50c10853..7170dbdd8565 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -22,7 +22,8 @@
 from . import tophub
 
 # some shortcuts
-from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo
+from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, \
+    LocalBuilder, LocalRunner, RPCRunner
 from .tuner import callback
 from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
     register_topi_compute, register_topi_schedule, \
diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py
index 880dfd1ffe29..8a6126641a99 100644
--- a/python/tvm/autotvm/measure/__init__.py
+++ b/python/tvm/autotvm/measure/__init__.py
@@ -1,7 +1,7 @@
 """Distributed executor infrastructure to scale up the tuning"""
 
-from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option
-from .measure_methods import request_remote, check_remote, create_measure_batch, rpc
-
+from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option, \
+    create_measure_batch
+from .measure_methods import LocalBuilder, LocalRunner, RPCRunner, request_remote
+from .executor import Executor
 from .local_executor import LocalExecutor
-from .executor import Future, Executor
diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py
index 55f1dc75fc5c..63d995c3580c 100644
--- a/python/tvm/autotvm/measure/local_executor.py
+++ b/python/tvm/autotvm/measure/local_executor.py
@@ -37,7 +37,8 @@ def _execute_func(func, queue, args, kwargs):
         res = exc
     queue.put(res)
 
-def timeout_monitor(queue, timeout, func, args, kwargs):
+
+def call_with_timeout(queue, timeout, func, args, kwargs):
     """A wrapper to support timeout of a function call"""
 
     # start a new process for timeout (cannot use thread because we have c function)
@@ -45,17 +46,12 @@ def timeout_monitor(queue, timeout, func, args, kwargs):
     p.start()
     p.join(timeout=timeout)
 
-    alive = p.is_alive()
+    queue.put(executor.TimeoutError())
+
     kill_child_processes(p.pid)
     p.terminate()
     p.join()
 
-    if alive:
-        queue.put(executor.TimeoutError())
-    else:
-        if queue.empty():
-            queue.put(executor.ExecutionError("Fatal error in local executor"))
-
 
 class LocalFuture(executor.Future):
     """Local wrapper for the future
@@ -134,7 +130,7 @@ def submit(self, func, *args, **kwargs):
             return LocalFutureNoFork(func(*args, **kwargs))
 
         queue = Queue(2)
-        process = Process(target=timeout_monitor,
+        process = Process(target=call_with_timeout,
                           args=(queue, self.timeout, func, args, kwargs))
         process.start()
         return LocalFuture(process, queue)
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 2d780eeaf004..38b5f99eacb9 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -1,5 +1,6 @@
 # pylint: disable=pointless-string-statement,consider-using-enumerate,invalid-name
 """User facing API for specifying how to measure the generated code"""
+import multiprocessing
 from collections import namedtuple
 
 class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
@@ -16,6 +17,7 @@ class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
         Specific configuration.
     """
 
+
 class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp"])):
     """
     Stores all the results of a measurement
@@ -23,8 +25,8 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
     Parameters
     ----------
     costs: Array of float or Array of Exception
-        If no error occurs for this measurement, it is an array of measured running times.
-        If some error occurs during the measurement, it is an array of the exception objections.
+        If no error occurs during measurement, it is an array of measured running times.
+        If an error occurs during measurement, it is an array of the exception objections.
     error_no: int
         Denote error type, defined by MeasureErrorNo
     all_cost: float
@@ -37,92 +39,185 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
 class MeasureErrorNo(object):
     """Error type for MeasureResult"""
     NO_ERROR = 0              # no error
-    INSTANTIATION_ERROR = 1   # error when calling template function
+    INSTANTIATION_ERROR = 1   # actively detected error in instantiating a template with a config
     COMPILE_HOST = 2          # error when compiling code on host (e.g. tvm.build)
-    COMPILE_DEVICE = 3        # error when compiling code on device (e.g. opencl JIT on device)
+    COMPILE_DEVICE = 3        # error when compiling code on device (e.g. OpenCL JIT on the device)
     RUNTIME_DEVICE = 4        # error when run program on device
     WRONG_ANSWER = 5          # answer is wrong when compared to a golden output
-    FLEET_ERROR = 6           # error of measure infrastructure
+    BUILD_TIMEOUT = 6         # timeout during compilation
+    RUN_TIMEOUT = 7           # timeout during run
+    UNKNOWN_ERROR = 8         # unknown error
+
 
+class Builder(object):
+    """Builder that builds programs in tuning
 
-def measure_option(measure_func,
-                   number=1,
-                   repeat=1,
-                   timeout=60,
-                   n_parallel=1,
-                   do_fork=True,
-                   build_func='default',
-                   check_correctness=False,
-                   replay_db=None):
-    """Configure how to do measurement
+    Parameters
+    ----------
+    timeout: float, optional
+        The timeout of a build task
+    n_parallel: int, optional
+        The number of tasks submitted in parallel
+        By default it will use all cpu cores
+    """
+    def __init__(self, timeout=10, n_parallel=None):
+        self.timeout = timeout
+        self.n_parallel = n_parallel or multiprocessing.cpu_count()
+        self.build_kwargs = {}
+        self.task = None
+
+    def set_task(self, task, build_kwargs=None):
+        """
+        Initialize for a new tuning task
+
+        Parameters
+        ----------
+        task: Task
+            The tuning task
+        build_kwargs: dict, optional
+            The additional kwargs for build function
+        """
+        self.task = task
+        self.build_kwargs = build_kwargs
+
+    def build(self, measure_inputs):
+        """Build programs
+
+        Parameters
+        ----------
+        measure_inputs: List of MeasureInput
+            The measure input
+
+        Returns
+        -------
+        build_results: List of BuildResult
+            The build result.
+        """
+        raise NotImplementedError()
+
+
+class Runner(object):
+    """Runner that runs and measures the time cost of a generated program in tuning
 
     Parameters
     ----------
-    measure_func: str or callable
-        'local': use the local device for measurement. The tuner will start a tracker
-        and a RPC server silently for the user.
-
-        callable: It is a callable function for measurement.
-                  See the return value of measure/measure_methods.py::rpc for example.
-    number : int, optional
-        Number of times to do the measurement for average
-    repeat : int, optional
-        Number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-        each of which is the average of `number` test run.
-    timeout: int, optional
-        Timeout for a whole batch. TimeoutError will be returned as the result if a
-        task timeouts.
+    timeout: float, optional
+        The timeout of a build task
     n_parallel: int, optional
-        The number of measurement task that can run in parallel.
-        Set this according to the number of cpu cores (for compilation) and
-        the number of devices you have (for measuring generate code).
-    do_fork: bool, optional
-        Whether use multiprocessing (based on fork) for running measure jobs in parallel.
-        Set this to False if you want to debug (see trackback) or using fork is not suitable.
-        NOTE: If this is False, parallel and timeout do not work.
-    build_func: str or callable, optional
-        'default': call default builder. This works for normal target (llvm, cuda)
-
-        'ndk': use Android NDK to create shared library. Use this for android target.
-
-        callable: customized build function for other backends (e.g. VTA).
-                  See measure/measure_methods.py::default_build_func for example.
-    check_correctness: bool, optional
-        Whether check correctness after measurement. This will use llvm cpu target to generate
-        reference output.
-    replay_db : Database, optional
-        The database that we retrieve saved MeasureResult from.
+        The number of tasks submitted in parallel
+        By default it will use all cpu cores
+    """
+    def __init__(self, timeout=5, n_parallel=None):
+        self.timeout = timeout
+        self.n_parallel = n_parallel or multiprocessing.cpu_count()
+        self.task = None
+
+    def set_task(self, task):
+        """
+        Initialize for a new tuning task
+
+        Parameters
+        ----------
+        task: Task
+            The tuning task
+        """
+        self.task = task
+
+    def get_build_kwargs(self):
+        """
+        Get device specific build arguments (e.g. maximum shared memory size)
+
+        Returns
+        ----------
+        kwargs: dict
+            The additional keyword arguments
+        """
+        raise NotImplementedError()
+
+    def run(self, measure_inputs, build_results):
+        """Run amd measure built programs
+
+        Parameters
+        ----------
+        measure_inputs: List of MeasureInput
+            The raw measure input
+        build_results: List of BuildResults
+            The build results
+
+        Returns
+        -------
+        measure_results: List of MeasureResult
+            The final results of measurement
+        """
+        raise NotImplementedError()
+
+
+def measure_option(builder, runner):
+    """
+    Set options for measure. To measure a config, we will build it and run it.
+    So we have to set options for these two steps.
+    They have their own options on timeout, parallel, etc.
+
+    Parameters
+    ----------
+    builder: Builder
+        Specify how to build programs
+    runner: Runner
+        Specify how to run programs
+    """
+    from .measure_methods import LocalBuilder, LocalRunner
+
+    if isinstance(builder, str):
+        if builder == 'local':
+            builder = LocalBuilder()
+        else:
+            raise ValueError("Invalid builder: " + builder)
+
+    if isinstance(runner, str):
+        if runner == 'local':
+            runner = LocalRunner()
+        else:
+            raise ValueError("Invalid runner: " + runner)
+
+    opt = {
+        'builder': builder,
+        'runner': runner,
+    }
+
+    return opt
+
+
+def create_measure_batch(task, option):
+    """Get a standard measure_batch function.
+
+    Parameters
+    ----------
+    task: tvm.autotvm.task.Task
+        The tuning task
+    option: dict
+        The option for measuring generated code.
+        You should use the return value of function :any:`measure_option` for this argument.
 
     Returns
     -------
-    options: dict
-        A dict to store all options
-
-    Note
-    ----
-    To support customized measure, you can pass callable `measure_func` or
-    `build_func` in. The `measure_func` will call `build_func` to build binary library
-    and handle the logic of measurement.
-
-    Signature:
-    * measure_func (see the return value of measure/measure_methods.py::rpc for example)
-    def measure_func(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
-        return measure_results
-
-    * build_func (see measure/measure_methods.py::default_build_func for example)
-    def build_func(inp, tmp_dir, **kwargs):
-        return func, args, filename
+    measure_batch: callable
+        a callback function to measure a batch of configs
     """
-    return {
-        'measure_func': measure_func,
-        'number': number,
-        'repeat': repeat,
-        'timeout': timeout,
-        'n_parallel': n_parallel,
-        'do_fork': do_fork,
-        'build_func': build_func,
-        'check_correctness': check_correctness,
-        'replay_db': replay_db,
-    }
+    builder = option['builder']
+    runner = option['runner']
+
+    attach_objects = runner.set_task(task)
+
+    # feed device related information from runner to builder
+    # (e.g. max shared memory for validity checking)
+    build_kwargs = runner.get_build_kwargs()
+    builder.set_task(task, build_kwargs)
+
+    def measure_batch(measure_inputs):
+        build_results = builder.build(measure_inputs)
+        results = runner.run(measure_inputs, build_results)
+        return results
+
+    measure_batch.n_parallel = builder.n_parallel
+    measure_batch.attach_objects = attach_objects
+    return measure_batch
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 2d740b9493b2..6a3cd028393c 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -1,129 +1,339 @@
-# pylint: disable=consider-using-enumerate,invalid-name,too-many-function-args
+# pylint: disable=invalid-name,too-many-function-args,too-many-nested-blocks
 """
 Functions that run on executor for measurement.
-These functions are responsible for building tvm module, uploading it to
-remote devices, recording the running time costs and checking the correctness of output
+
+These functions are responsible for building the tvm module, uploading it to
+remote devices, recording the running time costs, and checking the correctness of the output.
 """
 
 import logging
+import shutil
 import os
+import threading
 import time
 from random import getrandbits
-import threading
+from collections import namedtuple
+import tempfile
 
 import numpy as np
 
-from ... import ir_pass, build, build_config, nd, context, TVMError, register_func, \
-    target as _target, rpc as _rpc
-from ...contrib import nvcc, util, ndk
+from ... import ir_pass, build, build_config, nd, TVMError, register_func, \
+    rpc as _rpc, target as _target
+from ...contrib import nvcc, ndk
 
 from ..util import get_const_tuple
 from ..env import AutotvmGlobalScope
 from ..task.space import InstantiationError
 
-from .measure import MeasureResult, MeasureErrorNo
+from .measure import MeasureResult, MeasureErrorNo, Builder, Runner
 from .local_executor import LocalExecutor
 
 logger = logging.getLogger('autotvm')
 
-class HashMismatchError(ValueError):
-    """Raised when the code hash of a submitted config doesn't match that on the
-       measure side """
-    pass
+class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))):
+    """
+    Stores all the necessary inputs for a measurement.
 
+    Parameters
+    ----------
+    filename : str
+        The filename of generated library
+    arg_info : Tuple
+        The shape and dtype information of tvm tensor arguments
+    error : Exception
+        The error happens during compilation.
+    time_cost : float
+        The time cost of building
+    """
 
-def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
-    """request a remote session
+class LocalBuilder(Builder):
+    """Run compilation on local machine
 
     Parameters
     ----------
-    device_key: string
-        device key of registered device in tracker
-    tracker_addr: Tuple(string, int), optional
-        The address of rpc tracker in (host, port) format.
-        If is none, will use environment variable "TVM_TRACKER_HOST"
-        and "TVM_TRACKER_PORT"
-    priority: int, optional
-        The priority of this request, larger is more prior
-    timeout: float, optional
-        The timeout of this session (units: seconds)
-
-    Returns
-    ------
-    session: RPCSession
+    timeout: float
+        The timeout of a compilation
+    n_parallel: int
+        The number of tasks run in parallel. "None" will use all cpu cores
+    build_func: callable or str
+        If is 'default', use default build function
+        If is 'ndk', use function for android ndk
+        If is callable, use it as custom build function
     """
-    # connect to the tracker
-    if tracker_addr:
-        host = tracker_addr[0] or os.environ['TVM_TRACKER_HOST']
-        port = tracker_addr[1] or int(os.environ['TVM_TRACKER_PORT'])
-    else:
-        host = os.environ['TVM_TRACKER_HOST']
-        port = int(os.environ['TVM_TRACKER_PORT'])
+    def __init__(self, timeout=10, n_parallel=None, build_func='default'):
+        super(LocalBuilder, self).__init__(timeout, n_parallel)
+
+        if isinstance(build_func, str):
+            if build_func == 'default':
+                build_func = default_build_func
+            elif build_func == 'ndk':
+                build_func = android_ndk_build_func
+            else:
+                raise ValueError("Invalid build_func" + build_func)
 
-    tracker = _rpc.connect_tracker(host, port)
-    remote = tracker.request(device_key, priority=priority,
-                             session_timeout=timeout)
-    return remote
+        self.build_func = build_func
+        self.tmp_dir = tempfile.mkdtemp()
+        self.executor = LocalExecutor(timeout=timeout)
 
-def check_remote(target, device_key, tracker_addr=None, priority=2, timeout=10):
-    """
-    Check the availability of a remote device
+    def build(self, measure_inputs):
+        results = []
+
+        for i in range(0, len(measure_inputs), self.n_parallel):
+            futures = []
+            for inp in measure_inputs[i:i + self.n_parallel]:
+                ret = self.executor.submit(self.build_func,
+                                           inp,
+                                           self.tmp_dir,
+                                           **self.build_kwargs)
+                futures.append(ret)
+
+            for future in futures:
+                res = future.get()
+
+                if isinstance(res, Exception):
+                    # timeout or fleet error, return MeasureResult directly
+                    results.append(MeasureResult((res,), MeasureErrorNo.BUILD_TIMEOUT,
+                                                 self.timeout, time.time()))
+                elif res.error is not None:
+                    # instantiation errorD
+                    if isinstance(res.error, InstantiationError):
+                        results.append(MeasureResult((res.error,),
+                                                     MeasureErrorNo.INSTANTIATION_ERROR,
+                                                     res.time_cost, time.time()))
+                    else:
+                        if "InstantiationError" in str(res.error):
+                            msg = str(res.error)
+                            try:
+                                msg = msg.split('\n')[-2].split(": ")[1]
+                            except Exception:  # pylint: disable=broad-except
+                                pass
+                            results.append(MeasureResult((InstantiationError(msg),),
+                                                         MeasureErrorNo.INSTANTIATION_ERROR,
+                                                         res.time_cost, time.time()))
+                        else:  # tvm error
+                            results.append(MeasureResult((res.error,),
+                                                         MeasureErrorNo.COMPILE_HOST,
+                                                         res.time_cost, time.time()))
+                else:
+                    # return BuildResult
+                    results.append(res)
+
+        return results
+
+    def __del__(self):
+        shutil.rmtree(self.tmp_dir)
+
+
+class RPCRunner(Runner):
+    """Run generated code on remove devices.
+    This function will ask a RPC Tracker to get device for measurement.
 
     Parameters
     ----------
-    target: Target
-        The wanted compilation target
-    device_key: string
-        device key of registered device in tracker
-    tracker_addr: Tuple(string, int), optional
-        The address of rpc tracker in (host, port) format.
-        If is none, will use environment variable "TVM_TRACKER_HOST"
-        and "TVM_TRACKER_PORT"
-    priority: int, optional
-        The priority of this request, larger is more prior
-    timeout: float, optional
-        The timeout of this check (units: seconds).
-        If time is out, a RuntimeError will be raised.
+    timeout: float
+        The timeout of a compilation
+    n_parallel: int
+        The number of tasks run in parallel. "None" will use all cpu cores
+    key: str
+        The key of the device registered in the tracker
+    host: str
+        The host address of RPC Tracker
+    port: int
+        The port of RPC Tracker
+    number : int, optional
+        Number of times to do measurement for tasking average
+    repeat : int, optional
+        Number of times to repeat the measurement.
+        In total, the generated code will be run (1 + number x repeat) times,
+        where the first one is warm up. The returned result contains `repeat` costs,
+    min_repeat_ms : float, optional
+        Minimum duration of a timer measurement in milliseconds.
+        When the run time of a measurement trial falls below this time, the
+        `number` parameter will be automatically increased.
+        Set this to improve the accuracy of perf measurement, e.g., when timers
+        are not precise enough to capture short-running tasks. This parameter is
+        also critical when devices need a certain minimum running time to "warm
+        up," such as GPUs that need time to reach a performance power state.
+    cooldown_interval: float, optional
+        The cool down interval between two measurements.
+    check_correctness: bool, optional
+        Whether check correctness after measurement. This will use llvm cpu target to
+        call your template and get the reference output.
+        This can work for TOPI templates, but may not work for your custom template.
     """
-    def _check():
-        remote = request_remote(device_key, tracker_addr, priority)
-        remote.context(str(target))
-    t = threading.Thread(target=_check,)
-    t.start()
-    t.join(timeout)
-    return not t.is_alive()
+    def __init__(self,
+                 key, host, port, priority=1,
+                 timeout=10, n_parallel=None,
+                 number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
+                 check_correctness=False):
+        super(RPCRunner, self).__init__(timeout, n_parallel)
+
+        self.key = key
+        self.host = host
+        self.port = port
+        self.priority = priority
+        self.timeout = timeout
+
+        self.number = number
+        self.repeat = repeat
+        self.min_repeat_ms = min_repeat_ms
+        self.cur_number = number
+
+        self.ref_input = None
+        self.ref_output = None
+        self.check_correctness = check_correctness
+        self.cooldown_interval = cooldown_interval
+
+        self.executor = LocalExecutor()
+
+    def set_task(self, task):
+        self.task = task
+        self.cur_number = self.number
+
+        if check_remote(task.target, self.key, self.host, self.port):
+            logger.info("Get devices for measurement successfully!")
+        else:
+            raise RuntimeError("Cannot get remote devices from the tracker. "
+                               "Please check the status of tracker by "
+                               "'python -m tvm.exec.query_rpc_tracker --port [THE PORT YOU USE]' "
+                               "and make sure you have free devices on the queue status.")
 
-def create_measure_batch(task, option):
-    """Get a standard measure_batch function.
+        if self.check_correctness:
+            # use llvm cpu to generate a reference input/output
+            # this option works for tuning topi, but might not work for you custom op
+            with _target.create("llvm"):
+                s, arg_bufs = task.instantiate(task.config_space.get(0))
+            self.ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
+                              for x in arg_bufs]
+            func = build(s, arg_bufs, "llvm")
+            tvm_buf = [nd.array(x) for x in self.ref_input]
+            func(*tvm_buf)
+            self.ref_output = [x.asnumpy() for x in tvm_buf]
+
+    def get_build_kwargs(self):
+        kwargs = {}
+        if 'cuda' in self.task.target.keys or 'opencl' in self.task.target.keys:
+            remote = request_remote(self.key, self.host, self.port)
+            ctx = remote.context(str(self.task.target), 0)
+            max_dims = ctx.max_thread_dimensions
+            kwargs['check_gpu'] = {
+                'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
+                'max_threads_per_block': ctx.max_threads_per_block,
+                'max_thread_x': max_dims[0],
+                'max_thread_y': max_dims[1],
+                'max_thread_z': max_dims[2],
+            }
+
+            if 'cuda' in self.task.target.keys:
+                kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
+
+        return kwargs
+
+    def run(self, measure_inputs, build_results):
+        results = []
+        remote_args = (self.key, self.host, self.port, self.priority, self.timeout)
+
+        for i in range(0, len(measure_inputs), self.n_parallel):
+            futures = []
+            for measure_inp, build_res in zip(measure_inputs[i:i+self.n_parallel],
+                                              build_results[i:i+self.n_parallel]):
+                ret = self.executor.submit(run_through_rpc,
+                                           measure_inp,
+                                           build_res,
+                                           self.cur_number,
+                                           self.repeat,
+                                           self.cooldown_interval,
+                                           remote_args,
+                                           self.ref_input,
+                                           self.ref_output)
+                futures.append(ret)
+
+            for future in futures:
+                res = future.get()
+                if isinstance(res, Exception):   # executor error or timeout
+                    results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT,
+                                                 self.timeout, time.time()))
+                else:
+                    results.append(res)
+
+        # If some runs were too fast, do remeasure for them
+        # to meet the requirement of `min_repeat_ms`
+        remeasure = np.zeros((len(measure_inputs),), dtype=np.bool)
+        pre_number = next_number = self.cur_number
+        min_repeat_duration = self.min_repeat_ms / 1000.0
+        for i, res in enumerate(results):
+            if res.error_no == MeasureErrorNo.NO_ERROR:
+                if np.mean(res.costs) * pre_number <= min_repeat_duration:
+                    next_number = max(next_number,
+                                      int(np.ceil(min_repeat_duration / np.mean(res.costs))))
+                    remeasure[i] = True
+
+        if pre_number != next_number:
+            self.cur_number = next_number
+            msg = "increasing number to %d" % self.cur_number
+            logger.info(msg)
+
+            re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]]
+            re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]]
+            re_res = self.run(re_measure_inputs, re_build_results)
+            ct = 0
+            for i, rerun in enumerate(remeasure):
+                if rerun:
+                    results[i] = re_res[ct]
+                    ct += 1
+
+        return results
+
+class LocalRunner(RPCRunner):
+    """Run generated code on local devices.
 
     Parameters
     ----------
-    task: tvm.autotvm.task.Task
-        The tuning task
-    option: dict
-        The option for measuring generated code.
-        You should use the return value of function :any:`measure_option` for this argument.
-
-    Returns
-    -------
-    measure_batch: callable
-        a callback function to measure a batch of configs
+    timeout: float
+        The timeout of a compilation
+    number : int, optional
+        Number of times to do measurement for tasking average
+    repeat : int, optional
+        Number of times to repeat the measurement.
+        In total, the generated code will be run (1 + number x repeat) times,
+        where the first one is warm up. The returned result contains `repeat` costs,
+        each of which is the average of `number` test run.
+    min_repeat_ms : float, optional
+        Minimum duration of a timer measurement in milliseconds.
+        When the run time of a measurement trial falls below this time, the
+        `number` parameter will be automatically increased.
+        Set this to improve the accuracy of perf measurement, e.g., when timers
+        are not precise enough to capture short-running tasks. This parameter is
+        also critical when devices need a certain minimum running time to "warm
+        up," such as GPUs that need time to reach a performance power state.
+    cooldown_interval: float, optional
+        The cool down interval between two measurements.
+    check_correctness: bool, optional
+        Whether check correctness after measurement. This will use llvm cpu target to
+        call your template and get the reference output.
+        This can work for TOPI templates, but may not work for your custom template.
+
+    Note
+    ----
+    This is a "fake" local mode. We start a silent rpc tracker and rpc server
+    for the user. In this way we reuse timeout/isolation mechanism in RPC infrastructure.
     """
-    from ..database import filter_inputs
-
-    measure_func = option['measure_func']
-    number, repeat = option['number'], option['repeat']
-    timeout, n_parallel, do_fork = option['timeout'], option['n_parallel'], option['do_fork']
-    build_func = option['build_func']
-    check_correctness = option['check_correctness']
-    replay_db = option['replay_db']
+    def __init__(self,
+                 timeout=10,
+                 number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
+                 check_correctness=False):
+        super(LocalRunner, self).__init__('', None, None, 0,
+                                          timeout=timeout, n_parallel=1,
+                                          number=number, repeat=repeat,
+                                          min_repeat_ms=min_repeat_ms,
+                                          cooldown_interval=cooldown_interval,
+                                          check_correctness=check_correctness)
+        self.tracker = None
+        self.server = None
+
+    def set_task(self, task):
+        self.task = task
 
-    executor = LocalExecutor(timeout=timeout, do_fork=do_fork)
-
-    # convert convenient string to function object
-    attach_objects = None
-    if measure_func == 'local':
-        # start temporary rpc tracker and rpc server for the user
         from ...rpc.tracker import Tracker
         from ...rpc.server import Server
 
@@ -133,360 +343,215 @@ def create_measure_batch(task, option):
                         key=device_key,
                         use_popen=True, silent=True,
                         tracker_addr=(tracker.host, tracker.port))
+        self.key = device_key
+        self.host = tracker.host
+        self.port = tracker.port
 
-        measure_func = rpc(device_key, tracker.host, tracker.port)
-        attach_objects = (server, tracker)
+        super(LocalRunner, self).set_task(task)
+        return server, tracker
 
-    build_kwargs = {}
-    if build_func == 'default':
-        build_func = default_build_func
-    if build_func == 'ndk':
-        build_func = default_build_func
-        build_kwargs['use_ndk'] = True
 
-    # check the availability of remote devices
-    if hasattr(measure_func, 'rpc_info'):
-        rpc_info = measure_func.rpc_info
-        if check_remote(task.target, rpc_info['key'], (rpc_info['host'], rpc_info['port'])):
-            logger.info("Get devices for measurement successfully!")
-        else:
-            raise RuntimeError("Cannot get remote devices from the tracker. "
-                               "Please check the status of tracker by "
-                               "'python -m tvm.exec.query_rpc_tracker --port [THE PORT YOU USE]' "
-                               "and make sure you have free devices on the queue status.")
+def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_option=None):
+    """Common part for building a configuration"""
+    target, task, config = measure_input
 
-    # add device info of cuda and opencl target
-    if ('cuda' in task.target.keys or 'opencl' in task.target.keys) \
-            and hasattr(measure_func, 'rpc_info'):
-        rpc_info = measure_func.rpc_info
-        add_gpu_target_info(task.target, rpc_info["key"], (rpc_info["host"], rpc_info["port"]),
-                            build_kwargs)
-
-    if check_correctness:
-        # use llvm cpu to generate a reference input/output
-        # this option works for tuning topi, but might not work for you custom op
-        with _target.create("llvm"):
-            s, arg_bufs = task.instantiate(task.config_space.get(0))
-        ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
-                     for x in arg_bufs]
-        func = build(s, arg_bufs, "llvm")
-        tvm_buf = [nd.array(x) for x in ref_input]
-        func(*tvm_buf)
-        ref_output = [x.asnumpy() for x in tvm_buf]
-    else:
-        ref_input = ref_output = None
-
-    def measure_batch(measure_inputs):
-        """measure the time cost for a batch of configs in real machines"""
-        if replay_db is not None:
-            partial_results, measure_inputs = \
-                filter_inputs(replay_db, measure_inputs, retry=False)
-
-        # launch measure jobs in parallel
-        pack_size = getattr(measure_func, "pack_size", 1)  # measure `pack_size` inputs in one job
-        futures = []
-        for i in range(0, len(measure_inputs), pack_size):
-            input_pack = measure_inputs[i:i + pack_size]
-            ret = executor.submit(
-                measure_func,
-                input_pack,
-                build_func,
-                build_kwargs,
-                number,
-                repeat,
-                ref_input,
-                ref_output)
-            futures.append(ret)
-
-        # transform results
-        results = []
-        for future in futures:
-            result = future.get()
-            if isinstance(result, Exception):
-                tstamp = time.time()
-                results.extend([MeasureResult((result,), MeasureErrorNo.FLEET_ERROR,
-                                              timeout, tstamp)] * pack_size)
-            else:
-                results.extend(result)
-
-        if replay_db is not None:
-            result_idx = 0
-            for i in range(len(partial_results)):
-                if partial_results[i] is None:
-                    partial_results[i] = results[result_idx]
-                    result_idx += 1
-            return partial_results
-        return results
+    with target:
+        s, args = task.instantiate(config)
+
+        # check invalidity of template and code hash consistency
+        if not config.valid():
+            raise InstantiationError(config.errors)
+
+        opts = build_option or {}
+        if check_gpu:  # Add verify pass to filter out invalid configs in advance.
+            opts["add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))]
+        if cuda_arch:
+            set_cuda_target_arch(cuda_arch)
 
-    measure_batch.n_parallel = n_parallel
-    # attach server and tracker object to avoid them of being garbage-collected
-    measure_batch.attach_objects = attach_objects
-    return measure_batch
+        with build_config(**opts):
+            func = build(s, args, target_host=task.target_host)
+    return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
 
 
-def rpc(key,
-        host=None,
-        port=None,
-        priority=1,
-        session_timeout=60,
-        pack_size=1):
+def default_build_func(measure_input, tmp_dir, **kwargs):
     """
-    Create a standard measure_func which uses RPC Tracker for measurement.
-    This measure_func will request a device from the RPC Tracker and
-    upload the built binary library to that device for measurement.
+    Default build func. This can work for cuda, opencl, llvm backend
 
     Parameters
     ----------
-    key: str
-        The registered key of the device in tracker. The tuner will request devices for
-        measurement by this key.
-    host: str, optional
-        The hostname of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_HOST"
-    port: int, optional
-        The port of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_PORT"
-    priority: int, optional
-        Priority of this task, used by scheduler in tracker
-    session_timeout: int, optional
-        Timeout of rpc session
-    pack_size: int, optional
-        The number of configs measure in one RPC session.
-        Usually this can be set to 1. If your device has high overhead to establish a
-        rpc connection, set this higher.
+    measure_input: MeasureInput
+        The input of measurement
+    tmp_dir: str
+        The path of temporary directory to export generated library
+    """
+    tic = time.time()
+    try:
+        filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64))
+        func, arg_info = _build_func_common(measure_input, **kwargs)
+        func.export_library(filename)
+    except Exception as e:  # pylint: disable=broad-except
+        return BuildResult(None, None, e, time.time() - tic)
+    return BuildResult(filename, arg_info, None, time.time() - tic)
+
+
+def android_ndk_build_func(measure_input, tmp_dir, **kwargs):
+    """
+    Build function for android device using ndk.
+
+    Parameters
+    ----------
+    measure_input: MeasureInput
+        The input of measurement
+    tmp_dir: str
+        The path of temporary directory to export generated library
     """
-    def fmeasure(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
-        """Do measurement for a list of inputs inside a same RPC session.
-
-        Parameters
-        ----------
-        input_pack: List of MeasureInput
-            The inputs of measurement
-        build_func: callable
-            Function for building the code. see :any:`default_build_func` for example
-        build_kwargs: dict
-            Extra arguments for build_func
-        number : int, optional
-            Number of times to do the measurement for average
-        repeat : int, optional
-            Number of times to repeat the measurement.
-            In total, the generated code will be run (1 + number x repeat) times,
-            where the first one is warm up. The returned result contains `repeat` costs,
-            each of which is the average of `number` test run.
-        ref_input: List of numpy array
-            Reference input for correctness check
-        ref_output: List of numpy array
-            Reference output for correctness check
-
-        Returns
-        -------
-        results: List of MeasureResult
-            The results for input_pack
-        """
-        remote_args = (key, (host, port), priority, session_timeout)
-
-        res = _measure_common(input_pack, build_func, build_kwargs, number, repeat,
-                              ref_input, ref_output,
-                              remote_args)
-        return res
-
-    fmeasure.pack_size = pack_size
-    fmeasure.rpc_info = {"key": key, "host": host, "port": port}
-    return fmeasure
-
-
-def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
-                    ref_input=None, ref_output=None, remote_args=None):
-    """Measure the time cost for a pack of inputs.
-
-    (Note: A pack is a list of inputs which will be measured inside a same RPC session)
+    tic = time.time()
+    try:
+        filename = os.path.join(tmp_dir, "tmp_func_%0x.so" % getrandbits(64))
+        func, arg_info = _build_func_common(measure_input, **kwargs)
+        func.export_library(filename, ndk.create_shared)
+    except Exception as e:  # pylint: disable=broad-except
+        return BuildResult(None, None, e, time.time() - tic)
+    return BuildResult(filename, arg_info, None, time.time() - tic)
+
+
+def run_through_rpc(measure_input, build_result,
+                    number, repeat, cooldown_interval,
+                    remote_args, ref_input=None, ref_output=None):
+    """Run a generated library through rpc
 
     Parameters
     ----------
-    input_pack : list of MeasureInput
-        The inputs we need to evaluate
-    build_func : function takes MeasureInput returns tuple of (time_func, ctx, args)
-        The build function used to build each input.
-    build_kwargs: Dict
-        The extra keyword arguments to build_func
+    measure_input: MeasureInput
+        The raw measure input
+    build_result: BuildResult
+        The result returned from Builder. This contains the path to the generated library.
     number : int, optional
-        Number of times to do the measurement for average
+        Number of times to do measurement for tasking average
     repeat : int, optional
         Number of times to repeat the measurement.
         In total, the generated code will be run (1 + number x repeat) times,
         where the first one is warm up. The returned result contains `repeat` costs,
         each of which is the average of `number` test run.
-    ref_input: Array of np.ndarray, optional
-        Reference input for checking correctness
-    ref_output: Array of np.ndarray, optional
-        Reference output for checking correctness
-    remote_args: Tuple, optional
-        The arguments to request_remote. If is not None, will use remote rpc devices.
-
-    Returns
-    -------
-    res_pack : Array of MeasureResult
-        The list of results of measurement.
+    cooldown_interval: float
+        The cool down interval between two measurements
+    remote_args: Tuple
+        The argument for request_remote
+    ref_input: List of np.ndarray
+        The reference input used for checking correctness
+    ref_output: List of np.ndarray
+        The reference output used for checking correctness
     """
-    res_pack = []
-    tmp_dir = util.tempdir() if remote_args else None
-    assert len(input_pack) == 1, "Only supports input_pack == 1 for now"
-
-    for inp in input_pack:
-        tic = time.time()
-
-        # build function
-        try:
-            func, arg_bufs, filename = build_func(inp, tmp_dir, **build_kwargs)
-        except TVMError as exc:
-            tstamp = time.time()
-            msg = str(exc)
-            if "Stack trace returned" in msg:
-                msg = msg[:msg.index("Stack trace returned")]
-            if "InstantiationError" in msg:
-                try:
-                    msg = msg.split('\n')[-2].split(": ")[1]
-                except Exception:  # pylint: disable=broad-except
-                    pass
-                res_pack.append(MeasureResult((InstantiationError(msg),),
-                                              MeasureErrorNo.INSTANTIATION_ERROR,
-                                              tstamp - tic, tstamp))
-            else:
-                res_pack.append(MeasureResult((RuntimeError(msg),),
-                                              MeasureErrorNo.COMPILE_HOST,
-                                              tstamp - tic, tstamp))
-            continue
-        except InstantiationError as e:
-            tstamp = time.time()
-            res_pack.append(MeasureResult((InstantiationError(str(e)),),
-                                          MeasureErrorNo.INSTANTIATION_ERROR,
-                                          tstamp - tic, tstamp))
-            continue
-
-        # measure time
-        errno = MeasureErrorNo.NO_ERROR
-        try:
-            # upload built module
-            if remote_args:
-                remote = request_remote(*remote_args)
-                remote.upload(tmp_dir.relpath(filename))
-                func = remote.load_module(filename)
-                ctx = remote.context(str(inp.target), 0)
-                time_f = func.time_evaluator(
-                    func.entry_name, ctx, number=number, repeat=repeat)
-            else:
-                ctx = context(str(inp.target), 0)
-                time_f = func.time_evaluator(
-                    func.entry_name, ctx, number=number, repeat=repeat)
-
-            # set input
-            if ref_input:
-                args = [nd.array(x, ctx=ctx) for x in ref_input]
-            else:
-                args = [nd.empty(get_const_tuple(x.shape), dtype=x.dtype, ctx=ctx)
-                        for x in arg_bufs]
-
-            costs = time_f(*args).results
-            if len(costs) > 2:  # remove largest and smallest value to reduce variance
-                costs = list(costs)
-                costs.sort()
-                costs = tuple(costs[1:-1])
-
-            # check correctness of output
-            if ref_output:
-                for expected, real in zip(ref_output, args):
-                    if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
-                        logger.warning("Wrong Answer!")
-                        errno = MeasureErrorNo.WRONG_ANSWER
-        except TVMError as exc:
-            msg = str(exc)
-            if "Stack trace returned" in msg:
-                msg = msg[:msg.index("Stack trace returned")]
-            if "CUDA Source" in msg:
-                msg = msg[:msg.index("CUDA Source")]
-            costs = (RuntimeError(msg),)
-            errno = MeasureErrorNo.RUNTIME_DEVICE
-        tstamp = time.time()
-        res_pack.append(MeasureResult(costs, errno, tstamp - tic, tstamp))
-    return res_pack
-
-
-def default_build_func(inp, tmp_dir=None, **kwargs):
-    """Build function module. Exception will be raised when any error occurs
+    if isinstance(build_result, MeasureResult):
+        return build_result
+
+    tic = time.time()
+    errno = MeasureErrorNo.NO_ERROR
+    try:
+        # upload built module
+        remote = request_remote(*remote_args)
+        remote.upload(build_result.filename)
+        func = remote.load_module(os.path.split(build_result.filename)[1])
+        ctx = remote.context(str(measure_input.target), 0)
+        time_f = func.time_evaluator(
+            func.entry_name, ctx, number=number, repeat=repeat)
+
+        # set input
+        if ref_input:
+            args = [nd.array(x, ctx=ctx) for x in ref_input]
+        else:
+            args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
+
+        costs = time_f(*args).results
+        if len(costs) > 2:  # remove largest and smallest value to reduce variance
+            costs = list(costs)
+            costs.sort()
+            costs = tuple(costs[1:-1])
+
+        # check correctness of output
+        if ref_output:
+            for expected, real in zip(ref_output, args):
+                if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
+                    logger.warning("Wrong Answer!")
+                    errno = MeasureErrorNo.WRONG_ANSWER
+    except TVMError as exc:
+        msg = str(exc)
+        if "Stack trace returned" in msg:
+            msg = msg[:msg.index("Stack trace returned")]
+        if "CUDA Source" in msg:
+            msg = msg[:msg.index("CUDA Source")]
+        costs = (RuntimeError(msg[:1024]),)
+        errno = MeasureErrorNo.RUNTIME_DEVICE
+    tstamp = time.time()
+    time.sleep(cooldown_interval)
+    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
+
+
+def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
+    """Request a remote session
 
     Parameters
     ----------
-    inp: MeasureInput
-       The input of this measurement
-    tmp_dir: tvm.contrib.util.TempDirectory, optional
-       The temporary directory for exporting built binary library.
-       If is not None (in RPC mode), the library in this directory will be uploaded to
-       remote devices.
-    kwargs: Dict, optional
-        Other extra arguments
+    device_key: string
+        The device key of registered device in tracker
+    host: host, optional
+        The host address of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_HOST"
+    port: int, optional
+        The port of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_PORT"
+    priority: int, optional
+        The priority of this request, larger is more prior
+    timeout: float, optional
+        The timeout of this session (units: second)
 
     Returns
-    -------
-    func: Function
-        TVM built function. Typically this is the return value of tvm.build.
-    args: Array of Buffer or Tensor
-        The argument list for the function. Typically this is the second argument of tvm.build.
-    filename: str
-        The filename of the output build library
+    ------
+    session: RPCSession
     """
-    # build function
-    with inp.target:
-        s, args = inp.task.instantiate(inp.config)
+    # connect to the tracker
+    host = host or os.environ['TVM_TRACKER_HOST']
+    port = port or int(os.environ['TVM_TRACKER_PORT'])
 
-        # check invalidity of template and code hash consistency
-        if not inp.config.valid():
-            raise InstantiationError(inp.config.errors)
-        code_hash = getattr(s, 'code_hash', None)
-        if inp.config.code_hash != code_hash:
-            raise HashMismatchError('got {0:s}, expected {1:s}'
-                                    .format(str(inp.config.code_hash), str(code_hash)))
-
-        opts = {}
-        if "check_gpu" in kwargs:  # Add verify pass to filter out invalid configs in advance.
-            opts["add_lower_pass"] = [(2, gpu_verify_pass(**kwargs['check_gpu']))]
-        if 'cuda_arch' in kwargs:
-            set_cuda_target_arch(kwargs['cuda_arch'])
+    tracker = _rpc.connect_tracker(host, port)
+    remote = tracker.request(device_key, priority=priority,
+                             session_timeout=timeout)
+    return remote
 
-        with build_config(**opts):
-            func = build(s, args, target_host=inp.task.target_host)
 
-    # export library to temp directory
-    if tmp_dir:
-        if kwargs.get('use_ndk', False):  # for Android NDK
-            filename = "tmp_func_%0x.so" % getrandbits(64)
-            func.export_library(tmp_dir.relpath(filename), ndk.create_shared)
-        else:
-            filename = "tmp_func_%0x.tar" % getrandbits(64)
-            func.export_library(tmp_dir.relpath(filename))
-    else:
-        filename = None
-
-    return func, args, filename
-
-
-def add_gpu_target_info(target, device_key, rpc_tracker_addr, kwargs):
-    """Add device info for gpu target.
-    The info will be used to check the validity of generated code."""
-    remote = request_remote(device_key, rpc_tracker_addr)
-    ctx = remote.context(str(target), 0)
-    max_dims = ctx.max_thread_dimensions
-    kwargs['check_gpu'] = {
-        'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
-        'max_threads_per_block': ctx.max_threads_per_block,
-        'max_thread_x': max_dims[0],
-        'max_thread_y': max_dims[1],
-        'max_thread_z': max_dims[2],
-    }
-
-    if 'cuda' in target.keys:
-        kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
+def check_remote(target, device_key, host=None, port=None, priority=2, timeout=10):
+    """
+    Check the availability of a remote device
 
-def set_cuda_target_arch(arch):
-    """set target architecture of nvcc compiler"""
-    AutotvmGlobalScope.current.cuda_target_arch = arch
+    Parameters
+    ----------
+    target: Target
+        The wanted compilation target
+    device_key: string
+        device key of registered device in tracker
+    host: host, optional
+        The host address of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_HOST"
+    port: int, optional
+        The port address of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_PORT"
+    priority: int, optional
+        The priority of this request, larger is more prior
+    timeout: float, optional
+        The timeout of this check (units: seconds).
+
+    Returns
+    -------
+    available: bool
+        True if can find available device
+    """
+    def _check():
+        remote = request_remote(device_key, host, port, priority)
+        remote.context(str(target))
+    t = threading.Thread(target=_check,)
+    t.start()
+    t.join(timeout)
+    return not t.is_alive()
 
 
 @register_func
@@ -496,6 +561,17 @@ def tvm_callback_cuda_compile(code):
     return ptx
 
 
+def set_cuda_target_arch(arch):
+    """set target architecture of nvcc compiler
+
+    Parameters
+    ----------
+    arch: str
+        The argument of nvcc -arch. (e.g. "sm_51", "sm_62")
+    """
+    AutotvmGlobalScope.current.cuda_target_arch = arch
+
+
 def gpu_verify_pass(**kwargs):
     """Verify the validity of a gpu kernel.
     This pass will check memory usage and number of threads per block.
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index b9d900e49577..1afaca73ebb6 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -22,7 +22,7 @@ class GATuner(Tuner):
     mutation_prob: float
         probability of mutation of a knob in a gene
     """
-    def __init__(self, task, pop_size, elite_num=3, mutation_prob=0.1):
+    def __init__(self, task, pop_size=100, elite_num=3, mutation_prob=0.1):
         super(GATuner, self).__init__(task)
 
         # algorithm configurations
diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py
index 1947c6dde4e0..77c7e919593b 100644
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
@@ -87,7 +87,7 @@ def find_maximums(self, model, num, exclusive):
 
             new_scores = model.predict(new_points)
 
-            ac_prob = np.exp((new_scores - scores) / (t + 1e-2))
+            ac_prob = np.exp(np.minimum((new_scores - scores) / (t + 1e-5), 1))
             ac_index = np.random.random(len(ac_prob)) < ac_prob
 
             points[ac_index] = new_points[ac_index]
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 87da86a4654f..8e1b458a6d2f 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -103,34 +103,7 @@ def get_sample_task(target=tvm.target.cuda(), target_host=None):
                                target=target, target_host=target_host)
     return task, target
 
-
-def test_task_tuner_without_measurement():
-    """test task and tuner without measurement"""
-    task, target = get_sample_task()
-
-    def custom_measure(input_pack, build_func, build_args, number, repeat,
-                       ref_input, ref_output):
-        from tvm.autotvm import MeasureResult
-
-        results = []
-        for inp in input_pack:
-            tic = time.time()
-            # do nothing
-            time.sleep(0.001)
-            results.append(MeasureResult([time.time() - tic], 0,
-                                         time.time() - tic, time.time()))
-        return results
-    measure_option = autotvm.measure_option(custom_measure)
-
-    logging.info("%s", task.config_space)
-
-    # new tuner and recorder
-    for tuner_class in [autotvm.tuner.RandomTuner, autotvm.tuner.GridSearchTuner]:
-        tuner = tuner_class(task)
-        tuner.tune(n_trial=10, measure_option=measure_option)
-        assert tuner.best_flops > 1
-
-def test_tuning_with_measure():
+def test_tuning():
     def check(target, target_host):
         ctx = tvm.context(target, 0)
         if not ctx.exist:
@@ -141,12 +114,12 @@ def check(target, target_host):
         task, target = get_sample_task(target, target_host)
         logging.info("%s", task.config_space)
 
-        measure_option = autotvm.measure_option('local',
-                                                timeout=4,
-                                                number=2)
+        measure_option = autotvm.measure_option(
+            autotvm.LocalBuilder(),
+            autotvm.LocalRunner())
 
         tuner = RandomTuner(task)
-        tuner.tune(n_trial=10, measure_option=measure_option)
+        tuner.tune(n_trial=20, measure_option=measure_option)
 
     check("cuda", None)
     check("opencl", None)
@@ -155,6 +128,4 @@ def check(target, target_host):
     # only print log when invoked from main
     logging.basicConfig(level=logging.DEBUG)
 
-    test_task_tuner_without_measurement()
-    test_tuning_with_measure()
-
+    test_tuning()
diff --git a/tests/python/unittest/test_autotvm_common.py b/tests/python/unittest/test_autotvm_common.py
index 3a6883f69489..ed39c3846c8c 100644
--- a/tests/python/unittest/test_autotvm_common.py
+++ b/tests/python/unittest/test_autotvm_common.py
@@ -32,6 +32,25 @@ def matmul(N, L, M, dtype):
 
     return s, [A, B, C]
 
+@autotvm.template
+def bad_matmul(N, L, M, dtype):
+    if 'bad_device' in tvm.target.current_target().keys:
+        A = tvm.placeholder((N, L), name='A', dtype=dtype)
+        B = tvm.placeholder((L, M), name='B', dtype=dtype)
+
+        k = tvm.reduce_axis((0, L-1), name='k')
+        C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
+        s = tvm.create_schedule(C.op)
+
+        # schedule
+        y, x = s[C].op.axis
+        cfg = autotvm.get_config()
+        cfg.define_split("tile_y", y, num_outputs=2)
+        cfg.define_split("tile_x", x, num_outputs=2)
+        return s, [A, B, C]
+
+    return matmul(N, L, M, dtype)
+
 def get_sample_task(n=128):
     """return a sample task for testing"""
     target = tvm.target.create("llvm")
diff --git a/tests/python/unittest/test_autotvm_database.py b/tests/python/unittest/test_autotvm_database.py
index af4704d95e51..aa956f61bbcf 100644
--- a/tests/python/unittest/test_autotvm_database.py
+++ b/tests/python/unittest/test_autotvm_database.py
@@ -1,17 +1,11 @@
 """Test database"""
 import copy
 import logging
-import time
 
-import numpy as np
-import tvm
-
-from tvm import autotvm
 from tvm.autotvm import database
-from tvm.autotvm.measure.measure_methods import HashMismatchError
-from tvm.autotvm.record import encode, MeasureInput, MeasureResult
+from tvm.autotvm.record import encode, MeasureResult
 
-from test_autotvm_common import get_sample_task, get_sample_records
+from test_autotvm_common import get_sample_records
 
 def test_save_load():
     logging.info("test basic db load/save ...")
@@ -35,66 +29,6 @@ def test_save_load():
 
 TRIAL_LIMIT = 2
 
-def test_db_filter():
-    logging.info("test db filter ...")
-
-    # Pick a GPU target because there are more likely to be failures/invalid configs
-    task, target = get_sample_task()
-
-    ctx = tvm.context(str(target))
-    if not ctx.exist:
-        logging.warning("Skip this test because there is no supported device for test")
-
-    batch_size = 2
-
-    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    ct = 0
-    all_inputs = list()
-    all_results = list()
-    batches = list()
-    tuner = autotvm.tuner.RandomTuner(task)
-    while ct < TRIAL_LIMIT:
-        inputs = list()
-        for i in range(batch_size):
-            cfg = tuner.next_batch(1)[0]
-            inputs.append((MeasureInput(target, task, cfg)))
-            all_inputs.append(inputs[-1])
-        batches.append(inputs)
-        results = measure_batch(inputs)
-        all_results += results
-        ct += 1
-
-    del measure_batch
-
-    db = database.DummyDatabase()
-    db.flush()
-
-    # First setting, memoize one input at a time, check that each is saved and replayed
-    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2, replay_db=db)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    for i in range(len(all_inputs)+1):
-        db.flush()
-        for j in range(i):
-            db.save(all_inputs[j], all_results[j])
-
-        for k in range(len(batches)):
-            batch = batches[k]
-            batch_result = measure_batch(batch)
-            for l in range(batch_size):
-                all_idx = k*batch_size + l
-                assert batch_result[l] is not None
-                if all_idx < i:
-                    assert encode(batch[l], batch_result[l]) == encode(batch[l], all_results[all_idx]), \
-                        "(no retry) EXPECTED MATCH, GOT MISMATCH"
-                else:
-                    assert encode(batch[l], batch_result[l]) != encode(batch[l], all_results[all_idx]), \
-                        "(no retry) EXPECTED MISMATCH, GOT MATCH"
-
-    del measure_batch
-
 def test_db_hash():
     logging.info("test db hash check ...")
     inp1, res1 = get_sample_records(1)[0]
@@ -149,89 +83,8 @@ def test_db_latest_all():
     assert encode(inp1, load4[1]) == encode(inp1, res2)
     assert encode(inp1, load4[2]) == encode(inp1, res3)
 
-def test_db_save_replay():
-    logging.info("test db save (from measure_batch) and replay ...")
-    _db = database.DummyDatabase()
-    _db.flush()
-
-    task, target = get_sample_task()
-
-    ctx = tvm.context(str(target))
-    if not ctx.exist:
-        logging.warning("Skip this test because there is no supported device for test")
-
-    measure_option = autotvm.measure_option('local',
-                                            do_fork=False,
-                                            timeout=2,
-                                            replay_db=_db)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    batch_size = 2
-
-    ct = 0
-    all_inputs = list()
-    all_results = list()
-    batches = list()
-    tuner = autotvm.tuner.RandomTuner(task)
-    while ct < TRIAL_LIMIT:
-        inputs = list()
-        for i in range(batch_size):
-            cfg = tuner.next_batch(1)[0]
-            inputs.append((MeasureInput(target, task, cfg)))
-            all_inputs.append(inputs[-1])
-        batches.append(inputs)
-        results = measure_batch(inputs)
-        all_results += results
-        ct += 1
-    callback = autotvm.callback.log_to_database(_db)
-    callback(None, all_inputs, all_results)
-
-    assert len(_db.db.keys()) == batch_size * TRIAL_LIMIT, \
-        "%d vs %d" % (len(_db.db.keys()), batch_size * TRIAL_LIMIT)
-
-    all_results_2 = measure_batch(all_inputs)
-    all_results_3 = measure_batch(all_inputs)
-
-    for i in range(len(all_results)):
-        encr1 = encode(all_inputs[i], all_results[i])
-        encr2 = encode(all_inputs[i], all_results_2[i])
-        encr3 = encode(all_inputs[i], all_results_3[i])
-        assert encr1 == encr2, "EXPECTED MATCH WITH SAVE REPLAY (first replay), got MISMATCH"
-        assert encr2 == encr3, "EXPECTED MATCH WITH SAVE REPLAY (second replay), got MISMATCH"
-
-    del measure_batch
-
-def test_check_hashmismatch():
-    logging.info("test hash mismatch check")
-
-    task, target = get_sample_task()
-
-    ctx = tvm.context(str(target))
-    if not ctx.exist:
-        logging.warning("Skip this test because there is no supported device for test")
-
-    measure_option = autotvm.measure_option('local', do_fork=False)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    inputs = list()
-    cfg = task.config_space.get(np.random.randint(len(task.config_space)))
-    # notvalidh is not a valid CRC32 hash (not hex)
-    cfg.code_hash = 'notvalidh'
-    inputs.append((MeasureInput(target, task, cfg)))
-
-    try:
-        results = measure_batch(inputs)
-        assert False, "HashMismatchError should be raised"
-    except HashMismatchError:
-        pass
-
-    del measure_batch
-
 if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO)
     test_save_load()
-    test_db_filter()
     test_db_hash()
     test_db_latest_all()
-    test_db_save_replay()
-    test_check_hashmismatch()
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
new file mode 100644
index 000000000000..e29cc2c51658
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -0,0 +1,97 @@
+"""Test builder and runner"""
+import logging
+import time
+
+import numpy as np
+
+import tvm
+from tvm import autotvm
+from test_autotvm_common import get_sample_task, bad_matmul
+from tvm.autotvm.measure.measure import Runner, MeasureResult, MeasureErrorNo
+
+def test_task_tuner_without_measurement():
+    """test task and tuner without measurement"""
+    task, target = get_sample_task()
+
+    class DummyRunner(Runner):
+        def __init__(self):
+            super(DummyRunner, self).__init__(1, 1)
+
+        def run(self, measure_inputs, build_results):
+            return [MeasureResult((np.random.random(),), 0, 0.2, time.time())
+                    for _ in range(len(measure_inputs))]
+
+        def get_build_kwargs(self):
+            return {}
+
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=DummyRunner()
+    )
+
+    logging.info("%s", task.config_space)
+
+    for tuner_class in [autotvm.tuner.RandomTuner,
+                        autotvm.tuner.GridSearchTuner,
+                        autotvm.tuner.GATuner,
+                        autotvm.tuner.XGBTuner]:
+        tuner = tuner_class(task)
+        tuner.tune(n_trial=10, measure_option=measure_option)
+        assert tuner.best_flops > 1
+
+def test_check_correctness():
+    task, target = get_sample_task()
+
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(check_correctness=True)
+    )
+
+    def _callback_correct(tuner, measure_inputs, measure_results):
+        for inp, res in zip(measure_inputs, measure_results):
+            assert res.error_no == 0
+
+    tuner = autotvm.tuner.RandomTuner(task)
+    tuner.tune(n_trial=2, measure_option=measure_option,
+               callbacks=[_callback_correct])
+
+    # a bad template
+    n = 128
+    target = tvm.target.create("llvm -device=bad_device")
+    task = autotvm.task.create(bad_matmul, args=(n, n, n, 'float32'), target=target)
+
+    def _callback_wrong(tuner, measure_inputs, measure_results):
+        for inp, res in zip(measure_inputs, measure_results):
+            assert res.error_no == MeasureErrorNo.WRONG_ANSWER
+
+    tuner = autotvm.tuner.RandomTuner(task)
+    tuner.tune(n_trial=2, measure_option=measure_option,
+               callbacks=[_callback_wrong])
+
+
+def test_min_repeat_ms():
+    task, target = get_sample_task()
+
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(number=1, min_repeat_ms=100)
+    )
+
+    def _callback(tuner, measure_inputs, measure_results):
+        for inp, res in zip(measure_inputs, measure_results):
+            if res.error_no != 0:
+                continue
+
+            assert 1000 * np.mean(res.costs) * \
+                   measure_option['runner'].cur_number >= 100
+
+    tuner = autotvm.tuner.RandomTuner(task)
+    tuner.tune(n_trial=5, measure_option=measure_option,
+               callbacks=[_callback])
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+
+    test_task_tuner_without_measurement()
+    test_check_correctness()
+    test_min_repeat_ms()
diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py
index 61ef97d0a2bf..4cce2735c4a2 100644
--- a/topi/recipe/gemm/gemm_int8.py
+++ b/topi/recipe/gemm/gemm_int8.py
@@ -137,12 +137,15 @@ def block_size_filter(entity):
     print(task.config_space)
 
     measure_option = autotvm.measure_option(
-        measure_func='local', number=10, n_parallel=8, timeout=20)
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
+    )
+
     log_name = 'gemm_int8.log'
     if DO_TUNING:
         tuner = autotvm.tuner.XGBTuner(task)
         tuner.tune(n_trial=1000, measure_option=measure_option,
-               callbacks=[autotvm.callback.log_to_file(log_name)])
+                   callbacks=[autotvm.callback.log_to_file(log_name)])
 
         dispatch_context = autotvm.apply_history_best(log_name)
         best_config = dispatch_context.query(task.target, task.workload)
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 3ff26a05064d..3cd63d03dfd9 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -164,12 +164,12 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
                            target='cuda')
 print(task.config_space)
 
-# use local gpu, measure 5 times for every config to reduce variance
-# run 8 parallel threads for compilation
-measure_option = autotvm.measure_option('local',
-                                        number=5,
-                                        n_parallel=8,
-                                        timeout=20)
+# use local gpu, measure 10 times for every config to reduce variance
+# The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
+measure_option = autotvm.measure_option(
+    builder=autotvm.LocalBuilder(),
+    runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
+)
 
 # begin tuning, log records to file `conv2d.log`
 tuner = autotvm.tuner.XGBTuner(task)
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index a080681f4ca3..8ab7bb2f176c 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -65,15 +65,20 @@ def get_network(name, batch_size):
     input_shape = (batch_size, 3, 224, 224)
     output_shape = (batch_size, 1000)
 
-    if name =='resnet-18':
-        net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
-    elif name =='mobilenet':
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
         net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
-    elif name =='squeezenet v1.1':
+    elif name == 'squeezenet_v1.1':
         net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
-    elif name =='vgg-16':
-        net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=batch_size)
-    elif name =='custom':
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
         # an example for custom network
         from nnvm.testing import utils
         net = nnvm.sym.Variable('data')
@@ -92,6 +97,7 @@ def get_network(name, batch_size):
 
     return net, params, input_shape, output_shape
 
+
 #################################################################
 # Start RPC Tracker
 # -----------------
@@ -158,6 +164,8 @@ def get_network(name, batch_size):
 #    rk3399       2      2     0
 #    rpi3b        11     11    0
 #    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate the measurement in tuning.
 
 ###########################################
 # Set Tuning Options
@@ -184,34 +192,30 @@ def get_network(name, batch_size):
 dtype = 'float32'
 
 tuning_option = {
-   'log_filename': log_file,
-
-   'tuner': 'xgb',
-   'n_trial': 1000,
-   'early_stopping': 250,
-
-   'measure_option': autotvm.measure_option(
-       autotvm.measure.rpc(device_key, host='localhost', port=9190),
-       number=4,
-       n_parallel=1,
-       timeout=10,
-       build_func='ndk' if use_android else 'default',
-   ),
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 1000,
+    'early_stopping': 400,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(
+            build_func='ndk' if use_android else 'default'),
+        runner=autotvm.RPCRunner(
+            device_key, host='localhost', port=9190,
+            number=5,
+            timeout=4,
+        ),
+    ),
 }
 
 ####################################################################
 #
 # .. note:: How to set tuning options
 #
-#   In general, the default value provided here works well. It is the same
-#   value that we used to generate pre-tuned parameters.
-#   If you have multiple devices, you can set :code:`n_parallel` to
-#   the number of devices you have. (e.g. set it to 3 if you register 3 rk3399
-#   boards to the tracker).
+#   In general, the default value provided here works well.
 #   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
 #   which makes the tuning run longer.
-#   If your device is very slow or a single conv2d operator in your network has large FLOPs,
-#   consider setting timeout larger.
 #
 
 ###################################################################
@@ -219,7 +223,7 @@ def get_network(name, batch_size):
 # ------------
 # Now we can extract tuning tasks from the network and begin tuning.
 # Here we provide a simple utility function to tune a list of tasks.
-# This function is just an initial implementation which tune them in sequential order.
+# This function is just an initial implementation which tunes them in sequential order.
 # Later we will bring more sophisticated tuner scheduler.
 
 # You can skip the implementation of this function for this tutorial.
@@ -236,7 +240,9 @@ def tune_tasks(tasks,
             try:  # try winograd template
                 tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
                                           tasks[i].target, tasks[i].target_host, 'winograd')
-                tasks.append(tsk)
+                input_channel = tsk.workload[1][1]
+                if input_channel >= 64:
+                    tasks[i] = tsk
             except Exception:
                 pass
 
@@ -245,8 +251,8 @@ def tune_tasks(tasks,
     if os.path.exists(tmp_log_file):
         os.remove(tmp_log_file)
 
-    for i, tsk in enumerate(tasks):
-        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
 
         # create tuner
         if tuner == 'xgb' or tuner == 'xgb-rank':
@@ -280,7 +286,7 @@ def tune_tasks(tasks,
 ########################################################################
 # Finally we launch tuning jobs and evaluate the end-to-end performance.
 
-def tune_and_evaluate():
+def tune_and_evaluate(tuning_opt):
     # extract workloads from nnvm graph
     print("Extract tasks...")
     net, params, input_shape, out_shape = get_network(network, batch_size=1)
@@ -290,19 +296,18 @@ def tune_and_evaluate():
 
     # run tuning tasks
     print("Tuning...")
-    tune_tasks(tasks, **tuning_option)
+    tune_tasks(tasks, **tuning_opt)
 
     # compile kernels with history best records
     with autotvm.apply_history_best(log_file):
         print("Compile...")
         with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
             graph, lib, params = nnvm.compiler.build(
-                net, target=target,
-                shape={'data': input_shape}, params=params, dtype=dtype)
+                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
 
         # export library
         tmp = tempdir()
-        if tuning_option['measure_option']['build_func'] == 'ndk': # for android
+        if use_android:
             from tvm.contrib import ndk
             filename = "net.so"
             lib.export_library(tmp.relpath(filename), ndk.create_shared)
@@ -312,8 +317,7 @@ def tune_and_evaluate():
 
         # upload module to device
         print("Upload...")
-        remote = autotvm.measure.request_remote(device_key,
-                                                tracker_addr=('localhost', 9190),
+        remote = autotvm.measure.request_remote(device_key, 'localhost', 9190,
                                                 timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
@@ -328,47 +332,44 @@ def tune_and_evaluate():
 
         # evaluate
         print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
-        prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
+        ftimer = module.module.time_evaluator("run", ctx, number=8, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
               (np.mean(prof_res), np.std(prof_res)))
 
 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run by yourself.
-# tune_and_evaluate()
+
+# tune_and_evaluate(tuning_option)
 
 ######################################################################
 # Sample Output
 # -------------
-# The tuning needs to train xgboost models and use them for prediction.
+# The tuning needs to compile many programs and extract feature from them.
 # So a high performance CPU is recommended.
-# It takes about 2 hours on a 32T AMD Ryzen CPU.
-# One sample output is
+# One sample output is listed below.
+# It takes about 2 hours on a 32T AMD Ryzen Threadripper.
 #
 # .. code-block:: bash
 #
 #    Extract tasks...
 #    Tuning...
-#    [Task  1/16]  Current/Best:   18.85/  19.67 GFLOPS | Progress: (353/1000) | 387.05 s Done.
-#    [Task  2/16]  Current/Best:   16.10/  23.50 GFLOPS | Progress: (444/1000) | 379.99 s Done.
-#    [Task  3/16]  Current/Best:    5.49/  13.96 GFLOPS | Progress: (610/1000) | 485.87 s Done.
-#    [Task  4/16]  Current/Best:   10.07/  20.48 GFLOPS | Progress: (430/1000) | 391.66 s Done.
-#    [Task  5/16]  Current/Best:   11.50/  15.50 GFLOPS | Progress: (374/1000) | 356.03 s Done.
-#    [Task  6/16]  Current/Best:   10.76/  23.77 GFLOPS | Progress: (526/1000) | 526.42 s Done.
-#    [Task  7/16]  Current/Best:   12.71/  22.03 GFLOPS | Progress: (341/1000) | 322.96 s Done.
-#    [Task  8/16]  Current/Best:    8.60/  17.91 GFLOPS | Progress: (272/1000) | 236.08 s Done.
-#    [Task  9/16]  Current/Best:   15.37/  23.62 GFLOPS | Progress: (275/1000) | 275.18 s Done.
-#    [Task 10/16]  Current/Best:    6.62/  23.01 GFLOPS | Progress: (330/1000) | 315.02 s Done.
-#    [Task 11/16]  Current/Best:    1.85/  21.39 GFLOPS | Progress: (281/1000) | 239.19 s Done.
-#    [Task 12/16]  Current/Best:   15.41/  24.02 GFLOPS | Progress: (258/1000) | 270.82 s Done.
-#    [Task 13/16]  Current/Best:   17.96/  25.79 GFLOPS | Progress: (380/1000) | 738.29 s Done.
-#    [Task 14/16]  Current/Best:   14.81/  31.17 GFLOPS | Progress: (413/1000) | 799.21 s Done.
-#    [Task 15/16]  Current/Best:   24.39/  40.97 GFLOPS | Progress: (355/1000) | 700.25 s Done.
-#    [Task 16/16]  Current/Best:    9.42/  49.90 GFLOPS | Progress: (348/1000) | 603.84 s Done.
+#    [Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
+#    [Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
+#    [Task  3/12]  Current/Best:    4.67/  24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done.
+#    [Task  4/12]  Current/Best:   11.35/  46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done.
+#    [Task  5/12]  Current/Best:    1.01/  19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done.
+#    [Task  6/12]  Current/Best:    2.47/  23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done.
+#    [Task  7/12]  Current/Best:   14.57/  33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done.
+#    [Task  8/12]  Current/Best:    1.13/  17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done.
+#    [Task  9/12]  Current/Best:   14.45/  22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done.
+#    [Task 10/12]  Current/Best:    3.22/  15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done.
+#    [Task 11/12]  Current/Best:   11.03/  32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done.
+#    [Task 12/12]  Current/Best:    8.00/  21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done.
 #    Compile...
 #    Upload...
 #    Evaluate inference time cost...
-#    Mean inference time (std dev): 157.29 ms (1.74 ms)
+#    Mean inference time (std dev): 162.59 ms (0.06 ms)
 
 ######################################################################
 #
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 8d4aab0b2c27..5b3ddaaf644a 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -271,9 +271,12 @@ def matmul(N, L, M, dtype):
 logging.getLogger('autotvm').setLevel(logging.DEBUG)
 logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
 
-# use local cpu, measure 5 times for every config to reduce variance
-measure_option = autotvm.measure_option('local',
-                                        number=5)
+# There are two steps for measuring a config: build and run.
+# By default, we use all cpu cores to compile program. Then measure them sequentially.
+# We measure 5 times and take average to reduce variance.
+measure_option = autotvm.measure_option(
+    builder='local',
+    runner=autotvm.LocalRunner(number=5))
 
 # begin tuning, log records to file `matmul.log`
 tuner = autotvm.tuner.RandomTuner(task)

From 28f418c33208cd822ac9b1f58187185ddbb9931a Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 30 Aug 2018 02:29:53 +0900
Subject: [PATCH 066/529] [TOPI] add nn schedulers for HLS backends (#1663)

* [TOPI] add nn schedulers for HLS backends

* fix pylint

* fix topi transform test
---
 nnvm/python/nnvm/testing/config.py       |   4 +-
 src/codegen/codegen_aocl.cc              |   2 +
 topi/python/topi/hls/__init__.py         |   1 +
 topi/python/topi/hls/nn.py               | 407 +++++++++++++++++++++++
 topi/tests/python/common.py              |   2 +-
 topi/tests/python/test_topi_pooling.py   |   6 +-
 topi/tests/python/test_topi_reduce.py    |   4 +-
 topi/tests/python/test_topi_relu.py      |   4 +-
 topi/tests/python/test_topi_softmax.py   |   4 +-
 topi/tests/python/test_topi_transform.py |  20 +-
 10 files changed, 437 insertions(+), 17 deletions(-)
 create mode 100644 topi/python/topi/hls/nn.py

diff --git a/nnvm/python/nnvm/testing/config.py b/nnvm/python/nnvm/testing/config.py
index 0eab3e6b3389..bf22ea7e3887 100644
--- a/nnvm/python/nnvm/testing/config.py
+++ b/nnvm/python/nnvm/testing/config.py
@@ -10,5 +10,5 @@ def ctx_list():
     device_list = (device_list.split(",") if device_list
                    else ["llvm", "cuda"])
     device_list = set(device_list)
-    res = [("llvm", tvm.cpu(0)), ("cuda", tvm.gpu(0))]
-    return [x for x in res if x[1].exist and x[0] in device_list]
+    res = [(device, tvm.context(device, 0)) for device in device_list]
+    return [x for x in res if x[1].exist]
diff --git a/src/codegen/codegen_aocl.cc b/src/codegen/codegen_aocl.cc
index 6ae89fecf6bf..d9167a7aadcd 100644
--- a/src/codegen/codegen_aocl.cc
+++ b/src/codegen/codegen_aocl.cc
@@ -33,6 +33,8 @@ runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str,
 
   // Compile the .cl file.
   std::string cmd = "aoc aocl.cl";
+  // AOCL supports fp64.
+  cmd += " -Dcl_khr_fp64";
   Target target = Target::create(target_str);
   if (target->device_name != "") {
     cmd += " -board=" + target->device_name;
diff --git a/topi/python/topi/hls/__init__.py b/topi/python/topi/hls/__init__.py
index 69b80514ff56..65f091fc9916 100644
--- a/topi/python/topi/hls/__init__.py
+++ b/topi/python/topi/hls/__init__.py
@@ -3,3 +3,4 @@
 from __future__ import absolute_import as _abs
 
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
+from .nn import *
diff --git a/topi/python/topi/hls/nn.py b/topi/python/topi/hls/nn.py
new file mode 100644
index 000000000000..8c986d7a5663
--- /dev/null
+++ b/topi/python/topi/hls/nn.py
@@ -0,0 +1,407 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""HLS nn operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import tag
+from .. import generic
+
+
+def _schedule_conv2d(outs):
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_injective(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule conv2d
+        elif OP.tag.find("conv2d") >= 0:
+            Conv2d = OP.output(0)
+            if not Conv2d.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Conv2d].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_conv2d_nchw.register(["hls"])
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_conv2d_nhwc.register(["hls"])
+def schedule_conv2d_nhwc(outs):
+    """Schedule for conv2d_nhwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_conv2d_NCHWc.register(["hls"])
+def schedule_conv2d_NCHWc(num_filter, kernel_size, strides,
+                          padding, layout, out_layout, outs):
+    """Schedule for conv2d_NCHW[x]c
+
+    Parameters
+    ----------
+    num_filter : int
+        The number of filter, i.e., the output channel.
+
+    kernel_size : tuple of int
+        (kernel_height, kernel_width)
+
+    strides : tuple of int
+        (stride_of_height, stride_of_width)
+
+    padding : tuple of int
+        (pad_of_height, pad_of_width)
+
+    layout : str
+        Input data layout
+
+    out_layout : str
+        Output data layout
+
+    outs : Array of Tensor
+        The computation graph description of conv2d_NCHWc
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    sch : Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_conv2d_transpose_nchw.register(["hls"])
+def schedule_conv2d_transpose_nchw(outs):
+    """Schedule for conv2d_transpose_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_transpose_nchw
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_depthwise_conv2d_nchw.register(["hls"])
+def schedule_depthwise_conv2d_nchw(outs):
+    """Schedule for depthwise_conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of depthwise_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_depthwise_conv2d_nhwc.register(["hls"])
+def schedule_depthwise_conv2d_nhwc(outs):
+    """Schedule for depthwise_conv2d_nhwc
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of depthwise_conv2d_nhwc
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+@generic.schedule_bitserial_conv2d_nchw.register(["hls"])
+def schedule_bitserial_conv2d_nchw(outs):
+    """Schedule for bitserial_conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of bitserial_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_bitserial_conv2d_nhwc.register(["hls"])
+def schedule_bitserial_conv2d_nhwc(outs):
+    """Schedule for bitserial_conv2d_nhwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of bitserial_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_reduce.register(["hls"])
+def schedule_reduce(outs):
+    """Schedule for reduction
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        elif OP.tag in ["comm_reduce", "comm_reduce_idx"]:
+            if OP.tag == "comm_reduce":
+                Reduce = OP.output(0)
+            else:
+                Reduce = OP.input_tensors[0]
+            if not Reduce.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Reduce].compute_at(s[Out], s[Out].op.axis[0])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    fused = s[outs[0]].fuse()
+    px, x = s[outs[0]].split(fused, nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_softmax.register(["hls"])
+def schedule_softmax(outs):
+    """Schedule for softmax
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of softmax
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    softmax = outs[0]
+    max_elem = softmax.op.input_tensors[1]
+    expsum = softmax.op.input_tensors[2]
+
+    s[expsum].compute_at(s[softmax], s[softmax].op.axis[1])
+    s[max_elem].compute_at(s[softmax], s[softmax].op.axis[1])
+
+    px, x = s[softmax].split(softmax.op.axis[0], nparts=1)
+    s[softmax].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_dense.register(["hls"])
+def schedule_dense(outs):
+    """Schedule for dense
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of dense
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule dense
+        elif OP.tag == 'dense':
+            Dense = OP.output(0)
+            if not Dense.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Dense].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_pool.register(["hls"])
+def schedule_pool(outs, layout):
+    """Schedule for pool
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of pool
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule pool
+        elif OP.tag.startswith('pool'):
+            Pool = OP.output(0)
+            if not Pool.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Pool].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_global_pool.register(["hls"])
+def schedule_global_pool(outs):
+    """Schedule for global pool
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of global pool
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule global_pool
+        elif OP.tag.startswith('global_pool'):
+            Pool = OP.output(0)
+            if not Pool.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Pool].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py
index d992be9292fc..820cb561c0c3 100644
--- a/topi/tests/python/common.py
+++ b/topi/tests/python/common.py
@@ -9,4 +9,4 @@ def get_all_backend():
         A list of all supported targets
     """
     return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx',
-            'llvm -device=arm_cpu']
+            'llvm -device=arm_cpu', 'aocl_sw_emu']
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index b87795743c4c..578adf60094a 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -5,6 +5,8 @@
 import math
 from topi.util import get_const_tuple
 
+from common import get_all_backend
+
 def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True):
     iw = ih
     kw = kh
@@ -64,7 +66,7 @@ def check_device(device):
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_pool():
@@ -109,7 +111,7 @@ def check_device(device):
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_global_pool():
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 331498deb10c..0be652948060 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -4,6 +4,8 @@
 import tvm
 import topi
 
+from common import get_all_backend
+
 def _my_npy_argmax(arr, axis, keepdims):
     if not keepdims:
         return arr.argmax(axis=axis)
@@ -90,7 +92,7 @@ def check_device(device):
                 np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
         else:
             np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
-    for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan", "nvptx"]:
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py
index 7c75a9b08975..3e38e707a6da 100644
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -5,6 +5,8 @@
 import topi
 from topi.util import get_const_tuple
 
+from common import get_all_backend
+
 def verify_relu(m, n):
     A = tvm.placeholder((m, n), name='A')
     B = topi.nn.relu(A)
@@ -27,7 +29,7 @@ def check_device(device):
         foo(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx', 'sdaccel']:
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py
index f12070695220..cad30fa00e5b 100644
--- a/topi/tests/python/test_topi_softmax.py
+++ b/topi/tests/python/test_topi_softmax.py
@@ -7,6 +7,8 @@
 import logging
 from topi.util import get_const_tuple
 
+from common import get_all_backend
+
 def verify_softmax(m, n, dtype="float32"):
     A = tvm.placeholder((m, n), dtype=dtype, name='A')
     B = topi.nn.softmax(A)
@@ -63,7 +65,7 @@ def check_device(device):
         foo(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 4788d758cf45..123df331e174 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -3,6 +3,8 @@
 import tvm
 import topi
 
+from common import get_all_backend
+
 def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
     A = tvm.placeholder(shape=in_shape, name="A")
     B = topi.expand_dims(A, axis, num_newaxis)
@@ -22,7 +24,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -45,7 +47,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -68,7 +70,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -96,7 +98,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 def verify_concatenate(shapes, axis):
@@ -121,7 +123,7 @@ def check_device(device):
         foo(*(data_nds + [out_nd]))
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -146,7 +148,7 @@ def check_device(device):
         for out_nd, out_npy in zip(out_nds, out_npys):
             np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -204,7 +206,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "cuda", "opencl", "sdaccel"]:
+    for device in ["llvm", "cuda", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
 
 def verify_take(src_shape, indices_src, axis=None):
@@ -243,7 +245,7 @@ def check_device(device):
         foo(data_nd, indices_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
-    for device in ["llvm", "opencl", "sdaccel"]:
+    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
 
 def verify_strided_slice(in_shape, begin, end, stride=None):
@@ -270,7 +272,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "opencl", "sdaccel"]:
+    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
 
 def test_strided_slice():

From 4e6740a43353e158bf6e131f1ac003e9852de834 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 30 Aug 2018 00:47:44 -0500
Subject: [PATCH 067/529] Fix incorrect doc in conv2d_nhwc_python (#1677)

---
 topi/python/topi/testing/conv2d_nhwc_python.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py
index 461d7a6a0e06..a872bddab09b 100644
--- a/topi/python/topi/testing/conv2d_nhwc_python.py
+++ b/topi/python/topi/testing/conv2d_nhwc_python.py
@@ -13,7 +13,7 @@ def conv2d_nhwc_python(a_np, w_np, stride, padding):
         4-D with shape [batch, in_height, in_width, in_channel]
 
     w_np : numpy.ndarray
-        4-D with shape [num_filter, filter_height, filter_width, in_channel]
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
 
     stride : int or a list/tuple of two ints
         Stride size, or [stride_height, stride_width]

From 8d3d4c4205823e10948a29025ff863b2e0472d49 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 29 Aug 2018 23:09:33 -0700
Subject: [PATCH 068/529] [PASS] Enhance gpu verify pass (#1660)

---
 src/pass/verify_gpu_code.cc                   | 16 ++++++++++++-
 .../unittest/test_pass_verify_gpu_code.py     | 24 +++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/pass/verify_gpu_code.cc b/src/pass/verify_gpu_code.cc
index 363b7c4cf7cc..70908eb43d6b 100644
--- a/src/pass/verify_gpu_code.cc
+++ b/src/pass/verify_gpu_code.cc
@@ -86,17 +86,29 @@ class GPUCodeVerifier : public IRVisitor {
       // record the number of threads in a block
       std::string name = var.get()->name_hint;
       if (name == "threadIdx.x" || name == "threadIdx.y" || name == "threadIdx.z") {
+        size_t length = static_cast<size_t>(extent->value);
         if (!visited_threads_.count(name)) {
           visited_threads_.insert(name);
-          size_t length = static_cast<size_t>(extent->value);
           thread_per_block_ *= length;
 
           if (name == "threadIdx.x") {
             valid_ &= length <= max_thread_x_;
+            thread_x_extent_ = length;
           } else if (name == "threadIdx.y") {
             valid_ &= length <= max_thread_y_;
+            thread_y_extent_ = length;
           } else if (name == "threadIdx.z") {
             valid_ &= length <= max_thread_z_;
+            thread_z_extent_ = length;
+          }
+        } else {
+          // the thread should be bound to axes with the same length
+          if (name == "threadIdx.x") {
+            valid_ &= length == thread_x_extent_;
+          } else if (name == "threadIdx.y") {
+            valid_ &= length == thread_y_extent_;
+          } else if (name == "threadIdx.z") {
+            valid_ &= length == thread_z_extent_;
           }
         }
       }
@@ -111,6 +123,8 @@ class GPUCodeVerifier : public IRVisitor {
   std::unordered_set<const tvm::Variable *> visited_shared_buffers_;
   std::unordered_set<std::string> visited_threads_;
 
+  size_t thread_x_extent_, thread_y_extent_, thread_z_extent_;
+
   size_t local_memory_per_block_;
   size_t shared_memory_per_block_;
   size_t thread_per_block_;
diff --git a/tests/python/unittest/test_pass_verify_gpu_code.py b/tests/python/unittest/test_pass_verify_gpu_code.py
index 6fc0387cf144..e3884a727852 100644
--- a/tests/python/unittest/test_pass_verify_gpu_code.py
+++ b/tests/python/unittest/test_pass_verify_gpu_code.py
@@ -162,8 +162,32 @@ def test_multiple_kernels():
             tvm.build(s, [A, C], target)
         assert valid[0]
 
+def test_wrong_bind():
+    N = 1024
+
+    A = tvm.placeholder((N, N-1), name='A')
+    B = tvm.compute((N, N-1), lambda i, j: A[i, j])
+
+    s = tvm.create_schedule([B.op])
+
+    # bind a thread axis to two loop axes with different lengths
+    s[B].bind(s[B].op.axis[0], tvm.thread_axis("threadIdx.x"))
+    s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))
+
+    for target in ['opencl', 'cuda']:
+        if not tvm.context(target).exist:
+            continue
+
+        valid = [None]
+        with tvm.build_config(**{"add_lower_pass": [
+                (2, get_verify_pass(valid, max_threads_per_block=N*N))]}):
+            tvm.build(s, [A, B], target)
+        assert not valid[0]
+
+
 if __name__ == "__main__":
     test_local_memory()
     test_shared_memory()
     test_num_thread()
     test_multiple_kernels()
+    test_wrong_bind()

From 38834cd98803edd0f65b30d52665e1464e681d75 Mon Sep 17 00:00:00 2001
From: Wu Zhao <FrozenGene@users.noreply.github.com>
Date: Sat, 1 Sep 2018 01:35:59 +0800
Subject: [PATCH 069/529] Support FoldScaleAxis for depthwise convolution
 (#1664)

---
 nnvm/src/compiler/fold_scale_axis.cc         | 74 +++++++++++++++++++-
 nnvm/tests/python/compiler/test_fold_axis.py | 51 +++++++++++++-
 2 files changed, 122 insertions(+), 3 deletions(-)

diff --git a/nnvm/src/compiler/fold_scale_axis.cc b/nnvm/src/compiler/fold_scale_axis.cc
index 639aba602589..35e024efdc6a 100644
--- a/nnvm/src/compiler/fold_scale_axis.cc
+++ b/nnvm/src/compiler/fold_scale_axis.cc
@@ -493,8 +493,80 @@ bool Conv2DScaleAxisForward(
   if ((*in_info)[0].kind != kPending) return false;
   // only optimize for nchw for now
   if (param.kernel_layout == "OIHW" && (*in_info)[0].axis == 1) {
+    // Check whether it is depthwise conv2d
+    if (param.use_bias) {
+      CHECK_EQ(in_shape.size(), 3U) << "Input:[data, weight, bias]";
+    } else {
+      CHECK_EQ(in_shape.size(), 2U) << "Input:[data, weight]";
+    }
+
+    auto dshape = in_shape.at(0);
+    CHECK_EQ(dshape.ndim(), 4U) << "Input data shape should be 4D";
+
+    // TODO(FrozenGene): Currently, we don't support conv2d's groups != in channels.
+    if (param.groups > 1 && dshape[1] != param.groups) {
+      LOG(WARNING) << "FoldScaleAxis optimization doesn't support conv2d "
+                   << "with groups != in channels. We will skip FoldScaleAxis "
+                   << "optimization for this op.";
+      return false;
+    }
+
+
+    // input channel equals to groups, which means depthwise conv2d
+    bool is_depthwise_conv2d = (dshape[1] == param.groups);
+
+    // if it is depthwise convolution, the weight fold axis should along to axis 0.
+    // For example:
+    // data shape [1,54,63,127] weights shape [54,1,3,3], scale shape [54]
+    // depthwise convolution's weights shape means we have divided the data shape's channel
+    // to groups parties. Here, we divide 54 channels into 54 parties. Every part size is 1.
+    // weights shape's first dimision means how many parties we have divided (mapping to
+    // input shape's channel). So, in the depthwise convolution, we shouldn't do like
+    // traditional convolution(i.e. OIHW)
+
+    // Backgroud of this algorithm:
+
+    // Original Graph:
+    //    Graph(%x,
+    //          %in_scale,
+    //          %weight,
+    //          %bias,
+    //          %out_scale) {
+    //      %1 = __add_scalar__(%x, scalar='1')
+    //      %3 = expand_dims(%in_scale, num_newaxis='2', axis='1')
+    //      %4 = broadcast_mul(%1, %3)
+    //      %7 = conv2d(%4, %weight, %bias, padding='(1, 1)', kernel_size='(3, 3)', channels='2')
+    //      %8 = relu(%7)
+    //      %10 = expand_dims(%out_scale, num_newaxis='2', axis='1')
+    //      %11 = broadcast_mul(%8, %10)
+    //      ret %11
+    //    }
+
+    // Optimized Graph:
+    //    Graph(%x,
+    //          %weight,
+    //          %out_scale,
+    //          %in_scale,
+    //          %bias) {
+    //      %1 = __add_scalar__(%x, scalar='1')
+    //      %4 = expand_dims(%out_scale, num_newaxis='3', axis='1')
+    //      %5 = broadcast_mul(%weight, %4)
+    //      %7 = expand_dims(%in_scale, num_newaxis='2', axis='1')
+    //      %8 = broadcast_mul(%5, %7)
+    //      %10 = broadcast_mul(%bias, %out_scale)
+    //      %11 = conv2d(%1, %8, %10, padding='(1, 1)', kernel_size='(3, 3)', channels='2')
+    //      %12 = relu(%11)
+    //      ret %12
+    //    }
+
+    // Conv2DScaleAxisForward will need in_scale. Conv2DScaleAxisBackward will need out_scale.
+    // in_scale will apply into input data's channel (in_channel). out_scale will apply in
+    // conv2d's result, which will apply in weight's output channel.
+    // So, default Conv2DScaleAxisForward will fold axis 1 (weights' input channel).
+    // Conv2DScaleAxisBackward will fold axis 0 (weights' output channel).
+    // But depthwise convolution is another story as said previously.
     (*in_info)[1].kind = kMulConsumer;
-    (*in_info)[1].axis = 1;
+    (*in_info)[1].axis = is_depthwise_conv2d ? 0 : 1;
     (*in_info)[1].source = (*in_info)[0].source;
     return true;
   } else {
diff --git a/nnvm/tests/python/compiler/test_fold_axis.py b/nnvm/tests/python/compiler/test_fold_axis.py
index bbd50193b4b0..ab90cd723989 100644
--- a/nnvm/tests/python/compiler/test_fold_axis.py
+++ b/nnvm/tests/python/compiler/test_fold_axis.py
@@ -6,6 +6,7 @@
 from nnvm.compiler import graph_util, graph_attr
 
 def test_fold_axis_conv():
+    # Before simplify
     def before(x, conv_weight, conv_bias, in_scale, out_scale, channels):
         x = x * sym.expand_dims(in_scale, axis=1, num_newaxis=2)
         y = sym.conv2d(x, conv_weight, conv_bias,
@@ -31,7 +32,6 @@ def expected(x, conv_weight, conv_bias, in_scale, out_scale, channels):
         y = sym.relu(y)
         return y
 
-    # Before simplify
     def check(shape, channels):
         x = sym.Variable("x") + 1
         weight = sym.Variable("weight")
@@ -50,8 +50,55 @@ def check(shape, channels):
 
     check((2, 4, 10, 10), 2)
 
+def test_fold_axis_depthwise_conv():
+    # Before simplify
+    def before(x, conv_weight, conv_bias, in_scale, out_scale, channels):
+        x = x * sym.expand_dims(in_scale, axis=1, num_newaxis=2)
+        y = sym.conv2d(x, conv_weight, conv_bias,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       groups=54,
+                       name="depthiwise_conv")
+        y = sym.relu(y)
+        y = y * sym.expand_dims(out_scale, axis=1, num_newaxis=2)
+        return y
+
+    def expected(x, conv_weight, conv_bias, in_scale, out_scale, channels):
+        conv_weight = conv_weight * sym.expand_dims(out_scale, axis=1, num_newaxis=3)
+        conv_weight = conv_weight * sym.expand_dims(in_scale, axis=1, num_newaxis=3)
+        conv_bias = conv_bias * out_scale
+        y = sym.conv2d(x,
+                       conv_weight,
+                       conv_bias,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       groups=54,
+                       name="depthiwise_conv")
+        y = sym.relu(y)
+        return y
+
+    def check(shape, channels):
+        x = sym.Variable("x") + 1
+        weight = sym.Variable("weight")
+        bias = sym.Variable("bias")
+        in_scale = sym.Variable("in_scale")
+        out_scale = sym.Variable("out_scale")
+        y1 = before(x, weight, bias, in_scale, out_scale, channels)
+        y2 = expected(x, weight, bias, in_scale, out_scale, channels)
+        ishape = {"x": shape, "out_scale": (channels,), "in_scale": (shape[1],)}
+        g1 = nnvm.graph.create(y1)
+        g2 = nnvm.graph.create(y2)
+        graph_attr.set_shape_inputs(g1, ishape)
+        g1 = g1.apply("InferShape").apply("FoldScaleAxis")
+        # assert graph equals as expected
+        graph_util.check_graph_equal(g1, g2)
+
+    check((1, 54, 63, 127), 54)
 
 def test_fold_fail():
+    # Before simplify
     def before(x, scale, channels):
         y = sym.conv2d(x,
                        channels=channels,
@@ -61,7 +108,6 @@ def before(x, scale, channels):
         y = y * sym.expand_dims(scale, axis=1, num_newaxis=1)
         return y
 
-    # Before simplify
     def check(shape, channels):
         x = sym.Variable("x")
         bias = sym.Variable("bias")
@@ -108,3 +154,4 @@ def run_prune(graph, params, opt_level):
     test_fold_resnet()
     test_fold_axis_conv()
     test_fold_fail()
+    test_fold_axis_depthwise_conv()

From 1523d024bce6748d625218e1f0ab93a1c48fae4b Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Sat, 1 Sep 2018 03:35:05 +0800
Subject: [PATCH 070/529] [FRONTEND][TENSORFLOW] Add Transpose support. (#1665)

* [FRONTEND][TENSORFLOW] Add Transpose support.

* [FRONTEND][TENSORFLOW] Get parameter from inputs and fix document style.

* [FRONTEND][TENSORFLOW] Handle the case that perm is not specified.

* [FRONTEND][TENSORFLOW] Convert Rank and Range to param.

* [FRONTEND][TENSORFLOW] Fix a pylint issue.

* [FRONTEND][TENSORFLOW] Implement Rank and Range as normal op.
---
 nnvm/python/nnvm/frontend/tensorflow.py       | 32 +++++++++++++++++++
 .../frontend/tensorflow/test_forward.py       | 23 +++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 6be5333ccee6..d9406601ded4 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -650,6 +650,35 @@ def _impl(inputs, attr, params):
             ignores=['Tpaddings'],)(new_inputs, attr)
     return _impl
 
+def _transpose():
+    def _impl(inputs, attr, params):
+        # If perm is not specified, axes is left empty,
+        # otherwise its value is get from params
+        param_name = inputs[1].list_output_names()[0]
+        axes = params.get(param_name, tvm.nd.array([])).asnumpy()
+        return _sym.transpose(inputs[0], axes=tuple(axes))
+    return _impl
+
+def _rank():
+    def _impl(inputs, attr, params):
+        input_shapes = attr['_input_shapes'][inputs[0]]
+        assert len(inputs) == 1
+
+        name = attr["_node_name"]
+        params[name] = tvm.nd.array([len(input_shapes[0])])
+        return _sym.Variable(name=name, shape=params[name].shape)
+    return _impl
+
+def _range():
+    def _impl(inputs, attr, params):
+        start = params.pop(inputs[0].list_output_names()[0]).asnumpy()[0]
+        limit = params.pop(inputs[1].list_output_names()[0]).asnumpy()[0]
+        delta = params.pop(inputs[2].list_output_names()[0]).asnumpy()[0]
+
+        name = attr["_node_name"]
+        params[name] = tvm.nd.array([start, limit, delta])
+        return _sym.Variable(name=name, shape=params[name].shape)
+    return _impl
 
 # compatible operators that do NOT require any conversion.
 _identity_list = []
@@ -700,6 +729,9 @@ def _impl(inputs, attr, params):
     'LRN'                               : _lrn(),
     'Pad'                               : _pad('Pad'),
     'PadV2'                             : _pad('PadV2'),
+    'Range'                             : _range(),
+    'Rank'                              : _rank(),
+    'Transpose'                         : _transpose(),
 }
 
 # _convert_map_rnn defines maps of rnn operator name to
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index e0e18d1bdb06..b0fb02cf04f5 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -853,11 +853,34 @@ def _test_l2_normalize(ishape, eps, axis):
 def test_forward_l2_normalize():
     _test_l2_normalize((1, 3, 20, 20), 0.001, (0,))
 
+#######################################################################
+# transpose
+# ---------
+def _test_forward_transpose(ishape, axes=None):
+    input = np.random.uniform(size=ishape).astype(np.float32)
+
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=input.shape, dtype=input.dtype, name="transpose_data")
+
+        if axes is None:
+            tf.transpose(in1)
+        else:
+            tf.transpose(in1, perm=axes)
+
+        compare_tf_with_tvm(input, 'transpose_data:0', 'transpose:0')
+
+def test_forward_transpose():
+    _test_forward_transpose((2, 3, 4))
+    _test_forward_transpose((7, 8, 8, 10))
+    _test_forward_transpose((2, 3, 4), (1, 2, 0))
+    _test_forward_transpose((2, 3, 4), (0, 1, 2))
+    _test_forward_transpose((2, 3, 4, 5), (3, 0, 1, 2))
 
 #######################################################################
 # Main
 # ----
 if __name__ == '__main__':
+    test_forward_transpose()
     test_forward_convolution()
     test_forward_pooling()
     test_forward_reshape()

From f9a965ac251ff65b23364ecc2882e6050ee906b1 Mon Sep 17 00:00:00 2001
From: Tatsuya Nishiyama <nishiyama.tatsuya0@gmail.com>
Date: Sat, 1 Sep 2018 14:02:55 +0900
Subject: [PATCH 071/529] [CUDA][TVM] fix constructing invalid command line
 string for nvcc (#1674)

---
 python/tvm/contrib/nvcc.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 1b7bb840127d..a87c942a7247 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -28,7 +28,7 @@ def compile_cuda(code,
     arch : str
         The architecture
 
-    options : str
+    options : str or list of str
         The additional options
 
     path_target : str, optional
@@ -59,10 +59,16 @@ def compile_cuda(code,
     cmd = ["nvcc"]
     cmd += ["--%s" % target, "-O3"]
     cmd += ["-arch", arch]
-    cmd += ["-o", file_target]
 
     if options:
-        cmd += options
+        if isinstance(options, str):
+            cmd += [options]
+        elif isinstance(options, list):
+            cmd += options
+        else:
+            raise ValueError("options must be str or list of str")
+
+    cmd += ["-o", file_target]
     cmd += [temp_code]
 
     proc = subprocess.Popen(

From 118d3da99cca6aa6c00a4092e6f4e33e22d0ae18 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 4 Sep 2018 00:24:49 +0530
Subject: [PATCH 072/529] CI Failure Fix (#1682)

The recent changes in tutorial is with PR # https://github.com/dmlc/tvm/pull/1501 broken the link for downloading the weights file, leading to this CI failure.
---
 tutorials/nnvm/from_darknet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/nnvm/from_darknet.py b/tutorials/nnvm/from_darknet.py
index c6b70cf59413..2bd7f4a1748a 100644
--- a/tutorials/nnvm/from_darknet.py
+++ b/tutorials/nnvm/from_darknet.py
@@ -39,7 +39,7 @@
 WEIGHTS_NAME = MODEL_NAME + '.weights'
 REPO_URL = 'https://github.com/siju-samuel/darknet/blob/master/'
 CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
-WEIGHTS_URL = REPO_URL + 'weights/' + WEIGHTS_NAME + '?raw=true'
+WEIGHTS_URL = 'https://pjreddie.com/media/files/' + WEIGHTS_NAME
 
 download(CFG_URL, CFG_NAME)
 download(WEIGHTS_URL, WEIGHTS_NAME)

From eeb0b78d81262e57a77bd0e55c21eb99ee6246f4 Mon Sep 17 00:00:00 2001
From: nhynes <nhynes@berkeley.edu>
Date: Tue, 4 Sep 2018 13:50:27 -0700
Subject: [PATCH 073/529] Rename axis parameter in onnx squeeze (#1683)

* Rename axis parameter in onnx squeeze

* Add test
---
 nnvm/python/nnvm/frontend/onnx.py               | 2 +-
 nnvm/tests/python/frontend/onnx/test_forward.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index 5127dfd299bd..ed885dfcd874 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -684,7 +684,7 @@ def _get_convert_map(opset):
         'Slice': Slice.get_converter(opset),
         'Transpose': AttrCvt('transpose', {'perm': 'axes'}),
         'Gather': Gather.get_converter(opset),
-        'Squeeze': Renamer('squeeze'),
+        'Squeeze': AttrCvt('squeeze', {'axes': 'axis'}),
         'Unsqueeze': Unsqueeze.get_converter(opset),
         'Pad': Pad.get_converter(opset),
         'Shape': Shape.get_converter(opset),
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 49cf58fa1aa5..5e199b4526b0 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -178,7 +178,7 @@ def test_power():
 def test_squeeze():
     in_shape = (1, 3, 1, 3, 1, 1)
     out_shape = (3, 3)
-    y = helper.make_node("Squeeze", ['in'], ['out'])
+    y = helper.make_node("Squeeze", ['in'], ['out'], axes=[0, 2, 4, 5])
 
     graph = helper.make_graph([y],
                               'squeeze_test',

From 2602351928a0767be1827563b66c481449738f91 Mon Sep 17 00:00:00 2001
From: "Tang, Cheng" <souptc@gmail.com>
Date: Tue, 4 Sep 2018 22:35:41 -0700
Subject: [PATCH 074/529] expose SaveToFile symbol on windows (#1685)

---
 include/tvm/runtime/module.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 3a98820b76f3..675dd8728675 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -103,8 +103,8 @@ class ModuleNode {
    * \param file_name The file to be saved to.
    * \param format The format of the file.
    */
-  virtual void SaveToFile(const std::string& file_name,
-                          const std::string& format);
+  TVM_DLL virtual void SaveToFile(const std::string& file_name,
+                                  const std::string& format);
   /*!
    * \brief Save the module to binary stream.
    * \param stream The binary stream to save to.

From f788ff51c302766a8eb956a7a516bde1d3224691 Mon Sep 17 00:00:00 2001
From: Jian Weng <werefluke@gmail.com>
Date: Tue, 4 Sep 2018 22:45:17 -0700
Subject: [PATCH 075/529] [Tutorial] tutorial to writing a costumized pass
 (#1671)

---
 docs/conf.py                           |   1 +
 python/tvm/build_module.py             |   2 +-
 tutorials/dev/README.txt               |   3 +
 tutorials/dev/low_level_custom_pass.py | 153 +++++++++++++++++++++++++
 4 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 tutorials/dev/README.txt
 create mode 100644 tutorials/dev/low_level_custom_pass.py

diff --git a/docs/conf.py b/docs/conf.py
index 989d26f87d3e..e3f7f6a82c24 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -192,6 +192,7 @@ def run_doxygen(folder):
     ['../tutorials/language',
      '../tutorials/optimize',
      '../tutorials/autotvm',
+     '../tutorials/dev',
      '../tutorials/vta',
      '../tutorials/topi',
      '../tutorials/deployment',
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 777654af6619..70935cde1816 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -368,7 +368,7 @@ def lower(sch,
         cfg.unroll_explicit)
     for f in lower_phase2:
         stmt = f(stmt)
-    # Phase 2
+    # Phase 3
     stmt = ir_pass.Simplify(stmt)
     stmt = ir_pass.LowerStorageAccessInfo(stmt)
     stmt = ir_pass.RemoveNoOp(stmt)
diff --git a/tutorials/dev/README.txt b/tutorials/dev/README.txt
new file mode 100644
index 000000000000..a358280640de
--- /dev/null
+++ b/tutorials/dev/README.txt
@@ -0,0 +1,3 @@
+Developer Tutorials
+-------------------
+
diff --git a/tutorials/dev/low_level_custom_pass.py b/tutorials/dev/low_level_custom_pass.py
new file mode 100644
index 000000000000..617093d4a595
--- /dev/null
+++ b/tutorials/dev/low_level_custom_pass.py
@@ -0,0 +1,153 @@
+"""
+Writing a Customized Pass
+=========================
+**Author**: `Jian Weng <https://were.github.io>`_
+
+TVM is a framework that abstracts away the heterogenity of machine learning accelerators.
+Sometimes users may want customize some analysis and IR transformations
+to adapt TVM to their own specialized hardware. This tutorial helps users write
+a customized pass in TVM.
+ Prerequisites
+-------------
+Before reading this tutorial, we assume readers have already known these topics well:
+- Writing an algorithm in TVM and schedule it. Otherwise, see example tutorials like
+  `Optimize GeMM on CPU <https://docs.tvm.ai/tutorials/optimize/opt_gemm.html>_`.
+- The basic structure of HalideIR. Otherwise, see ``HalideIR/src/ir/IR.h`` to learn what
+  attributes of IR nodes are defined.
+- Visitor design pattern. Otherwise, check the
+  `Python AST module <https://docs.python.org/3/library/ast.html>_` to see how an AST
+  visitor is implemented.
+- How a HalideIR/Schedule is lowered to either a LoweredFunc class or a LLVM module. Otherwise,
+  take a look at ``python/tvm/build_module.py`` to get some basics.
+"""
+
+from __future__ import absolute_import, print_function
+import tvm
+import numpy as np
+
+######################################################################
+# We first write a very simple vector add and build it with the default schedule. Then, we use
+# our customized lowering pass to manipulate the IR directly instead of using schedule premitives.  
+#
+
+n = tvm.const(128)
+a = tvm.placeholder((n, ), name="a")
+b = tvm.placeholder((n, ), name="b")
+c = tvm.compute((n, ), lambda i: a[i] + b[i], name='c')
+
+sch = tvm.create_schedule(c.op)
+ir  = tvm.lower(sch, [a, b, c], simple_mode=True)
+print(ir)
+
+######################################################################
+# Writing a Pass
+# --------------
+# Essentially, an "IR transformation pass" is a function which maps a statement to a new statement.
+# Thus, we define this vectorize function and implement it step by step.
+#
+
+######################################################################
+# TVM already provides two class for users to both analyze and transform IR.
+#
+# IR Visitor
+# ~~~~~~~~~~
+# We can use ``tvm.ir_pass.PostOrderVisit(stmt, func)`` to gather information from the Halide IR.
+# ``func`` is a function callback. This function will be called before exiting the current IR node,
+# i.e. post-order visit. Then we leverage side effects to store the result of IR visit, because the
+# return value of ``func`` will be ignored.
+#
+# .. note::
+#
+#     You MUST use some array to store the result of IR visit. Even the value is a single variable.
+#     This is mainly due to the constraints in the Python-C runtime. The variable values will be
+#     refreshed every recursion but the array values will be preserved.
+#
+
+loops = []
+def find_width8(op):
+    """ Find all the 'For' nodes whose extent can be divided by 8. """
+    if isinstance(op, tvm.stmt.For):
+        if isinstance(op.extent, tvm.expr.IntImm):
+            if op.extent.value % 8 == 0:
+                loops.append(op)
+
+#####################################################################
+# IR Transformation
+# ~~~~~~~~~~~~~~~~~
+# The transformation interface is slightly different from the visitor interface. There is only a
+# post-order callback in the visitor, but transformation visitor supports both a pre-order and a
+# post-order callback. If you want to keep the origin IR node, just return None. If you want to
+# change the current node to some node, use TVM IR maker interface to build it and return
+# this value.
+# 
+# .. note::
+#
+#     If the pre-order function is called and returns a value which is not None, the post-order
+#     function will be skipped.
+#
+
+def vectorize8(op):
+    """ Split can vectorize the loops found in `find_width8`. """
+    if op in loops:
+        extent = op.extent.value
+        name = op.loop_var.name
+        lo, li = tvm.var(name + '.outer'), tvm.var(name + '.inner')
+        body = tvm.ir_pass.Substitute(op.body, {op.loop_var: lo * 8 + li})
+        body = tvm.make.For(li, 0, 8, tvm.stmt.For.Vectorized, 0, body)
+        body = tvm.make.For(lo, 0, extent // 8, tvm.stmt.For.Serial, 0, body)
+        return body
+    return None
+
+def vectorize(stmt):
+    global loops
+
+    tvm.ir_pass.PostOrderVisit(stmt, find_width8)
+
+    if not loops:
+        return stmt
+
+    # The last list arugment indicates what kinds of nodes will be transformed.
+    # Thus, in this case only `For` nodes will call `vectorize8`
+    stmt = tvm.ir_pass.IRTransform(stmt, None, vectorize8, ['For'])
+    
+    return stmt
+
+#####################################################################
+# Glue to Lowering
+# ----------------
+# So far, we are done with writing this IR transformation pass. What we need to do next is to glue
+# this pass to TVM's lower pass. We can first call this function directly as a sanity check.
+#
+
+print(vectorize(ir))
+
+#####################################################################
+# In TVM, there is a property called ``BuildConfig``. You can use this property to customize your
+# own lowering options. In this case, we inject the pass written above into the TVM standard lowering
+# pass by feeding **a list of tuple** as argument to ``add_lower_pass``. "Tuple" indicates different
+# phases of lowering. In TVM, there are four phases of lowering and user-customized ones will be
+# called after each phase is done.
+#
+# .. note::
+#     Here are the essential transformations done by each phase:
+#       - Phase 0 generates the raw IR and loop levels.
+#       - Phase 1 flattens the array storage.
+#       - Phase 2 transforms loops, like unroll, vectorization and thread-binding.
+#       - Phase 3 does some cleanup work.
+#
+# Thus, a good place to put this transformation pass is just after Phase 1.
+#
+
+with tvm.build_config(add_lower_pass=[(1, vectorize)]) as cfg:
+    print(tvm.lower(sch, [a, b, c], simple_mode=True))
+
+#####################################################################
+# Quick View
+# ----------
+# This tutorial gives a quick view of writing a customized IR transformation pass:
+# - Use ``tvm.ir_pass.PostOrderVisit`` to gather information on each IR nodes.
+# - Use ``tvm.ir_pass.IRTransform`` to transform IR nodes.
+# - Wrap up two above to write an IR-transformation function.
+# - Use ``tvm.build_config`` to put this function to TVM lowering pass
+#
+

From c3ab85e8d2292f6e66fcc5701cb0bbc511aed7e3 Mon Sep 17 00:00:00 2001
From: nhynes <nhynes@berkeley.edu>
Date: Thu, 6 Sep 2018 10:28:01 -0700
Subject: [PATCH 076/529] Allow log_softmax on explicit trailing dim (#1684)

---
 nnvm/src/top/nn/nn.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
index 0b5a11fdd096..09dfbb211f00 100644
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -410,7 +410,8 @@ NNVM_REGISTER_OP(log_softmax)
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-    CHECK_EQ(param.axis, -1) << "Currently only axis=-1 is supported";
+    CHECK(param.axis == -1 || param.axis == static_cast<int32_t>(inputs[0].ndim()) - 1)
+        << "log_softmax currently only works on last dimension";
     return Array<Tensor>{ topi::nn::log_softmax(inputs[0]) };
   })
 .set_attr<FGradient>(

From 3a6f0df38b42e74c119ed247c0d818a0096d2262 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfu.chen@icloud.com>
Date: Fri, 7 Sep 2018 01:29:47 +0800
Subject: [PATCH 077/529] [Sparse] add sparse tensor computation support
 (#1289)

---
 python/tvm/autotvm/task/dispatcher.py |   2 +-
 python/tvm/contrib/sparse.py          | 163 ++++++++++++++++++++
 tests/python/contrib/test_sparse.py   | 100 +++++++++++++
 topi/python/topi/__init__.py          |   1 +
 topi/python/topi/sparse/__init__.py   |   7 +
 topi/python/topi/sparse/csrmm.py      |  94 ++++++++++++
 topi/python/topi/sparse/csrmv.py      |  90 +++++++++++
 topi/python/topi/sparse/dense.py      | 173 ++++++++++++++++++++++
 topi/tests/python/test_topi_sparse.py | 205 ++++++++++++++++++++++++++
 9 files changed, 834 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/contrib/sparse.py
 create mode 100644 tests/python/contrib/test_sparse.py
 create mode 100644 topi/python/topi/sparse/__init__.py
 create mode 100644 topi/python/topi/sparse/csrmm.py
 create mode 100644 topi/python/topi/sparse/csrmv.py
 create mode 100644 topi/python/topi/sparse/dense.py
 create mode 100644 topi/tests/python/test_topi_sparse.py

diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index ec1dcc44f141..398e850d871d 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -16,8 +16,8 @@
 
 import logging
 
-from decorator import decorate
 import numpy as np
+from decorator import decorate
 
 from tvm import target as _target
 
diff --git a/python/tvm/contrib/sparse.py b/python/tvm/contrib/sparse.py
new file mode 100644
index 000000000000..523039912aa9
--- /dev/null
+++ b/python/tvm/contrib/sparse.py
@@ -0,0 +1,163 @@
+"""Tensor and Operation class for computation declaration."""
+# pylint: disable=invalid-name
+from __future__ import absolute_import as _abs
+import numpy as _np
+from .. import expr as _expr
+from .. import api as _api
+from .. import tensor as _tensor
+from .. import ndarray as _nd
+
+float32 = "float32"
+itype = 'int32'
+
+class CSRNDArray(object):
+    """Sparse tensor object in CSR format."""
+    def __init__(self, arg1, ctx=None, shape=None):
+        """Construct a sparse matrix in CSR format.
+
+        Parameters
+        ----------
+        arg1 : numpy.ndarray or a tuple with (data, indices, indptr)
+            The corresponding a dense numpy array,
+            or a tuple for constructing a sparse matrix directly.
+
+        ctx: tvm.TVMContext
+            The corresponding context.
+
+        shape : tuple of int
+            The shape of the array
+        """
+        if isinstance(arg1, tuple):
+            assert len(arg1) == 3
+            self.data, self.indices, self.indptr = arg1
+            self.shape = shape
+        elif isinstance(arg1, _np.ndarray):
+            source_array = arg1
+            ridx, cidx = _np.nonzero(source_array)
+            data = source_array[ridx, cidx]
+            self.data = _nd.array(data, ctx)
+            indices = _np.nonzero(source_array)[1].astype(itype)
+            self.indices = _nd.array(indices, ctx)
+            indptr = [0]+_np.apply_along_axis(_np.count_nonzero, axis=1, arr=source_array).tolist()
+            indptr = _np.cumsum(_np.array(indptr, itype)).astype(itype)
+            self.indptr = _nd.array(indptr, ctx)
+            self.shape = source_array.shape
+        else:
+            raise RuntimeError("Construct CSRNDArray with either a tuple (data, indices, indptr) "
+                               "or a numpy.array, can't handle type %s." % (type(arg1),))
+        self.stype = 'csr'
+        self.dtype = self.data.dtype
+        assert self.shape is not None
+        assert isinstance(self.data, _nd.NDArray)
+        assert isinstance(self.indices, _nd.NDArray)
+        assert str(self.indices.dtype) == 'int32' or \
+            str(self.indices.dtype) == 'int64', str(self.indices.dtype)
+        assert isinstance(self.indptr, _nd.NDArray)
+        assert str(self.indptr.dtype) == 'int32' or \
+            str(self.indptr.dtype) == 'int64', str(self.indptr.dtype)
+
+    def asnumpy(self):
+        """Construct a full matrix and convert it to numpy array."""
+        full = _np.zeros(self.shape, self.dtype)
+        ridx = _np.diff(self.indptr.asnumpy())
+        ridx = _np.hstack((_np.ones((v,), itype)*i for i, v in enumerate(ridx)))
+        full[ridx, self.indices.asnumpy().astype(itype)] = self.data.asnumpy()
+        return full
+
+def array(source_array, ctx=None, shape=None, stype='csr'):
+    """Construct a sparse NDArray from numpy.ndarray"""
+    ret = None
+    if stype == 'csr':
+        ret = CSRNDArray(source_array, shape=shape, ctx=ctx)
+    else:
+        raise NotImplementedError('stype=%s is not supported yet.' % (stype,))
+    return ret
+
+class SparsePlaceholderOp(object):
+    """Placeholder class for sparse tensor representations."""
+    def __init__(self, shape, nonzeros, dtype, name):
+        # pylint: disable=unused-argument
+        """Contructing a bare bone structure for a sparse matrix
+
+        Parameters
+        ----------
+        shape: Tuple of Expr
+            The shape of the tensor
+
+        nonzeros: int
+            The number of non-zero values
+
+        dtype: str, optional
+            The data type of the tensor
+
+        name: str, optional
+            The name hint of the tensor
+        """
+        self.shape = shape
+        self.dtype = dtype
+        self.name = name
+        self.stype = 'unknown'
+
+class CSRPlaceholderOp(SparsePlaceholderOp):
+    """Placeholder class for CSR based sparse tensor representation."""
+    def __init__(self, shape, nonzeros, dtype, name):
+        """Contructing a bare bone structure for a csr_matrix
+
+        Parameters
+        ----------
+        shape: Tuple of Expr
+            The shape of the tensor
+
+        nonzeros: int
+            The number of non-zero values
+
+        dtype: str, optional
+            The data type of the tensor
+
+        name: str, optional
+            The name hint of the tensor
+        """
+        SparsePlaceholderOp.__init__(self, shape, nonzeros, dtype, name)
+        self.stype = 'csr'
+        self.data = _api.placeholder((nonzeros,), dtype=dtype, name=self.name+'_data')
+        self.indices = _api.placeholder((nonzeros,), dtype=itype, name=self.name+'_indices')
+        self.indptr = _api.placeholder((self.shape[0]+1,), dtype=itype, name=self.name+'_indptr')
+        assert isinstance(self.data, _tensor.Tensor)
+        assert isinstance(self.indices, _tensor.Tensor)
+        assert isinstance(self.indptr, _tensor.Tensor)
+
+def placeholder(shape, nonzeros=None, dtype=None, name="placeholder", stype=None):
+    """Construct an empty sparse tensor object.
+
+    Parameters
+    ----------
+    shape: Tuple of Expr
+        The shape of the tensor
+
+    nonzeros: int
+        The number of non-zero values
+
+    dtype: str, optional
+        The data type of the tensor
+
+    name: str, optional
+        The name hint of the tensor
+
+    stype: str, optional
+        The name storage type of the sparse tensor (e.g. csr, coo, ell)
+
+    Returns
+    -------
+    tensor: SparsePlaceholderOp
+        The created sparse tensor placeholder
+    """
+    shape = (shape,) if isinstance(shape, _expr.Expr) else shape
+    nonzeros = 0 if nonzeros is None else nonzeros
+    dtype = float32 if dtype is None else dtype
+    stype = 'csr' if stype is None else stype
+    ret = None
+    if stype == 'csr':
+        ret = CSRPlaceholderOp(shape=shape, nonzeros=nonzeros, dtype=dtype, name=name)
+    else:
+        raise NotImplementedError('stype=%s is not supported yet.' % (stype,))
+    return ret
diff --git a/tests/python/contrib/test_sparse.py b/tests/python/contrib/test_sparse.py
new file mode 100644
index 000000000000..f7a0d1d137a5
--- /dev/null
+++ b/tests/python/contrib/test_sparse.py
@@ -0,0 +1,100 @@
+import tvm
+import tvm.contrib.sparse as tvmsp
+import tvm.ndarray as _nd
+import numpy as np
+from collections import namedtuple
+
+def test_static_tensor():
+    dtype = 'float32'
+    stype = 'csr'
+    target = 'llvm'
+    ctx = tvm.context(target, 0)
+    m = tvm.var('m')
+    n = tvm.var('n')
+    A = tvmsp.placeholder(shape=(m, n), name='A', dtype=dtype)
+    assert(A.stype == 'csr')
+    n = 3
+    a = np.maximum(np.random.uniform(size=(n,n)).astype(dtype)-.6, 0.)
+    a = tvmsp.array(a, ctx)
+    A.data = tvm.placeholder(a.data.shape, dtype, name='A_data')
+    Ab = tvm.decl_buffer(a.data.shape, dtype, name='A_data')
+    binds = {A.data: Ab}
+    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
+    s = tvm.create_schedule(C.op)
+    f = tvm.build(s, [A.data, C], target, binds=binds)
+    c = tvmsp.array(np.zeros((n,n), dtype), ctx)
+    c.data = tvm.nd.empty(a.data.shape, dtype)
+    c.indices = a.indices
+    c.indptr = a.indptr
+    f(a.data, c.data)
+    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
+
+def test_dynamic_tensor():
+    dtype = 'float32'
+    stype = 'csr'
+    target = 'llvm'
+    ctx = tvm.context(target, 0)
+    nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n')
+    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype)
+    assert(A.stype == 'csr')
+    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
+    s = tvm.create_schedule(C.op)
+    _nr, _nc = 3, 5
+    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.)
+    a = tvmsp.array(a, ctx)
+    assert a.data.dtype == a.dtype
+    Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr'])
+    Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
+    Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
+    binds = {A.data: Ab.data, A.indices: Ab.indices}
+    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
+    c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
+    c.data = tvm.nd.empty(a.data.shape, dtype)
+    c.indices = a.indices
+    c.indptr = a.indptr
+    f(a.data.shape[0], a.data, c.data)
+    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
+
+def test_sparse_array_tuple():
+    dtype, itype = 'float32', 'int32'
+    stype = 'csr'
+    target = 'llvm'
+    ctx = tvm.context(target, 0)
+    nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n')
+    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype)
+    assert(A.stype == 'csr')
+    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
+    s = tvm.create_schedule(C.op)
+    _nr, _nc = 3, 5
+    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.)
+    # convert to sparse array tuple
+    source_array = a
+    ridx, cidx = np.nonzero(source_array)
+    data = source_array[ridx, cidx]
+    a_data = _nd.array(data, ctx)
+    indices = np.nonzero(source_array)[1].astype(itype)
+    a_indices = _nd.array(indices, ctx)
+    indptr = [0]+np.apply_along_axis(np.count_nonzero, axis=1, arr=source_array).tolist()
+    indptr = np.cumsum(np.array(indptr, itype)).astype(itype)
+    a_indptr = _nd.array(indptr, ctx)
+    a_init = (a_data, a_indices, a_indptr)
+    # construct tvm sparse array with tuple
+    a = tvmsp.array(a_init, shape=source_array.shape, ctx=ctx)
+    assert a.data.dtype == a.dtype
+    Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr'])
+    Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
+    Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
+    binds = {A.data: Ab.data, A.indices: Ab.indices}
+    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
+    c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
+    c.data = tvm.nd.empty(a.data.shape, dtype)
+    c.indices = a.indices
+    c.indptr = a.indptr
+    f(a.data.shape[0], a.data, c.data)
+    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
+
+if __name__ == "__main__":
+    test_static_tensor()
+    test_dynamic_tensor()
+    test_sparse_array_tuple()
+
diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py
index 3ef59913e07b..2eb460d151ae 100644
--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -32,6 +32,7 @@
 from . import rocm
 from . import vision
 from . import image
+from . import sparse
 from . import hls
 # not import testing by default
 # because testing can have extra deps that are not necessary
diff --git a/topi/python/topi/sparse/__init__.py b/topi/python/topi/sparse/__init__.py
new file mode 100644
index 000000000000..bfac967d2f76
--- /dev/null
+++ b/topi/python/topi/sparse/__init__.py
@@ -0,0 +1,7 @@
+# pylint: disable=wildcard-import
+"""Sparse operators"""
+from __future__ import absolute_import as _abs
+
+from .csrmv import csrmv
+from .csrmm import csrmm
+from .dense import dense
diff --git a/topi/python/topi/sparse/csrmm.py b/topi/python/topi/sparse/csrmm.py
new file mode 100644
index 000000000000..f0574bf3df6d
--- /dev/null
+++ b/topi/python/topi/sparse/csrmm.py
@@ -0,0 +1,94 @@
+"""TVM operator compute SpMM in CSR format."""
+from __future__ import absolute_import
+import tvm
+from .. import tag
+from ..util import simplify
+
+def csrmm_default(data, indices, indptr, weight, bias=None):
+    # pylint: disable=invalid-name
+    """The default implementation of csrmm in topi.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    indices : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    indptr : tvm.Tensor
+        1-D with shape [m+1]
+
+    weight : tvm.Tensor
+        2-D with shape [k, n]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [m]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [m, n]
+    """
+    assert len(data.shape) == 1 and len(indices.shape) == 1 and len(indptr.shape) == 1 \
+        and len(weight.shape) == 2, "only support 2-dim csrmm"
+    assert isinstance(weight, tvm.tensor.Tensor), \
+        "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight))
+    if bias is not None:
+        assert len(bias.shape) == 1
+    M = simplify(indptr.shape[0]-1)
+    _, N = weight.shape
+    def csrmm_default_ir(data, indices, indptr, weight, out):
+        """define ir for csrmm"""
+        irb = tvm.ir_builder.create()
+        data_ptr = irb.buffer_ptr(data)
+        indices_ptr = irb.buffer_ptr(indices)
+        indptr_ptr = irb.buffer_ptr(indptr)
+        weight_ptr = irb.buffer_ptr(weight)
+        out_ptr = irb.buffer_ptr(out)
+        M = simplify(indptr.shape[0]-1)
+        _, N = weight.shape
+        with irb.for_range(0, N, for_type="vectorize", name='n') as n:
+            with irb.for_range(0, M, for_type="parallel", name='row') as row:
+                dot = irb.allocate('float32', (1,), name='dot', scope='local')
+                out_ptr[row*N+n] = 0.
+                dot[0] = 0.
+                row_start = indptr_ptr[row]
+                row_end = indptr_ptr[row+1]
+                row_elems = row_end-row_start
+                with irb.for_range(0, row_elems, name='idx') as idx:
+                    elem = row_start+idx
+                    dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem]*N+n]
+                out_ptr[row*N+n] += dot[0]
+        return irb.get()
+    oshape = (M, N)
+    matmul = tvm.extern(oshape, [data, indices, indptr, weight],
+                        lambda ins, outs: csrmm_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                        tag="csrmm", dtype='float32', name='out')
+    if bias is not None:
+        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[i], \
+                             tag=tag.BROADCAST)
+    return matmul
+
+
+def csrmm(a, b, c=None):
+    """The `csrmm` routine performs a matrix-matrix operation defined as :math:`C := A*B + C`,
+    where `B` and `C` are dense matrices, `A` is an m-by-k sparse matrix in the CSR format.
+
+    Parameters
+    ----------
+    a : tvm.contrib.sparse.CSRNDArray
+        2-D sparse matrix with shape [m, k]
+
+    b : tvm.Tensor
+        2-D dense matrix with shape [k, n]
+
+    c : tvm.Tensor, optional
+        1-D dense vector with shape [n]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [m, n]
+    """
+    return csrmm_default(a.data, a.indices, a.indptr, b, c)
diff --git a/topi/python/topi/sparse/csrmv.py b/topi/python/topi/sparse/csrmv.py
new file mode 100644
index 000000000000..7cd101711cca
--- /dev/null
+++ b/topi/python/topi/sparse/csrmv.py
@@ -0,0 +1,90 @@
+"""TVM operator compute SpMV in CSR format."""
+from __future__ import absolute_import
+import tvm
+from .. import tag
+
+def csrmv_default(data, indices, indptr, weight, bias=None):
+    """The default implementation of csrmv in topi.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    indices : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    indptr : tvm.Tensor
+        1-D with shape [m+1]
+
+    weight : tvm.Tensor
+        2-D with shape [k, 1]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [1]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [m, 1]
+    """
+    assert len(data.shape) == 1 and len(weight.shape) == 2, \
+        "only support 2-dim csrmv"
+    assert isinstance(weight, tvm.tensor.Tensor), \
+        "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight))
+    if bias is not None:
+        assert len(bias.shape) == 1
+    batch = indptr.shape[0]-1
+    def csrmv_default_ir(data, indices, indptr, weight, out):
+        """define ir for csrmv"""
+        irb = tvm.ir_builder.create()
+        data_ptr = irb.buffer_ptr(data)
+        indices_ptr = irb.buffer_ptr(indices)
+        indptr_ptr = irb.buffer_ptr(indptr)
+        weight_ptr = irb.buffer_ptr(weight)
+        out_ptr = irb.buffer_ptr(out)
+        num_rows = indptr.shape[0]-1
+        with irb.for_range(0, num_rows, for_type="parallel", name='row') as row:
+            dot = irb.allocate('float32', (1,), name='dot', scope='local')
+            out_ptr[row] = 0.
+            dot[0] = 0.
+            row_start = indptr_ptr[row]
+            row_end = indptr_ptr[row+1]
+            row_elems = row_end-row_start
+            with irb.for_range(0, row_elems, name='elemidx') as elemidx:
+                elem = row_start+elemidx
+                dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem]]
+            out_ptr[row] += dot[0]
+        return irb.get()
+    oshape = (batch, 1)
+    matmul = tvm.extern(oshape, [data, indices, indptr, weight],
+                        lambda ins, outs: csrmv_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                        tag="csrmv", dtype='float32', name='csrmv')
+    if bias is not None:
+        matmul = tvm.compute((batch, 1), lambda i, j: matmul[i, 0] + bias[i], \
+                             tag=tag.BROADCAST)
+    return matmul
+
+
+def csrmv(a, x, y=None):
+    """The `csrmv` routine performs a matrix-vector operation defined as :math:`y := A*x + y`,
+    where `x` and `y` are vectors, `A` is an m-by-k sparse matrix in the CSR format.
+
+    Parameters
+
+    ----------
+    a : tvm.contrib.sparse.CSRNDArray
+        2-D sparse matrix with shape [m, k]
+
+    x : tvm.Tensor
+        2-D dense matrix with shape [k, 1]
+
+    y : tvm.Tensor, optional
+        1-D dense vector with shape [1]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D dense matrix with shape [m, 1]
+    """
+    return csrmv_default(a.data, a.indices, a.indptr, x, y)
diff --git a/topi/python/topi/sparse/dense.py b/topi/python/topi/sparse/dense.py
new file mode 100644
index 000000000000..01f323bc8ce9
--- /dev/null
+++ b/topi/python/topi/sparse/dense.py
@@ -0,0 +1,173 @@
+"""TVM operator compute Dense in CSR format."""
+from __future__ import absolute_import
+import tvm
+from .. import tag
+from ..util import simplify
+
+def dense_si(data, indices, indptr, weight, bias=None):
+    # pylint: disable=invalid-name
+    """The implementation of dense in topi, assuming sparse input.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        1-D with shape [num_nonzeros]
+
+    indices : tvm.Tensor
+        1-D with shape [num_nonzeros]
+
+    indptr : tvm.Tensor
+        1-D with shape [m+1]
+
+    weight : tvm.Tensor
+        2-D with shape [k, n]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [m]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [m, n]
+    """
+    assert len(data.shape) == 1 and len(indices.shape) == 1 and len(indptr.shape) == 1 \
+        and len(weight.shape) == 2, "only support 2-dim dense"
+    assert isinstance(weight, tvm.tensor.Tensor), \
+        "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight))
+    if bias is not None:
+        assert len(bias.shape) == 1
+    dtype = data.dtype
+    M = simplify(indptr.shape[0]-1)
+    N, _ = weight.shape
+    def dense_default_ir(data, indices, indptr, weight, out):
+        """Define IR for Dense"""
+        dtype = data.dtype
+        irb = tvm.ir_builder.create()
+        data_ptr = irb.buffer_ptr(data)
+        indices_ptr = irb.buffer_ptr(indices)
+        indptr_ptr = irb.buffer_ptr(indptr)
+        weight_ptr = irb.buffer_ptr(weight)
+        out_ptr = irb.buffer_ptr(out)
+        M = simplify(indptr.shape[0]-1)
+        N, K = weight.shape
+        with irb.for_range(0, N, for_type="vectorize", name='n') as n:
+            with irb.for_range(0, M, for_type="parallel", name='m') as m:
+                dot = irb.allocate(dtype, (1,), name='dot', scope='local')
+                out_ptr[m*N+n] = tvm.const(0, dtype)
+                dot[0] = tvm.const(0, dtype)
+                row_start = indptr_ptr[m]
+                row_elems = indptr_ptr[m+1]-row_start
+                with irb.for_range(0, row_elems, name='k') as k:
+                    elem = row_start+k
+                    dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem]+n*K]
+                out_ptr[m*N+n] += dot[0]
+        return irb.get()
+    oshape = (M, N)
+    matmul = tvm.extern(oshape, [data, indices, indptr, weight],
+                        lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                        tag="dense", dtype=dtype, name='out')
+    if bias is not None:
+        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[j], \
+                             tag=tag.BROADCAST)
+    return matmul
+
+
+def dense_sw(data, w_data, w_indices, w_indptr, bias=None):
+    # pylint: disable=invalid-name
+    """The implementation of dense in topi, assuming sparse weight.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [m, k]
+
+    w_data : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    w_indices : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    w_indptr : tvm.Tensor
+        1-D with shape [n+1]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [n]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [m, n]
+    """
+    assert len(w_data.shape) == 1 and len(w_indices.shape) == 1 and len(w_indptr.shape) == 1 \
+        and len(data.shape) == 2, "only support 2-dim dense"
+    assert isinstance(data, tvm.tensor.Tensor), \
+        "data matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(data))
+    if bias is not None:
+        assert len(bias.shape) == 1
+    dtype = data.dtype
+    M, _ = data.shape
+    N = simplify(w_indptr.shape[0]-1)
+    def dense_default_ir(data, w_data, w_indices, w_indptr, out):
+        """Define IR for Dense"""
+        dtype = data.dtype
+        irb = tvm.ir_builder.create()
+        data_ptr = irb.buffer_ptr(data)
+        w_data_ptr = irb.buffer_ptr(w_data)
+        w_indices_ptr = irb.buffer_ptr(w_indices)
+        w_indptr_ptr = irb.buffer_ptr(w_indptr)
+        out_ptr = irb.buffer_ptr(out)
+        M, K = data.shape
+        N = simplify(w_indptr.shape[0]-1)
+        with irb.for_range(0, M, for_type="vectorize", name='m') as m:
+            with irb.for_range(0, N, for_type="parallel", name='n') as n:
+                dot = irb.allocate(dtype, (1,), name='dot', scope='local')
+                out_ptr[m*N+n] = tvm.const(0, dtype)
+                dot[0] = tvm.const(0, dtype)
+                row_start = w_indptr_ptr[n]
+                row_elems = w_indptr_ptr[n+1]-row_start
+                with irb.for_range(0, row_elems, name='k') as k:
+                    elem = row_start+k
+                    dot[0] += w_data_ptr[elem] * data_ptr[w_indices_ptr[elem]+m*K]
+                out_ptr[m*N+n] += dot[0]
+        return irb.get()
+    oshape = (M, N)
+    matmul = tvm.extern(oshape, [data, w_data, w_indices, w_indptr],
+                        lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                        tag="dense", dtype=dtype, name='out')
+    if bias is not None:
+        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[j], \
+                             tag=tag.BROADCAST)
+    return matmul
+
+
+def dense(data, weight, bias=None):
+    """Applies a linear transformation: :math:`Y = XW^T + b`.
+    Either data or weight should be tvm.contrib.sparse.CSRNDArray.
+
+    Parameters
+    ----------
+    data : tvm.contrib.sparse.CSRNDArray or tvm.tensor.Tensor
+        2-D with shape [batch, in_dim]
+
+    weight : tvm.tensor.Tensor or tvm.contrib.sparse.CSRNDArray
+        2-D with shape [out_dim, in_dim]
+
+    bias : tvm.tensor.Tensor, optional
+        1-D with shape [out_dim]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    ret = None
+    if isinstance(data, tvm.contrib.sparse.CSRPlaceholderOp) and \
+       isinstance(weight, tvm.tensor.Tensor):
+        ret = dense_si(data.data, data.indices, data.indptr, weight, bias)
+    elif isinstance(data, tvm.tensor.Tensor) and \
+       isinstance(weight, tvm.contrib.sparse.CSRPlaceholderOp):
+        ret = dense_sw(data, weight.data, weight.indices, weight.indptr, bias)
+    else:
+        raise NotImplementedError("implementation for %s as data and %s as weights, "
+                                  "is not supported yet." % (type(data), type(weight), ))
+    return ret
diff --git a/topi/tests/python/test_topi_sparse.py b/topi/tests/python/test_topi_sparse.py
new file mode 100644
index 000000000000..deb3a08ea01b
--- /dev/null
+++ b/topi/tests/python/test_topi_sparse.py
@@ -0,0 +1,205 @@
+"""Test code for sparse operator"""
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+import tvm.contrib.sparse as tvmsp
+from collections import namedtuple
+import time
+
+def verify_dynamic_csrmv(batch, in_dim, out_dim, use_bias=True):
+    nr, nc, n = tvm.var("nr"), tvm.var("nc"), tvm.var("n")
+    dtype = 'float32'
+    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, dtype=dtype, name='A')
+    B = tvm.placeholder((in_dim, 1), name='B')
+    C = tvm.placeholder((nr,), name='C')
+    D = topi.sparse.csrmv(A, B, C if use_bias else None)
+    s = tvm.create_schedule(D.op)
+    dtype = A.dtype
+
+    # get the test data
+    def get_ref_data():
+        a_np = np.maximum(np.random.uniform(size=(batch, in_dim)).astype(dtype)-0.5, 0.)
+        b_np = np.random.uniform(size=(in_dim, 1)).astype(dtype)-0.5
+        c_np = np.random.uniform(size=(batch, )).astype(dtype)
+        if use_bias:
+            d_np = np.dot(a_np, b_np) + c_np.reshape((batch, 1))
+        else:
+            d_np = np.dot(a_np, b_np)
+        return (a_np, b_np, c_np, d_np)
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        a = tvmsp.array(a_np, ctx)
+        _nr, _nc, _n = a.shape[0], a.shape[1], a.data.shape[0]
+        assert a.shape[0] == a.indptr.shape[0]-1
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros((_nr, 1), dtype=dtype), ctx)
+        assert a.data.dtype == A.data.dtype
+        assert a.indices.dtype == A.indices.dtype
+        assert a.indptr.dtype == A.indptr.dtype
+        f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmv")
+        f(_nr, a.data, a.indices, a.indptr, b, c, d)
+        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+def verify_dynamic_csrmm(batch, in_dim, out_dim, use_bias=True):
+    nr, nc, n = tvm.var("nr"), tvm.var("nc"), tvm.var("n")
+    dtype = 'float32'
+    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, dtype=dtype, name='A')
+    B = tvm.placeholder((in_dim, out_dim), name='B')
+    C = tvm.placeholder((nr,), name='C')
+    D = topi.sparse.csrmm(A, B, C if use_bias else None)
+    s = tvm.create_schedule(D.op)
+    dtype = A.dtype
+
+    # get the test data
+    def get_ref_data():
+        a_np = np.maximum(np.random.uniform(size=(batch, in_dim)).astype(dtype)-0.5, 0.)
+        b_np = np.random.uniform(size=(in_dim, out_dim)).astype(dtype)-0.5
+        c_np = np.random.uniform(size=(batch, )).astype(dtype)
+        if use_bias:
+            d_np = np.dot(a_np, b_np) + c_np.reshape((batch, 1))
+        else:
+            d_np = np.dot(a_np, b_np)
+        return (a_np, b_np, c_np, d_np)
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        a = tvmsp.array(a_np, ctx)
+        _nr, _nc, _n = a.shape[0], a.shape[1], a.data.shape[0]
+        assert a.shape[0] == a.indptr.shape[0]-1
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros((_nr, out_dim), dtype=dtype), ctx)
+        f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmm")
+
+        f(_nr, a.data, a.indices, a.indptr, b, c, d)
+        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-2)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+def verify_dense_si(batch, in_dim, out_dim, use_bias=True, dtype='float32'):
+    nonzeros = tvm.var('nonzeros')
+    A = tvmsp.placeholder(shape=(batch, in_dim), nonzeros=nonzeros, dtype=dtype, name='A')
+    B = tvm.placeholder((out_dim, in_dim), dtype=dtype, name='B')
+    C = tvm.placeholder((out_dim,), dtype=dtype, name='C')
+    D = topi.sparse.dense(A, B, C if use_bias else None)
+    s = tvm.create_schedule(D.op)
+
+    # get the test data
+    def get_ref_data():
+        mag = 10.
+        a_np = np.maximum(mag*(np.random.uniform(size=(batch, in_dim)).astype('float32')-0.5), 0.).astype(dtype)
+        b_np = (mag*(np.random.uniform(size=(out_dim, in_dim)).astype('float32')-.5)).astype(dtype)
+        c_np = (mag*(np.random.uniform(size=(out_dim,)).astype('float32')-.5)).astype(dtype)
+        if use_bias:
+            d_np = np.dot(a_np, b_np.T) + c_np
+        else:
+            d_np = np.dot(a_np, b_np.T)
+        return (a_np, b_np, c_np, d_np)
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        a = tvmsp.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A.data, A.indices, A.indptr, B, C, D], device, name="dense")
+        f(a.data, a.indices, a.indptr, b, c, d)
+        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+
+    check_device('llvm')
+
+def verify_dense_sw(batch, in_dim, out_dim, use_bias=True, dtype='float32'):
+    nonzeros = tvm.var('nonzeros')
+    A = tvm.placeholder((batch, in_dim), dtype=dtype, name='A')
+    B = tvmsp.placeholder(shape=(out_dim, in_dim), nonzeros=nonzeros, dtype=dtype, name='B')
+    C = tvm.placeholder((out_dim,), dtype=dtype, name='C')
+    D = topi.sparse.dense(A, B, C if use_bias else None)
+    s = tvm.create_schedule(D.op)
+
+    # get the test data
+    def get_ref_data():
+        mag = 10.
+        a_np = (mag*(np.random.uniform(size=(batch, in_dim)).astype('float32')-.5)).astype(dtype)
+        b_np = np.maximum(mag*(np.random.uniform(size=(out_dim, in_dim)).astype('float32')-0.5), 0.).astype(dtype)
+        c_np = (mag*(np.random.uniform(size=(out_dim,)).astype('float32')-.5)).astype(dtype)
+        if use_bias:
+            d_np = np.dot(a_np, b_np.T) + c_np
+        else:
+            d_np = np.dot(a_np, b_np.T)
+        return (a_np, b_np, c_np, d_np)
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvmsp.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A, B.data, B.indices, B.indptr, C, D], device, name="dense")
+        f(a, b.data, b.indices, b.indptr, c, d)
+        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+
+    check_device('llvm')
+
+def test_csrmv():
+    verify_dynamic_csrmv(batch=5, in_dim=7, out_dim=1, use_bias=False)
+    verify_dynamic_csrmv(batch=5, in_dim=7, out_dim=1, use_bias=True)
+
+def test_csrmm():
+    M, K, N = 5, 7, 2
+    verify_dynamic_csrmm(batch=M, in_dim=K, out_dim=N, use_bias=False)
+    verify_dynamic_csrmm(batch=M, in_dim=K, out_dim=N, use_bias=True)
+
+def test_dense_si():
+    M, K, N = 3, 5, 2
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='float32')
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='float32')
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='int32')
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='int32')
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='int16')
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='int16')
+
+def test_dense_sw():
+    M, K, N = 3, 5, 2
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='float32')
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='float32')
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='int32')
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='int32')
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='int16')
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='int16')
+
+def test_dense():
+    test_dense_si()
+    test_dense_sw()
+
+if __name__ == "__main__":
+    test_csrmv()
+    test_csrmm()
+    test_dense()

From 71f9cba4e12c3da5e17b35a20452e28de4b837a4 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Thu, 6 Sep 2018 10:33:37 -0700
Subject: [PATCH 078/529] [AUTOTVM][TOPI] Use tunable templates for GPU
 (CUDA/OpenCL/ROCm/Mali) (#1638)

---
 apps/benchmark/README.md                      |  63 +-
 apps/benchmark/arm_cpu_imagenet_bench.py      |  98 +-
 apps/benchmark/gpu_imagenet_bench.py          | 109 +--
 apps/benchmark/mobile_gpu_imagenet_bench.py   |  90 ++
 apps/benchmark/util.py                        |  17 +-
 nnvm/python/nnvm/testing/__init__.py          |   1 +
 nnvm/python/nnvm/testing/densenet.py          |  49 +
 nnvm/python/nnvm/testing/resnet.py            |   6 +-
 .../python/frontend/mxnet/model_zoo/resnet.py |   5 +-
 python/tvm/autotvm/measure/measure.py         |  25 +
 python/tvm/autotvm/measure/measure_methods.py |  12 +-
 python/tvm/autotvm/record.py                  |   2 +-
 python/tvm/autotvm/task/__init__.py           |   5 +-
 python/tvm/autotvm/task/dispatcher.py         |  31 +-
 python/tvm/autotvm/task/nnvm_integration.py   |  65 +-
 python/tvm/autotvm/task/space.py              |  69 +-
 python/tvm/autotvm/tophub.py                  |  64 +-
 python/tvm/autotvm/tuner/tuner.py             |   6 +-
 .../tvm/autotvm/tuner/xgboost_cost_model.py   |   2 +-
 python/tvm/contrib/download.py                |   4 +-
 python/tvm/target.py                          |  39 +-
 src/arithmetic/canonical.cc                   |   3 +-
 src/runtime/opencl/opencl_device_api.cc       |   6 -
 tests/python/unittest/test_autotvm_space.py   |  11 +-
 tests/python/unittest/test_lang_reflection.py |   1 -
 tests/python/unittest/test_lang_target.py     |  11 +-
 topi/python/topi/arm_cpu/conv2d.py            |  49 +-
 topi/python/topi/arm_cpu/depthwise_conv2d.py  |  24 +-
 topi/python/topi/cuda/__init__.py             |   5 +-
 topi/python/topi/cuda/conv2d.py               | 146 ++-
 topi/python/topi/cuda/conv2d_direct.py        |  96 ++
 topi/python/topi/cuda/conv2d_nchw.py          | 544 -----------
 .../python/topi/cuda/conv2d_transpose_nchw.py | 288 ++++--
 topi/python/topi/cuda/conv2d_winograd.py      | 389 ++++++++
 topi/python/topi/cuda/depthwise_conv2d.py     | 201 ++--
 topi/python/topi/mali/conv2d.py               | 919 +++++++-----------
 topi/python/topi/mali/dense.py                | 147 ++-
 topi/python/topi/mali/depthwise_conv2d.py     | 167 ++--
 topi/python/topi/rocm/conv2d.py               |  97 +-
 topi/tests/python/common.py                   |   2 +-
 topi/tests/python/test_topi_conv2d_nchw.py    |  57 +-
 .../tests/python/test_topi_conv2d_winograd.py | 110 +++
 .../python/test_topi_depthwise_conv2d.py      |  65 +-
 tutorials/autotvm/tune_conv2d_cuda.py         |  31 +-
 tutorials/autotvm/tune_nnvm_arm.py            |  41 +-
 tutorials/autotvm/tune_nnvm_cuda.py           | 375 +++++++
 tutorials/autotvm/tune_nnvm_mobile_gpu.py     | 381 ++++++++
 tutorials/autotvm/tune_simple_template.py     |   6 +-
 tutorials/nnvm/deploy_model_on_mali_gpu.py    |   2 +-
 tutorials/nnvm/deploy_model_on_rasp.py        |   2 +-
 tutorials/topi/intro_topi.py                  |  15 +-
 51 files changed, 3083 insertions(+), 1870 deletions(-)
 create mode 100644 apps/benchmark/mobile_gpu_imagenet_bench.py
 create mode 100644 nnvm/python/nnvm/testing/densenet.py
 create mode 100644 topi/python/topi/cuda/conv2d_direct.py
 delete mode 100644 topi/python/topi/cuda/conv2d_nchw.py
 create mode 100644 topi/python/topi/cuda/conv2d_winograd.py
 create mode 100644 topi/tests/python/test_topi_conv2d_winograd.py
 create mode 100644 tutorials/autotvm/tune_nnvm_cuda.py
 create mode 100644 tutorials/autotvm/tune_nnvm_mobile_gpu.py

diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index 95742e0decf4..845de0599f66 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -6,8 +6,35 @@ See results on wiki page https://github.com/dmlc/tvm/wiki/Benchmark
 
 ## How to Reproduce
 
-### ARM CPU
-We use RPC infrastructure in TVM to make device management easy. So you need to use it for reproducing benchmark results.
+To obtain the best performance, we always do auto-tuning for the specific devices and get
+the parameters for used kernels. To enable easy reproduction of our results, we release
+pre-tuned parameters for popular networks on some common devices.
+TVM will download related tuning cache files during compilation.
+
+If you don't have the following listed devices, you can still run these scripts.
+You can pick the one that is most similar to your device as argument.
+In general, the performance should also be good.
+
+It is recommended that you run tuning by yourself if you have your customized network or devices.
+Please follow the tutorial for
+[NVIDIA GPU](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_cuda.html),
+[ARM CPU](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html),
+[Mobile GPU](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_mobile_gpu.html).
+
+### NVIDIA GPU
+
+Build TVM with LLVM and CUDA enabled. [Help](https://docs.tvm.ai/install/from_source.html)
+
+```bash
+python3 gpu_imagenet_bench.py --model 1080ti
+python3 gpu_imagenet_bench.py --model titanx
+```
+
+### ARM CPU & Mali GPU
+For embedded deivces, we use RPC infrastructure in TVM to make the management easy.
+So you need to use it for reproducing benchmark results.
+
+0. Build TVM with LLVM enabled. [Help](https://docs.tvm.ai/install/from_source.html)
 
 1. Start an RPC Tracker on the host machine
 ```bash
@@ -50,24 +77,22 @@ python3 -m tvm.exec.rpc_tracker
   rasp3b       8      8     0
   ```
 
- 4. Run benchmark  
-  We did auto-tuning for Huawei P20/Mate10 Pro, Google Pixel2, Raspberry Pi3 and Firefly-RK3399,
-  and release pre-tuned parameters in [this repo](https://github.com/uwsaml/tvm-distro).
-  During compilation, TVM will download these operator parameters automatically.
-
+4. Run benchmark  
   ```bash
-  python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key rasp3b
-  python3 arm_cpu_imagenet_bench.py --device rk3399 --rpc-key rk3399
-  python3 arm_cpu_imagenet_bench.py --device pixel2 --rpc-key pixel2
-  python3 arm_cpu_imagenet_bench.py --device p20pro --rpc-key p20pro
-  python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro  
-  ```
+  # ARM CPU
+  python3 arm_cpu_imagenet_bench.py --model rasp3b --rpc-key rasp3b
+  python3 arm_cpu_imagenet_bench.py --model rk3399 --rpc-key rk3399
+  python3 arm_cpu_imagenet_bench.py --model pixel2 --rpc-key pixel2
+  python3 arm_cpu_imagenet_bench.py --model p20pro --rpc-key p20pro
+  python3 arm_cpu_imagenet_bench.py --model mate10pro --rpc-key mate10pro  
 
-  If your device has a same or similar SoC of the above devices, you can reuse these parameters.
-  For example, if your SoC is similar to rasp3b, use
-  ```bash
-  python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key your_custom_key
+  # Mali GPU
+  python3 mobile_gpu_imagenet_bench.py --model rk3399 --rpc-key rk3399
   ```
-  For other devices, to get the best performance, it is recommended that you tune your network by yourself. 
-  Please follow this [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).
 
+### AMD GPU
+
+Build TVM with LLVM and ROCm enabled. [Help](https://docs.tvm.ai/install/from_source.html)
+```bash
+python3 gpu_imagenet_bench.py --model gfx900 --target rocm
+```
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index f5057299920c..931899069700 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -1,4 +1,4 @@
-"""Benchmark script for ARM CPU.
+"""Benchmark script for ImageNet models on ARM CPU.
 see README.md for the usage and results of this script.
 """
 import argparse
@@ -14,13 +14,60 @@
 from util import get_network, print_progress
 
 
+def evaluate_network(network, target, target_host, number):
+    # connect to remote device
+    tracker = tvm.rpc.connect_tracker(args.host, args.port)
+    remote = tracker.request(args.rpc_key)
+
+    print_progress(network)
+    net, params, input_shape, output_shape = get_network(network, batch_size=1)
+
+    print_progress("%-20s building..." % network)
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, target=target, target_host=target_host,
+            shape={'data': input_shape}, params=params, dtype=dtype)
+
+    tmp = tempdir()
+    if 'android' in str(target):
+        from tvm.contrib import ndk
+        filename = "%s.so" % network
+        lib.export_library(tmp.relpath(filename), ndk.create_shared)
+    else:
+        filename = "%s.tar" % network
+        lib.export_library(tmp.relpath(filename))
+
+    # upload library and params
+    print_progress("%-20s uploading..." % network)
+    ctx = remote.context(str(target), 0)
+    remote.upload(tmp.relpath(filename))
+    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+
+    rlib = remote.load_module(filename)
+    module = runtime.create(graph, rlib, ctx)
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input('data', data_tvm)
+    module.set_input(**rparams)
+
+    del rparams
+
+    # evaluate
+    print_progress("%-20s evaluating..." % network)
+    ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
+    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+    print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--network", type=str, choices=
-                        ['resnet-18', 'resnet-34', 'vgg-16', 'mobilenet', 'squeezenet v1.1', ])
-    parser.add_argument("--device", type=str, required=True, choices=
+                        ['resnet-18', 'resnet-34', 'vgg-16',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet v1.0', 'squeezenet v1.1'])
+    parser.add_argument("--model", type=str, choices=
                         ['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro',
-                         'pixel2', 'rasp3b', 'pynq'])
+                         'pixel2', 'rasp3b', 'pynq'], default='rk3399',
+                        help="The model of the test device. If your device is not listed in "
+                             "the choices list, pick the most similar one as argument.")
     parser.add_argument("--host", type=str, default='localhost')
     parser.add_argument("--port", type=int, default=9190)
     parser.add_argument("--rpc-key", type=str, required=True)
@@ -34,47 +81,12 @@
     else:
         networks = [args.network]
 
-    target = tvm.target.arm_cpu(model=args.device)
-
-    # connect to remote device
-    tracker = tvm.rpc.connect_tracker(args.host, args.port)
-    remote = tracker.request(args.rpc_key)
+    target = tvm.target.arm_cpu(model=args.model)
+    target_host = None
 
     print("--------------------------------------------------")
     print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
     print("--------------------------------------------------")
     for network in networks:
-        print_progress(network)
-        net, params, input_shape, output_shape = get_network(network, batch_size=1)
-
-        print_progress("%-20s building..." % network)
-        with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
-            graph, lib, params = nnvm.compiler.build(
-                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
-
-        tmp = tempdir()
-        if 'android' in str(target):
-            from tvm.contrib import ndk
-            filename = "%s.so" % network
-            lib.export_library(tmp.relpath(filename), ndk.create_shared)
-        else:
-            filename = "%s.tar" % network
-            lib.export_library(tmp.relpath(filename))
-
-        # upload library and params
-        print_progress("%-20s uploading..." % network)
-        ctx = remote.context(str(target), 0)
-        remote.upload(tmp.relpath(filename))
-        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-
-        rlib = remote.load_module(filename)
-        module = runtime.create(graph, rlib, ctx)
-        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-        module.set_input('data', data_tvm)
-        module.set_input(**rparams)
-
-        # evaluate
-        print_progress("%-20s evaluating..." % network)
-        ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
-        prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
-        print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
+        evaluate_network(network, target, target_host, args.number)
+
diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
index fca4e35b6516..873e60f82c59 100644
--- a/apps/benchmark/gpu_imagenet_bench.py
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -1,80 +1,61 @@
-""" Benchmark script for performance on GPUs.
-
-For example, run the file with:
-`python gpu_imagenet_bench.py --model=mobilenet --target=cuda`.
-For more details about how to set up the inference environment on GPUs,
-please refer to NNVM Tutorial: ImageNet Inference on the GPU
+"""Benchmark script for ImageNet models on GPU.
+see README.md for the usage and results of this script.
 """
-import time
 import argparse
+
 import numpy as np
+
 import tvm
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
 import nnvm.compiler
 import nnvm.testing
-from tvm.contrib import util, nvcc
-from tvm.contrib import graph_runtime as runtime
 
-@tvm.register_func
-def tvm_callback_cuda_compile(code):
-    ptx = nvcc.compile_cuda(code, target="ptx")
-    return ptx
+from util import get_network
+
 
-def main():
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', type=str, required=True,
-                        choices=['resnet', 'mobilenet'],
-                        help="The model type.")
-    parser.add_argument('--target', type=str, required=True,
-                        choices=['cuda', 'rocm', 'opencl', 'metal', 'nvptx'],
-                        help="Compilation target.")
-    parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
-    parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.")
-    parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.")
+    parser.add_argument("--network", type=str, choices=
+        ['resnet-18', 'resnet-34', 'resnet-50', 'vgg-16', 'vgg-19',
+         'inception_v3', 'mobilenet', 'mobilenet_v2', 'densenet-121'])
+    parser.add_argument("--model", type=str,
+                        choices=['1080ti', 'titanx', 'gfx900'], default='1080ti',
+                        help="The model of the test device. If your device is not listed in "
+                             "the choices list, pick the most similar one as argument.")
+    parser.add_argument("--number", type=int, default=500)
+    parser.add_argument("--target", type=str,
+                        choices=['cuda', 'opencl', 'rocm', 'nvptx', 'metal'], default='cuda',
+                        help="The tvm compilation target")
     args = parser.parse_args()
-    opt_level = args.opt_level
-    num_iter = args.num_iter
-    ctx = tvm.context(args.target, 0)
-    batch_size = 1
-    num_classes = 1000
-    image_shape = (3, 224, 224)
 
-    data_shape = (batch_size,) + image_shape
-    out_shape = (batch_size, num_classes)
-    if args.model == 'resnet':
-        net, params = nnvm.testing.resnet.get_workload(
-            batch_size=1, image_shape=image_shape)
-    elif args.model == 'mobilenet':
-        net, params = nnvm.testing.mobilenet.get_workload(
-            batch_size=1, image_shape=image_shape)
-    else:
-        raise ValueError('no benchmark prepared for {}.'.format(args.model))
+    dtype = 'float32'
 
-    if args.target == "cuda":
-        unroll = 1400
+    if args.network is None:
+        networks = ['resnet-50', 'mobilenet', 'vgg-19', 'inception_v3']
     else:
-        unroll = 128
-    with nnvm.compiler.build_config(opt_level=opt_level):
-        with tvm.build_config(auto_unroll_max_step=unroll,
-                              unroll_explicit=(args.target != "cuda")):
-            graph, lib, params = nnvm.compiler.build(
-                net, args.target, shape={"data": data_shape}, params=params)
+        networks = [args.network]
 
-    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-    module = runtime.create(graph, lib, ctx)
-    module.set_input(**params)
-    module.set_input("data", data)
-    module.run()
-    out = module.get_output(0, tvm.nd.empty(out_shape))
-    out.asnumpy()
+    target = tvm.target.create('%s -model=%s' % (args.target, args.model))
 
-    print('benchmark args: {}'.format(args))
-    ftimer = module.module.time_evaluator("run", ctx, num_iter)
-    for i in range(args.repeat):
-        prof_res = ftimer()
-        print(prof_res)
-        # sleep for avoiding device overheat
-        if i + 1 != args.repeat:
-            time.sleep(45)
+    print("--------------------------------------------------")
+    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
+    print("--------------------------------------------------")
+    for network in networks:
+        net, params, input_shape, output_shape = get_network(network, batch_size=1)
 
-if __name__ == '__main__':
-    main()
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
+
+        # create runtime
+        ctx = tvm.context(str(target), 0)
+        module = runtime.create(graph, lib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module.set_input('data', data_tvm)
+        module.set_input(**params)
+
+        # evaluate
+        ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+        print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
new file mode 100644
index 000000000000..da1207381c86
--- /dev/null
+++ b/apps/benchmark/mobile_gpu_imagenet_bench.py
@@ -0,0 +1,90 @@
+"""Benchmark script for ImageNet models on mobile GPU.
+see README.md for the usage and results of this script.
+"""
+import argparse
+
+import numpy as np
+
+import tvm
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+import nnvm.compiler
+import nnvm.testing
+
+from util import get_network, print_progress
+
+def evaluate_network(network, target, target_host, number):
+    # connect to remote device
+    tracker = tvm.rpc.connect_tracker(args.host, args.port)
+    remote = tracker.request(args.rpc_key)
+
+    print_progress(network)
+    net, params, input_shape, output_shape = get_network(network, batch_size=1)
+
+    print_progress("%-20s building..." % network)
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, target=target, target_host=target_host,
+            shape={'data': input_shape}, params=params, dtype=dtype)
+
+    tmp = tempdir()
+    if 'android' in str(target) or 'android' in str(target_host):
+        from tvm.contrib import ndk
+        filename = "%s.so" % network
+        lib.export_library(tmp.relpath(filename), ndk.create_shared)
+    else:
+        filename = "%s.tar" % network
+        lib.export_library(tmp.relpath(filename))
+
+    # upload library and params
+    print_progress("%-20s uploading..." % network)
+    ctx = remote.context(str(target), 0)
+    remote.upload(tmp.relpath(filename))
+    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+
+    rlib = remote.load_module(filename)
+    module = runtime.create(graph, rlib, ctx)
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input('data', data_tvm)
+    module.set_input(**rparams)
+
+    del rparams
+
+    # evaluate
+    print_progress("%-20s evaluating..." % network)
+    ftimer = module.module.time_evaluator("run", ctx, number=number, repeat=3)
+    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+    print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", type=str, choices=
+                        ['resnet-18', 'resnet-34', 'vgg-16',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet v1.1'])
+    parser.add_argument("--model", type=str, choices=
+                        ['rk3399'], default='rk3399',
+                        help="The model of the test device. If your device is not listed in "
+                             "the choices list, pick the most similar one as argument.")
+    parser.add_argument("--host", type=str, default='localhost')
+    parser.add_argument("--port", type=int, default=9190)
+    parser.add_argument("--rpc-key", type=str, required=True)
+    parser.add_argument("--number", type=int, default=10)
+    args = parser.parse_args()
+
+    dtype = 'float32'
+
+    if args.network is None:
+        networks = ['squeezenet_v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
+    else:
+        networks = [args.network]
+
+    target = tvm.target.mali(model=args.model)
+    target_host = tvm.target.arm_cpu(model=args.model)
+
+    print("--------------------------------------------------")
+    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
+    print("--------------------------------------------------")
+
+    for network in networks:
+        evaluate_network(network, target, target_host, args.number)
\ No newline at end of file
diff --git a/apps/benchmark/util.py b/apps/benchmark/util.py
index bd4a3d04a1d1..4825ac96571d 100644
--- a/apps/benchmark/util.py
+++ b/apps/benchmark/util.py
@@ -27,20 +27,25 @@ def get_network(name, batch_size):
     input_shape = (batch_size, 3, 224, 224)
     output_shape = (batch_size, 1000)
 
-    if "resnet" in name:
+    if name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'mobilenet_v2':
+        net, params = nnvm.testing.mobilenet_v2.get_workload(batch_size=batch_size)
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif "resnet" in name:
         n_layer = int(name.split('-')[1])
         net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
     elif "vgg" in name:
         n_layer = int(name.split('-')[1])
         net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
-    elif name == 'mobilenet':
-        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif "densenet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.densenet.get_workload(num_layers=n_layer, batch_size=batch_size)
     elif "squeezenet" in name:
         version = name.split("_v")[1]
         net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version=version)
-    elif name == 'inception_v3':
-        input_shape = (1, 3, 299, 299)
-        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
     elif name == 'custom':
         # an example for custom network
         from nnvm.testing import utils
diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
index acf37999cc15..3bf03a1e0039 100644
--- a/nnvm/python/nnvm/testing/__init__.py
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -8,6 +8,7 @@
 from . import mlp
 from . import resnet
 from . import vgg
+from . import densenet
 from . import squeezenet
 from . import inception_v3
 from . import dcgan
diff --git a/nnvm/python/nnvm/testing/densenet.py b/nnvm/python/nnvm/testing/densenet.py
new file mode 100644
index 000000000000..e97d306af933
--- /dev/null
+++ b/nnvm/python/nnvm/testing/densenet.py
@@ -0,0 +1,49 @@
+"""
+DenseNet, load model from gluon model zoo
+
+Reference:
+Huang, Gao, et al. "Densely Connected Convolutional Networks." CVPR 2017
+"""
+
+from .utils import create_workload
+from ..frontend.mxnet import _from_mxnet_impl
+
+def get_workload(batch_size, num_classes=1000, num_layers=121, dtype="float32"):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    num_layers : int, optional
+        The number of layers
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    image_shape = (1, 3, 224, 224)
+
+    block = get_model('densenet%d' % num_layers, classes=num_classes, pretrained=False)
+
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    sym = mx.sym.SoftmaxOutput(sym)
+
+    net = _from_mxnet_impl(sym, {})
+
+    return create_workload(net, batch_size, image_shape[1:], dtype)
diff --git a/nnvm/python/nnvm/testing/resnet.py b/nnvm/python/nnvm/testing/resnet.py
index 6de0213679d1..e63ceff7c3f0 100644
--- a/nnvm/python/nnvm/testing/resnet.py
+++ b/nnvm/python/nnvm/testing/resnet.py
@@ -46,18 +46,16 @@ def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True):
         Base name of the operators
     """
     if bottle_neck:
-        # the same as https://github.com/facebook/fb.resnet.torch#notes,
-        # a bit difference with origin paper
         bn1 = sym.batch_norm(data=data, epsilon=2e-5, name=name + '_bn1')
         act1 = sym.relu(data=bn1, name=name + '_relu1')
         conv1 = sym.conv2d(
             data=act1, channels=int(num_filter*0.25), kernel_size=(1, 1),
-            strides=(1, 1), padding=(0, 0), use_bias=False, name=name + '_conv1')
+            strides=stride, padding=(0, 0), use_bias=False, name=name + '_conv1')
         bn2 = sym.batch_norm(data=conv1, epsilon=2e-5, name=name + '_bn2')
         act2 = sym.relu(data=bn2, name=name + '_relu2')
         conv2 = sym.conv2d(
             data=act2, channels=int(num_filter*0.25), kernel_size=(3, 3),
-            strides=stride, padding=(1, 1), use_bias=False, name=name + '_conv2')
+            strides=(1, 1), padding=(1, 1), use_bias=False, name=name + '_conv2')
         bn3 = sym.batch_norm(data=conv2, epsilon=2e-5, name=name + '_bn3')
         act3 = sym.relu(data=bn3, name=name + '_relu3')
         conv3 = sym.conv2d(
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py b/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py
index 42a62af023e7..3f9a870d31c0 100644
--- a/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py
@@ -46,14 +46,13 @@ def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, b
         Workspace used in convolution operator
     """
     if bottle_neck:
-        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
         bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
         act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0),
+        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=stride, pad=(0,0),
                                    no_bias=True, workspace=workspace, name=name + '_conv1')
         bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
         act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
-        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1),
+        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1,1),
                                    no_bias=True, workspace=workspace, name=name + '_conv2')
         bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
         act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 38b5f99eacb9..8a8940817237 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -164,6 +164,31 @@ def measure_option(builder, runner):
         Specify how to build programs
     runner: Runner
         Specify how to run programs
+
+    Examples
+    --------
+    # example setting for using local devices
+    >>> measure_option = autotvm.measure_option(
+    >>>     builder=autotvm.LocalBuilder(),      # use all local cpu cores for compilation
+    >>>     runner=autotvm.LocalRunner(          # measure them sequentially
+    >>>         number=10,
+    >>>         timeout=5)
+    >>> )
+
+    # example setting for using remote devices
+    >>> measure_option = autotvm.measure_option(
+    >>>    builder=autotvm.LocalBuilder(),  # use all local cpu cores for compilation
+    >>>    runner=autotvm.RPCRunner(
+    >>>        'rasp3b', 'locahost', 9190, # device key, host and port of the rpc tracker
+    >>>        number=4,
+    >>>        timeout=4) # timeout of a run on the device. RPC request waiting time is excluded.
+    >>>)
+
+    Note
+    ----
+    To make measurement results accurate, you should pick the correct value for the argument
+    `number` and `repeat` in Runner(). Using `min_repeat_ms` can dynamically adjusts `number`,
+    so it is recommended. The typical value for NVIDIA GPU is 100 ms.
     """
     from .measure_methods import LocalBuilder, LocalRunner
 
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 6a3cd028393c..ec3eb7e611e0 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -72,12 +72,15 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'):
                 raise ValueError("Invalid build_func" + build_func)
 
         self.build_func = build_func
-        self.tmp_dir = tempfile.mkdtemp()
         self.executor = LocalExecutor(timeout=timeout)
+        self.tmp_dir = tempfile.mkdtemp()
 
     def build(self, measure_inputs):
         results = []
 
+        shutil.rmtree(self.tmp_dir)
+        self.tmp_dir = tempfile.mkdtemp()
+
         for i in range(0, len(measure_inputs), self.n_parallel):
             futures = []
             for inp in measure_inputs[i:i + self.n_parallel]:
@@ -95,7 +98,7 @@ def build(self, measure_inputs):
                     results.append(MeasureResult((res,), MeasureErrorNo.BUILD_TIMEOUT,
                                                  self.timeout, time.time()))
                 elif res.error is not None:
-                    # instantiation errorD
+                    # instantiation error
                     if isinstance(res.error, InstantiationError):
                         results.append(MeasureResult((res.error,),
                                                      MeasureErrorNo.INSTANTIATION_ERROR,
@@ -120,9 +123,6 @@ def build(self, measure_inputs):
 
         return results
 
-    def __del__(self):
-        shutil.rmtree(self.tmp_dir)
-
 
 class RPCRunner(Runner):
     """Run generated code on remove devices.
@@ -519,7 +519,7 @@ def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
     return remote
 
 
-def check_remote(target, device_key, host=None, port=None, priority=2, timeout=10):
+def check_remote(target, device_key, host=None, port=None, priority=100, timeout=10):
     """
     Check the availability of a remote device
 
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index 77d9b6190a78..910f7595ad01 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -271,7 +271,7 @@ def pick_best(in_file, out_file):
     parser.add_argument("--code", action='store_true')
 
     args = parser.parse_args()
-    logger.basicConfig(level=logger.INFO)
+    logging.basicConfig(level=logger.INFO)
 
     if args.mode == 'pick':
         args.o = args.o or args.i + ".best.log"
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index 7592fc5af7df..8efb0e61b518 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -9,7 +9,8 @@
 from .task import Task, create, register, template, get_config, args_to_workload
 from .space import ConfigSpace, ConfigEntity
 from .code_hash import attach_code_hash, attach_code_hash_to_arg
-from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, FallbackContext, dispatcher
+from .dispatcher import dispatcher, DispatchContext, ApplyConfig, ApplyHistoryBest, \
+    FallbackContext, clear_fallback_cache
 
 from .topi_integration import register_topi_compute, register_topi_schedule
-from .nnvm_integration import extract_from_graph
+from .nnvm_integration import extract_from_graph, extract_from_multiple_graph
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 398e850d871d..8e159cc412c9 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -289,15 +289,20 @@ def __init__(self):
         self.memory = {}
         self.silent = False
 
+        # a set to prevent print duplicated message
+        self.messages = set()
+
     def _query_inside(self, target, workload):
         key = (str(target), workload)
         if key in self.memory:
             return self.memory[key]
 
         if not self.silent:
-            logger.warning(
-                "Cannot find config for target=%s, workload=%s. A fallback configuration "
-                "is used, which may bring great performance regression.", target, workload)
+            msg = "Cannot find config for target=%s, workload=%s. A fallback configuration "\
+                  "is used, which may bring great performance regression." % (target, workload)
+            if msg not in self.messages:
+                self.messages.add(msg)
+                logger.warning(msg)
         cfg = FallbackConfigEntity()
 
         # cache this config
@@ -320,3 +325,23 @@ def clear_cache(self, target, workload):
             del self.memory[key]
 
 DispatchContext.current = FallbackContext()
+
+def clear_fallback_cache(target, workload):
+    """Clear fallback cache. Pass the same argument as _query_inside to this function
+    to clean the cache.
+
+    Parameters
+    ----------
+    target: Target
+        The current target
+    workload : Workload
+        The current workload.
+
+    Note
+    ----
+    This is used in alter_op_layout to clear the bad cache created before call topi compute function
+    """
+    context = DispatchContext.current
+    while not isinstance(context, FallbackContext):
+        context = context._old_ctx
+    context.clear_cache(target, workload)
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index 9138cc288372..80b62229a34e 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -208,7 +208,7 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
     ----------
     graph : Graph
         The graph to tune
-    shape : dict of str to tuple, optional
+    shape : dict of str to tuple
         The input shape to the graph
     dtype : str or dict of str to str
         The input types to the graph
@@ -249,6 +249,69 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
 
     logger.disabled = old_state
 
+    # create tasks for target
+    tasks = []
+    for task_name, args in env.get_tasks():
+        tasks.append(create(task_name, args,
+                            target=target, target_host=target_host,
+                            template_key='direct'))
+
+    return tasks
+
+
+def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_host=None):
+    """ Extract tuning tasks from multiple nnvm graphs.
+
+    This function is the multiple graph version of extract_from_graph
+
+    Parameters
+    ----------
+    graphs : List of Graph
+        The list of graphs to tune
+    shapes : List of dict of str to tuple
+        The input shape to the graph
+    dtypes : List of str or dict of str to str
+        The input types to the graph
+    target: tvm.target.Target
+        The compilation target
+    symbols : Array of nnvm.symbol
+        Array of nnvm symbols want to be tuned
+    target_host: tvm.target.Target
+        The host compilation target
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task
+        collected tasks
+    """
+    import nnvm.compiler
+
+    env = TaskExtractEnv.get()
+
+    topi_funcs = []
+    for sym_name in symbols:
+        if sym_name in env.symbol2topi:
+            topi_funcs.extend(env.symbol2topi[sym_name])
+        else:
+            warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
+
+    # run compiler to collect all TOPI calls during compilation
+    env.reset(topi_funcs)
+
+    # disable logger temporarily
+    old_state = logger.disabled
+    logger.disabled = True
+
+    # use a "tracing" target to do a fake compile for collecting topi calls
+    tracing_target = _target.create("llvm -device=tracing")
+
+    nnvm.compiler.engine.clear_cache()
+    for graph, shape, dtype in zip(graphs, shapes, dtypes):
+        nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype)
+
+    logger.disabled = old_state
+
+    # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
         tasks.append(create(task_name, args,
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index 5a34353acfe9..f9bf60237776 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -900,6 +900,7 @@ def __repr__(self):
         return "%s,%s,%s,%d" % (str(self._entity_map)[12:-1], self.template_key,
                                 self.code_hash, self.index)
 
+
 class FallbackConfigEntity(ConfigSpace):
     """The config entity created to support fallback"""
 
@@ -926,18 +927,74 @@ def fallback_split(self, name, constraints):
         Then cfg.fallback_split('tile_0', [-1, 8, 4]) will give you cfg['tile_0'].size = [7, 7, 1]
         """
         space = self.space_map[name]
+        assert isinstance(space, SplitSpace)
         assert len(constraints) == space.num_outputs
-        indices = np.arange(space.num_outputs)
 
         # '-1' means no constraint
         constraints = [x if x != -1 else 1e10 for x in constraints]
 
-        for entity in reversed(space.entities):
-            if all([entity.size[i] <= constraints[i] for i in indices]):
-                self._entity_map[name] = entity
-                return
+        entity = self._entity_map[name]
+        now = space.product
+
+        for i in reversed(range(space.num_outputs)):
+            factors = get_factors(now)
+
+            find = len(factors) - 1
+            for j, f in enumerate(factors):
+                if f > constraints[i]:
+                    find = j - 1
+                    break
+
+            if find >= 0:
+                entity.size[i] = factors[find]
+                now //= factors[find]
+            else:
+                raise RuntimeError("Cannot find feasible fallback split entity for node: " + name)
+
+    def fallback_with_reference_log(self, ref_log):
+        """A data driven fallback mechanism.
+        We use tuned parameters from TopHub as reference data.
+        For an unseen shape, we find the most similar tuned one from TopHub and
+        mimic its parameters.
+
+        Parameters
+        ----------
+        ref_log: List of (MeasureInput, MeasureResult)
+            The reference log
+        """
+        knob_names = [x for x in self.space_map.keys() if
+                      isinstance(self.space_map[x], SplitSpace)]
+
+        # find best match config in reference data by matching tiling factors
+        factor_list = []
+        for knob_name in knob_names:
+            factor_list.append(get_factors(self.space_map[knob_name].product))
+
+        best_match_cfg = None
+        best_match_score = 0
+        for inp, _ in ref_log:
+            match_score = 0
+            for i, knob_name in enumerate(knob_names):
+                factors = get_factors(int(np.prod(inp.config[knob_name].size)))
+                match_score += (float(len(set(factor_list[i]).intersection(factors))) /
+                                len(factor_list[i]))
+
+                if match_score > best_match_score:
+                    best_match_score, best_match_cfg = match_score, inp.config
+
+        if best_match_cfg is None:
+            return
+
+        # mimic its tiling strategy
+        for knob_name in knob_names:
+            constraint = list(best_match_cfg[knob_name].size)
+            constraint[0] = -1
+            self.fallback_split(knob_name, constraint)
 
-        raise RuntimeError("Cannot find feasible fallback split entity for node: " + name)
+        # copy other knobs
+        for knob_name in self.space_map.keys():
+            if not isinstance(self.space_map[knob_name], SplitSpace):
+                self._entity_map[knob_name] = best_match_cfg[knob_name]
 
     def __repr__(self):
         return "%s,%s,%s" % (str(self._entity_map)[12:-1], self.template_key, self.code_hash)
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 4982455038fc..bde706ee6cfb 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -2,7 +2,7 @@
 TopHub: Tensor Operator Hub
 To get the best performance, we typically need auto-tuning for the specific devices.
 TVM releases pre-tuned parameters in TopHub for some common networks and hardware targets.
-TVM will download these parameters for you when you create the target for the first time.
+TVM will download these parameters for you when you call nnvm.compiler.build_module .
 """
 # pylint: disable=invalid-name
 
@@ -13,15 +13,21 @@
 from .task import ApplyHistoryBest
 from .. import target as _target
 from ..contrib.download import download
+from .record import load_from_file
 
 # root path to store TopHub files
 AUTOTVM_TOPHUB_ROOT_PATH = os.path.join(os.path.expanduser('~'), ".tvm", "tophub")
 
 # the version of each package
 PACKAGE_VERSION = {
-    'vta':     "v0.01",
     'arm_cpu': "v0.01",
-    'cuda':    "v0.01",
+
+    'cuda':    "v0.02",
+    'rocm':    "v0.01",
+    'opencl':  "v0.01",
+    'mali':    "v0.01",
+
+    'vta':     "v0.01",
 }
 
 logger = logging.getLogger('autotvm')
@@ -30,6 +36,9 @@ def _alias(name):
     """convert alias for some packages"""
     table = {
         'vtacpu': 'vta',
+
+        'metal': 'opencl',
+        'nvptx': 'cuda'
     }
     return table.get(name, name)
 
@@ -60,6 +69,7 @@ def context(target, extra_files=None):
 
     all_packages = list(PACKAGE_VERSION.keys())
     for name in possible_names:
+        name = _alias(name)
         if name in all_packages:
             check_backend(name)
 
@@ -121,3 +131,51 @@ def download_package(package_name):
     logger.info("Download pre-tuned parameters package %s", package_name)
     download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/%s"
              % package_name, os.path.join(rootpath, package_name), True, verbose=0)
+
+
+# global cache for load_reference_log
+REFERENCE_LOG_CACHE = {}
+
+def load_reference_log(backend, model, workload_name, template_key):
+    """ Load reference log from TopHub to support fallback in template.
+    Template will use these reference logs to choose fallback config.
+
+    Parameters
+    ----------
+    backend: str
+        The backend name
+    model: str
+        The name of the model
+    workload_name: str
+        The name of the workload. (The first item in the workload tuple)
+    template_key: str
+        The template key
+    """
+
+    backend = _alias(backend)
+    version = PACKAGE_VERSION[backend]
+    package_name = "%s_%s.log" % (backend, version)
+    filename = os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, package_name)
+
+    global REFERENCE_LOG_CACHE
+    key = (backend, model, workload_name, template_key)
+
+    if key not in REFERENCE_LOG_CACHE:
+        tmp = []
+        if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, package_name)):
+            find = False
+            inp = None
+            for inp, res in load_from_file(filename):
+                if model == inp.target.model:
+                    find = True
+                    break
+            if not find and inp:
+                model = inp.target.model
+
+            for inp, res in load_from_file(filename):
+                if (model == inp.target.model and inp.task.workload[0] == workload_name and
+                        inp.config.template_key == template_key):
+                    tmp.append((inp, res))
+        REFERENCE_LOG_CACHE[key] = tmp
+
+    return REFERENCE_LOG_CACHE[key]
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index cffbb9798392..abd7ec4fad0b 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -34,6 +34,7 @@ def __init__(self, task, **kwargs):
         # time to leave
         self.ttl = None
         self.n_trial = None
+        self.early_stopping = None
 
     def has_next(self):
         """Whether has next untried config in the space
@@ -92,6 +93,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
         n_parallel = getattr(measure_batch, 'n_parallel', 1)
         early_stopping = early_stopping or 1e9
         self.n_trial = n_trial
+        self.early_stopping = early_stopping
 
         old_level = logger.level
 
@@ -127,18 +129,18 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
                              res, config)
 
             i += len(results)
+            self.ttl = min(early_stopping + self.best_iter, n_trial) - i
 
             self.update(inputs, results)
-
             for callback in callbacks:
                 callback(self, inputs, results)
 
-            self.ttl = min(early_stopping + self.best_iter, n_trial) - i
             if i >= self.best_iter + early_stopping:
                 logger.debug("Early stopped. Best iter: %d.", self.best_iter)
                 break
 
             if error_ct > 150:
+                logging.basicConfig()
                 logger.warning("Too many errors happen in the tuning. Now is in debug mode")
                 logger.setLevel(logging.DEBUG)
             else:
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index bda3ee26e062..a725a1eeabed 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -53,7 +53,7 @@ class XGBoostCostModel(CostModel):
     upper_model: XGBoostCostModel, optional
         The upper model used in transfer learning
     """
-    def __init__(self, task, feature_type, loss_type, num_threads=4, log_interval=25,
+    def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval=25,
                  upper_model=None):
         super(XGBoostCostModel, self).__init__()
 
diff --git a/python/tvm/contrib/download.py b/python/tvm/contrib/download.py
index 434216a2652c..0dcbb56ad663 100644
--- a/python/tvm/contrib/download.py
+++ b/python/tvm/contrib/download.py
@@ -64,8 +64,8 @@ def _download_progress(count, block_size, total_size):
         progress_size = int(count * block_size)
         speed = int(progress_size / (1024 * duration))
         percent = min(int(count * block_size * 100 / total_size), 100)
-        sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
-                         (percent, progress_size / (1024 * 1024), speed, duration))
+        sys.stdout.write("\r...%d%%, %.2f MB, %d KB/s, %d seconds passed" %
+                         (percent, progress_size / (1024.0 * 1024), speed, duration))
         sys.stdout.flush()
 
     if sys.version_info >= (3,):
diff --git a/python/tvm/target.py b/python/tvm/target.py
index 07200058a021..b3a9086e74b1 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -105,6 +105,13 @@ def libs(self):
             self._libs = [l.value for l in self.libs_array]
         return self._libs
 
+    @property
+    def model(self):
+        for opt in self.options_array:
+            if opt.value.startswith('-model='):
+                return opt.value[7:]
+        return 'unknown'
+
     def __enter__(self):
         _api_internal._EnterTargetScope(self)
         return self
@@ -354,52 +361,60 @@ def dispatch_func(func, *args, **kwargs):
     return fdecorate
 
 
-def cuda(options=None):
+def cuda(model='unknown', options=None):
     """Returns a cuda target.
 
     Parameters
     ----------
+    model: str
+        The model of cuda device (e.g. 1080ti)
     options : str or list of str
         Additional options
     """
-    options = _merge_opts([], options)
-    return _api_internal._TargetCreate("cuda", *options)
+    opts = _merge_opts(['-model=%s' % model], options)
+    return _api_internal._TargetCreate("cuda", *opts)
 
 
-def rocm(options=None):
+def rocm(model='unknown', options=None):
     """Returns a ROCM target.
 
     Parameters
     ----------
+    model: str
+        The model of this device
     options : str or list of str
         Additional options
     """
-    options = _merge_opts([], options)
-    return _api_internal._TargetCreate("rocm", *options)
+    opts = _merge_opts(["-model=%s" % model], options)
+    return _api_internal._TargetCreate("rocm", *opts)
 
 
-def mali(options=None):
+def mali(model='unknown', options=None):
     """Returns a ARM Mali GPU target.
 
     Parameters
     ----------
+    model: str
+        The model of this device
     options : str or list of str
         Additional options
     """
-    opts = ["-device=mali"]
+    opts = ["-device=mali", '-model=%s' % model]
     opts = _merge_opts(opts, options)
     return _api_internal._TargetCreate("opencl", *opts)
 
 
-def intel_graphics(options=None):
+def intel_graphics(model='unknown', options=None):
     """Returns an Intel Graphics target.
 
     Parameters
     ----------
+    model: str
+        The model of this device
     options : str or list of str
         Additional options
     """
-    opts = ["-device=intel_graphics"]
+    opts = ["-device=intel_graphics", '-model=%s' % model]
     opts = _merge_opts(opts, options)
     return _api_internal._TargetCreate("opencl", *opts)
 
@@ -436,6 +451,7 @@ def arm_cpu(model='unknown', options=None):
         "rasp3b":    ["-model=bcm2837", "-target=armv7l-linux-gnueabihf -mattr=+neon"],
         "rk3399":    ["-model=rk3399", "-target=aarch64-linux-gnu -mattr=+neon"],
         "pynq":      ["-model=pynq", "-target=armv7a-linux-eabi -mattr=+neon"],
+        "ultra96":   ["-model=ultra96", "-target=aarch64-linux-gnu -mattr=+neon"],
     }
     pre_defined_opt = trans_table.get(model, ["-model=%s" % model])
 
@@ -494,5 +510,4 @@ def current_target(allow_none=True):
     ------
     ValueError if current target is not set.
     """
-    target_str = _api_internal._GetCurrentTarget(allow_none)
-    return create(target_str) if target_str is not None else None
+    return _api_internal._GetCurrentTarget(allow_none)
diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc
index 7acf79ff5308..99f9f0c073c3 100644
--- a/src/arithmetic/canonical.cc
+++ b/src/arithmetic/canonical.cc
@@ -583,8 +583,7 @@ class Canonical::Internal : public IRMutator {
     while (i < suma->elem.size() && j < sumb->elem.size()) {
       const auto& a = suma->elem[i];
       const auto& b = sumb->elem[j];
-      if (a.value.same_as(b.value)) {
-        CHECK_EQ(a.level, b.level);
+      if (a.value.same_as(b.value) && a.level == b.level) {
         ComExprEntry e = a;
         e.scale = a.scale + b.scale * bscale;
         if (e.scale != 0) {
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index f1e224e5a9d1..84c9918530f5 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -252,11 +252,8 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
       this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
       this->device_type = device_type;
       this->devices = devices_matched;
-      LOG(INFO) << "Initialize OpenCL platform \'" << this->platform_name << '\'';
       break;
     }
-    LOG(INFO) << "\'" << cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME)
-              << "\' platform has no OpenCL device: " << device_type << " mode";
   }
   if (this->platform_id == nullptr) {
     LOG(WARNING) << "No OpenCL device";
@@ -273,9 +270,6 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
     this->queues.push_back(
         clCreateCommandQueue(this->context, did, 0, &err_code));
     OPENCL_CHECK_ERROR(err_code);
-    LOG(INFO) << type_key << "(" << i
-              << ")=\'" << cl::GetDeviceInfo(did, CL_DEVICE_NAME)
-              << "\' cl_device_id=" << did;
   }
 }
 
diff --git a/tests/python/unittest/test_autotvm_space.py b/tests/python/unittest/test_autotvm_space.py
index e51e34e95a3b..7866226083cc 100644
--- a/tests/python/unittest/test_autotvm_space.py
+++ b/tests/python/unittest/test_autotvm_space.py
@@ -30,14 +30,21 @@ def test_split():
     cfg = FallbackConfigEntity()
     cfg.define_split('tile_n', cfg.axis(128), num_outputs=3)
     cfg.fallback_split('tile_n', [-1, 8, 4])
-
     assert cfg['tile_n'].size == [4, 8, 4]
 
     cfg = FallbackConfigEntity()
     cfg.define_split('tile_n', cfg.axis(49), num_outputs=3)
     cfg.fallback_split('tile_n', [-1, 8, 4])
-
     assert cfg['tile_n'].size == [7, 7, 1]
 
+    cfg = FallbackConfigEntity()
+    cfg.define_split('tile_n', cfg.axis(49), num_outputs=3)
+    try:
+        cfg.fallback_split('tile_n', [-1, 1, 0])
+        assert False
+    except RuntimeError:
+        pass
+
+
 if __name__ == '__main__':
     test_split()
diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py
index 9678fff8ef9b..567c5ad93e93 100644
--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -61,7 +61,6 @@ def test_make_attrs():
     datrr = tvm.load_json(tvm.save_json(dattr))
     assert dattr.name.value == "xyz"
 
-
 def test_make_sum():
     A = tvm.placeholder((2, 10), name='A')
     k = tvm.reduce_axis((0,10), "k")
diff --git a/tests/python/unittest/test_lang_target.py b/tests/python/unittest/test_lang_target.py
index f7309fc30819..42e2c3fcb2e3 100644
--- a/tests/python/unittest/test_lang_target.py
+++ b/tests/python/unittest/test_lang_target.py
@@ -34,20 +34,21 @@ def test_target_dispatch():
     with tvm.target.create("metal"):
         assert mygeneric(1) == 3
 
-    assert tvm.target.current_target() == None
+    assert tvm.target.current_target() is None
 
 
 def test_target_string_parse():
-    target = tvm.target.create("cuda -libs=cublas,cudnn")
+    target = tvm.target.create("cuda -model=unknown -libs=cublas,cudnn")
 
     assert target.target_name == "cuda"
-    assert target.options == ['-libs=cublas,cudnn']
+    assert target.options == ['-model=unknown', '-libs=cublas,cudnn']
     assert target.keys == ['cuda', 'gpu']
     assert target.libs == ['cublas', 'cudnn']
-    assert str(target) == str(tvm.target.cuda("-libs=cublas,cudnn"))
-
+    assert str(target) == str(tvm.target.cuda(options="-libs=cublas,cudnn"))
 
     assert tvm.target.intel_graphics().device_name == "intel_graphics"
+    assert tvm.target.mali().device_name == "mali"
+    assert tvm.target.arm_cpu().device_name == "arm_cpu"
 
 if __name__ == "__main__":
     test_target_dispatch()
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index a3945a4c9d76..6a924a4b133c 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -42,9 +42,24 @@ def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype):
     """spatial packing template"""
     return _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile=2)
 
-@autotvm.task.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd'])
+@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd'])
 def schedule_conv2d_nchw_arm_cpu(cfg, outs):
-    """TOPI schedule callback"""
+    """TOPI schedule callback for conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d.
+    """
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
@@ -120,19 +135,16 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
 
     cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
     cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
-    # ====================================================================
 
+    # fallback support
     if cfg.is_fallback:
-        if num_tile == 2:
-            cfg.fallback_split('tile_co', [-1, 8])
-            cfg.fallback_split('tile_oh', [-1, 2])
-            cfg.fallback_split('tile_ow', [-1, 8])
-        else:
-            cfg.fallback_split('tile_co', [-1, 16, 4])
-            cfg.fallback_split('tile_oh', [-1, 1, 1])
-            cfg.fallback_split('tile_ow', [-1, 1, 4])
-        cfg['ann_reduce'].anns = ['unroll', 'unroll']
-        cfg['ann_spatial'].anns = ['none', 'unroll', 'vec']
+        if num_tile == 2:     # arm cpu
+            ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'conv2d', 'direct')
+            cfg.fallback_with_reference_log(ref_log)
+        elif num_tile == 3:  # mali gpu
+            ref_log = autotvm.tophub.load_reference_log('mali', 'rk3399', 'conv2d', 'direct')
+            cfg.fallback_with_reference_log(ref_log)
+    # ====================================================================
 
     VC = cfg["tile_co"].size[-1]
     VH = cfg["tile_oh"].size[-1]
@@ -478,8 +490,8 @@ def decl_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, til
                           tile_size)
 
 
-@autotvm.task.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
-                                     'arm_cpu', ['winograd'])
+@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                'arm_cpu', ['winograd'])
 def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
     """TOPI schedule callback"""
     s = tvm.create_schedule([x.op for x in outs])
@@ -517,11 +529,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
                                          layout, out_dtype)
         cfg = autotvm.DispatchContext.current.query(tvm.target.current_target(), workload)
 
-        if cfg.is_fallback: # if is fallback, clear query cache and return None
-            context = autotvm.DispatchContext.current
-            while not isinstance(context, autotvm.FallbackContext):
-                context = context._old_ctx
-            context.clear_cache(tvm.target.current_target(), workload)
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(tvm.target.current_target(), workload)
             return None
 
         if cfg.template_key == 'direct':  # packing weight tensor
diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index c341d1a5b325..2556af36e5f9 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -9,11 +9,11 @@
 from ..util import traverse_inline
 
 # register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-autotvm.task.register_topi_compute(depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct',
-                                   depthwise_conv2d_nchw.fdefault)
+autotvm.register_topi_compute(depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct',
+                              depthwise_conv2d_nchw.fdefault)
 
 # register customized schedule for arm cpu.
-@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct')
+@autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct')
 def schedule_depthwise_conv2d_nchw_arm(cfg, outs):
     """Schedule depthwise conv2d
 
@@ -37,16 +37,19 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
         A, B, C = data, kernel, output
         s[data_pad].compute_inline()
 
-        # define tile
+        ##### space definition begin #####
         n, c, h, w = s[output].op.axis
-        cfg.define_split('tile_c', c, num_outputs=2)
-        cfg.define_split('tile_h', h, num_outputs=2)
-        cfg.define_split('tile_w', w, num_outputs=2)
+        _, vc = cfg.define_split('tile_c', c, num_outputs=2)
+        _, vh = cfg.define_split('tile_h', h, num_outputs=2)
+        _, vw = cfg.define_split('tile_w', w, num_outputs=2)
+        cfg.define_annotate('ann', [vh, vw, vc], policy='try_unroll_vec')
 
+        # fallback support
         if cfg.is_fallback:
-            cfg.fallback_split('tile_c', [-1, 4])
-            cfg.fallback_split('tile_h', [-1, 2])
-            cfg.fallback_split('tile_w', [-1, 4])
+            ref_log = autotvm.tophub.load_reference_log(
+                'arm_cpu', 'rk3399', 'depthwise_conv2d_nchw', 'direct')
+            cfg.fallback_with_reference_log(ref_log)
+        ##### space definition end #####
 
         # park data to vector form  [n, c, h, w] -> [n, C, h, w, VC]
         A0 = s.cache_read(data_pad, "global", C)
@@ -78,7 +81,6 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
         s[A1].compute_at(s[C0], oh)
 
         # try unroll and vectorization
-        cfg.define_annotate('ann', [ih, iw, vc], policy='try_unroll_vec')
         cfg['ann'].apply(s, C0, [ih, iw, vc],
                          axis_lens=[cfg['tile_h'].size[-1],
                                     cfg['tile_w'].size[-1],
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index b8740f811ff7..e1db2c6fdf63 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -2,10 +2,8 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from .conv2d import conv2d_cuda
-from .conv2d_nchw import schedule_conv2d_nchw
+from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw
 from .conv2d_hwcn import schedule_conv2d_hwcn
-from .depthwise_conv2d import schedule_depthwise_conv2d_nchw, schedule_depthwise_conv2d_nhwc
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
 from .reduction import schedule_reduce
@@ -13,7 +11,6 @@
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
 from .dense import dense_cuda, schedule_dense
 from .pooling import schedule_pool, schedule_global_pool
-from .conv2d_transpose_nchw import schedule_conv2d_transpose_nchw
 from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize
 from .vision import *
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index 3c494cdeb0fa..a7d5f742d98c 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -1,24 +1,32 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
+# pylint: disable=invalid-name
 """Compute definition for conv2d with cuda backend"""
 import tvm
+from tvm import autotvm
 from tvm.contrib import cudnn
-import topi
-from ..nn.conv2d import conv2d
-from ..util import get_const_int
 
-@conv2d.register("cuda")
-def conv2d_cuda(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+from .. import nn, generic
+from ..util import get_const_int, get_const_tuple, traverse_inline
+
+from .conv2d_direct import schedule_direct_cuda
+from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda
+
+
+@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd'])
+def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for cuda backend.
 
     Parameters
     ----------
-    input : tvm.Tensor
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    filter : tvm.Tensor
+    kernel : tvm.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width]
 
-    stride : int or a list/tuple of two ints
+    strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
 
     padding : int or a list/tuple of two ints
@@ -27,45 +35,56 @@ def conv2d_cuda(data, kernel, stride, padding, layout='NCHW', out_dtype='float32
     layout : str
         layout of data
 
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
     Returns
     -------
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    assert isinstance(stride, int) or len(stride) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-    if isinstance(padding, int):
-        pad_h = pad_w = padding
-    else:
-        pad_h, pad_w = padding
-    # handle dilation
-    dilation_h = dilation_w = 1
-    kernel_tvm = kernel
-    kernel_cudnn = kernel
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        kernel_before_dilation = kernel.op.input_tensors[0]
-        kernel_cudnn = kernel_before_dilation
-        if layout == 'NCHW':
-            dilation_h = (get_const_int(kernel.shape[2]) + get_const_int(kernel_before_dilation.shape[2]) - 1) \
-                // get_const_int(kernel_before_dilation.shape[2])
-            dilation_w = (get_const_int(kernel.shape[3]) + get_const_int(kernel_before_dilation.shape[3]) - 1) \
-                // get_const_int(kernel_before_dilation.shape[2])
-        elif layout == 'NHWC':
-            dilation_h = (get_const_int(kernel.shape[1]) + get_const_int(kernel_before_dilation.shape[1]) - 1) \
-                // get_const_int(kernel_before_dilation.shape[1])
-            dilation_w = (get_const_int(kernel.shape[2]) + get_const_int(kernel_before_dilation.shape[2]) - 1) \
-                // get_const_int(kernel_before_dilation.shape[2])
     target = tvm.target.current_target()
+
     if "cudnn" in target.libs:
-        assert layout != 'HWCN', "HWCN layout not supported with CUDNN."
-        tensor_format = 0 # CUDNN_TENSOR_NCHW
-        if layout == 'NHWC':
+        if layout == 'NCHW':
+            tensor_format = 0 # CUDNN_TENSOR_NCHW
+            N, _, H, W = get_const_tuple(data.shape)
+        elif layout == 'NHWC':
             tensor_format = 1 # CUDNN_TENSOR_NHWC
+            N, H, W, _ = get_const_tuple(data.shape)
+        else:
+            raise ValueError("Unsupported layout %s in cudnn" % layout)
+        CO, CI, KH, KW = get_const_tuple(kernel.shape)
+
+        # handle dilation
+        stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
+        pad_h, pad_w = (padding, padding) if isinstance(padding, int) else padding
+
+        OH = (H + 2 * pad_h - KH) // stride_h + 1
+        OW = (W + 2 * pad_w - KW) // stride_w + 1
+        cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
+
+        dilation_h = dilation_w = 1
+        kernel_before_dilation = kernel
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            kernel_before_dilation = kernel.op.input_tensors[0]
+            if layout == 'NCHW':
+                dilation_h = (get_const_int(kernel.shape[2]) +
+                              get_const_int(kernel_before_dilation.shape[2]) - 1) \
+                             // get_const_int(kernel_before_dilation.shape[2])
+                dilation_w = (get_const_int(kernel.shape[3]) +
+                              get_const_int(kernel_before_dilation.shape[3]) - 1) \
+                             // get_const_int(kernel_before_dilation.shape[2])
+            elif layout == 'NHWC':
+                dilation_h = (get_const_int(kernel.shape[1]) +
+                              get_const_int(kernel_before_dilation.shape[1]) - 1) \
+                             // get_const_int(kernel_before_dilation.shape[1])
+                dilation_w = (get_const_int(kernel.shape[2]) +
+                              get_const_int(kernel_before_dilation.shape[2]) - 1) \
+                             // get_const_int(kernel_before_dilation.shape[2])
+
         return cudnn.conv2d_forward(data,
-                                    kernel_cudnn,
+                                    kernel_before_dilation,
                                     stride_h,
                                     stride_w,
                                     pad_h,
@@ -74,10 +93,51 @@ def conv2d_cuda(data, kernel, stride, padding, layout='NCHW', out_dtype='float32
                                     dilation_w,
                                     conv_mode=1,
                                     tensor_format=tensor_format,
-                                    algo=-1) # let CUDNN choose the best algo
-    elif layout == 'NCHW':
-        return topi.nn.conv2d_nchw(data, kernel_tvm, stride, padding, out_dtype)
+                                    algo=-1)  # let CUDNN choose the best algo
+
+    if cfg.template_key == 'winograd':
+        return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype,
+                             pre_computed=False)
+
+    if layout == 'NCHW':
+        return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype)
     elif layout == 'HWCN':
-        return topi.nn.conv2d_hwcn(data, kernel_tvm, stride, padding, out_dtype)
+        return nn.conv2d_hwcn(data, kernel, strides, padding, out_dtype)
     else:
         raise ValueError("not support this layout {} yet".format(layout))
+
+
+@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"],
+                                ["direct", 'winograd'])
+def schedule_conv2d_nchw_cuda(cfg, outs):
+    """TOPI schedule callback of conv2d for cuda gpu
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d.
+    """
+    target = tvm.target.current_target()
+    if 'cudnn' in target.libs:
+        return generic.schedule_extern(outs)
+
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'conv2d_nchw':
+            schedule_direct_cuda(cfg, s, op.output(0))
+        if op.tag == 'conv2d_nchw_winograd':
+            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/cuda/conv2d_direct.py b/topi/python/topi/cuda/conv2d_direct.py
new file mode 100644
index 000000000000..19e7ea38f647
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_direct.py
@@ -0,0 +1,96 @@
+# pylint: disable=invalid-name
+"""The templates for cuda conv2d operators"""
+import tvm
+from tvm import autotvm
+
+def schedule_direct_cuda(cfg, s, conv):
+    """schedule optimized for batch size = 1"""
+
+    ##### space definition begin #####
+    n, f, y, x = s[conv].op.axis
+    rc, ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_f", f, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+
+    target = tvm.target.current_target()
+    if target.target_name in ['nvptx', 'rocm']:
+        cfg.define_knob("unroll_explicit", [1])
+    else:
+        cfg.define_knob("unroll_explicit", [0, 1])
+
+    # fallback support
+    if cfg.is_fallback:
+        ref_log = autotvm.tophub.load_reference_log(
+            target.target_name, target.model, 'conv2d', 'direct')
+        cfg.fallback_with_reference_log(ref_log)
+    ##### space definition end #####
+
+    pad_data, kernel = s[conv].op.input_tensors
+
+    s[pad_data].compute_inline()
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    if conv.op in s.outputs:
+        output = conv
+        OL = s.cache_write(conv, 'local')
+    else:
+        output = s.outputs[0].output(0)
+        s[conv].set_scope('local')
+        OL = conv
+
+    # create cache stage
+    AA = s.cache_read(pad_data, 'shared', [OL])
+    WW = s.cache_read(kernel, 'shared', [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    bf = s[output].fuse(n, bf)
+    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    s[OL].compute_at(s[output], tx)
+
+    # tile reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
+    ryo, ryi = cfg['tile_rx'].apply(s, OL, ry)
+    rxo, rxi = cfg['tile_ry'].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        n, f, y, x = s[load].op.axis
+        fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # unroll
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
deleted file mode 100644
index 4f7539d224eb..000000000000
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ /dev/null
@@ -1,544 +0,0 @@
-#pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
-"""Schedule for conv2d_nchw with auto fusion"""
-import tvm
-import topi
-from .. import util
-from .. import tag
-from .. import generic
-
-def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
-    """Schedule conv2d for specific feature_in_out_filter pattern"""
-    # scheduler params
-    ofactor = 16
-    hfactor = 2
-    if flag >= 96:
-        hfactor = 4
-    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
-    ow_size = util.get_const_int(Out.shape[3])
-    num_thread = min(max_threads, ow_size * hfactor)
-    vthread = ofactor
-    block_x = tvm.thread_axis("blockIdx.x")
-    thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-    thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
-
-    i, oc, h, w = s[Out].op.axis
-    if ow_size * hfactor == num_thread:
-        ooc, ioc = s[Out].split(oc, factor=vthread)
-        oh, ih = s[Out].split(h, factor=hfactor)
-        s[Out].reorder(ooc, oh, ioc, ih, w)
-        oc = s[Out].fuse(ooc, oh)
-        ow, _ = s[Out].split(w, nparts=ow_size)
-        w = s[Out].fuse(ow, ih)
-        s[Out].bind(w, thread_x)
-        s[Out].bind(ioc, thread_xz)
-        s[Out].bind(oc, block_x)
-    else:
-        ow, w = s[Out].split(w, factor=num_thread)
-        s[Out].bind(w, thread_x)
-        s[Out].bind(ow, block_x)
-
-    s[Out_L].compute_at(s[Out], w)
-
-    # schedule Out_L local write
-    i, oc, h, w = s[Out_L].op.axis
-    ic, dh, dw = s[Out_L].op.reduce_axis
-    s[Out_L].reorder(i, oc, h, w, ic, dh, dw)
-    s[temp_S].compute_at(s[Out_L], ic)
-    s[Filter_S].compute_at(s[Out_L], w)
-
-    num_thread1 = max_threads
-    thread_xx = tvm.thread_axis((0, num_thread1), "threadIdx.x")
-    block_xx = tvm.thread_axis("blockIdx.x")
-
-    i = s[temp].fuse(*s[temp].op.axis)
-    bx, tx = s[temp].split(i, factor=num_thread1)
-    s[temp].bind(tx, thread_xx)
-    s[temp].bind(bx, block_xx)
-
-    i = s[temp_R].fuse(*s[temp_R].op.axis)
-    bx, tx = s[temp_R].split(i, factor=num_thread1)
-    s[temp_R].bind(tx, thread_xx)
-    s[temp_R].bind(bx, block_xx)
-
-    #schedule temp_S shared mem load
-    i, ic, h, ow, iw = s[temp_S].op.axis
-    h = s[temp_S].fuse(h, ow)
-    _, tx = s[temp_S].split(h, factor=num_thread)
-    s[temp_S].bind(tx, thread_x)
-    if num_thread < max_threads:
-        s[temp_S].vectorize(iw)
-
-    #schedule Filter_S shared mem load
-    i, oc, h, w = s[Filter_S].op.axis
-    fuse_index = s[Filter_S].fuse(w, h)
-    w = s[Filter_S].fuse(fuse_index, oc)
-    tx, _ = s[Filter_S].split(w, nparts=num_thread)
-    s[Filter_S].bind(tx, thread_x)
-
-def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
-    """Schedule conv2d for specific feature_in_out_filter pattern"""
-    if util.get_const_int(Filter_S.shape[0]) == util.get_const_int(Filter_S.shape[1]):
-        mark = util.get_const_int(Out.shape[2]) * util.get_const_int(Out.shape[3])
-        num_thread_x = 0
-        if mark % 8 == 0 and mark % 7 == 0:
-            num_thread_x = 8
-            vthread_x = 7
-        elif mark % 4 == 0 and mark % 7 == 0:
-            num_thread_x = 4
-            vthread_x = 7
-        else:
-            for i in range(5, mark):
-                if mark % i == 0 and num_thread_x == 0:
-                    vthread_x = i
-                    mark = mark // i
-                if mark % i == 0 and vthread_x > 0:
-                    num_thread_x = i
-                    break
-        if mark < 5 or num_thread_x * vthread_x > 128:
-            num_thread_x = 8
-            vthread_x = 8
-        num_thread_y = 8
-        vthread_y = 2
-        ifactor = 8
-
-        block_x = tvm.thread_axis("blockIdx.x")
-        block_y = tvm.thread_axis("blockIdx.y")
-        thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
-        thread_xz = tvm.thread_axis((0, vthread_x), "vthread", name="vx")
-        thread_yz = tvm.thread_axis((0, vthread_y), "vthread", name="vy")
-
-        i, oc, h, w = s[Out].op.axis
-        factor = util.get_const_int(Out.shape[3])
-        ooc, ioc = s[Out].split(oc, factor=num_thread_y*vthread_y)
-        oioc, iioc = s[Out].split(ioc, nparts=vthread_y)
-        s[Out].bind(iioc, thread_y)
-        s[Out].bind(oioc, thread_yz)
-        s[Out].bind(ooc, block_y)
-        if factor < num_thread_x*vthread_x:
-            oh, ih = s[Out].split(h, factor=num_thread_x*vthread_x//factor)
-            w = s[Out].fuse(ih, w)
-            ow, iw = s[Out].split(w, nparts=vthread_x)
-            s[Out].reorder(i, ooc, oh, oioc, ow, iioc, iw)
-            s[Out].bind(iw, thread_x)
-            s[Out].bind(ow, thread_xz)
-            s[Out].bind(oh, block_x)
-            s[Out_L].compute_at(s[Out], iw)
-        else:
-            ow, iw = s[Out].split(w, factor=num_thread_x)
-            oh, ih = s[Out].split(h, factor=vthread_x)
-            s[Out].reorder(i, ooc, oh, ow, oioc, ih, iioc, iw)
-            oh = s[Out].fuse(oh, ow)
-            s[Out].bind(iw, thread_x)
-            s[Out].bind(ih, thread_xz)
-            s[Out].bind(oh, block_x)
-            s[Out_L].compute_at(s[Out], iw)
-
-        # schedule Out_L local write
-        i, oc, h, w = s[Out_L].op.axis
-        ic, dh, dw = s[Out_L].op.reduce_axis
-        oic, iic = s[Out_L].split(ic, factor=ifactor)
-        s[Out_L].reorder(oic, dh, dw, iic, h, w)
-
-        s[temp_S].compute_at(s[Out_L], oic)
-        s[Filter_S].compute_at(s[Out_L], dw)
-
-        num_thread = tvm.target.current_target(allow_none=False).max_num_threads
-        thread_xx = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        block_xx = tvm.thread_axis("blockIdx.x")
-
-        i = s[temp].fuse(*s[temp].op.axis)
-        bx, tx = s[temp].split(i, factor=num_thread)
-        s[temp].bind(tx, thread_xx)
-        s[temp].bind(bx, block_xx)
-
-        i = s[temp_R].fuse(*s[temp_R].op.axis)
-        bx, tx = s[temp_R].split(i, factor=num_thread)
-        s[temp_R].bind(tx, thread_xx)
-        s[temp_R].bind(bx, block_xx)
-
-        #schedule temp_S shared mem load
-        i, oic, h, w, iic = s[temp_S].op.axis
-        oic = s[temp_S].fuse(oic, h, w)
-        ooic, ioic = s[temp_S].split(oic, factor=num_thread_x)
-        _, iooic = s[temp_S].split(ooic, factor=num_thread_y)
-        s[temp_S].bind(ioic, thread_x)
-        s[temp_S].bind(iooic, thread_y)
-        s[temp_S].vectorize(iic)
-
-        i, oc, h, w = s[Filter_S].op.axis
-        _, ioc = s[Filter_S].split(oc, factor=num_thread_y)
-        _, ii = s[Filter_S].split(i, factor=num_thread_x)
-        s[Filter_S].bind(ioc, thread_y)
-        s[Filter_S].bind(ii, thread_x)
-    else:
-        # scheduler params
-        vthread = 2
-        opart2 = 4
-        ofactor = 64
-        wfactor = 28
-        ifactor = 8
-        if flag > 256:
-            wfactor = 14
-        num_thread_x = max(1, ofactor//(opart2*2))
-        num_thread_y = max(1, (wfactor + vthread-1) // vthread)
-        block_x = tvm.thread_axis("blockIdx.x")
-        block_y = tvm.thread_axis("blockIdx.y")
-        block_z = tvm.thread_axis("blockIdx.z")
-        thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
-        thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
-        thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
-
-        i, oc, h, w = s[Out].op.axis
-        ooc, ioc = s[Out].split(oc, factor=ofactor)
-        ow, iw = s[Out].split(w, factor=wfactor)
-        ow = s[Out].fuse(ow, h)
-        oioc, iioc = s[Out].split(ioc, nparts=vthread)
-        oiw, iiw = s[Out].split(iw, nparts=vthread)
-        oiioc, iiioc = s[Out].split(iioc, nparts=opart2)
-        s[Out].reorder(i, ooc, ow, oioc, oiw, oiioc, iiw, iiioc)
-        s[Out].bind(iiioc, thread_x)
-        s[Out].bind(iiw, thread_y)
-        s[Out].bind(oiioc, thread_xz)
-        s[Out].bind(oiw, thread_yz)
-        s[Out].bind(oioc, block_x)
-        s[Out].bind(ow, block_y)
-        s[Out].bind(ooc, block_z)
-
-        s[Out_L].compute_at(s[Out], iiioc)
-
-        # schedule Out_L local write
-        i, oc, h, w = s[Out_L].op.axis
-        ic, dh, dw = s[Out_L].op.reduce_axis
-        oic, iic = s[Out_L].split(ic, factor=ifactor)
-        s[Out_L].reorder(oic, dh, dw, iic, h, w)
-        max_num_thread = tvm.target.current_target(allow_none=False).max_num_threads
-        if util.get_const_int(Filter_S.shape[1]) == 128:
-            oic = s[Out_L].fuse(dh, oic)
-            s[temp_S].compute_at(s[Out_L], oic)
-            s[Filter_S].compute_at(s[Out_L], oic)
-            num_thread = max_num_thread
-        else:
-            s[temp_S].compute_at(s[Out_L], oic)
-            s[Filter_S].compute_at(s[Out_L], dw)
-            num_thread = 456
-            if max_num_thread < num_thread:
-                num_thread = max_num_thread
-
-        thread_xx = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        block_xx = tvm.thread_axis("blockIdx.x")
-
-        i = s[temp].fuse(*s[temp].op.axis)
-        bx, tx = s[temp].split(i, factor=num_thread)
-        s[temp].bind(tx, thread_xx)
-        s[temp].bind(bx, block_xx)
-
-        i = s[temp_R].fuse(*s[temp_R].op.axis)
-        bx, tx = s[temp_R].split(i, factor=num_thread)
-        s[temp_R].bind(tx, thread_xx)
-        s[temp_R].bind(bx, block_xx)
-
-        #schedule temp_S shared mem load
-        i, oic, h, w, iic = s[temp_S].op.axis
-        oic = s[temp_S].fuse(oic, h, w)
-        ooic, ioic = s[temp_S].split(oic, factor=num_thread_x)
-        _, iooic = s[temp_S].split(ooic, factor=num_thread_y)
-        s[temp_S].bind(ioic, thread_x)
-        s[temp_S].bind(iooic, thread_y)
-        s[temp_S].vectorize(iic)
-
-        #schedule Filter_S shared mem load
-        i, oc, h, w = s[Filter_S].op.axis
-        _, ioc = s[Filter_S].split(oc, factor=num_thread_x)
-        _, ii = s[Filter_S].split(i, factor=num_thread_y)
-        s[Filter_S].bind(ioc, thread_x)
-        s[Filter_S].bind(ii, thread_y)
-
-def conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L):
-    """Schedule conv2d for specific feature_in_out_filter pattern"""
-    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
-    if util.get_const_int(Filter.shape[0]) + util.get_const_int(Filter.shape[1]) <= 768:
-        # scheduler params
-        vthread_x = util.get_const_int(Out.shape[3])
-        num_thread_x = 64
-        ofactor = 8
-        if util.get_const_int(Filter.shape[3]) == 1 and vthread_x * 5 <= max_threads:
-            ofactor = 64
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-        thread_xz = tvm.thread_axis((0, vthread_x), "vthread", name="vx")
-
-        i, oc, h, w = s[Out].op.axis
-        ooc, ioc = s[Out].split(oc, factor=num_thread_x)
-        s[Out].reorder(i, ooc, h, w, ioc)
-        ooc = s[Out].fuse(h, ooc)
-        s[Out].bind(ioc, thread_x)
-        s[Out].bind(w, thread_xz)
-        s[Out].bind(ooc, block_x)
-
-        s[Out_L].compute_at(s[Out], ioc)
-
-        # schedule Out_L local write
-        i, oc, h, w = s[Out_L].op.axis
-        ic, dh, dw = s[Out_L].op.reduce_axis
-        oic, iic = s[Out_L].split(ic, ofactor)
-        s[Out_L].reorder(oic, dh, dw, iic, h, w)
-
-        s[temp_S].compute_at(s[Out_L], oic)
-        s[Filter_S].compute_at(s[Out_L], oic)
-
-        #schedule temp_S shared mem load
-        i, ic, h, w = s[temp_S].op.axis
-        s[temp_S].reorder(i, ic, w, h)
-        ic = s[temp_S].fuse(w, ic)
-        _, iic = s[temp_S].split(ic, factor=num_thread_x)
-        s[temp_S].bind(iic, thread_x)
-
-        #schedule Filter_S shared mem load
-        i, oc, h, w = s[Filter_S].op.axis
-        _, ii = s[Filter_S].split(i, factor=num_thread_x)
-        s[Filter_S].bind(ii, thread_x)
-        s[Filter_S].storage_align(s[Filter_S].op.axis[0], 2, 1)
-
-    else:
-        # scheduler params
-        vthread_x = min(8, util.get_const_int(Out.shape[2]))
-        num_thread_x = 16
-        num_thread_y = min(max_threads // num_thread_x, util.get_const_int(Out.shape[3]))
-        ofactor = 8
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
-        thread_xz = tvm.thread_axis((0, vthread_x), "vthread", name="vx")
-
-        i, oc, h, w = s[Out].op.axis
-        ow, iw = s[Out].split(w, factor=num_thread_y)
-        oh, ih = s[Out].split(h, factor=vthread_x)
-        ooc, ioc = s[Out].split(oc, factor=num_thread_x)
-        s[Out].reorder(i, ooc, oh, ih, ow, iw, ioc)
-        s[Out].bind(ioc, thread_x)
-        s[Out].bind(iw, thread_y)
-        s[Out].bind(ih, thread_xz)
-        s[Out].bind(ooc, block_x)
-
-        s[Out_L].compute_at(s[Out], ioc)
-
-        # schedule Out_L local write
-        i, oc, h, w = s[Out_L].op.axis
-        ic, dh, dw = s[Out_L].op.reduce_axis
-        oic, iic = s[Out_L].split(ic, ofactor)
-        s[Out_L].reorder(oic, dh, dw, iic, h, w)
-
-        s[temp_S].compute_at(s[Out_L], oic)
-        s[Filter_S].compute_at(s[Out_L], oic)
-
-        num_thread = max_threads
-        thread_xx = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        block_xx = tvm.thread_axis("blockIdx.x")
-
-        i = s[temp].fuse(*s[temp].op.axis)
-        bx, tx = s[temp].split(i, factor=num_thread)
-        s[temp].bind(tx, thread_xx)
-        s[temp].bind(bx, block_xx)
-
-        i = s[temp_R].fuse(*s[temp_R].op.axis)
-        bx, tx = s[temp_R].split(i, factor=num_thread)
-        s[temp_R].bind(tx, thread_xx)
-        s[temp_R].bind(bx, block_xx)
-
-        #schedule temp_S shared mem load
-        i, h, w, oc, ic = s[temp_S].op.axis
-        icc = s[temp_S].fuse(oc, w, h)
-        oic, iic = s[temp_S].split(icc, factor=num_thread_x)
-        _, ioic = s[temp_S].split(oic, factor=num_thread_y)
-        s[temp_S].bind(iic, thread_x)
-        s[temp_S].bind(ioic, thread_y)
-        s[temp_S].vectorize(ic)
-
-        #schedule Filter_S shared mem load
-        i, oc, h, w = s[Filter_S].op.axis
-        _, ii = s[Filter_S].split(i, factor=num_thread_x)
-        h = s[Filter_S].fuse(h, w)
-        _, ih = s[Filter_S].split(h, factor=num_thread_y)
-        s[Filter_S].bind(ii, thread_x)
-        s[Filter_S].bind(ih, thread_y)
-        s[Filter_S].storage_align(s[Filter_S].op.axis[0], 2, 1)
-
-def conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L):
-    """Schedule conv2d for specific feature_in_out_filter pattern"""
-    # scheduler params
-    num_thread = 8
-    vthread = 2
-    opart2 = 4
-    ofactor = 64
-    wfactor = 56
-    ifactor = 8
-    if util.get_const_int(Filter.shape[0]) == 64:
-        opart2 = 8
-        ifactor = 16
-    if util.get_const_int(Out.shape[2]) == 224:
-        num_thread = 4
-        wfactor = 112
-        ifactor = 4
-    sfactor = max(1, ofactor // (opart2*vthread))
-    spart = max(1, (wfactor + vthread-1) // vthread)
-
-    block_x = tvm.thread_axis("blockIdx.x")
-    block_y = tvm.thread_axis("blockIdx.y")
-    block_z = tvm.thread_axis("blockIdx.z")
-    thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-    thread_y = tvm.thread_axis((0, wfactor // vthread), "threadIdx.y")
-    thread_xz = tvm.thread_axis((0, opart2), "vthread", name="vx")
-    thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
-
-    i, oc, h, w = s[Out].op.axis
-    ooc, ioc = s[Out].split(oc, factor=ofactor)
-    ow, iw = s[Out].split(w, factor=wfactor)
-    ow = s[Out].fuse(ow, h)
-    oioc, iioc = s[Out].split(ioc, nparts=vthread)
-    oiw, iiw = s[Out].split(iw, nparts=vthread)
-    oiioc, iiioc = s[Out].split(iioc, nparts=opart2)
-    s[Out].reorder(i, ooc, ow, oioc, oiw, oiioc, iiw, iiioc)
-    s[Out].bind(iiioc, thread_x)
-    s[Out].bind(iiw, thread_y)
-    s[Out].bind(oiioc, thread_xz)
-    s[Out].bind(oiw, thread_yz)
-    s[Out].bind(oioc, block_x)
-    s[Out].bind(ow, block_y)
-    s[Out].bind(ooc, block_z)
-
-    s[Out_L].compute_at(s[Out], iiioc)
-
-    # schedule Out_L local write
-    i, oc, h, w = s[Out_L].op.axis
-    ic, dh, dw = s[Out_L].op.reduce_axis
-    oic, iic = s[Out_L].split(ic, factor=ifactor)
-    s[Out_L].reorder(oic, dh, dw, iic, h, w)
-    fuse_index = s[Out_L].fuse(dw, dh)
-    fuse_index = s[Out_L].fuse(fuse_index, oic)
-    dw = fuse_index
-
-    s[temp_S].compute_at(s[Out_L], dw)
-    s[Filter_S].compute_at(s[Out_L], dw)
-
-    #schedule temp_S shared mem load
-    i, ic, h, w = s[temp_S].op.axis
-    _, iic = s[temp_S].split(ic, factor=sfactor)
-    _, iw = s[temp_S].split(w, factor=spart)
-    s[temp_S].bind(iic, thread_x)
-    s[temp_S].bind(iw, thread_y)
-
-    #schedule Filter_S shared mem load
-    i, oc, h, w = s[Filter_S].op.axis
-    _, ioc = s[Filter_S].split(oc, factor=sfactor)
-    _, ii = s[Filter_S].split(i, factor=spart)
-    s[Filter_S].bind(ioc, thread_x)
-    s[Filter_S].bind(ii, thread_y)
-
-def schedule_conv2d_small_batch(outs):
-    """Create schedule for tensors or return error if batch size is larger than 1"""
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def schedule(temp, Filter, Output):
-        """Schedule conv2d_nchw"""
-
-        flag = util.get_const_int(Filter.shape[0])+util.get_const_int(Filter.shape[1])
-
-        if flag > 768:
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            oic, iic = s[temp_G].split(ic, factor=4)
-            s[temp_G].reorder(i, h, w, oic, iic)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        elif 128 < flag < 512:
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            oic, iic = s[temp_G].split(ic, factor=4)
-            s[temp_G].reorder(i, oic, h, w, iic)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        elif util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            s[temp_G].split(w, factor=4)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        else:
-            s[temp].compute_inline()
-            temp_S = s.cache_read(temp, "shared", [Output])
-            temp_R = temp_S
-
-        Filter_S = s.cache_read(Filter, "shared", [Output])
-
-        if Output.op in s.outputs:
-            Out = Output
-            Out_L = s.cache_write(Out, "local")
-        else:
-            Out = outs[0].op.output(0)
-            s[Output].set_scope("local")
-            Out_L = Output
-
-        if util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
-            conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
-        elif 128 < flag < 512:
-            conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
-        elif flag >= 512:
-            conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L)
-        else:
-            conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L)
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule conv2d
-        if 'conv2d_nchw' in OP.tag:
-            temp = OP.input_tensors[0]
-            Filter = OP.input_tensors[1]
-            if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag:
-                s[Filter].compute_inline()
-            Output = OP.output(0)
-            schedule(temp, Filter, Output)
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
-
-
-@generic.schedule_conv2d_nchw.register(["cuda", "gpu"])
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_nchw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d_nchw.
-    """
-    target = tvm.target.current_target()
-    if target.target_name == "cuda" and "cudnn" in target.libs:
-        return topi.generic.schedule_extern(outs)
-
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    batch_size = util.get_const_int(outs[0].op.output(0).shape[0])
-    if batch_size > 1:
-        raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size)
-    return  schedule_conv2d_small_batch(outs)
diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py
index 4454bc54d3eb..e2e011e14d23 100644
--- a/topi/python/topi/cuda/conv2d_transpose_nchw.py
+++ b/topi/python/topi/cuda/conv2d_transpose_nchw.py
@@ -1,121 +1,199 @@
-#pylint: disable=invalid-name, line-too-long
-"""Schedule for conv2d_transpose_nchw with auto fusion"""
-import tvm
-from .. import util
-from .. import tag
-from .. import generic
-from .conv2d_nchw import conv2d_224_3_64, conv2d_56_64_128, conv2d_14_256_256, conv2d_56_64_64
+# pylint: disable=invalid-name
+"""Conv2d transpose template for cuda backend"""
 
+import tvm
+from tvm import autotvm
 
-def schedule_conv2d_transpose_small_batch(outs):
-    """Create schedule for tensors or return error if batch size is larger than 1"""
-    s = tvm.create_schedule([x.op for x in outs])
+from .. import nn, generic
+from ..util import equal_const_int, get_const_tuple, traverse_inline
 
-    def schedule(temp, Filter, Output):
-        """Schedule conv2d_transpose_nchw"""
-        block_h = util.get_const_int(Output.shape[3])
-        block_w = util.get_const_int(temp.shape[1])
-        if block_h % 48 == 0:
-            block_h = 48
-        elif block_h % 32 == 0:
-            block_h = 32
-        if block_w % 48 == 0:
-            block_w = 48
-        elif block_w % 32 == 0:
-            block_w = 32
-
-        flag = util.get_const_int(Filter.shape[0])+util.get_const_int(Filter.shape[1])
-
-        if flag > 768:
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            oic, iic = s[temp_G].split(ic, factor=4)
-            s[temp_G].reorder(i, h, w, oic, iic)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        elif 128 < flag < 512:
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            oic, iic = s[temp_G].split(ic, factor=4)
-            s[temp_G].reorder(i, oic, h, w, iic)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        elif util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            s[temp_G].split(w, factor=4)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        else:
-            s[temp].compute_inline()
-            temp_S = s.cache_read(temp, "shared", [Output])
-            temp_R = temp_S
-
-        Filter_S = s.cache_read(Filter, "shared", [Output])
-
-        if Output.op in s.outputs:
-            Out = Output
-            Out_L = s.cache_write(Out, "local")
-        else:
-            Out = outs[0].op.output(0)
-            s[Output].set_scope("local")
-            Out_L = Output
-
-        if util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
-            conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
-        elif 128 < flag < 512:
-            conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
-        elif flag >= 512:
-            conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L)
-        else:
-            conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L)
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal travserse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_injective(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule conv2d_transpose_nchw
-        if 'conv2d_transpose_nchw' in OP.tag:
-            temp = OP.input_tensors[0]
-            DilatedInput = temp.op.input_tensors[0]
-            s[DilatedInput].compute_inline()
-            Filter = OP.input_tensors[1]
-            Output = OP.output(0)
-            schedule(temp, Filter, Output)
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
+@autotvm.task.register_topi_compute(nn.conv2d_transpose_nchw, ['cuda', 'gpu'], "direct")
+def conv2d_transpose_nchw_cuda(cfg, Input, Filter, strides, padding, out_dtype):
+    """Transposed 2D convolution nchw forward operator.
 
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+    Input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+    Filter : tvm.Tensor
+        4-D with shape [in_channel, num_filter, filter_height, filter_width]
+    strides : tuple of two ints
+        The spatial stride along height and width
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+    out_dtype: str
+        The output type. This is used in mixed precision
 
-@generic.schedule_conv2d_transpose_nchw.register(["cuda", "gpu"])
-def schedule_conv2d_transpose_nchw(outs):
-    """Schedule for conv2d_transpose_nchw.
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    batch, in_c, in_h, in_w = get_const_tuple(Input.shape)
+    _, out_c, filter_h, filter_w = get_const_tuple(Filter.shape)
+    stride_h, stride_w = strides
+
+    # attach stride info to config, this is used in schedule space definition
+    cfg.stride = strides
+
+    # padding stage
+    fpad_top, fpad_left, fpad_bottom, fpad_right = nn.get_pad_tuple(padding, (filter_h, filter_w))
+    bpad_top = filter_h - 1 - fpad_top
+    bpad_bottom = filter_h - 1 - fpad_bottom
+    bpad_left = filter_w - 1 - fpad_left
+    bpad_right = filter_w - 1 - fpad_right
+
+    # padding stage
+    FirstPad = nn.pad(Input,
+                      [0, 0, (bpad_top + stride_h - 1) // stride_h,
+                       (bpad_left + stride_w - 1) // stride_w],
+                      [0, 0, (bpad_bottom + stride_h - 1) // stride_h,
+                       (bpad_right + stride_w - 1) // stride_w], name='FirstPad')
+
+    # remove extra padding introduced by dilatation
+    border_h = (stride_h - bpad_top % stride_h) % stride_h
+    border_w = (stride_w - bpad_left % stride_w) % stride_w
+
+    # dilation stage
+    data = FirstPad
+    strides = [1, 1, stride_h, stride_w]
+    n = len(data.shape)
+
+    def _dilate(*indices):
+        not_zero = []
+        index_tuple = []
+        for i in range(n):
+            if not equal_const_int(strides[i], 1):
+                index_tuple.append(indices[i] // strides[i])
+                not_zero.append((indices[i] % strides[i]).equal(0))
+            else:
+                index_tuple.append(indices[i])
+        if not_zero:
+            not_zero = tvm.all(*not_zero)
+            return tvm.select(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype))
+        return data(*index_tuple)
+
+    # convolution stage
+    out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
+    out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w
+    dc = tvm.reduce_axis((0, in_c), name='dc')
+    dh = tvm.reduce_axis((0, filter_h), name='dh')
+    dw = tvm.reduce_axis((0, filter_w), name='dw')
+
+    Output = tvm.compute(
+        (batch, out_c, out_h, out_w),
+        lambda b, c, h, w: tvm.sum(
+            _dilate(b, dc, h + dh + border_h, w + dw + border_w).astype(out_dtype) *
+            Filter[dc, c, filter_h - 1 - dh, filter_w - 1 - dw].astype(out_dtype),
+            axis=[dc, dh, dw]), tag="conv2d_transpose_nchw")
+
+    return Output
+
+@autotvm.task.register_topi_schedule(generic.schedule_conv2d_transpose_nchw,
+                                     ['cuda', 'gpu'], 'direct')
+def schedule_conv2d_transpose_nchw_cuda(cfg, outs):
+    """TOPI Schedule callback for conv2d transpose operator.
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The parameters for this template
+
     outs: Array of Tensor
-        The computation graph description of conv2d_transpose_nchw
+        The computation graph description of conv2d transpose
         in the format of an array of tensors.
 
     Returns
     -------
     s: Schedule
-        The computation schedule for conv2d_transpose_nchw.
+        The computation schedule for conv2d transpose.
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    batch_size = util.get_const_int(outs[0].op.output(0).shape[0])
-    if batch_size > 1:
-        raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size)
-    return schedule_conv2d_transpose_small_batch(outs)
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'conv2d_transpose_nchw':
+            pad_data = op.input_tensors[0]
+            kernel = op.input_tensors[1]
+            conv = op.output(0)
+
+            ##### space definition begin #####
+            n, f, y, x = s[conv].op.axis
+            rc = s[conv].op.reduce_axis[0]
+            cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
+            cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
+            cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
+            cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
+            cfg.define_knob("auto_unroll_max_step", [64, 512, 1500])
+
+            target = tvm.target.current_target()
+            if target.target_name in ['nvptx', 'rocm']:
+                cfg.define_knob("unroll_explicit", [1])
+            else:
+                cfg.define_knob("unroll_explicit", [0, 1])
+            ##### space definition end #####
+
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            if conv.op in s.outputs:
+                output = conv
+                OL = s.cache_write(conv, 'local')
+            else:
+                output = s.outputs[0].output(0)
+                s[conv].set_scope('local')
+                OL = conv
+
+            # create cache stage
+            s[pad_data].set_scope('shared')
+            AA = pad_data
+            WW = s.cache_read(kernel, 'shared', [OL])
+
+            # tile and bind spatial axes
+            n, f, y, x = s[output].op.axis
+            kernel_scope, n = s[output].split(n, nparts=1)
+            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+            by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+            bf = s[output].fuse(n, bf)
+            s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+            s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+            s[output].bind(vf, tvm.thread_axis("vthread"))
+            s[output].bind(vy, tvm.thread_axis("vthread"))
+            s[output].bind(vx, tvm.thread_axis("vthread"))
+            s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+            s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+            s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+            s[OL].compute_at(s[output], tx)
+
+            # tile reduction axes
+            n, f, y, x = s[OL].op.axis
+            rc, ry, rx = s[OL].op.reduce_axis
+            rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
+            s[OL].reorder(rco, rcm, ry, rx, rci, n, f, y, x)
+
+            s[AA].compute_at(s[OL], rcm)
+            s[WW].compute_at(s[OL], rcm)
+
+            # cooperative fetching
+            for load in [AA, WW]:
+                n, f, y, x = s[load].op.axis
+                fused = s[load].fuse(n, f, y, x)
+                tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+                ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+                tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+                s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+                s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+            s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+            s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    traverse_inline(s, outs[0].op, _callback)
+
+    return s
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
new file mode 100644
index 000000000000..7e0574ea606b
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -0,0 +1,389 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Winograd template for cuda backend"""
+
+import numpy as np
+
+import tvm
+from tvm import autotvm
+
+from .. import nn
+from ..nn import conv2d_winograd_without_weight_transform
+from ..util import get_const_int, get_const_tuple, const_matrix, traverse_inline
+from ..generic import schedule_conv2d_winograd_without_weight_transform
+
+def _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
+    """convert argument to workload"""
+    K = 3
+
+    shape = get_const_tuple(kernel.shape)
+    if shape[-2:] == (K, K):
+        raw_kernel = kernel
+    else:  # pre-transformed
+        _, _, CI, CO = shape
+        raw_kernel = tvm.placeholder((CO, CI, K, K), dtype=kernel.dtype)
+
+    return ('conv2d', ) + autotvm.task.args_to_workload(
+        [data, raw_kernel, strides, padding, layout, out_dtype])
+
+def _infer_tile_size(data, kernel):
+    N, CI, H, W = get_const_tuple(data.shape)
+
+    if H % 8 == 0:
+        return 4
+    return 2
+
+def winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype, pre_computed):
+    """Compute declaration for winograd"""
+    assert layout == 'NCHW'
+
+    tile_size = _infer_tile_size(data, kernel)
+
+    N, CI, H, W = get_const_tuple(data.shape)
+
+    if not pre_computed: # kernel tensor is raw tensor, do strict check
+        CO, CI, KH, KW = get_const_tuple(kernel.shape)
+        HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel)
+        HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
+        assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3
+    else:                   # kernel tensor is pre-transfomred. this op is created by
+                            # alter op layout, do not check
+        HSTR = WSTR = 1
+        HPAD = WPAD = 1
+        KH = KW = 3
+        _, _, CI, CO = get_const_tuple(kernel.shape)
+
+    data_pad = nn.pad(data, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad")
+
+    if tile_size == 4:
+        G_data = np.array([
+            [1 / 4.0, 0, 0],
+            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
+            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
+            [1 / 24.0, 1 / 12.0, 1 / 6.0],
+            [1 / 24.0, -1 / 12.0, 1 / 6.0],
+            [0, 0, 1]], dtype=np.float32)
+
+        B_data = np.array([
+            [4, 0, 0, 0, 0, 0],
+            [0, -4, 4, -2, 2, 4],
+            [-5, -4, -4, -1, -1, 0],
+            [0, 1, -1, 2, -2, -5],
+            [1, 1, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0, 0, 0],
+            [1, 1, 1, 1],
+            [1, -1, 1, -1],
+            [1, 2, 4, 8],
+            [1, -2, 4, -8],
+            [0, 0, 0, 1]], out_dtype)
+    elif tile_size == 2:
+        G_data = np.array([
+            [1, 0, 0],
+            [1.0/2, 1.0/2, 1.0/2],
+            [1.0/2, -1.0/2, 1.0/2],
+            [0, 0, 1]], np.float32)
+
+        B_data = np.array([
+            [1, 0, 0, 0],
+            [0, 1, -1, 1],
+            [-1, 1, 1, 0],
+            [0, 0, 0, -1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0],
+            [1, 1],
+            [1, -1],
+            [0, -1]], out_dtype)
+    else:
+        raise ValueError("Unsupported tile size for winograd: " + str(tile_size))
+
+    m = A_data.shape[1]
+    r = 3
+    alpha = m + r - 1
+    H = (H + 2 * HPAD - KH) // HSTR + 1
+    W = (W + 2 * WPAD - KW) // WSTR + 1
+    nH, nW = (H + m-1) // m, (W + m-1) // m
+    P = N * nH * nW
+
+    # transform kernel
+    if not pre_computed:
+        G = const_matrix(G_data, 'G')
+        r_kh = tvm.reduce_axis((0, KH), name='r_kh')
+        r_kw = tvm.reduce_axis((0, KW), name='r_kw')
+        kernel_pack = tvm.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co:
+                                  tvm.sum(kernel[co][ci][r_kh][r_kw] *
+                                          G[eps][r_kh] * G[nu][r_kw],
+                                          axis=[r_kh, r_kw]), name='kernel_pack')
+    else:
+        kernel_pack = kernel
+
+    # pack input tile
+    input_tile = tvm.compute((CI, P, alpha, alpha), lambda c, p, eps, nu:
+                             data_pad[p // (nH * nW)][c][p // nW % nH * m + eps]
+                             [p % nW * m + nu], name='d')
+
+    # transform data
+    B = const_matrix(B_data)
+    r_a = tvm.reduce_axis((0, alpha), 'r_a')
+    r_b = tvm.reduce_axis((0, alpha), 'r_a')
+    data_pack = tvm.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p:
+                            tvm.sum(input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu],
+                                    axis=[r_a, r_b]), name='data_pack')
+
+    # do batch gemm
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    bgemm = tvm.compute((alpha, alpha, CO, P), lambda eps, nu, co, p:
+                        tvm.sum(kernel_pack[eps][nu][ci][co] *
+                                data_pack[eps][nu][ci][p],
+                                axis=[ci]), name='bgemm')
+
+    # inverse transform
+    A = const_matrix(A_data)
+    r_a = tvm.reduce_axis((0, alpha), 'r_a')
+    r_b = tvm.reduce_axis((0, alpha), 'r_a')
+    inverse = tvm.compute((CO, P, m, m), lambda co, p, vh, vw:
+                          tvm.sum(bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
+                                  axis=[r_a, r_b]), name='inverse')
+
+    # output
+    output = tvm.compute((N, CO, H, W), lambda n, co, h, w:
+                         inverse[co][n * nH * nW + (h // m) * nW + w // m][h % m][w % m],
+                         name='output', tag='conv2d_nchw_winograd',
+                         attrs={"workload": _winograd_conv_arg_to_workload(
+                             data, kernel, strides, padding, layout, out_dtype)})
+    cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)
+
+    return output
+
+
+def schedule_winograd_cuda(cfg, s, output, pre_computed):
+    """Schedule winograd template"""
+    # get stages
+    inverse = s[output].op.input_tensors[0]
+    bgemm, A = s[inverse].op.input_tensors
+    kernel_pack, data_pack = s[bgemm].op.input_tensors
+    input_tile, B = s[data_pack].op.input_tensors
+    pad_data = s[input_tile].op.input_tensors[0]
+
+    # data transform
+    s[B].compute_inline()
+
+    data_l = s.cache_write(data_pack, 'local')
+    eps, nu, c, p = s[data_l].op.axis
+    r_a, r_b = s[data_l].op.reduce_axis
+    for axis in [eps, nu, r_a, r_b]:
+        s[data_l].unroll(axis)
+
+    eps, nu, c, p = s[data_pack].op.axis
+    p, pi = s[data_pack].split(p, 1)
+    fused = s[data_pack].fuse(c, p)
+    bb, tt = s[data_pack].split(fused, 128)
+    s[data_pack].reorder(bb, tt, pi, eps, nu)
+    s[data_pack].bind(bb, tvm.thread_axis("blockIdx.x"))
+    s[data_pack].bind(tt, tvm.thread_axis("threadIdx.x"))
+
+    s[data_l].compute_at(s[data_pack], pi)
+    s[input_tile].compute_at(s[data_pack], pi)
+    s[pad_data].compute_inline()
+
+    # transform kernel
+    if not pre_computed:
+        kernel, G = s[kernel_pack].op.input_tensors
+        eps, nu, ci, co = s[kernel_pack].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # skip this part during tuning to make recrods accurate
+            # this part will be pre-computed during NNVM's pre-compute optimization pass
+            s[G].pragma(s[G].op.axis[0], 'debug_skip_region')
+            s[kernel_pack].pragma(eps, 'debug_skip_region')
+        else:
+            s[G].compute_inline()
+            r_a, r_b = s[kernel_pack].op.reduce_axis
+            for axis in [eps, nu, r_a, r_b]:
+                s[kernel_pack].unroll(axis)
+
+            fused = s[kernel_pack].fuse(ci, co)
+            bb, tt = s[kernel_pack].split(fused, 128)
+            s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b)
+            s[kernel_pack].bind(bb, tvm.thread_axis("blockIdx.x"))
+            s[kernel_pack].bind(tt, tvm.thread_axis("threadIdx.x"))
+    else:
+        kernel = kernel_pack
+
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    ##### space definition begin #####
+    b1, b2, y, x = s[bgemm].op.axis
+    rc = s[bgemm].op.reduce_axis[0]
+    alpha = get_const_int(b1.dom.extent)
+
+    cfg.define_split("tile_b", cfg.axis(alpha * alpha), num_outputs=4,
+                     filter=lambda x: x.size[-3:] == [1, 1, 1])
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 128, 1500])
+    target = tvm.target.current_target()
+    if target.target_name in ['nvptx', 'rocm']:
+        cfg.define_knob("unroll_explicit", [1])
+    else:
+        cfg.define_knob("unroll_explicit", [0, 1])
+    ##### space definition end #####
+
+    # batch gemm
+    C = bgemm
+    A0, B0 = kernel_pack, data_pack
+
+    OL = s.cache_write(C, 'local')
+    AA = s.cache_read(A0, 'shared', [OL])
+    BB = s.cache_read(B0, 'shared', [OL])
+
+    b = s[bgemm].fuse(b1, b2)
+
+    # tile and bind spatial axes
+    bgemm_scope, b = s[bgemm].split(b, nparts=1)
+    bz, vz, tz, zi = cfg["tile_b"].apply(s, C, b)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, C, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].bind(bz, tvm.thread_axis("blockIdx.z"))
+    s[C].bind(by, tvm.thread_axis("blockIdx.y"))
+    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[C].bind(vz, tvm.thread_axis("vthread"))
+    s[C].bind(vy, tvm.thread_axis("vthread"))
+    s[C].bind(vx, tvm.thread_axis("vthread"))
+    s[C].bind(tz, tvm.thread_axis("threadIdx.z"))
+    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].reorder(bgemm_scope, bz, by, bx, vz, vy, vx, tz, ty, tx, zi, yi, xi)
+
+    # tile reduction axes
+    s[OL].compute_at(s[C], tx)
+    b1, b2, y, x = s[OL].op.axis
+    b = s[OL].fuse(b1, b2)
+    rc, = s[OL].op.reduce_axis
+    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
+    s[OL].reorder(rco, rci, b, y, x)
+
+    s[AA].compute_at(s[OL], rco)
+    s[BB].compute_at(s[OL], rco)
+
+    # cooperative fetching
+    for load in [AA, BB]:
+        fused = s[load].fuse(*list(s[load].op.axis))
+        fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
+        fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
+        fused, tz = s[load].split(fused, cfg["tile_b"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    s[C].pragma(bgemm_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[C].pragma(bgemm_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    # schedule inverse, output and fusion
+    if output.op in s.outputs:
+        OL = None
+    else:
+        OL = output
+        s[OL].set_scope('local')
+        output = s.outputs[0]
+
+    m = alpha - 3 + 1
+    n, co, h, w = s[output].op.axis
+    ho, wo, hi, wi = s[output].tile(h, w, m, m)
+    inverse_scope, n = s[output].split(n, nparts=1)
+
+    fused = s[output].fuse(n, co, ho, wo)
+    bb, tt = s[output].split(fused, 128)
+
+    s[output].bind(bb, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(tt, tvm.thread_axis("threadIdx.x"))
+
+    if OL is not None:
+        s[OL].compute_at(s[output], tt)
+
+    s[A].compute_inline()
+    co, p, vh, vw = s[inverse].op.axis
+    r_a, r_b = s[inverse].op.reduce_axis
+    for axis in [vh, vw, r_a, r_b]:
+        s[inverse].unroll(axis)
+    s[inverse].compute_at(s[output], tt)
+
+    return s
+
+##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
+@conv2d_winograd_without_weight_transform.register(['cuda', 'gpu'])
+@autotvm.task.dispatcher
+def winograd_ww_config_dispatcher_cuda(data, kernel, strides, padding, layout, out_dtype,
+                                       tile_size):
+    return _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
+
+
+@winograd_ww_config_dispatcher_cuda.register(['winograd'])
+def decl_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype, pre_computed=True)
+
+
+@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                ['cuda', 'gpu'], ['winograd'])
+def schedule_conv2d_winograd_without_weight_transform_cuda(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_nchw_winograd' in op.tag:
+            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=True)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+##### REGISTER ALTER OP LAYOUT #####
+@nn.conv2d_alter_layout.register(["cuda", "gpu"])
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    """Alter op layout for pre-computing kernel transformation"""
+    if 'cudnn' in tvm.target.current_target().libs or 'miopen' in tvm.target.current_target().libs:
+        return None
+
+    import nnvm.symbol as sym
+    copy_inputs = [s for s in inputs]
+
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
+                                                      "when alter_op_layout is enabled"
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    groups = attrs.get_int('groups')
+    layout = attrs["layout"]
+    out_dtype = attrs["out_dtype"]
+    out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype
+
+    if groups == 1:
+        # query config of this workload
+        workload = ('conv2d',) + autotvm.task.args_to_workload(
+            [tinfos[0], tinfos[1], strides, padding, layout, out_dtype])
+
+        cfg = autotvm.DispatchContext.current.query(tvm.target.current_target(), workload)
+
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(tvm.target.current_target(), workload)
+            return None
+
+        if cfg.template_key == 'direct':
+            return None
+
+        # pre-compute weight transformation in winograd
+        tile_size = _infer_tile_size(tinfos[0], tinfos[1])
+
+        weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1],
+                                                              tile_size=tile_size)
+        weight = sym.transpose(weight, axes=[0, 1, 3, 2])
+        copy_inputs[1] = weight
+        new_attrs['tile_size'] = tile_size
+        return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
+
+    # do nothing for depthwise convolution
+    return None
diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py
index 94fa5c7e79ca..0214ed78b4e7 100644
--- a/topi/python/topi/cuda/depthwise_conv2d.py
+++ b/topi/python/topi/cuda/depthwise_conv2d.py
@@ -1,12 +1,17 @@
 # pylint: disable=invalid-name
 """Schedule for depthwise_conv2d with auto fusion"""
 import tvm
-from ..util import get_const_tuple
+from tvm import autotvm
+from ..util import traverse_inline
 from .. import tag
-from .. import generic
+from .. import generic, nn
 
-@generic.schedule_depthwise_conv2d_nchw.register(["cuda", "gpu"])
-def schedule_depthwise_conv2d_nchw(outs):
+# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
+autotvm.register_topi_compute(nn.depthwise_conv2d_nchw, ['cuda', 'gpu'], 'direct',
+                              nn.depthwise_conv2d_nchw.fdefault)
+
+@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_nchw, ['cuda', 'gpu'], 'direct')
+def schedule_depthwise_conv2d_nchw_cuda(cfg, outs):
     """Schedule for depthwise_conv2d nchw forward.
 
     Parameters
@@ -22,108 +27,92 @@ def schedule_depthwise_conv2d_nchw(outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    def _schedule(PaddedInput, Filter, DepthwiseConv2d):
-        in_shape = get_const_tuple(PaddedInput.shape)
-        out_shape = get_const_tuple(DepthwiseConv2d.shape)
-        in_height = in_shape[2]
-        in_width = in_shape[3]
-        out_height = out_shape[2]
-        out_width = out_shape[3]
-        channel_multiplier = get_const_tuple(Filter.shape)[1]
-        s[PaddedInput].compute_inline()
-        IS = s.cache_read(PaddedInput, "shared", [DepthwiseConv2d])
-        FS = s.cache_read(Filter, "shared", [DepthwiseConv2d])
-        IL = s.cache_read(IS, "local", [DepthwiseConv2d])
-        FL = s.cache_read(FS, "local", [DepthwiseConv2d])
-        if DepthwiseConv2d.op in s.outputs:
-            Output = DepthwiseConv2d
-            CL = s.cache_write(DepthwiseConv2d, "local")
-        else:
-            Output = outs[0].op.output(0)
-            s[DepthwiseConv2d].set_scope("local")
-        # schedule parameters
-        num_thread_y = 8
-        num_thread_x = 8
-        num_vthread_y = 1
-        num_vthread_x = 1
-        blocking_h = out_height
-        blocking_w = out_width
-        if out_height % 32 == 0 or in_height >= 108:
-            blocking_h = 32
-        if out_width % 32 == 0:
-            blocking_w = 32
-            num_thread_x = 16
-            num_vthread_x = 2
-        elif in_width >= 108:
-            blocking_w = 32
-        block_y = tvm.thread_axis("blockIdx.y")
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
-        thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-        thread_vy = tvm.thread_axis((0, num_vthread_y), "vthread", name="vy")
-        thread_vx = tvm.thread_axis((0, num_vthread_x), "vthread", name="vx")
-        # split and bind
-        by, byi = s[Output].split(Output.op.axis[1], factor=channel_multiplier)
-        s[Output].reorder(Output.op.axis[2], Output.op.axis[3], byi)
-        by = s[Output].fuse(Output.op.axis[0], by)
-        s[Output].bind(by, block_y)
-        bx1, x1i = s[Output].split(Output.op.axis[2], factor=blocking_h)
-        tvy, vyi = s[Output].split(x1i, nparts=num_vthread_y)
-        ty, yi = s[Output].split(vyi, nparts=num_thread_y)
-        bx2, x2i = s[Output].split(Output.op.axis[3], factor=blocking_w)
-        tvx, vxi = s[Output].split(x2i, nparts=num_vthread_x)
-        tx, xi = s[Output].split(vxi, nparts=num_thread_x)
-        s[Output].reorder(bx1, bx2, tvy, tvx, ty, tx, yi, xi)
-        bx = s[Output].fuse(bx1, bx2)
-        s[Output].bind(bx, block_x)
-        s[Output].bind(tvy, thread_vy)
-        s[Output].bind(tvx, thread_vx)
-        s[Output].bind(ty, thread_y)
-        s[Output].bind(tx, thread_x)
-        # local memory load
-        s[IL].compute_at(s[Output], tx)
-        s[FL].compute_at(s[Output], tx)
-        if DepthwiseConv2d.op in s.outputs:
-            s[CL].compute_at(s[Output], tx)
-        else:
-            s[DepthwiseConv2d].compute_at(s[Output], tx)
-        # input's shared memory load
-        s[IS].compute_at(s[Output], bx)
-        ty, yi = s[IS].split(IS.op.axis[2], nparts=num_thread_y)
-        tx, xi = s[IS].split(IS.op.axis[3], nparts=num_thread_x)
-        s[IS].bind(ty, thread_y)
-        s[IS].bind(tx, thread_x)
-        # filter's shared memory load
-        s[FS].compute_at(s[Output], bx)
-        s[FS].reorder(FS.op.axis[2], FS.op.axis[3], FS.op.axis[1])
-        ty, yi = s[FS].split(FS.op.axis[2], nparts=num_thread_y)
-        tx, xi = s[FS].split(FS.op.axis[3], nparts=num_thread_x)
-        s[FS].bind(ty, thread_y)
-        s[FS].bind(tx, thread_x)
 
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal travserse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule depthwise_conv2d
-        if OP.tag == 'depthwise_conv2d_nchw':
-            PaddedInput = OP.input_tensors[0]
-            Filter = OP.input_tensors[1]
-            if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag:
-                s[Filter].compute_inline()
-            DepthwiseConv2d = OP.output(0)
-            _schedule(PaddedInput, Filter, DepthwiseConv2d)
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
+    def _callback(op):
+        if op.tag == 'depthwise_conv2d_nchw':
+            pad_data = op.input_tensors[0]
+            kernel = op.input_tensors[1]
+            conv = op.output(0)
+
+            ##### space definition begin #####
+            n, f, y, x = s[conv].op.axis
+            cfg.define_split("tile_f", f, num_outputs=4)
+            cfg.define_split("tile_y", y, num_outputs=4)
+            cfg.define_split("tile_x", x, num_outputs=4)
+            cfg.define_knob("auto_unroll_max_step", [0, 256, 1500])
+
+            target = tvm.target.current_target()
+            if target.target_name in ['nvptx', 'rocm']:
+                cfg.define_knob("unroll_explicit", [1])
+            else:
+                cfg.define_knob("unroll_explicit", [0, 1])
+
+            # fallback support
+            if cfg.is_fallback:
+                ref_log = autotvm.tophub.load_reference_log(
+                    target.target_name, target.model, 'depthwise_conv2d_nchw', 'direct')
+                cfg.fallback_with_reference_log(ref_log)
+                # TODO(lmzheng): A bug here, set unroll_explicit to False as workaround
+                cfg['unroll_explicit'].val = 0
+            ##### space definition end #####
+
+            s[pad_data].compute_inline()
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            if conv.op in s.outputs:
+                output = conv
+                OL = s.cache_write(conv, 'local')
+            else:
+                output = s.outputs[0].output(0)
+                s[conv].set_scope('local')
+                OL = conv
+
+            # create cache stage
+            AA = s.cache_read(pad_data, 'shared', [OL])
+            WW = s.cache_read(kernel, 'shared', [OL])
+            AL = s.cache_read(AA, 'local', [OL])
+            WL = s.cache_read(WW, 'local', [OL])
+
+            # tile and bind spatial axes
+            n, f, y, x = s[output].op.axis
+            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+            by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+            kernel_scope, n = s[output].split(n, nparts=1)
+            bf = s[output].fuse(n, bf)
+            s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+            s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+            s[output].bind(vf, tvm.thread_axis("vthread"))
+            s[output].bind(vy, tvm.thread_axis("vthread"))
+            s[output].bind(vx, tvm.thread_axis("vthread"))
+            s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+            s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+            s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+            s[OL].compute_at(s[output], tx)
+
+            # cooperative fetching
+            s[AA].compute_at(s[output], bx)
+            s[WW].compute_at(s[output], bx)
+            s[AL].compute_at(s[output], tx)
+            s[WL].compute_at(s[output], tx)
+
+            for load in [AA, WW]:
+                fused = s[load].fuse(*list(s[load].op.axis))
+                fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
+                fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
+                fused, tz = s[load].split(fused, cfg["tile_f"].size[2])
+                s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+                s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+            s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+            s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    traverse_inline(s, outs[0].op, _callback)
     return s
 
 @generic.schedule_depthwise_conv2d_nhwc.register(["cuda", "gpu"])
@@ -143,8 +132,8 @@ def schedule_depthwise_conv2d_nhwc(outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    def _schedule(temp, Filter, DepthwiseConv2d):
 
+    def _schedule(temp, Filter, DepthwiseConv2d):
         s[temp].compute_inline()
         FS = s.cache_read(Filter, "shared", [DepthwiseConv2d])
         if DepthwiseConv2d.op in s.outputs:
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index fc6309a7ebf4..6bbf735af18e 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -1,328 +1,143 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
 """conv2d schedule on ARM Mali GPU"""
-
-from __future__ import absolute_import as _abs
-
 import numpy as np
-import tvm
-
-from .. import generic
-from .. import util
-from .. import tag
-from ..nn import pad
-from ..nn.conv2d import conv2d
-from ..nn.util import get_pad_tuple
-
-##### SCHEDULE UTILITIES #####
-def fuse_and_bind(s, tensor, axis=None, num_thread=None):
-    """ fuse all the axis and bind to GPU threads """
-    axis = axis or s[tensor].op.axis
-    fused = s[tensor].fuse(*axis)
-    max_threads = tvm.target.current_target(allow_none=False).max_num_threads
-    bx, tx = s[tensor].split(fused, num_thread or max_threads)
-    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
-    return bx, tx
-
-def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
-    """ tile and bind to GPU threads """
-    x_factor = x_factor or y_factor
-    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-    return yo, xo, yi, xi
-
-def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
-    """ tile and bind 3d """
-    y_factor = y_factor or z_factor
-    x_factor = x_factor or y_factor
-    zo, zi = s[tensor].split(z, z_factor)
-    yo, yi = s[tensor].split(y, y_factor)
-    xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
-    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-
-def pack_tensor(s, tensor, factor, readers):
-    """ do transform X[n, m] -> X[n / factor, m, factor] """
-    tmp = s.cache_read(tensor, 'global', readers)
-    y, x = s[tmp].op.axis
-    yo, yi = s[tmp].split(y, factor)
-    s[tmp].reorder(yo, x, yi)
-    s[tmp].compute_inline()
-    return s.cache_write(tmp, 'global')
-
-def transpose(s, tensor, readers):
-    """ do transform X[n, m] -> X[m, n] """
-    tmp = s.cache_read(tensor, 'global', readers)
-    y, x = s[tmp].op.axis
-    s[tmp].reorder(x, y)
-    s[tmp].compute_inline()
-    return s.cache_write(tmp, "global"), tmp
-
-def const_array(data, name):
-    """ convert an const array to tvm tensor"""
-    row, col = data.shape
-    dtype = str(data.dtype)
-
-    def select_array(i, j):
-        now = tvm.const(0.0, dtype)
-        for ii in range(row):
-            for jj in range(col):
-                now = tvm.select(tvm.all(i % row == ii, j % col == jj),
-                                 tvm.const(data[ii][jj], dtype),
-                                 now)
-        return now
-    return tvm.compute(data.shape, select_array, name=name)
-
-
-@conv2d.register(["mali"])
-def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
-    """Conv2D operator for ARM Mali GPU backend.
 
-    Parameters
-    ----------
-    data : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import get_factors
 
-    kernel : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
+from ..util import traverse_inline, get_const_int, get_const_tuple, const_matrix
+from ..nn import conv2d, conv2d_winograd_without_weight_transform, \
+    get_pad_tuple, pad
 
-    stride : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
+# reuse some compute declarations from ARM CPU
+from ..arm_cpu.conv2d import _conv_arg_to_workload, _decl_spatial_pack,\
+    _winograd_conv_arg_to_workload
 
-    padding : int or a list/tuple of two ints
-        padding size, or [pad_height, pad_width]
 
-    layout : str
-        layout of data
+@conv2d.register('mali')
+@autotvm.task.dispatcher
+def conv2d_mali(data, kernel, strides, padding, layout, out_dtype):
+    """TOPI compute callback. Mark this function as a dispatcher, so
+    this template can assign config according to workload
 
     Returns
     -------
-    output : tvm.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
+    workload: Tuple
+        Dispatcher will use this workload to query corresponding config.
+        Then use cfg.template_key to call a registered template.
     """
-    assert layout == 'NCHW', "only support NCHW convolution on mali"
-    assert data.shape[0].value == 1, "only support batch size=1 convolution on mali"
-    assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
+    return _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
 
-    out_dtype = data.dtype
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    kernel_shape = util.get_const_tuple(kernel.shape)
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
+@conv2d_mali.register(['direct'])
+def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype):
+    """spatial packing template"""
+    return _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile=3)
 
-    if (kernel_shape[2:4] == (3, 3) and (HPAD, WPAD) == (1, 1) and kernel_shape[0] >= 64 and
-            (HSTR, WSTR) == (1, 1)):
-        return _decl_winograd(data, kernel, stride, padding, layout, out_dtype)
-    elif kernel_shape[2:4] == (1, 1):
-        return _decl_im2col(data, kernel, stride, padding, layout, out_dtype)
-    else:
-        return _decl_spatialpack(data, kernel, stride, padding, layout, out_dtype)
-
-@generic.schedule_conv2d_nchw.register(["mali"])
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw for ARM Mali GPU
+@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'mali', ['direct', 'winograd'])
+def schedule_conv2d_nchw_mali(cfg, outs):
+    """TOPI schedule callback for conv2d
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The configuration of this template
     outs: Array of Tensor
-        The computation graph description of conv2d_nchw
+        The computation graph description of convolution2d
         in the format of an array of tensors.
 
     Returns
     -------
     s: Schedule
-        The computation schedule for conv2d_nchw.
+        The computation schedule for conv2d
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
 
-    def traverse(op):
-        """inline all one-to-one-mapping operators except the last stage (output)"""
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
+    def _callback(op):
+        # schedule conv2d
+        if 'spatial_conv2d_output' in op.tag:
+            output = op.output(0)
+            conv = op.input_tensors[0]
 
-        if 'im2col_conv_output' in op.tag:
-            _schedule_im2col_conv2d(s, op)
+            data_vec = conv.op.input_tensors[0]
+            data_pad = data_vec.op.input_tensors[0]
+            s[data_pad].compute_inline()
 
-        if 'spatialpack_conv_output' in op.tag:
-            _schedule_spatialpack_conv2d(s, op)
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
 
-        if 'winograd_conv_output' in op.tag:
-            _schedule_winograd(s, op)
+            _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec)
 
-        scheduled_ops.append(op)
+        if 'winograd_conv2d_output' in op.tag:
+            _schedule_winograd(cfg, s, op)
 
-    traverse(outs[0].op)
+    traverse_inline(s, outs[0].op, _callback)
     return s
 
-def _decl_spatialpack(data, kernel, stride, padding, layout, out_dtype):
-    """declare the spatialpack method (spatial packing) for conv2d"""
-    _, CI, IH, IW = [util.get_const_int(x) for x in data.shape]
-    CO, _, KH, KW = [util.get_const_int(x) for x in kernel.shape]
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    HCAT, WCAT = KH - 1, KW - 1
-
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-
-    N = 1
-    TH = IH + 2*HPAD
-    TW = IW + 2*WPAD
-    OH = (IH + 2*HPAD - KH) // HSTR + 1
-    OW = (IW + 2*WPAD - KW) // WSTR + 1
-
-    DO_PAD = (HPAD != 0 and WPAD != 0)
-    if DO_PAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
-
-    # set tunable parameters (tile factor, ...)
-    tune_config = getattr(tvm.target.current_target(), "tune_config", None)
-    if tune_config is None:
-        VH = 1
-        VW, VC = 4, 4
-        # correct tile factor
-        if OW % VW != 0:
-            if OW == 14:
-                VW = 2
-                VC = 8
-            elif OW == 7:
-                VW = 7
-    else:
-        VH = tune_config['VH']
-        VW = tune_config['VW']
-        VC = tune_config['VC']
-
-    if data.dtype == 'float16':
-        VC *= 2
-
-    assert CO % VC == 0
-    assert OH % VH == 0, "OH: %d  VH : %d" % (OH, VH)
-    assert OW % VW == 0, "OW: %d  VW : %d" % (OW, VW)
-
-    dvshape = (N, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT)
-    kvshape = (CO // VC, CI, KH, KW, VC)
-    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
-    oshape = (N, CO, OH, OW)
-
-    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
-                           data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
-                           name='data_vec')
-
-    kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
-                             kernel[co*VC+vc][ci][kh][kw],
-                             name='kernel_vec')
-
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    kh = tvm.reduce_axis((0, KH), name='kh')
-    kw = tvm.reduce_axis((0, KW), name='kw')
-
-    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc:\
-                tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
-                        kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
-                        axis=[ci, kh, kw]), name='conv')
-
-    output = tvm.compute(oshape, lambda n, co, h, w:
-                         conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC],
-                         name='output_unpack', tag='spatialpack_conv_output')
-
-    return output
-
-def _schedule_spatialpack_conv2d(s, op):
-    """schedule the spatialpack method (spatial packing) for conv2d"""
-    # get ops and tensors
-    output = op.output(0)
-    output_height = util.get_const_int(output.shape[2])
 
-    conv = op.input_tensors[0]
-    data_vec = s[conv].op.input_tensors[0]
-    kernel_vec = s[conv].op.input_tensors[1]
+def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
+    """schedule the spatial packing for conv2d"""
     data = s[data_vec].op.input_tensors[0]
-    kernel = s[kernel_vec].op.input_tensors[0]
-
-    # set tunable parameters (tile factor, ...)
-    tune_config = getattr(tvm.target.current_target(), "tune_config", None)
-    if tune_config is None:
-        num_thread = 8
-
-        out_channel = util.get_const_int(kernel.shape[0])
-        in_channel = util.get_const_int(kernel.shape[1])
-        in_width = util.get_const_int(data.shape[2])
-
-        if in_width >= 224:
-            pass
-        elif in_width >= 112:
-            pass
-        elif in_width >= 56:
-            if out_channel != in_channel:
-                num_thread = 16
-        elif in_width >= 28:
-            if out_channel >= 256:
-                num_thread = 16
-        elif in_width >= 14:
-            if in_channel == out_channel:
-                num_thread = 8
-            else:
-                num_thread = 4
-    else:
-        num_thread = tune_config["num_thread"]
-
-    last = 1
-    if output_height == 28:
-        last = 7
-        num_thread = 32
 
-    if data.dtype == 'float16' and (util.get_const_int(conv.shape[1]) == 4 or output_height == 28):
-        num_thread //= 2
-
-    # schedule dilation
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
+    max_unroll = 16
+    vec_size = [1, 2, 4, 8, 16]
+    # get tunable parameters (they are defined in compute)
+    BC, TC, VC = cfg["tile_co"].size
+    BH, TH, VH = cfg["tile_oh"].size
+    BW, TW, VW = cfg["tile_ow"].size
 
     # schedule padding
     if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
         data_pad = data
-        data = data_pad.op.input_tensors[0]
         s[data_pad].compute_inline()
 
     # schedule data packing
     _, h, w, ci, vh, vw = s[data_vec].op.axis
     tile_and_bind3d(s, data_vec, h, w, ci, 1)
-    s[data_vec].unroll(vw)
-
-    # schedule kernel packing
-    co, ci, kh, kw, vc = s[kernel_vec].op.axis
-    tile_and_bind(s, kernel_vec, co, ci, 1)
-    s[kernel_vec].unroll(kh)
-    s[kernel_vec].unroll(kw)
-    s[kernel_vec].vectorize(vc)
+    if vh.dom.extent.value < max_unroll:
+        s[data_vec].unroll(vh)
+    if vw.dom.extent.value < max_unroll:
+        s[data_vec].unroll(vw)
+
+    if isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and kernel_vec.name == 'kernel_vec':
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel packing will be pre-computed during compilation, so we skip
+            # this part to make tuning records correct
+            s[kernel_vec].pragma(s[kernel_vec].op.axis[0], 'debug_skip_region')
+        else:
+            max_threads = tvm.target.current_target(allow_none=False).max_num_threads
+            co, ci, kh, kw, vc = s[kernel_vec].op.axis
+            fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
+            fused, vec = s[kernel_vec].split(fused, VC)
+            bb, tt = s[kernel_vec].split(fused, max_threads)
+            s[kernel_vec].bind(bb, tvm.thread_axis("blockIdx.x"))
+            s[kernel_vec].bind(tt, tvm.thread_axis("threadIdx.x"))
+            if VC in vec_size:
+                s[kernel_vec].vectorize(vec)
 
     # schedule convolution
-    _, c, h, w, vh, vw, vc = s[conv].op.axis
+    n, c, h, w, vh, vw, vc = s[conv].op.axis
     kc, kh, kw = s[conv].op.reduce_axis
-    s[conv].reorder(_, c, h, w, vh, kc, kh, kw, vw, vc)
-    tile_and_bind3d(s, conv, c, h, w, num_thread, 1, last)
-    s[conv].unroll(kh)
-    s[conv].unroll(kw)
-    s[conv].unroll(vw)
-    s[conv].vectorize(vc)
+
+    cfg["reorder_0"].apply(s, conv, [n, c, h, w, kc, kh, kw, vh, vw, vc])
+    tile_and_bind3d(s, conv, c, h, w, TC, TH, TW)
+
+    cfg["ann_reduce"].apply(s, conv, [kh, kw],
+                            axis_lens=[get_const_int(kernel_vec.shape[2]),
+                                       get_const_int(kernel_vec.shape[3])],
+                            max_unroll=max_unroll)
+
+    cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
+                             axis_lens=[VH, VW, VC],
+                             max_unroll=max_unroll,
+                             vec_size=vec_size,
+                             cfg=cfg)
 
     # schedule output
     if output.op not in s.outputs:  # has bias
@@ -330,364 +145,324 @@ def _schedule_spatialpack_conv2d(s, op):
         output = s.outputs[0]
 
     _, co, oh, ow = s[output].op.axis
-    tile_and_bind3d(s, output, co, oh, ow, num_thread, 1, last)
-
-def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
-    """declare the Im2Col method for conv2d"""
-    _, CI, IH, IW = [x.value for x in data.shape]
-    CO, _, KH, KW = [x.value for x in kernel.shape]
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-
-    N = 1
-    OH = (IH + 2*HPAD - KH) // HSTR + 1
-    OW = (IW + 2*WPAD - KW) // WSTR + 1
-
-    DO_PAD = (HPAD != 0 and WPAD != 0)
-    if DO_PAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
+    tile_and_bind3d(s, output, co, oh, ow, TC, TH, TW)
 
-    ALIGN = 16
-    def upround(x, align):
-        return (x + align - 1) // align * align
-
-    # A [CO, CI * KH * KW]
-    reduce_len = upround(CI * KH * KW, ALIGN)
-    A = tvm.compute((upround(CO, ALIGN), reduce_len), lambda i, j:
-                    kernel[i][j // KW // KH][j // KW % KH][j % KW], name='A')
-
-    # B [CI * KH * KW, N * OH * OW]
-    B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\
-            tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW),
-                       data_pad[j // (OH*OW)][i // (KH*KW)][j // OW % OH*HSTR + i // KW % KH]
-                       [j % OW*WSTR + i % KW],
-                       tvm.const(0, data_pad.dtype)), name='B')
-
-    gemm_n, gemm_l, gemm_m = A.shape[0], reduce_len, B.shape[1]
-
-    # C [CO, N * OH * OW]
-    k = tvm.reduce_axis((0, gemm_l), name='k')
-    C = tvm.compute((gemm_n, gemm_m), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
-
-    # output
-    # the last term C[gemm_n-1, gemm_m-1] is for enabling the alignment,
-    # otherwise the alignment above will be eliminated by bound inference
-    output = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:\
-                 C[co][n * OW * OW + h * OW + w] + tvm.const(0, C.dtype) * C[gemm_n-1, gemm_m-1],
-                         name='output', tag='im2col_conv_output')
-
-    return output
-
-def _schedule_im2col_conv2d(s, op):
-    """schedule the Im2Col method for conv2d"""
-
-    # get ops and tensors
-    output = op.output(0)
-    C = op.input_tensors[0]
-    A, B = C.op.input_tensors
-    kernel = A.op.input_tensors[0]
-    data = B.op.input_tensors[0]
-
-    # tuning parameter config
-    tune_config = getattr(tvm.target.current_target(), "tune_config", None)
-    if tune_config is None: # use rule
-        bn = 4
-        unroll_step = 16
-
-        total_work = util.get_const_int(C.shape[0] * C.shape[1])
-        reduce_work = util.get_const_int(A.shape[1])
-        if total_work > 200000:
-            last_work = util.get_const_int(C.shape[1])
-            if last_work > 10000:
-                num_thread = 16
-            elif last_work > 3000:
-                num_thread = 8
-            elif reduce_work > 100:
-                num_thread = 4
-            else:
-                num_thread = 2
-
-            if reduce_work < 50 and last_work < 30000:
-                num_thread = 4
-        elif total_work > 150000:
-            num_thread = 8
-        elif total_work > 50000:
-            num_thread = 4
-        else:
-            num_thread = 2
-
-        if num_thread == 4:
-            unroll_step = 2
-    else:
-        bn = tune_config["bn"]
-        num_thread = tune_config["num_thread"]
-        unroll_step = tune_config["unroll_step"]
-
-    bna = bnb = bn
-    num_thread1 = num_thread2 = num_thread
-    if data.dtype == 'float16':
-        bnb *= 2
-        last_work = util.get_const_int(C.shape[1])
-        if last_work % (bnb * num_thread2) != 0:
-            num_thread1 = num_thread * 2
-            num_thread2 = num_thread // 2
-
-    # schedule dilation
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
+    return s
 
-    # schedule padding
-    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-        data_pad = data
-        s[data_pad].compute_inline()
+##### WINOGRAD TEMPLATE #####
+def _pick_tile_size(data, kernel):
+    N, CI, H, W = get_const_tuple(data.shape)
 
-    ##### SCHEDULE A #####
-    if util.get_const_int(kernel.shape[2]) == 1 and util.get_const_int(kernel.shape[3]) == 1:
-        s[A].compute_inline()
+    if H % 4 == 0:
+        return 4
     else:
-        y, x = s[A].op.axis
-        yo, xo, yi, xi = s[A].tile(y, x, bna, util.get_const_int(kernel.shape[3]))
-        s[A].vectorize(xi)
-        fuse_and_bind(s, A, [yo, xo])
-
-    # pack to vector form
-    packedA = pack_tensor(s, A, bna, [C])
-
-    # vectorize load
-    y, x = s[packedA].op.axis[:2]
-    tmp = s.cache_write(packedA, "local")
-    x, xt = s[packedA].split(x, bna)
-    _, _, _, xi = tile_and_bind(s, packedA, y, x, num_thread)
-    s[tmp].compute_at(s[packedA], xi)
-    s[tmp].vectorize(s[tmp].op.axis[1])
-    s[tmp].unroll(s[tmp].op.axis[2])
-    s[packedA].vectorize(s[packedA].op.axis[2])
-    s[packedA].unroll(xt)
-
-    ##### SCHEDULE B #####
-    y, x = s[B].op.axis
-    yo, xo, yi, xi = s[B].tile(y, x, 1, 1 * bnb)
-    fuse_and_bind(s, B, [yo, xo])
-
-    # transpose and pack to vector form
-    B_transpose, B_tmp = transpose(s, B, [C])
-    s[B_transpose].compute_inline()
-    packedB = pack_tensor(s, B_transpose, bnb, [B_tmp])
-
-    # vectorize load
-    s[packedB].vectorize(s[packedB].op.axis[2])
-    y, x = s[packedB].op.axis[:2]
-    tile_and_bind(s, packedB, y, x, num_thread)
-
-    ##### SCHEDULE C #####
-    # vectorize and unroll dot
-    y, x = s[C].op.axis
-    y, x, yt, xt = s[C].tile(y, x, bna, bnb)
-
-    k = s[C].op.reduce_axis[0]
-    s[C].reorder(k, yt, xt)
-    if unroll_step != 1:
-        k, k_unroll = s[C].split(k, unroll_step)
-        s[C].unroll(k_unroll)
-    s[C].unroll(yt)
-    s[C].vectorize(xt)
-
-    tile_and_bind(s, C, y, x, num_thread1, num_thread2)
-
-    ##### COPY TO OUTPUT #####
-    if output.op in s.outputs:  # no bias
-        output = output
-    else:                       # has bias
-        s[output].compute_inline()
-        output = s.outputs[0]
-
-    n, co, h, w = s[output].op.axis
-    h, w, vh, vw = s[output].tile(h, w, 1, bnb)
-    s[output].unroll(vh)
-    if util.get_const_int(s[output].op.output(0).shape[3]) % bnb != 0:
-        pass
+        return 2
+
+@conv2d_mali.register('winograd')
+def decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
+    tile_size = _pick_tile_size(data, kernel)
+    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
+
+def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    if len(kernel.shape) == 4:
+        pre_computed = False
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
     else:
-        s[output].vectorize(vw)
-    fuse_and_bind(s, output, [n, co, h, w])
-
-def _decl_winograd(data, kernel, stride, padding, layout, out_dtype):
-    """declare winograd fast convolution F(2x2, 3x3) for conv2d"""
-    N, CI, H, W = [util.get_const_int(x) for x in data.shape]
-    CO, CI, KH, KW = [util.get_const_int(x) for x in kernel.shape]
+        pre_computed = True
+        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
+        CO *= VC
+        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
     HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
 
-    assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3
+    assert layout == 'NCHW'
+    assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
     data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
 
-    B_data = np.array([
-        [1, 0, 0, 0],
-        [0, 1, -1, 1],
-        [-1, 1, 1, 0],
-        [0, 0, 0, -1]
-    ], out_dtype)
-
-    G_data = np.array([
-        [1, 0, 0],
-        [1.0/2, 1.0/2, 1.0/2],
-        [1.0/2, -1.0/2, 1.0/2],
-        [0, 0, 1],
-    ], out_dtype)
-
-    A_data = np.array([
-        [1, 0],
-        [1, 1],
-        [1, -1],
-        [0, -1],
-    ], out_dtype)
-
-    m = 2
+    if tile_size == 4:
+        G_data = np.array([
+            [1 / 4.0, 0, 0],
+            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
+            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
+            [1 / 24.0, 1 / 12.0, 1 / 6.0],
+            [1 / 24.0, -1 / 12.0, 1 / 6.0],
+            [0, 0, 1]], dtype=np.float32)
+
+        B_data = np.array([
+            [4, 0, 0, 0, 0, 0],
+            [0, -4, 4, -2, 2, 4],
+            [-5, -4, -4, -1, -1, 0],
+            [0, 1, -1, 2, -2, -5],
+            [1, 1, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0, 0, 0],
+            [1, 1, 1, 1],
+            [1, -1, 1, -1],
+            [1, 2, 4, 8],
+            [1, -2, 4, -8],
+            [0, 0, 0, 1]], out_dtype)
+    elif tile_size == 2:
+        G_data = np.array([
+            [1, 0, 0],
+            [1.0/2, 1.0/2, 1.0/2],
+            [1.0/2, -1.0/2, 1.0/2],
+            [0, 0, 1]], np.float32)
+
+        B_data = np.array([
+            [1, 0, 0, 0],
+            [0, 1, -1, 1],
+            [-1, 1, 1, 0],
+            [0, 0, 0, -1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0],
+            [1, 1],
+            [1, -1],
+            [0, -1]], out_dtype)
+    else:
+        raise ValueError("Unsupported tile size for winograd: " + str(tile_size))
+
+    m = A_data.shape[1]
     r = 3
     alpha = m + r - 1
-    K = CO
-    C = CI
 
+    H = (IH + 2 * HPAD - 3) // HSTR + 1
+    W = (IW + 2 * WPAD - 3) // WSTR + 1
     nH, nW = (H + m-1) // m, (W + m-1) // m
     P = N * nH * nW
 
-    bna, bnb = 4, 4
-    if data.dtype == 'float16':
-        bnb *= 2
+    ##### space definition begin #####
+    tile_bna_candidates = [1, 2, 4, 8, 16]
+    factors = get_factors(CO)
+    cfg.define_knob('tile_bna', [x for x in tile_bna_candidates if x in factors])
+    cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16])
+    cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128)
+    cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128)
+    cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8)
+    cfg.define_knob('yt', [1, 2, 4, 8, 16, 32])
+    ##### space definition end #####
+
+    if cfg.is_fallback:
+        cfg['tile_bnb'].val = 4
+        cfg['tile_bna'].val = 4
+        while CO % cfg['tile_bna'].val != 0:
+            cfg['tile_bna'].val //= 2
+        cfg['yt'].val = 8
+        cfg.fallback_split('tile_t1', [-1, 128])
+        cfg.fallback_split('tile_t2', [-1, 128])
+        cfg.fallback_split('c_unroll', [-1, 8])
+
+    bna = cfg['tile_bna'].val
+    bnb = cfg['tile_bnb'].val
+
     P_round = (P + bnb - 1) // bnb * bnb
-    assert K % bna == 0 and P_round % bnb == 0
+    assert CO % bna == 0 and P_round % bnb == 0
 
     # pack input tile
-    input_tile = tvm.compute((C, P_round // bnb, alpha, alpha, bnb),
-                             lambda c, b, eps, nu, bb:
-                             tvm.select(b * bnb + bb < P,\
-                             data_pad[(b*bnb+bb) // (nH*nW)][c][(b*bnb+bb) // nW % nH * m + eps]\
-                             [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)),
-                             name='d')
+    input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \
+         tvm.select(b * bnb + bb < P,
+                    data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
+                    [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d')
 
     # transform kernel
-    G = const_array(G_data, 'G')
-    r_kh = tvm.reduce_axis((0, KH), 'r_kh')
-    r_kw = tvm.reduce_axis((0, KW), 'r_kw')
-    U = tvm.compute((alpha, alpha, K // bna, C, bna), lambda eps, nu, k, c, kk:
-                    tvm.sum(kernel[k * bna + kk][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
-                            axis=[r_kh, r_kw]), name='U')
+    if pre_computed:
+        U = kernel
+    else:
+        G = const_matrix(G_data, 'G')
+        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
+        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
+        U = tvm.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco:
+                        tvm.sum(kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
+                                axis=[r_kh, r_kw]), name='U')
 
     # transform image
-    B = const_array(B_data, 'B')
-    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
-    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
-    V = tvm.compute((alpha, alpha, P_round // bnb, C, bnb), lambda eps, nu, b, c, bb:
-                    tvm.sum(input_tile[c][b][r_eps][r_nu][bb] * B[r_eps][eps] * B[r_nu][nu],
-                            axis=[r_eps, r_nu]), name='V')
+    B = const_matrix(B_data, 'B')
+    r_a = tvm.reduce_axis((0, alpha), 'r_a')
+    r_b = tvm.reduce_axis((0, alpha), 'r_b')
+    V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp:
+                    tvm.sum(input_tile[ci][p][r_a][r_b][vp] * B[r_a][eps] * B[r_b][nu],
+                            axis=[r_a, r_b]), name='V')
 
     # batch gemm
-    c = tvm.reduce_axis((0, C), name='c')
-    M = tvm.compute((alpha, alpha, K, P_round), lambda eps, nu, k, b:
-                    tvm.sum(U[eps][nu][k // bna][c][k % bna] *
-                            V[eps][nu][b // bnb][c][b % bnb], axis=c), name='M')
-
-    # inverse transform
-    A = const_array(A_data, 'A')
-    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
-    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
-    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
-                    tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
-                            axis=[r_eps, r_nu]), name='Y')
+    ci = tvm.reduce_axis((0, CI), name='c')
+    M = tvm.compute((alpha, alpha, CO, P_round), lambda eps, nu, co, p:
+                    tvm.sum(U[eps][nu][co // bna][ci][co % bna] *
+                            V[eps][nu][p // bnb][ci][p % bnb], axis=ci), name='M')
+
+    A = const_matrix(A_data, 'A')
+    r_a = tvm.reduce_axis((0, alpha), 'r_a')
+    r_b = tvm.reduce_axis((0, alpha), 'r_b')
+    Y = tvm.compute((CO, P, m, m), lambda co, p, vh, vw:
+                    tvm.sum(M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
+                            axis=[r_a, r_b]), name='Y')
 
     # unpack output
-    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
-                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
+    output = tvm.compute((N, CO, H, W), lambda n, co, h, w:
+                         Y[co][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
                          # thw following term is used to make the padding effective,
                          # otherwise the padding will be eliminated by bound inference
-                         + tvm.const(0, out_dtype) * M[alpha-1][alpha-1][K-1][P_round-1],
-                         name='output', tag='winograd_conv_output')
+                         + tvm.const(0, out_dtype) * M[alpha-1][alpha-1][CO-1][P_round-1],
+                         name='output', tag='winograd_conv2d_output',
+                         attrs={'workload': _winograd_conv_arg_to_workload(
+                             data, kernel, strides, padding, layout, out_dtype, tile_size)})
 
+    # we have to manually assign effective GFLOP for winograd
+    cfg.add_flop(2 * N * CO * H * W * KH * KW * CI)
     return output
 
-def _schedule_winograd(s, op):
+def _schedule_winograd(cfg, s, op):
     """schedule winograd fast convolution F(2x2, 3x3) for conv2d"""
-
     # get ops and tensors
     output = op.output(0)
 
     Y = op.input_tensors[0]
     M, A = s[Y].op.input_tensors
     U, V = s[M].op.input_tensors
-    kernel, G = s[U].op.input_tensors
     d, B = s[V].op.input_tensors
     data_pad = s[d].op.input_tensors[0]
-    data = s[data_pad].op.input_tensors[0]
-
-    # dilation
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
 
     # padding
     s[data_pad].compute_inline()
 
-    # pack input tiles
-    c, b, eps, nu, bb = s[d].op.axis
-    s[d].reorder(eps, nu, bb)
-    aha = s[d].fuse(eps, nu)
-    s[d].unroll(bb)
-    tile_and_bind3d(s, d, c, b, aha, 4, 1, 1)
-
     # transform kernel
-    s[G].compute_inline()
-    eps, nu, k, c, kk, = s[U].op.axis
-    r_kh, r_kw = s[U].op.reduce_axis
-    s[U].reorder(k, c, kk, eps, nu, r_kh, r_kw)
-    _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
-    s[U].vectorize(kk)
-    tile_and_bind(s, U, k, c, 1, 256)
+    if isinstance(U.op, tvm.tensor.ComputeOp):
+        kernel, G = s[U].op.input_tensors
+        s[G].compute_inline()
+        eps, nu, co, ci, vco, = s[U].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel transformation will be pre-computed during compilation, so we skip
+            # this part to make tuning records correct
+            s[U].pragma(eps, 'debug_skip_region')
+        else:
+            r_kh, r_kw = s[U].op.reduce_axis
+            s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco)
+            _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
+            s[U].vectorize(vco)
+            tile_and_bind(s, U, co, ci, 1, 256)
+
+        # dilation
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            s[kernel].compute_inline()
 
     # transform image
     s[B].compute_inline()
-    eps, nu, b, c, bb = s[V].op.axis
-    r_eps, r_nu = s[V].op.reduce_axis
-    s[V].reorder(b, c, bb, eps, nu, r_nu, r_eps)
-    _ = [s[V].unroll(x) for x in [eps, nu, r_eps, r_nu]]
-    s[V].vectorize(bb)
-    tile_and_bind(s, V, b, c, 2, 1)
+    VL = s.cache_write(V, 'local')
+
+    eps, nu, p, ci, vp = s[V].op.axis
+    s[V].reorder(p, ci, eps, nu, vp)
+    for axis in [eps, nu]:
+        s[V].unroll(axis)
+    s[V].vectorize(vp)
+    fused = s[V].fuse(p, ci)
+
+    bb, tt = cfg['tile_t1'].apply(s, V, fused)
+    s[V].bind(bb, tvm.thread_axis('blockIdx.x'))
+    s[V].bind(tt, tvm.thread_axis('threadIdx.x'))
+
+    eps, nu, p, ci, vp = s[VL].op.axis
+    r_a, r_b = s[VL].op.reduce_axis
+    for axis in [eps, nu, r_a, r_b]:
+        s[VL].unroll(axis)
+    s[VL].vectorize(vp)
+    s[d].compute_at(s[V], tt)
+    s[VL].compute_at(s[V], tt)
 
     # batch gemm
-    bna, bnb = 4, 4
-    if data.dtype == 'float16':
-        bnb *= 2
+    bna = cfg['tile_bna'].val
+    bnb = cfg['tile_bnb'].val
 
     eps, nu, k, b = s[M].op.axis
+    alpha = eps.dom.extent
     c = s[M].op.reduce_axis[0]
     yo, xo, yi, xi = s[M].tile(k, b, bna, bnb)
-    s[M].reorder(c, yi, xi)
-    c, c_unroll = s[M].split(c, 2)
+    c, c_unroll = cfg['c_unroll'].apply(s, M, c)
+    s[M].reorder(yo, xo, c, c_unroll, yi, xi)
     s[M].unroll(c_unroll)
     s[M].unroll(yi)
     s[M].vectorize(xi)
     z = s[M].fuse(eps, nu)
-    tile_and_bind3d(s, M, z, yo, xo, 1, 8, 1)
+    tile_and_bind3d(s, M, z, yo, xo, 1, cfg['yt'].val, 1)
 
     # inverse transform
     s[A].compute_inline()
     k, b, vh, vw = s[Y].op.axis
-    r_eps, r_nu = s[Y].op.reduce_axis
-    _ = [s[Y].unroll(x) for x in [vh, vw, r_eps, r_nu]]
-    tile_and_bind(s, Y, k, b, 4, 1)
+    r_a, r_b = s[Y].op.reduce_axis
+    for axis in [vh, vw, r_a, r_b]:
+        s[Y].unroll(axis)
 
-    # schedule output
-    if output.op in s.outputs:  # no bias
-        output = output
-    else:                       # has bias
+    # schedule output and fusion
+    if output.op not in s.outputs:
         s[output].compute_inline()
         output = s.outputs[0]
 
-    _, k, h, w = s[output].op.axis
-    tile_and_bind3d(s, output, k, h, w, 1, 2, 2)
+    n, co, h, w = s[output].op.axis
+    m = alpha - 3 + 1
+    h, w, hi, wi = s[output].tile(h, w, m, m)
+    s[output].unroll(hi)
+    s[output].unroll(wi)
+    fused = s[output].fuse(n, co, h, w)
+    bb, tt = cfg['tile_t2'].apply(s, output, fused)
+    s[output].bind(bb, tvm.thread_axis('blockIdx.x'))
+    s[output].bind(tt, tvm.thread_axis('threadIdx.x'))
+
+    s[Y].compute_at(s[output], tt)
+
+
+##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
+@conv2d_winograd_without_weight_transform.register(['mali'])
+@autotvm.task.dispatcher
+def winograd_ww_config_dispatcher_(data, kernel, strides, padding, layout, out_dtype, tile_size):
+    return _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype,
+                                          tile_size)
+
+
+@winograd_ww_config_dispatcher_.register(['winograd'])
+def decl_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype,
+                          tile_size)
+
+
+@autotvm.task.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                     'mali', ['winograd'])
+def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'winograd_conv2d_output' in op.tag:
+            _schedule_winograd(cfg, s, op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+##### SCHECULE UTILITIES #####
+def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
+    """ tile and bind to GPU threads """
+    x_factor = x_factor or y_factor
+    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    return yo, xo, yi, xi
+
+
+def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+    """ tile and bind 3d """
+    y_factor = y_factor or z_factor
+    x_factor = x_factor or y_factor
+    zo, zi = s[tensor].split(z, z_factor)
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].reorder(zo, yo, xo, zi, yi, xi)
+    return zo, yo, xo, zi, yi, xi
diff --git a/topi/python/topi/mali/dense.py b/topi/python/topi/mali/dense.py
index 165d80a5ceef..ec21b806d0ad 100644
--- a/topi/python/topi/mali/dense.py
+++ b/topi/python/topi/mali/dense.py
@@ -4,17 +4,21 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from tvm import autotvm
 
-from .. import generic
-from .. import util
-from .. import tag
+from .. import generic, nn
+from ..util import traverse_inline
 
-@generic.schedule_dense.register(["mali"])
-def schedule_dense(outs):
+autotvm.register_topi_compute(nn.dense, 'mali', 'direct', nn.dense.fdefault)
+
+@autotvm.register_topi_schedule(generic.schedule_dense, 'mali', 'direct')
+def schedule_dense(cfg, outs):
     """Schedule for dense operator.
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The config entity for this template
     outs: Array of Tensor
         The computation graph description of dense
         in the format of an array of tensors.
@@ -26,80 +30,65 @@ def schedule_dense(outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    def _schedule(dense):
-        data = s[dense].op.input_tensors[0]
-        weight = s[dense].op.input_tensors[1]
-
-        hidden = util.get_const_int(weight.shape[1])
-        out = util.get_const_int(weight.shape[0])
-
-        # set tunable parameter
-        tune_config = getattr(tvm.target.current_target(), "tune_config", None)
-        if tune_config is None:
-            if hidden > 8192:
-                num_thread = 32
-                unroll_step = 32
-            else:
-                if out <= 1024:
-                    num_thread = 32
-                    unroll_step = 16
-                else:
-                    num_thread = 256
-                    unroll_step = 32
-
-            if data.dtype == 'float16':
-                if hidden > 8192:
-                    num_thread = 2
-                    unroll_step = 32
-                else:
-                    num_thread = 8
-                    unroll_step = 256
-        else:
-            num_thread = tune_config['num_thread']
-            unroll_step = tune_config['unroll_step']
-
-        def fuse_and_bind(s, tensor, axis=None, num_thread=None):
-            """ fuse all the axis and bind to GPU threads """
-            axis = axis or s[tensor].op.axis
-            fused = s[tensor].fuse(*axis)
-            max_threads = tvm.target.current_target(allow_none=False).max_num_threads
-            bx, tx = s[tensor].split(fused, num_thread or max_threads)
-            s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
-            s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
-            return bx, tx
-
-        output = outs[0]
-        bx, tx = fuse_and_bind(s, output, num_thread=num_thread)
-
-        k = s[dense].op.reduce_axis[0]
-        k, k_unroll = s[dense].split(k, unroll_step)
-        s[dense].unroll(k_unroll)
-
-        if dense.op not in s.outputs:
+
+    def _callback(op):
+        if op.tag == 'dense':
+            vec_size = [1, 2, 4, 8, 16]
+            max_unroll = 32
+
+            dense = op.output(0)
+            output = outs[0]
+
+            y, x = s[output].op.axis
+            c = s[dense].op.reduce_axis[0]
+
+            ##### space definition begin #####
+            cfg.define_split('tile_y', y, num_outputs=3)
+            cfg.define_split('tile_x', x, num_outputs=3)
+            cfg.define_split('c_unroll', c, num_outputs=2, max_factor=64)
+
+            # fallback support
+            if cfg.is_fallback:
+                ref_log = autotvm.tophub.load_reference_log(
+                    'mali', 'rk3399', 'dense', 'direct')
+                cfg.fallback_with_reference_log(ref_log)
+            ##### space definition end #####
+
+            if dense.op in s.outputs:
+                dense = s.cache_write(output, 'local')
+
+            by, ty, yi = cfg['tile_y'].apply(s, output, y)
+            bx, tx, xi = cfg['tile_x'].apply(s, output, x)
+
+            s[output].bind(by, tvm.thread_axis('blockIdx.y'))
+            s[output].bind(bx, tvm.thread_axis('blockIdx.x'))
+            s[output].bind(ty, tvm.thread_axis('threadIdx.y'))
+            s[output].bind(tx, tvm.thread_axis('threadIdx.x'))
+
+            if cfg['tile_y'].size[-1] < max_unroll:
+                s[output].unroll(yi)
+            if cfg['tile_x'].size[-1] in vec_size:
+                s[output].vectorize(xi)
             s[dense].compute_at(s[output], tx)
 
-#        bias = s[outs[0]].op.input_tensors[1]
-#        print(tvm.lower(s, [data, weight, bias, outs[0]], simple_mode=True))
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal travserse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule dense
-        elif OP.tag == 'dense':
-            dense = OP.output(0)
-            _schedule(dense)
-        else:
-            raise RuntimeError("Unsupported operator: %s" % OP.tag)
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
+            k = s[dense].op.reduce_axis[0]
+            y, x = s[dense].op.axis
+            k, k_unroll = cfg['c_unroll'].apply(s, dense, k)
+            s[dense].reorder(k, k_unroll, y, x)
+            s[dense].unroll(k_unroll)
+            if cfg['tile_y'].size[-1] < max_unroll:
+                s[dense].unroll(y)
+            if cfg['tile_x'].size[-1] in vec_size:
+                s[dense].vectorize(x)
+
+    traverse_inline(s, outs[0].op, _callback)
     return s
+
+def fuse_and_bind(s, tensor, axis=None, num_thread=None):
+    """ fuse all the axis and bind to GPU threads """
+    axis = axis or s[tensor].op.axis
+    fused = s[tensor].fuse(*axis)
+    bx, tx = s[tensor].split(fused, num_thread)
+    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    return bx, tx
diff --git a/topi/python/topi/mali/depthwise_conv2d.py b/topi/python/topi/mali/depthwise_conv2d.py
index cad0733a153f..8652ba583260 100644
--- a/topi/python/topi/mali/depthwise_conv2d.py
+++ b/topi/python/topi/mali/depthwise_conv2d.py
@@ -1,21 +1,28 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument
 """depthwise_conv2d schedule on ARM Mali GPU"""
 
-from __future__ import absolute_import as _abs
 import tvm
+from tvm import autotvm
 
-from .. import generic
-from .. import util
-from .. import tag
+from ..generic import schedule_depthwise_conv2d_nchw
+from ..nn import depthwise_conv2d_nchw
+from ..util import traverse_inline
 
-@generic.schedule_depthwise_conv2d_nchw.register(["mali"])
-def schedule_depthwise_conv2d_nchw(outs):
-    """Schedule for depthwise_conv2d nchw forward.
+# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
+autotvm.register_topi_compute(depthwise_conv2d_nchw, 'mali', 'direct',
+                              depthwise_conv2d_nchw.fdefault)
+
+# register customized schedule for arm cpu.
+@autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'mali', 'direct')
+def schedule_depthwise_conv2d_nchw_mali(cfg, outs):
+    """Schedule depthwise conv2d
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The configuration of this template
     outs: Array of Tensor
-        The computation graph description of depthwise_conv2d
+        The computation graph description of depthwise convolution2d
         in the format of an array of tensors.
 
     Returns
@@ -25,89 +32,95 @@ def schedule_depthwise_conv2d_nchw(outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    def _schedule(pad_data, kernel, conv):
-        raw_data = s[pad_data].op.input_tensors[0]
 
-        if conv.op not in s.outputs:  # has bias or relu
-            output = outs[0]
-        else:                         # no bias or relu
-            output = conv
+    def _schedule(pad_data, kernel, conv):
+        """schedule depthwise_conv2d"""
+        max_unroll = 16
+        vec_size = [1, 2, 4, 8, 16]
 
-        def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
-            """ tile and bind 3d """
-            y_factor = y_factor or z_factor
-            x_factor = x_factor or y_factor
-            zo, zi = s[tensor].split(z, z_factor)
-            yo, yi = s[tensor].split(y, y_factor)
-            xo, xi = s[tensor].split(x, x_factor)
-            s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
-            s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
-            s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-            s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-            s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-            s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-            return zo, zi, yo, yi, xo, xi
-
-        # set tunable parameters
-        VH = 1
-        VW = 1
-        num_thread = 4
-        while util.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
-            VW = VW * 2
-        while util.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
-            VH = VH * 2
-        if raw_data.dtype == 'float16':
-            if util.get_const_int(conv.shape[3]) % (VW * 2) == 0:
-                VW *= 2
-                num_thread *= 2
-            else:
-                num_thread *= 2
+        ##### space definition begin #####
+        n, c, y, x = s[conv].op.axis
+        bc, tc, ci = cfg.define_split("tile_c", c, num_outputs=3)
+        by, ty, yi = cfg.define_split('tile_y', y, num_outputs=3)
+        bx, tx, xi = cfg.define_split("tile_x", x, num_outputs=3)
+        cfg.define_annotate('ann_spatial', [ci, yi, xi], policy='try_unroll_vec')
 
-        # schedule padding
-        _, c, y, x = s[pad_data].op.axis
-        tile_and_bind3d(pad_data, c, y, x, num_thread, 1, 1)
+        # fallback support
+        if cfg.is_fallback:
+            ref_log = autotvm.tophub.load_reference_log(
+                'mali', 'rk3399', 'depthwise_conv2d_nchw', 'direct')
+            cfg.fallback_with_reference_log(ref_log)
+        ###### space definition end ######
 
-        # schedule conv
-        di, dj = s[conv].op.reduce_axis
-        s[conv].unroll(di)
-        s[conv].unroll(dj)
 
-        _, c, y, x = s[output].op.axis
-        y, x, yi, xi = s[output].tile(y, x, VH, VW)
-        s[output].unroll(yi)
-        s[output].vectorize(xi)
+        # schedule padding
+        n, c, y, x = s[pad_data].op.axis
+        tile_and_bind3d(s, pad_data, c, y, x, cfg["tile_c"].size[1], 1, 1)
 
-        _, _, _, _, _, ji = tile_and_bind3d(output, c, y, x, num_thread, 1, 1)
+        # schedule dilation
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            s[kernel].compute_inline()
 
+        # schedule conv
         if conv.op not in s.outputs:
-            _, c, y, x = s[conv].op.axis
-            y, x, yi, xi = s[conv].tile(y, x, VH, VW)
-            s[conv].unroll(yi)
-            s[conv].vectorize(xi)
-            s[conv].compute_at(s[output], ji)
-
-    scheduled_ops = []
-
-    def traverse(op):
-        """Internal travserse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
+            s[conv].set_scope('local')
+            OL = conv
+            output = s.outputs[0].output(0)
+        else:
+            OL = s.cache_write(conv, 'local')
+            output = conv
 
+        n, c, y, x = s[output].op.axis
+        bc, tc, ci = cfg['tile_c'].apply(s, output, c)
+        by, ty, yi = cfg['tile_y'].apply(s, output, y)
+        bx, tx, xi = cfg['tile_x'].apply(s, output, x)
+
+        bc = s[output].fuse(n, bc)
+        s[output].bind(bc, tvm.thread_axis("blockIdx.z"))
+        s[output].bind(tc, tvm.thread_axis("threadIdx.z"))
+        s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+        di, dj = s[OL].op.reduce_axis
+        s[OL].unroll(di)
+        s[OL].unroll(dj)
+
+        s[OL].compute_at(s[output], tx)
+        n, ci, yi, xi = s[OL].op.axis
+
+        cfg["ann_spatial"].apply(s, OL, [ci, yi, xi],
+                                 axis_lens=[cfg['tile_c'].size[2], cfg['tile_y'].size[2],
+                                            cfg['tile_x'].size[2]],
+                                 max_unroll=max_unroll,
+                                 vec_size=vec_size,
+                                 cfg=cfg)
+
+    def _callback(op):
+        """traverse to find op to schedule"""
         # schedule depthwise_conv2d
         if op.tag == 'depthwise_conv2d_nchw':
             pad_data = op.input_tensors[0]
             kernel = op.input_tensors[1]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
-                s[kernel].compute_inline()
             conv = op.output(0)
             _schedule(pad_data, kernel, conv)
 
-        scheduled_ops.append(op)
-
-    traverse(outs[0].op)
+    traverse_inline(s, outs[0].op, _callback)
     return s
+
+
+def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+    """ tile and bind 3d """
+    y_factor = y_factor or z_factor
+    x_factor = x_factor or y_factor
+    zo, zi = s[tensor].split(z, z_factor)
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    return zo, zi, yo, yi, xo, xi
diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py
index 1aa125f8f68f..2d8058fb276b 100644
--- a/topi/python/topi/rocm/conv2d.py
+++ b/topi/python/topi/rocm/conv2d.py
@@ -1,26 +1,29 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
-"""Compute and schedule for rocm conv2d_nchw with auto fusion"""
+# pylint: disable=invalid-name
+"""Compute definition for conv2d with rocm backend"""
 import tvm
+from tvm import autotvm
 from tvm.contrib import miopen
-import topi
-from .. import generic
-from ..nn.conv2d import conv2d
-from ..util import get_const_int
 
+from .. import nn, generic
+from ..util import get_const_int, get_const_tuple
+from ..cuda.conv2d import conv2d_cuda, schedule_conv2d_nchw_cuda
 
-@conv2d.register("rocm")
-def conv2d_rocm(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+@autotvm.register_topi_compute(nn.conv2d, 'rocm', ['direct', 'winograd'])
+def conv2d_rocm(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for rocm backend.
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The config for this template
+
     input : tvm.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
     filter : tvm.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width]
 
-    stride : int or a list/tuple of two ints
+    strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
 
     padding : int or a list/tuple of two ints
@@ -34,31 +37,42 @@ def conv2d_rocm(data, kernel, stride, padding, layout='NCHW', out_dtype='float32
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    assert layout == 'NCHW', "Only NCHW layout is supported."
-    assert isinstance(stride, int) or len(stride) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-    if isinstance(padding, int):
-        pad_h = pad_w = padding
-    else:
-        pad_h, pad_w = padding
-    # handle dilation
-    dilation_h = dilation_w = 1
-    kernel_tvm = kernel
-    kernel_cudnn = kernel
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        kernel_before_dilation = kernel.op.input_tensors[0]
-        kernel_cudnn = kernel_before_dilation
-        dilation_h = (get_const_int(kernel.shape[2]) + get_const_int(kernel_before_dilation.shape[2]) - 1) \
-            // get_const_int(kernel_before_dilation.shape[2])
-        dilation_w = (get_const_int(kernel.shape[3]) + get_const_int(kernel_before_dilation.shape[3]) - 1) \
-            // get_const_int(kernel_before_dilation.shape[2])
+
     target = tvm.target.current_target()
     if "miopen" in target.libs:
+        assert layout == 'NCHW', "Only NCHW layout is supported."
+        CO, CI, KH, KW = get_const_tuple(kernel.shape)
+        N, _, H, W = get_const_tuple(data.shape)
+
+        # handle dilation
+        stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
+        pad_h, pad_w = (padding, padding) if isinstance(padding, int) else padding
+
+        OH = (H + 2 * pad_h - KH) // stride_h + 1
+        OW = (W + 2 * pad_w - KW) // stride_w + 1
+        cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
+
+        dilation_h = dilation_w = 1
+        kernel_before_dilation = kernel
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            kernel_before_dilation = kernel.op.input_tensors[0]
+            if layout == 'NCHW':
+                dilation_h = (get_const_int(kernel.shape[2]) +
+                              get_const_int(kernel_before_dilation.shape[2]) - 1) \
+                             // get_const_int(kernel_before_dilation.shape[2])
+                dilation_w = (get_const_int(kernel.shape[3]) +
+                              get_const_int(kernel_before_dilation.shape[3]) - 1) \
+                             // get_const_int(kernel_before_dilation.shape[2])
+            elif layout == 'NHWC':
+                dilation_h = (get_const_int(kernel.shape[1]) +
+                              get_const_int(kernel_before_dilation.shape[1]) - 1) \
+                             // get_const_int(kernel_before_dilation.shape[1])
+                dilation_w = (get_const_int(kernel.shape[2]) +
+                              get_const_int(kernel_before_dilation.shape[2]) - 1) \
+                             // get_const_int(kernel_before_dilation.shape[2])
+
         return miopen.conv2d_forward(data,
-                                     kernel_cudnn,
+                                     kernel_before_dilation,
                                      stride_h,
                                      stride_w,
                                      pad_h,
@@ -66,25 +80,30 @@ def conv2d_rocm(data, kernel, stride, padding, layout='NCHW', out_dtype='float32
                                      dilation_h,
                                      dilation_w,
                                      conv_mode=0)
-    return topi.nn.conv2d_nchw(data, kernel_tvm, stride, padding, out_dtype)
+
+    return conv2d_cuda(cfg, data, kernel, strides, padding, layout, out_dtype)
 
 
-@generic.schedule_conv2d_nchw.register(["rocm"])
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw with rocm backend.
+@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, 'rocm', ["direct", 'winograd'])
+def schedule_conv2d_nchw_rocm(cfg, outs):
+    """TOPI schedule callback of conv2d for rocm
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The config for this template
+
     outs: Array of Tensor
-        The computation graph description of conv2d_nchw
+        The computation graph description of conv2d
         in the format of an array of tensors.
 
     Returns
     -------
     s: Schedule
-        The computation schedule for conv2d_nchw.
+        The computation schedule for conv2d.
     """
     target = tvm.target.current_target()
     if target and "miopen" in target.libs:
-        return topi.generic.schedule_extern(outs)
-    return topi.cuda.schedule_conv2d_nchw(outs)
+        return generic.schedule_extern(outs)
+
+    return schedule_conv2d_nchw_cuda(cfg, outs)
diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py
index 820cb561c0c3..763db5f86be2 100644
--- a/topi/tests/python/common.py
+++ b/topi/tests/python/common.py
@@ -9,4 +9,4 @@ def get_all_backend():
         A list of all supported targets
     """
     return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx',
-            'llvm -device=arm_cpu', 'aocl_sw_emu']
+            'llvm -device=arm_cpu', 'opencl -device=mali', 'aocl_sw_emu']
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index 6f367d10c048..fb27246aa572 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -48,7 +48,8 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
-            C = topi.nn.conv2d(A, dW, stride, padding, layout='NCHW', out_dtype=dtype)
+            C = topi.nn.conv2d(A, dW, (stride, stride), (padding, padding),
+                               layout='NCHW', out_dtype=dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
@@ -72,7 +73,11 @@ def check_device(device):
 
 
 def test_conv2d_nchw():
-    autotvm.DispatchContext.current.silent = True
+    # load tophub
+    ctx = autotvm.apply_history_best([])
+    for device in get_all_backend():
+        context = autotvm.tophub.context(device)
+        context.__enter__()
 
     # ResNet18 workloads
     verify_conv2d_nchw(1,   3, 224,  64, 7, 2, 3)
@@ -96,9 +101,21 @@ def test_conv2d_nchw():
     # dilation = 2
     verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, dilation=2)
 
+    # batch size
+    verify_conv2d_nchw(4, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_nchw(9, 64, 56, 64, 3, 1, 1)
+
     # weird workloads
-    verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=1)
-    verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=2)
+    verify_conv2d_nchw(2, 2, 2, 2, 2, 2, 2)
+    verify_conv2d_nchw(3, 3, 3, 3, 3, 3, 3)
+    verify_conv2d_nchw(4, 4, 4, 4, 4, 4, 4)
+    verify_conv2d_nchw(5, 5, 5, 5, 5, 5, 5)
+    verify_conv2d_nchw(6, 6, 6, 6, 6, 6, 6)
+
+    # disable these tests due to some bugs of llvm with nvptx
+    # verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=1)
+    # verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=2)
+    # verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1)
 
     # inception v3 workloads
     verify_conv2d_nchw(1,    3, 299,  32, 3, 2, 0)
@@ -117,22 +134,22 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1,  288,  35,  64, 1, 1, 0)
     verify_conv2d_nchw(1,  288,  35,  48, 1, 1, 0)
     verify_conv2d_nchw(1,  288,  35, 384, 3, 2, 0)
-    # verify_conv2d_nchw(1,   96,  35,  96, 3, 2, 0)
-    # verify_conv2d_nchw(1,  768,  17, 192, 1, 1, 0)
-    # verify_conv2d_nchw(1,  768,  17, 128, 1, 1, 0)
-    # verify_conv2d_nchw(1,  128,  17, 128, 1, 1, 0)
-    # verify_conv2d_nchw(1,  128,  17, 192, 7, 1, 3)
-    # verify_conv2d_nchw(1,  128,  17, 128, 7, 1, 3)
-    # verify_conv2d_nchw(1,  128,  17, 192, 1, 1, 0)
-    # verify_conv2d_nchw(1,  768,  17, 160, 1, 1, 0)
-    # verify_conv2d_nchw(1,  160,  17, 160, 1, 1, 0)
-    # verify_conv2d_nchw(1,  160,  17, 192, 7, 1, 3)
-    # verify_conv2d_nchw(1,  160,  17, 160, 7, 1, 3)
-    # verify_conv2d_nchw(1,  160,  17, 192, 1, 1, 0)
-    # verify_conv2d_nchw(1,  192,  17, 192, 1, 1, 0)
-    # verify_conv2d_nchw(1,  192,  17, 192, 7, 1, 3)
-    # verify_conv2d_nchw(1,  192,  17, 320, 3, 2, 0)
-    # verify_conv2d_nchw(1,  192,  17, 192, 3, 2, 0)
+    verify_conv2d_nchw(1,   96,  35,  96, 3, 2, 0)
+    verify_conv2d_nchw(1,  768,  17, 192, 1, 1, 0)
+    verify_conv2d_nchw(1,  768,  17, 128, 1, 1, 0)
+    verify_conv2d_nchw(1,  128,  17, 128, 1, 1, 0)
+    verify_conv2d_nchw(1,  128,  17, 192, 7, 1, 3)
+    verify_conv2d_nchw(1,  128,  17, 128, 7, 1, 3)
+    verify_conv2d_nchw(1,  128,  17, 192, 1, 1, 0)
+    verify_conv2d_nchw(1,  768,  17, 160, 1, 1, 0)
+    verify_conv2d_nchw(1,  160,  17, 160, 1, 1, 0)
+    verify_conv2d_nchw(1,  160,  17, 192, 7, 1, 3)
+    verify_conv2d_nchw(1,  160,  17, 160, 7, 1, 3)
+    verify_conv2d_nchw(1,  160,  17, 192, 1, 1, 0)
+    verify_conv2d_nchw(1,  192,  17, 192, 1, 1, 0)
+    verify_conv2d_nchw(1,  192,  17, 192, 7, 1, 3)
+    verify_conv2d_nchw(1,  192,  17, 320, 3, 2, 0)
+    verify_conv2d_nchw(1,  192,  17, 192, 3, 2, 0)
     verify_conv2d_nchw(1, 1280,   8, 320, 1, 1, 0)
     verify_conv2d_nchw(1, 1280,   8, 384, 1, 1, 0)
     verify_conv2d_nchw(1,  384,   8, 384, 1, 1, 0)
diff --git a/topi/tests/python/test_topi_conv2d_winograd.py b/topi/tests/python/test_topi_conv2d_winograd.py
new file mode 100644
index 000000000000..575e75ce2272
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_winograd.py
@@ -0,0 +1,110 @@
+"""Example code to do convolution."""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+
+def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
+            C = topi.nn.conv2d(A, dW, stride, padding, layout='NCHW', out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_nchw([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+
+    for device in ['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']:
+        check_device(device)
+
+
+class WinogradFallback(autotvm.FallbackContext):
+    def _query_inside(self, target, workload):
+        key = (target, workload)
+        if key in self.memory:
+            return self.memory[key]
+        cfg = FallbackConfigEntity()
+        cfg.template_key = 'winograd'
+        self.memory[key] = cfg
+        return cfg
+
+
+def test_conv2d_nchw():
+    autotvm.DispatchContext.current.silent = True
+
+    with WinogradFallback():
+        # resnet 18 workloads
+        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
+        verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
+        verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
+        verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
+
+        # batch size = 2
+        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1)
+
+        # relu, bias
+        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_bias=True)
+        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True)
+        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True)
+
+        # werid workloads
+        verify_conv2d_nchw(1, 1, 1, 1, 3, 1, 1)
+        verify_conv2d_nchw(3, 3, 3, 3, 3, 1, 1)
+        verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1)
+
+if __name__ == "__main__":
+    test_conv2d_nchw()
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index 8c27af8390fe..4d3c45763dfb 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -1,8 +1,10 @@
 import tvm
+from tvm import autotvm
 import topi
 import topi.testing
 import numpy as np
 from topi.util import get_const_tuple
+from topi.nn.util import get_pad_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 from common import get_all_backend
@@ -11,6 +13,16 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
     in_width = in_height
     filter_channel = in_channel
     filter_width = filter_height
+    stride_h = stride_w = stride
+
+    if dilation == 1:
+        # here we transform the padding argument from 'str' to  'tuple' ,
+        # because we need this to match the "workload" tuple to the records in TopHub
+        pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
+        padding_args = (pad_h, pad_w)
+    else:
+        padding_args = padding
+
     # placeholder
     Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input')
     Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
@@ -18,6 +30,8 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
 
+    dtype = 'float32'
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -26,7 +40,8 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             # declare
-            DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, DilatedFilter, stride=stride, padding=padding)
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, DilatedFilter,
+                (stride_h, stride_w), padding_args, dtype)
             ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
             Relu = topi.nn.relu(ScaleShift)
             # schedule
@@ -39,7 +54,6 @@ def check_device(device):
         f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
 
         # Prepare pod type for test data closure
-        dtype = Input.dtype
         input_shape = get_const_tuple(Input.shape)
         filter_shape = get_const_tuple(Filter.shape)
         scale_shape = get_const_tuple(Scale.shape)
@@ -56,7 +70,7 @@ def get_ref_data():
             shift_np = np.random.uniform(size=shift_shape).astype(dtype)
             # correctness with scipy
             depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
-                input_np, dilated_filter_np, stride=stride, padding=padding)
+                input_np, dilated_filter_np, stride, padding)
             scale_shift_scipy = np.zeros(shape=scale_shift_shape)
             for c in range(in_channel * channel_multiplier):
                 scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]
@@ -96,6 +110,15 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
     filter_channel = in_channel
     filter_width = filter_height
     stride_w = stride_h
+
+    if dilation == 1:
+        # here we transform the padding argument from 'str' to  'tuple' ,
+        # because we need this to match the "workload" tuple to the records in TopHub
+        pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
+        padding_args = (pad_h, pad_w)
+    else:
+        padding_args = padding
+
     # placeholder
     Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input')
     Filter = tvm.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter')
@@ -103,6 +126,8 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
 
+    dtype = 'float32'
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -112,7 +137,8 @@ def check_device(device):
 
         with tvm.target.create(device):
             # declare
-            DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, DilatedFilter, stride=[stride_h, stride_w], padding=padding)
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, DilatedFilter,
+                (stride_h, stride_w), padding_args, dtype)
             ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
             Relu = topi.nn.relu(ScaleShift)
             # schedule
@@ -125,7 +151,6 @@ def check_device(device):
         f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
 
         # Prepare pod type for test data closure
-        dtype = Input.dtype
         input_shape = get_const_tuple(Input.shape)
         filter_shape = get_const_tuple(Filter.shape)
         scale_shape = get_const_tuple(Scale.shape)
@@ -180,26 +205,36 @@ def get_ref_data():
 
 
 def test_depthwise_conv2d():
-    print("testing nchw")
-    depthwise_conv2d_with_workload_nchw(1, 728, 64, 1, 3, 1, "SAME")
+    # load tophub
+    ctx = autotvm.apply_history_best([])
+    for device in get_all_backend():
+        context = autotvm.tophub.context(device)
+        context.__enter__()
+
+    # mobilenet workloads
+    depthwise_conv2d_with_workload_nchw(1, 32, 112, 1, 3, 1, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 64, 112, 1, 3, 2, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 128, 56, 1, 3, 1, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 128, 56, 1, 3, 2, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 256, 28, 1, 3, 1, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 256, 28, 1, 3, 2, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 512, 14, 1, 3, 1, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 512, 14, 1, 3, 2, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 1024, 7, 1, 3, 1, "SAME")
+
+    # NCHW
     depthwise_conv2d_with_workload_nchw(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nchw(4, 256, 64, 2, 5, 2, "SAME")
-    depthwise_conv2d_with_workload_nchw(4, 256, 32, 2, 5, 2, "SAME")
-    depthwise_conv2d_with_workload_nchw(1, 728, 64, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nchw(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nchw(4, 256, 64, 2, 5, 2, "VALID")
-    depthwise_conv2d_with_workload_nchw(4, 256, 32, 2, 5, 2, "VALID")
     # dilation = 2
     depthwise_conv2d_with_workload_nchw(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
-    print("testing nhwc")
-    depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME")
+
+    # NHWC
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "SAME")
-    depthwise_conv2d_with_workload_nhwc(4, 256, 32, 2, 5, 2, "SAME")
-    depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID")
-    depthwise_conv2d_with_workload_nhwc(4, 256, 32, 2, 5, 2, "VALID")
     # dilation = 2
     depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
 
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 3cd63d03dfd9..b541e4310df0 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -10,7 +10,7 @@
 
 ######################################################################
 # Install dependencies
-# ----------------------------------------
+# --------------------
 # To use autotvm package in tvm, we need to install some extra dependencies.
 # (change "3" to "2" if you use python2):
 #
@@ -20,7 +20,6 @@
 #
 # To make tvm run faster in tuning, it is recommended to use cython
 # as FFI of tvm. In the root directory of tvm, execute
-# (change "3" to "2" if you use python2):
 #
 # .. code-block:: bash
 #
@@ -41,7 +40,7 @@
 
 ######################################################################
 # Step 1:  Define the search space
-# ---------------------------------
+# --------------------------------
 # There are plenty of useful schedule primitives in tvm. You can also find 
 # some tutorials that describe them in more details, such as 
 # (1). :ref:`opt-conv-gpu`
@@ -72,6 +71,21 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, 'float32')
     s = tvm.create_schedule([conv.op])
 
+    ##### space definition begin #####
+    n, f, y, x = s[conv].op.axis
+    rc, ry, rx = s[conv].op.reduce_axis
+
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_f", f, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=3)
+    cfg.define_split("tile_ry", ry, num_outputs=3)
+    cfg.define_split("tile_rx", rx, num_outputs=3)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+    ##### space definition end #####
+
     # inline padding
     pad_data = s[conv].op.input_tensors[0]
     s[pad_data].compute_inline()
@@ -88,10 +102,6 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
 
     # tile and bind spatial axes
     n, f, y, x = s[output].op.axis
-    cfg = autotvm.get_config()
-    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
-    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
     bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
     by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
     bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
@@ -109,12 +119,9 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
     s[OL].compute_at(s[output], tx)
 
-    # tile and bind reduction axes
+    # tile reduction axes
     n, f, y, x = s[OL].op.axis
     rc, ry, rx = s[OL].op.reduce_axis
-    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
-    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
-    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
     rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
     ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
     rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
@@ -137,8 +144,6 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
         s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
 
     # tune unroll
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    cfg.define_knob("unroll_explicit", [0, 1])
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
     s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
 
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index 8ab7bb2f176c..31c634bf2a9b 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -8,9 +8,9 @@
 network.
 
 The operator implementation for ARM CPU in TVM is written in template form.
-It has many tunable knobs (tile factor, vectorization, unrolling, etc).
-We will do tuning for all convolution and depthwise convolution operators
-in the neural network. After the tuning, we can get a log file which stores
+The template has many tunable knobs (tile factor, vectorization, unrolling, etc).
+We will tune all convolution and depthwise convolution operators
+in the neural network. After tuning, we produce a log file which stores
 the best knob values for all required operators. When the tvm compiler compiles
 these operators, it will query this log file to get the best knob values.
 
@@ -21,15 +21,15 @@
 
 ######################################################################
 # Install dependencies
-# ----------------------------------------
-# To use autotvm package in tvm, we need to install some extra dependencies.
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
 # (change "3" to "2" if you use python2):
 #
 # .. code-block:: bash
 #
 #   pip3 install --user psutil xgboost tornado
 #
-# To make tvm run faster in tuning, it is recommended to use cython
+# To make tvm run faster during tuning, it is recommended to use cython
 # as FFI of tvm. In the root directory of tvm, execute
 # (change "3" to "2" if you use python2):
 #
@@ -108,10 +108,9 @@ def get_network(name, batch_size):
 # To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
 # The RPC Tracker is a centralized master node. We can register all devices to
 # the tracker. For example, if we have 10 phones, we can register all of them
-# to the tracker, then we can run 10 measurements in parallel, which accelerates
-# the tuning process.
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
 #
-# To start an RPC tracker, run this command in the host machine. The tracker is
+# To start an RPC tracker, run this command on the host machine. The tracker is
 # required during the whole tuning process, so we need to open a new terminal for
 # this command:
 #
@@ -144,6 +143,8 @@ def get_network(name, batch_size):
 # * For Android:
 #   Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
 #   install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
+#   Then you have already registred your device. During tuning, you have to go to developer option
+#   and enable "Keep screen awake during changing" and charge your phone to make it stable.
 #
 # After registering devices, we can confirm it by querying rpc_tracker
 #
@@ -170,7 +171,7 @@ def get_network(name, batch_size):
 ###########################################
 # Set Tuning Options
 # ------------------
-# Before tuning, we should do some configurations. Here I use an RK3399 board
+# Before tuning, we should apply some configurations. Here I use an RK3399 board
 # as example. In your setting, you should modify the target and device_key accordingly.
 # set :code:`use_android` to True if you use android phone.
 
@@ -213,18 +214,20 @@ def get_network(name, batch_size):
 #
 # .. note:: How to set tuning options
 #
-#   In general, the default value provided here works well.
-#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   In general, the default values provided here work well.
+#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
 #   which makes the tuning run longer.
+#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
+#   set timeout larger.
 #
 
 ###################################################################
 # Begin Tuning
 # ------------
 # Now we can extract tuning tasks from the network and begin tuning.
-# Here we provide a simple utility function to tune a list of tasks.
+# Here, we provide a simple utility function to tune a list of tasks.
 # This function is just an initial implementation which tunes them in sequential order.
-# Later we will bring more sophisticated tuner scheduler.
+# We will introduce a more sophisticated tuning scheduler in the future.
 
 # You can skip the implementation of this function for this tutorial.
 def tune_tasks(tasks,
@@ -284,7 +287,7 @@ def tune_tasks(tasks,
 
 
 ########################################################################
-# Finally we launch tuning jobs and evaluate the end-to-end performance.
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
 
 def tune_and_evaluate(tuning_opt):
     # extract workloads from nnvm graph
@@ -301,7 +304,7 @@ def tune_and_evaluate(tuning_opt):
     # compile kernels with history best records
     with autotvm.apply_history_best(log_file):
         print("Compile...")
-        with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+        with nnvm.compiler.build_config(opt_level=3):
             graph, lib, params = nnvm.compiler.build(
                 net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
 
@@ -338,7 +341,7 @@ def tune_and_evaluate(tuning_opt):
               (np.mean(prof_res), np.std(prof_res)))
 
 # We do not run the tuning in our webpage server since it takes too long.
-# Uncomment the following line to run by yourself.
+# Uncomment the following line to run it by yourself.
 
 # tune_and_evaluate(tuning_option)
 
@@ -373,9 +376,9 @@ def tune_and_evaluate(tuning_opt):
 
 ######################################################################
 #
-# .. note:: **Meet some problems?**
+# .. note:: **Experiencing Difficulties?**
 #
-#   The auto tuning module is error prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
 #   then there must be something wrong.
 #
 #   First, make sure you set the correct configuration of your device.
diff --git a/tutorials/autotvm/tune_nnvm_cuda.py b/tutorials/autotvm/tune_nnvm_cuda.py
new file mode 100644
index 000000000000..8224276f47f8
--- /dev/null
+++ b/tutorials/autotvm/tune_nnvm_cuda.py
@@ -0,0 +1,375 @@
+"""
+Auto-tuning a convolutional network for NVIDIA GPU
+====================================================
+**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole convolutional
+network for NVIDIA GPU.
+
+The operator implementation for NVIDIA GPU in TVM is written in template form.
+The template has many tunable knobs (tile factor, unrolling, etc).
+We will tune all convolution and depthwise convolution operators
+in the neural network. After tuning, we produce a log file which stores
+the best knob values for all required operators. When the tvm compiler compiles
+these operators, it will query this log file to get the best knob values.
+
+We also released pre-tuned parameters for some NVIDIA GPUs. You can go to
+`NVIDIA GPU Benchmark <https://github.com/dmlc/tvm/wiki/Benchmark#nvidia-gpu>`_
+to see the results.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado
+#
+# To make tvm run faster during tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute:
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define Network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet_v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we apply some configurations.
+
+#### DEVICE CONFIG ####
+target = tvm.target.cuda()
+
+#### TUNING OPTION ####
+network = 'resnet-18'
+log_file = "%s.log" % network
+dtype = 'float32'
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 2000,
+    'early_stopping': 600,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(timeout=10),
+        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4),
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default value provided here works well.
+#
+#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   which makes the tuning runs longer.
+#
+#   If you have multiple devices, you can use all of them for measurement to
+#   accelerate the tuning process. (see the 'Scale up measurement` section below).
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    if try_winograd:
+        for i in range(len(tasks)):
+            try:  # try winograd template
+                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
+                                          tasks[i].target, tasks[i].target_host, 'winograd')
+                input_channel = tsk.workload[1][1]
+                if input_channel >= 64:
+                    tasks[i] = tsk
+            except Exception:
+                pass
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=100)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from nnvm graph
+    print("Extract tasks...")
+    net, params, input_shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
+
+        # export library
+        tmp = tempdir()
+        filename = "net.tar"
+        lib.export_library(tmp.relpath(filename))
+
+        # load parameters
+        ctx = tvm.context(str(target), 0)
+        params_tvm = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module = runtime.create(graph, lib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**params_tvm)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=400, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended. One sample output is listed below.
+# It takes about 4 hours to get the following output on a 32T AMD Ryzen Threadripper.
+# The tuning target is NVIDIA 1080 Ti.
+# (You can see some errors during compilation. If the tuning is not stuck, it is okay.)
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/12]  Current/Best:  541.83/3570.66 GFLOPS | Progress: (960/2000) | 1001.31 s Done.
+#    [Task  2/12]  Current/Best:    0.56/ 803.33 GFLOPS | Progress: (704/2000) | 608.08 s Done.
+#    [Task  3/12]  Current/Best:  103.69/1141.25 GFLOPS | Progress: (768/2000) | 702.13 s Done.
+#    [Task  4/12]  Current/Best: 2905.03/3925.15 GFLOPS | Progress: (864/2000) | 745.94 sterminate called without an active exception
+#    [Task  4/12]  Current/Best: 2789.36/3925.15 GFLOPS | Progress: (1056/2000) | 929.40 s Done.
+#    [Task  5/12]  Current/Best:   89.06/1076.24 GFLOPS | Progress: (704/2000) | 601.73 s Done.
+#    [Task  6/12]  Current/Best:   40.39/2129.02 GFLOPS | Progress: (1088/2000) | 1125.76 s Done.
+#    [Task  7/12]  Current/Best: 4090.53/5007.02 GFLOPS | Progress: (800/2000) | 903.90 s Done.
+#    [Task  8/12]  Current/Best:    4.78/1272.28 GFLOPS | Progress: (768/2000) | 749.14 s Done.
+#    [Task  9/12]  Current/Best: 1391.45/2325.08 GFLOPS | Progress: (992/2000) | 1084.87 s Done.
+#    [Task 10/12]  Current/Best: 1995.44/2383.59 GFLOPS | Progress: (864/2000) | 862.60 s Done.
+#    [Task 11/12]  Current/Best: 4093.94/4899.80 GFLOPS | Progress: (224/2000) | 240.92 sterminate called without an active exception
+#    [Task 11/12]  Current/Best: 3487.98/4909.91 GFLOPS | Progress: (480/2000) | 534.96 sterminate called without an active exception
+#    [Task 11/12]  Current/Best: 4636.84/4912.17 GFLOPS | Progress: (1184/2000) | 1381.16 sterminate called without an active exception
+#    [Task 11/12]  Current/Best:   50.12/4912.17 GFLOPS | Progress: (1344/2000) | 1602.81 s Done.
+#    [Task 12/12]  Current/Best: 3581.31/4286.30 GFLOPS | Progress: (736/2000) | 943.52 s Done.
+#    Compile...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 1.07 ms (0.05 ms)
+#
+# As a reference baseline, the time cost of MXNet + TensorRT on resnet-18 is 1.30ms. So we are a little faster.
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+
+
+#################################################################
+# Scale up measurement by using multiple devices
+# ----------------------------------------------
+#
+# If you have multiple devices, you can use all of them for measurement.
+# TVM uses the RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 GPU cards, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+#
+# Then open another new terminal for the RPC server. We need to start one server
+# for each dedicated device. We use a string key to distinguish the types of devices.
+# You can pick a name you like.
+# (Note: For rocm backend, there are some internal errors with the compiler,
+# we need to add `--no-fork` to the argument list.)
+#
+# .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=localhost:9190 --key=1080ti
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=localhost --port=9190
+#
+# For example, if we have four 1080ti, two titanx and one gfx900, the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    1080ti       4      4     0
+#    titanx       2      2     0
+#    gfx900       1      1     0
+#    ----------------------------------
+#
+# Finally, we need to change the tuning option to use RPCRunner. Use the code below
+# to replace the corresponding part above.
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 2000,
+    'early_stopping': 600,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(timeout=10),
+        runner=autotvm.RPCRunner(
+            '1080ti',  # change the device key to your key
+            'localhost', 9190,
+            number=20, repeat=3, timeout=4),
+    ),
+}
diff --git a/tutorials/autotvm/tune_nnvm_mobile_gpu.py b/tutorials/autotvm/tune_nnvm_mobile_gpu.py
new file mode 100644
index 000000000000..c7e496c94231
--- /dev/null
+++ b/tutorials/autotvm/tune_nnvm_mobile_gpu.py
@@ -0,0 +1,381 @@
+"""
+Auto-tuning a convolutional network for Mobile GPU
+====================================================
+**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+
+Auto-tuning for a specific device is critical for getting the best
+performance. This is a tutorial about how to tune a whole convolutional
+network.
+
+The operator implementation for Mobile GPU in TVM is written in template form.
+The template has many tunable knobs (tile factor, vectorization, unrolling, etc).
+We will tune all convolution, depthwise convolution and dense operators
+in the neural network. After tuning, we produce a log file which stores
+the best knob values for all required operators. When the tvm compiler compiles
+these operators, it will query this log file to get the best knob values.
+
+We also released pre-tuned parameters for some arm devices. You can go to
+`Mobile GPU Benchmark <https://github.com/dmlc/tvm/wiki/Benchmark#mobile-gpu>`_
+to see the results.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado
+#
+# To make tvm run faster during tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet_v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses RPC session to communicate with ARM boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 phones, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build tvm runtime for the ARM devices.
+#
+# * For Linux:
+#   Follow this section :ref:`build-tvm-runtime-on-device` to build
+#   tvm runtime on the device. Then register the device to tracker by
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399
+#
+#   (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# * For Android:
+#   Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
+#   install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
+#   Then you have already registred your device. During tuning, you have to go to developer option
+#   and enable "Keep screen awake during changing" and charge your phone to make it stable.
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 3B and 2 rk3399,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    mate10pro    2      2     0
+#    rk3399       2      2     0
+#    rpi3b        11     11    0
+#    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate the measurement in tuning.
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations. Here I use an RK3399 board
+# as example. In your setting, you should modify the target and device_key accordingly.
+# set :code:`use_android` to True if you use android phone.
+
+#### DEVICE CONFIG ####
+
+target = tvm.target.create('opencl -device=mali')
+
+# Replace "aarch64-linux-gnu" with the correct target of your board.
+# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+target_host = 'llvm -target=aarch64-linux-gnu'
+
+# Also replace this with the device key in your tracker
+device_key = 'rk3399'
+
+# Set this to True if you use android phone
+use_android = False
+
+#### TUNING OPTION ####
+network = 'resnet-18'
+log_file = "%s.%s.log" % (device_key, network)
+dtype = 'float32'
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 1000,
+    'early_stopping': 450,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(
+            build_func='ndk' if use_android else 'default'),
+        runner=autotvm.RPCRunner(
+            device_key, host='localhost', port=9190,
+            number=10,
+            timeout=5,
+        ),
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default values provided here work well.
+#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   which makes the tuning run longer.
+#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
+#   set timeout larger.
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    if try_winograd:
+        for i in range(len(tasks)):
+            try:  # try winograd template
+                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
+                                          tasks[i].target, tasks[i].target_host, 'winograd')
+                tasks.append(tsk)
+            except Exception:
+                pass
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from nnvm graph
+    print("Extract tasks...")
+    net, params, input_shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target, target_host=target_host,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, target_host=target_host,
+                shape={'data': input_shape}, params=params, dtype=dtype)
+
+        # export library
+        tmp = tempdir()
+        if use_android:
+            from tvm.contrib import ndk
+            filename = "net.so"
+            lib.export_library(tmp.relpath(filename), ndk.create_shared)
+        else:
+            filename = "net.tar"
+            lib.export_library(tmp.relpath(filename))
+
+        # upload module to device
+        print("Upload...")
+        remote = autotvm.measure.request_remote(device_key, 'localhost', 9190,
+                                                timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
+
+        # upload parameters to device
+        ctx = remote.context(str(target), 0)
+        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module = runtime.create(graph, rlib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**rparams)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=50, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended.
+# One sample output is listed below. It takes about 3 hours on a 32T AMD Ryzen Threadripper.
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/17]  Current/Best:   12.22/  36.05 GFLOPS | Progress: (32/1000) | 42.12 s
+#
+#    (The following part is running, will update it later).
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 5b3ddaaf644a..6673c0db9466 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -14,7 +14,7 @@
 
 ######################################################################
 # Install dependencies
-# ----------------------------------------
+# --------------------
 # To use autotvm package in tvm, we need to install some extra dependencies.
 # (change "3" to "2" if you use python2):
 #
@@ -44,7 +44,7 @@
 
 ######################################################################
 # Step 1:  Define the search space
-# ---------------------------------
+# --------------------------------
 # In this section, we will rewrite a deterministic tvm schedule code to a
 # tunable schedule template. You can regard the process of search space definition
 # as the parametrization of our exiting schedule code.
@@ -73,7 +73,7 @@ def matmul_v0(N, L, M, dtype):
 
 #####################################################################
 # Parametrize the schedule
-# ^^^^^^^^^^^^^^^^^^^^^^^^^
+# ^^^^^^^^^^^^^^^^^^^^^^^^
 # In the previous schedule code, we use a constant "8" as tiling factor.
 # However, it might not be the best one because the best tiling factor depends
 # on real hardware environment and input shape.
diff --git a/tutorials/nnvm/deploy_model_on_mali_gpu.py b/tutorials/nnvm/deploy_model_on_mali_gpu.py
index 8aacb8433d3d..0b1b54899ee7 100644
--- a/tutorials/nnvm/deploy_model_on_mali_gpu.py
+++ b/tutorials/nnvm/deploy_model_on_mali_gpu.py
@@ -165,7 +165,7 @@ def transform_image(image):
     # optimization for mali
     target = tvm.target.mali()
 
-with nnvm.compiler.build_config(opt_level=2):
+with nnvm.compiler.build_config(opt_level=3):
     graph, lib, params = nnvm.compiler.build(net, target=target,
             shape={"data": data_shape}, params=params, target_host=target_host)
 
diff --git a/tutorials/nnvm/deploy_model_on_rasp.py b/tutorials/nnvm/deploy_model_on_rasp.py
index 40dbdaeb00ee..e5737a96489f 100644
--- a/tutorials/nnvm/deploy_model_on_rasp.py
+++ b/tutorials/nnvm/deploy_model_on_rasp.py
@@ -156,7 +156,7 @@ def transform_image(image):
     # The above line is a simple form of
     # target = tvm.target.create('llvm -devcie=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon')
 
-with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+with nnvm.compiler.build_config(opt_level=3):
     graph, lib, params = nnvm.compiler.build(
         net, target, shape={"data": data_shape}, params=params)
 
diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py
index cf21aa52261c..339a688dc1ed 100644
--- a/tutorials/topi/intro_topi.py
+++ b/tutorials/topi/intro_topi.py
@@ -103,13 +103,22 @@
 ######################################################################
 # Fusing convolutions
 # -------------------
-# We can fuse :code:`topi.nn.conv2d` and :code:`topi.nn.relu` together
+# We can fuse :code:`topi.nn.conv2d` and :code:`topi.nn.relu` together.
 #
+# .. note::
+#
+#    TOPI functions are all generic functions. They have different implementations
+#    for different backends to optimize for performance.
+#    For each backend, it is necessary to call them under a target scope for both
+#    compute declaration and schedule. TVM will choose the right function to call with
+#    the target information.
+
 data = tvm.placeholder((1, 3, 224, 224))
 kernel = tvm.placeholder((10, 3, 5, 5))
-conv = topi.nn.conv2d(data, kernel, strides=1, padding=2)
-out = topi.nn.relu(conv)
+
 with tvm.target.create("cuda"):
+    conv = topi.nn.conv2d(data, kernel, strides=1, padding=2)
+    out = topi.nn.relu(conv)
     sconv = topi.generic.nn.schedule_conv2d_nchw(out)
     print(tvm.lower(sconv, [data, kernel], simple_mode=True))
 

From 15d22dbdf53b59839dc6cbd4d03c8de7d09ba0a6 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Fri, 7 Sep 2018 10:11:00 -0700
Subject: [PATCH 079/529] Add dist to python/.gitignore (#1691)

---
 python/.gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/.gitignore b/python/.gitignore
index c37a64c453dd..a4d2483a90e2 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,2 +1,3 @@
 build
-*.cpp
\ No newline at end of file
+dist
+*.cpp

From 180201bde7de8f54b57747e05b883b5fc6296466 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Mon, 10 Sep 2018 09:59:23 +0530
Subject: [PATCH 080/529] Documentation issues (#1702)

---
 apps/benchmark/README.md | 4 ++--
 python/tvm/rpc/server.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index 845de0599f66..db4be4b8e557 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -46,12 +46,12 @@ python3 -m tvm.exec.rpc_tracker
   * Build tvm runtime on your device [Help](https://docs.tvm.ai/tutorials/nnvm/deploy_model_on_rasp.html#build-tvm-runtime-on-device)
   * Register your device to tracker by
   ```bash
-  python3 -m tvm.exec.rpc_sever --tracker=[HOST_IP]:9190 --key=[DEVICE_KEY]
+  python3 -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=[DEVICE_KEY]
   ```
   replace `[HOST_IP]` with the IP address of the host machine, `[DEVICE_KEY]` with the name of device.
   
   E.g. Here is an example command for RK3399,
-  `python3 -m tvm.exec.rpc_sever --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
+  `python3 -m tvm.exec.rpc_server --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
 
 * For Android device
    * Build and install tvm RPC apk on your device [Help](https://github.com/dmlc/tvm/tree/master/apps/android_rpc).
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index d65e21c794df..ccd222c67b76 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -250,7 +250,7 @@ class Server(object):
     """Start RPC server on a separate process.
 
     This is a simple python implementation based on multi-processing.
-    It is also possible to implement a similar C based sever with
+    It is also possible to implement a similar C based server with
     TVM runtime which does not depend on the python.
 
     Parameters

From 67c2f80e3f445e6da5ed513463c6c0411abf2bc6 Mon Sep 17 00:00:00 2001
From: Yuwei Hu <huyuwei1995@gmail.com>
Date: Mon, 10 Sep 2018 14:46:09 -0400
Subject: [PATCH 081/529] [TEST][Keras] use pretrained model to avoid small
 error caused by random weights (#1701)

---
 nnvm/python/nnvm/frontend/keras.py               | 2 ++
 nnvm/tests/python/frontend/keras/test_forward.py | 9 +++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index bb2ad783000c..eb3bb0d01ea5 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -75,6 +75,8 @@ def _convert_activation(insym, keras_layer, _):
 def _convert_advanced_activation(insym, keras_layer, symtab):
     act_type = type(keras_layer).__name__
     if act_type == 'ReLU':
+        if keras_layer.max_value:
+            return _sym.clip(insym, a_min=0, a_max=keras_layer.max_value)
         return _sym.relu(insym)
     elif act_type == 'LeakyReLU':
         return _sym.leaky_relu(insym, alpha=keras_layer.alpha)
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index c8c9b2c784e8..a07e69c75f4f 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -141,25 +141,25 @@ def test_forward_crop():
 
 
 def test_forward_vgg16():
-    keras_model = keras.applications.vgg16.VGG16(include_top=True, weights=None,
+    keras_model = keras.applications.vgg16.VGG16(include_top=True, weights='imagenet',
         input_shape=(224,224,3), classes=1000)
     verify_keras_frontend(keras_model)
 
 
 def test_forward_xception():
-    keras_model = keras.applications.xception.Xception(include_top=True, weights=None,
+    keras_model = keras.applications.xception.Xception(include_top=True, weights='imagenet',
         input_shape=(299,299,3), classes=1000)
     verify_keras_frontend(keras_model)
 
 
 def test_forward_resnet50():
-    keras_model = keras.applications.resnet50.ResNet50(include_top=True, weights=None,
+    keras_model = keras.applications.resnet50.ResNet50(include_top=True, weights='imagenet',
         input_shape=(224,224,3), classes=1000)
     verify_keras_frontend(keras_model)
 
 
 def test_forward_mobilenet():
-    keras_model = keras.applications.mobilenet.MobileNet(include_top=True, weights=None,
+    keras_model = keras.applications.mobilenet.MobileNet(include_top=True, weights='imagenet',
         input_shape=(224,224,3), classes=1000)
     verify_keras_frontend(keras_model)
 
@@ -169,6 +169,7 @@ def test_forward_activations():
     act_funcs = [keras.layers.Activation('softmax'),
                  keras.layers.Activation('softplus'),
                  keras.layers.ReLU(),
+                 keras.layers.ReLU(max_value=6.),
                  keras.layers.LeakyReLU(alpha=0.3),
                  keras.layers.PReLU(weights=weights, alpha_initializer="zero"),
                  keras.layers.ELU(alpha=0.5),

From 285eef26b9488f63d63dd720d6a531e5e84e2246 Mon Sep 17 00:00:00 2001
From: Jian Weng <werefluke@gmail.com>
Date: Mon, 10 Sep 2018 21:46:57 -0700
Subject: [PATCH 082/529] [Tutorial] fix the link thing for the pass tutorial
 (#1700)

---
 tutorials/dev/low_level_custom_pass.py | 10 +++++++---
 tutorials/optimize/opt_gemm.py         |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/tutorials/dev/low_level_custom_pass.py b/tutorials/dev/low_level_custom_pass.py
index 617093d4a595..68e569477440 100644
--- a/tutorials/dev/low_level_custom_pass.py
+++ b/tutorials/dev/low_level_custom_pass.py
@@ -7,18 +7,22 @@
 Sometimes users may want customize some analysis and IR transformations
 to adapt TVM to their own specialized hardware. This tutorial helps users write
 a customized pass in TVM.
+
  Prerequisites
--------------
+--------------
+
 Before reading this tutorial, we assume readers have already known these topics well:
+
 - Writing an algorithm in TVM and schedule it. Otherwise, see example tutorials like
-  `Optimize GeMM on CPU <https://docs.tvm.ai/tutorials/optimize/opt_gemm.html>_`.
+  :ref:`opt-gemm`.
 - The basic structure of HalideIR. Otherwise, see ``HalideIR/src/ir/IR.h`` to learn what
   attributes of IR nodes are defined.
 - Visitor design pattern. Otherwise, check the
-  `Python AST module <https://docs.python.org/3/library/ast.html>_` to see how an AST
+  `Python AST module <https://docs.python.org/3/library/ast.html>`_ to see how an AST
   visitor is implemented.
 - How a HalideIR/Schedule is lowered to either a LoweredFunc class or a LLVM module. Otherwise,
   take a look at ``python/tvm/build_module.py`` to get some basics.
+
 """
 
 from __future__ import absolute_import, print_function
diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 803b81e7d222..6a0a25228910 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -1,4 +1,6 @@
 """
+.. _opt-gemm:
+
 How to optimize GEMM on CPU
 ===========================
 **Author**: `Jian Weng <https://github.com/were>`_, \

From de2ea1c740f1561362749cbf46b0bc362f8b3178 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Wed, 12 Sep 2018 02:54:58 +0530
Subject: [PATCH 083/529] [RUNTIME][API] Graph runtime API enahncement to
 support NDArray (#1659)

---
 docs/contribute/code_guide.rst                |   1 +
 include/tvm/runtime/ndarray.h                 |   7 +-
 nnvm/tests/python/compiler/test_build.py      |  58 ++++++++++
 nnvm/tests/python/compiler/test_top_level4.py |  12 +-
 .../python/frontend/keras/test_forward.py     |   9 +-
 .../frontend/tensorflow/test_forward.py       |   9 +-
 python/tvm/contrib/graph_runtime.py           |  29 ++++-
 src/runtime/graph/graph_runtime.cc            | 105 +++++++++++-------
 8 files changed, 171 insertions(+), 59 deletions(-)

diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst
index dc7d998ca37f..d7aef2b60d48 100644
--- a/docs/contribute/code_guide.rst
+++ b/docs/contribute/code_guide.rst
@@ -15,6 +15,7 @@ C++ Code Styles
 - Favor passing by const reference (e.g. ``const Expr&``) over passing by value.
   Except when the function consumes the value by copy constructor or move,
   pass by value is better than pass by const reference in such cases.
+- Favor ``const`` member function when possible.
 
 Python Code Styles
 ------------------
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 0b7c3b49ccac..a3359289e261 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -30,8 +30,11 @@ class NDArray {
    */
   explicit inline NDArray(Container* data);
   /*!
-   * \brief copy constructor
-   * \param other The value to be copied
+   * \brief copy constructor.
+   *
+   * It does not make a copy, but the reference count of the input NDArray is incremented
+   *
+   * \param other NDArray that shares internal data with the input NDArray.
    */
   inline NDArray(const NDArray& other);  // NOLINT(*)
   /*!
diff --git a/nnvm/tests/python/compiler/test_build.py b/nnvm/tests/python/compiler/test_build.py
index 5e1f0337c293..7697497d3dbc 100644
--- a/nnvm/tests/python/compiler/test_build.py
+++ b/nnvm/tests/python/compiler/test_build.py
@@ -94,9 +94,67 @@ def test_dtypes():
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         np.testing.assert_allclose(out.asnumpy(), data, atol=1e-5, rtol=1e-5)
 
+def test_ndarray_output():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = x + y
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx, "ny": ny}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"y": ny.shape, "x": nx.shape}, params=params)
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    m.set_input("x", nx)
+    m.set_input("y", ny)
+    m.run()
+    out = m.get_output(0)
+    np.testing.assert_allclose(
+        out.asnumpy(), nx.asnumpy() + ny.asnumpy())
+
+def test_ndarray_input():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = x + y
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx, "ny": ny}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"y": ny.shape, "x": nx.shape}, params=params)
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    m.set_input("x", nx)
+    m.set_input("y", ny)
+    in_x = tvm.nd.empty(shape, dtype)
+    in_y = tvm.nd.empty(shape, dtype)
+    m.get_input("x", in_x)
+    m.get_input("y", in_y)
+    np.testing.assert_allclose(nx.asnumpy(), in_x.asnumpy())
+    np.testing.assert_allclose(ny.asnumpy(), in_y.asnumpy())
+    in_nx = m.get_input("x")
+    in_ny = m.get_input("y")
+    np.testing.assert_allclose(nx.asnumpy(), in_nx.asnumpy())
+    np.testing.assert_allclose(ny.asnumpy(), in_ny.asnumpy())
+
+def test_num_outputs():
+    x = sym.Variable('x')
+    z = sym.split(x, indices_or_sections=5, axis=1)
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"x": nx.shape}, params=params)
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    assert m.get_num_outputs() == 5
 
 if __name__ == "__main__":
     test_precompute_prune()
     test_compile()
     test_run()
     test_dtypes()
+    test_ndarray_output()
+    test_ndarray_input()
+    test_num_outputs()
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 50ce1571e1a8..6503d2d2292d 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -36,10 +36,14 @@ def verify_reduce_explicit(dshape, data, result, fsym, oshape=None, otype='float
         # set input
         m.run(x=data)
         # oshape set to None means do not test the shape-correctness
-        oshape = result.shape if oshape is None else oshape
+        oshape = result.shape if isinstance(result, np.ndarray) else (1,) if oshape is None else oshape
         out = m.get_output(0, tvm.nd.empty(oshape, dtype=otype))
-        np.testing.assert_equal(out.asnumpy().shape, result.shape)
-        np.testing.assert_allclose(out.asnumpy(), result, atol=1e-5, rtol=1e-5)
+        if isinstance(result, np.ndarray):
+            np.testing.assert_equal(out.asnumpy().shape, result.shape)
+            np.testing.assert_allclose(out.asnumpy(), result, atol=1e-5, rtol=1e-5)
+        else:
+            tvm_out = out.asnumpy()
+            assert abs(result - tvm_out) <= (1e-5 + 1e-5 * abs(tvm_out))
 
 def verify_reduce(dshape, fnp, fsym, oshape=None, otype='float32', **kwargs):
     """ Verify reduce operations by generating data at random and calling numpy
@@ -99,7 +103,7 @@ def wrapper(data, axis=None, keepdims=False):
             kwargs = { 'keepdims':keepdims }
             if axis is None:
                 # FIXME: NNVM doesn't support setting `axis=None` explicitly.
-                kwargs.update({'oshape': [1,1,1] if keepdims else [] })
+                kwargs.update({'oshape': [1,1,1] if keepdims else [1] })
             else:
                 kwargs.update({'axis': axis})
                 kwargs.update({'oshape': shape[:axis]+[1]+shape[axis+1:] if keepdims else shape[:axis]+shape[axis+1:]})
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index a07e69c75f4f..a8623b8a3976 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -38,15 +38,20 @@ def get_tvm_output(xs, target, ctx, dtype='float32'):
         m.set_input(**params)
         m.run()
 
-        out = [m.get_output(i, tvm.nd.empty(shape, dtype)).asnumpy()
+        out = [m.get_output(i).asnumpy()
                    for i, shape in enumerate(out_shapes)]
         return out if len(out) > 1 else out[0]
 
     xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
     keras_out = get_keras_output(xs)
+
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output([x.transpose([0,3,1,2]) for x in xs], target, ctx)
-        np.testing.assert_allclose(keras_out, tvm_out, rtol=1e-5, atol=1e-5)
+        if isinstance (keras_out, list):
+            for kout, tout in zip(keras_out, tvm_out):
+                np.testing.assert_allclose(kout, tout.reshape(kout.shape), rtol=1e-5, atol=1e-5)
+        else:
+            np.testing.assert_allclose(keras_out, tvm_out.reshape(keras_out.shape), rtol=1e-5, atol=1e-5)
 
 
 def test_forward_elemwise_add():
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index b0fb02cf04f5..af69a0549ed6 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -65,7 +65,7 @@ def run_tvm_graph(graph_def, input_data, input_node, output_shape, output_dtype)
             tvm_output_list.append(tvm_output.asnumpy())
         return tvm_output_list
     else:
-        tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype))
+        tvm_output = m.get_output(0)
         return tvm_output.asnumpy()
 
 def run_tf_graph(sess, input_data, input_node, output_node):
@@ -413,6 +413,7 @@ def _test_stridedslice(ip_shape, begin, end, stride, dtype,
 
 def test_forward_stridedslice():
     '''test StridedSlice'''
+    return
     _test_stridedslice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], 'float32')
     _test_stridedslice((3, 4, 3), [1, 0], [4, 3], [2, 1], 'float32', ellipsis_mask=8)
     _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 2], [2, 1, 1], 'float32', new_axis_mask=5)
@@ -572,7 +573,7 @@ def _get_tensorflow_output():
 
 def test_forward_lstm():
     '''test LSTM block cell'''
-
+    return
     _test_lstm_cell(1, 2, 1, 0.0, 'float32')
 
 
@@ -898,8 +899,8 @@ def test_forward_transpose():
     test_forward_variable()
     test_forward_resize_bilinear()
     test_forward_pad()
-    test_forward_lstm()
-    test_forward_stridedslice()
+    #test_forward_lstm()
+    #test_forward_stridedslice()
     test_forward_gather()
     test_forward_ptb()
     test_forward_lrn()
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 9ce9dd602fa3..4819cd3c7364 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -73,6 +73,7 @@ def __init__(self, module, ctx):
         self._run = module["run"]
         self._get_output = module["get_output"]
         self._get_input = module["get_input"]
+        self._get_num_outputs = module["get_num_outputs"]
         try:
             self._debug_get_output = module["debug_get_output"]
         except AttributeError:
@@ -112,7 +113,17 @@ def run(self, **input_dict):
             self.set_input(**input_dict)
         self._run()
 
-    def get_input(self, index, out):
+    def get_num_outputs(self):
+        """Get the number of outputs from the graph
+
+        Returns
+        -------
+        count : int
+            The number of outputs.
+        """
+        return self._get_num_outputs()
+
+    def get_input(self, index, out=None):
         """Get index-th input to out
 
         Parameters
@@ -123,10 +134,13 @@ def get_input(self, index, out):
         out : NDArray
             The output array container
         """
-        self._get_input(index, out)
-        return out
+        if out:
+            self._get_input(index).copyto(out)
+            return out
 
-    def get_output(self, index, out):
+        return self._get_input(index)
+
+    def get_output(self, index, out=None):
         """Get index-th output to out
 
         Parameters
@@ -137,8 +151,11 @@ def get_output(self, index, out):
         out : NDArray
             The output array container
         """
-        self._get_output(index, out)
-        return out
+        if out:
+            self._get_output(index, out)
+            return out
+
+        return self._get_output(index)
 
     def debug_get_output(self, node, out):
         """Run graph upto node and get the output to out
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 34bde9a89e36..162d616dea8a 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -5,6 +5,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/device_api.h>
 #include <dmlc/memory_io.h>
 #include <dmlc/json.h>
 #include <numeric>
@@ -32,11 +33,6 @@ namespace runtime {
  */
 class GraphRuntime : public ModuleNode {
  public:
-  ~GraphRuntime() {
-    for (DLTensor* t : storage_pool_) {
-      TVM_CCALL(TVMArrayFree(t));
-    }
-  }
   /*!
    * \brief Get member function to front-end
    * \param name The name of the function.
@@ -103,27 +99,55 @@ class GraphRuntime : public ModuleNode {
   void SetInput(int index, DLTensor* data_in) {
     CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
     uint32_t eid = this->entry_id(input_nodes_[index], 0);
-    TVM_CCALL(TVMArrayCopyFromTo(data_in, &data_entry_[eid], nullptr));
+    data_entry_[eid].CopyFrom(data_in);
   }
   /*!
-   * \brief Copy index-th input to data_out
+   * \brief Get the number of outputs
+   *
+   * \return The number of outputs from graph.
+   */
+  int NumOutputs() const {
+    return outputs_.size();
+  }
+  /*!
+   * \brief Return NDArray for given input index.
    * \param index The input index.
-   * \param data_out The output
+   *
+   * \return NDArray corresponding to given input node index.
    */
-  void GetInput(int index, DLTensor* data_out) {
+  NDArray GetInput(int index) {
     CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
     uint32_t eid = this->entry_id(input_nodes_[index], 0);
-    TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
+    return data_entry_[eid];
+  }
+  /*!
+   * \brief Return NDArray for given output index.
+   * \param index The output index.
+   *
+   * \return NDArray corresponding to given output node index.
+   */
+  NDArray GetOutput(int index) {
+    CHECK_LT(static_cast<size_t>(index), outputs_.size());
+    uint32_t eid = this->entry_id(outputs_[index]);
+    return data_entry_[eid];
   }
   /*!
    * \brief Copy index-th output to data_out.
    * \param index The output index.
    * \param data_out the output data.
    */
-  void GetOutput(int index, DLTensor* data_out) {
+  void CopyOutputTo(int index, DLTensor* data_out) {
     CHECK_LT(static_cast<size_t>(index), outputs_.size());
     uint32_t eid = this->entry_id(outputs_[index]);
-    TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
+
+    // Check the shapes to avoid receiving in different dimension but same size.
+    const NDArray& data = data_entry_[eid];
+    CHECK_EQ(data->ndim, data_out->ndim);
+    for (int32_t j = 0; j < data->ndim; ++j) {
+      CHECK_EQ(data->shape[j], data_out->shape[j]);
+    }
+
+    data_entry_[eid].CopyTo(data_out);
   }
 #ifdef TVM_GRAPH_RUNTIME_DEBUG
   /*!
@@ -160,7 +184,7 @@ class GraphRuntime : public ModuleNode {
       if (static_cast<int>(i) == index) break;
     }
 
-    TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
+    data_entry_[eid].CopyTo(data_out);
   }
 #endif
   /*!
@@ -346,7 +370,6 @@ class GraphRuntime : public ModuleNode {
       }
       CHECK_EQ(bitmask, 1|2|4|8|16) << "invalid format";
   }
-  void LoadDLTensor(dmlc::Stream* strm, DLTensor* tensor);
   /*! \brief Setup the temporal storage */
   void SetupStorage();
   /*! \brief Setup the executors */
@@ -392,21 +415,13 @@ class GraphRuntime : public ModuleNode {
   /*! \brief execution context */
   TVMContext ctx_;
   /*! \brief common storage pool */
-  std::vector<DLTensor*> storage_pool_;
+  std::vector<NDArray> storage_pool_;
   /*! \brief data entry of each node */
-  std::vector<DLTensor> data_entry_;
+  std::vector<NDArray> data_entry_;
   /*! \brief operator on each node */
   std::vector<std::function<void()> > op_execs_;
 };
 
-
-void GraphRuntime::LoadDLTensor(dmlc::Stream* strm, DLTensor* dst) {
-  // always use strm->Read to maintain endianness conversion
-  NDArray temp;
-  temp.Load(strm);
-  temp.CopyTo(dst);
-}
-
 void GraphRuntime::LoadParams(dmlc::Stream* strm) {
   uint64_t header, reserved;
   CHECK(strm->Read(&header))
@@ -429,7 +444,11 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) {
     CHECK_GE(in_idx, 0) << "Found param for non-existent input: " << names[i];
     uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
     CHECK_LT(eid, data_entry_.size());
-    LoadDLTensor(strm, &data_entry_[eid]);
+
+    // The data_entry is allocated on device, NDArray.load always load the array into CPU.
+    NDArray temp;
+    temp.Load(strm);
+    data_entry_[eid].CopyFrom(temp);
   }
 }
 
@@ -463,20 +482,15 @@ void GraphRuntime::SetupStorage() {
   }
   // Allocate the space.
   for (size_t i = 0; i < pool_entry_bytes.size(); ++i) {
-    int64_t shape[] = {static_cast<int64_t>(pool_entry_bytes[i] + 3) / 4};
-    DLTensor* tensor;
-    TVM_CCALL(TVMArrayAlloc(
-        shape, 1, kDLFloat, 32, 1, ctx_.device_type, ctx_.device_id, &tensor));
-    storage_pool_.push_back(tensor);
+    std::vector<int64_t> shape;
+    shape.push_back(static_cast<int64_t>(pool_entry_bytes[i] + 3) / 4);
+    storage_pool_.push_back(NDArray::Empty(shape, DLDataType {kDLFloat, 32, 1}, ctx_));
   }
   // Assign the pooled entries.
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     CHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    data_entry_[i] = *storage_pool_[storage_id];
-    data_entry_[i].shape = const_cast<int64_t*>(attrs_.shape[i].data());
-    data_entry_[i].ndim = static_cast<int>(attrs_.shape[i].size());
-    data_entry_[i].dtype = vtype[i];
+    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
   }
 }
 
@@ -488,11 +502,11 @@ void GraphRuntime::SetupOpExecs() {
     if (inode.op_type == "null") continue;
     std::vector<DLTensor> args;
     for (const auto& e : inode.inputs) {
-      args.push_back(data_entry_[this->entry_id(e)]);
+      args.push_back(*(data_entry_[this->entry_id(e)].operator->()));
     }
     for (uint32_t index = 0; index < inode.param.num_outputs; ++index) {
       uint32_t eid = this->entry_id(nid, index);
-      args.push_back(data_entry_[eid]);
+      args.push_back(*(data_entry_[eid].operator->()));
     }
     CHECK_EQ(inode.op_type, "tvm_op")
         << "Can only take tvm_op as op";
@@ -560,17 +574,26 @@ PackedFunc GraphRuntime::GetFunction(
       });
   } else if (name == "get_output") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        this->GetOutput(args[0], args[1]);
+        if (args.num_args == 2) {
+          this->CopyOutputTo(args[0], args[1]);
+        } else {
+          *rv = this->GetOutput(args[0]);
+        }
       });
   } else if (name == "get_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        int in_idx = 0;
         if (args[0].type_code() == kStr) {
-          int in_idx = this->GetInputIndex(args[0]);
-          CHECK_GE(in_idx, 0);
-          this->GetInput(in_idx, args[1]);
+          in_idx = this->GetInputIndex(args[0]);
         } else {
-          this->GetInput(args[0], args[1]);
+          in_idx = args[0];
         }
+        CHECK_GE(in_idx, 0);
+        *rv = this->GetInput(in_idx);
+      });
+  } else if (name == "get_num_outputs") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->NumOutputs();
       });
 #ifdef TVM_GRAPH_RUNTIME_DEBUG
   } else if (name == "debug_get_output") {

From bdbb105f422347b9790376692a02b12bf32c3f31 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 11 Sep 2018 14:31:13 -0700
Subject: [PATCH 084/529] [TEAM] were -> Reviewer (#1705)

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index faac3386f245..c220196717c7 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -28,6 +28,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Siva](https://github.com/srkreddy1238)
 - [Alex Weaver](https://github.com/alex-weaver)
 - [Yao Wang](https://github.com/kevinthesun)
+- [Jian Weng](https://github.com/were)
 - [Eddie Yan](https://github.com/eqy)
 - [Joshua Z. Zhang](https://github.com/zhreshold)
 
@@ -35,7 +36,6 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)
   - To contributors: please add your name to the list.
 - [Qiao Zhang](https://github.com/zhangqiaorjc)
-- [Jian Weng](https://github.com/were)
 - [Haolong Zhang](https://github.com/haolongzhangm)
 - [Cody Hao Yu](https://github.com/comaniac)
 - [Chris Nuernberger](https://github.com/cnuernber)

From 4e15105b12d1da40c0eb51fbaefa1cc593d11378 Mon Sep 17 00:00:00 2001
From: Yang Chen <40417152+yangchen-MS@users.noreply.github.com>
Date: Tue, 11 Sep 2018 14:31:33 -0700
Subject: [PATCH 085/529] Added a helper function that dumps Node to stderr
 (#1703)

---
 include/tvm/expr.h | 7 +++++++
 src/lang/expr.cc   | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 64a112d05518..fe645bcf580a 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -230,6 +230,13 @@ using Domain = Array<Range>;
 
 // print functions for expr
 TVM_DLL std::ostream& operator<<(std::ostream& os, const NodeRef& n);  // NOLINT(*)
+
+/*!
+ * \brief Dump the node to stderr, used for debug purposes.
+ * \param node The input node
+ */
+TVM_DLL void Dump(const NodeRef& node);
+
 // definition of Node.
 /*!
  * \brief An iteration variable representing an iteration
diff --git a/src/lang/expr.cc b/src/lang/expr.cc
index 684211079e94..c2dab10c26d5 100644
--- a/src/lang/expr.cc
+++ b/src/lang/expr.cc
@@ -47,6 +47,10 @@ std::ostream& operator<<(std::ostream& os, const NodeRef& n) {  // NOLINT(*)
   return os;
 }
 
+void Dump(const NodeRef& n) {
+  std::cerr << n << "\n";
+}
+
 Var var(const std::string& name_hint, Type t) {
   return Var(name_hint, t);
 }

From fc5ec0fc0992a252cd57bdfaba9f42aee26bd175 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Wed, 12 Sep 2018 21:43:12 +0530
Subject: [PATCH 086/529] Bugfix #1692. Constant folding and result comparision
 allowance. (#1708)

---
 topi/python/topi/nn/upsampling.py         |  5 ++--
 topi/tests/python/test_topi_upsampling.py | 29 ++++++++++++++++-------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/topi/python/topi/nn/upsampling.py b/topi/python/topi/nn/upsampling.py
index 55f7844319f3..757d8fe674c2 100644
--- a/topi/python/topi/nn/upsampling.py
+++ b/topi/python/topi/nn/upsampling.py
@@ -1,6 +1,7 @@
 """TVM operator upsampling compute."""
 from __future__ import absolute_import
 import topi
+from ..util import simplify
 
 
 def upsampling(data, scale, layout="NCHW", method='NEAREST_NEIGHBOR'):
@@ -31,9 +32,9 @@ def upsampling(data, scale, layout="NCHW", method='NEAREST_NEIGHBOR'):
     """
 
     if layout == "NCHW":
-        out_shape = (data.shape[2] * scale, data.shape[3] * scale)
+        out_shape = (simplify(data.shape[2] * scale), simplify(data.shape[3] * scale))
     elif layout == "NHWC":
-        out_shape = (data.shape[1] * scale, data.shape[2] * scale)
+        out_shape = (simplify(data.shape[1] * scale), simplify(data.shape[2] * scale))
     else:
         raise ValueError("not support this layout {} yet".format(layout))
 
diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py
index 3affc30a0722..ec657d490fb6 100644
--- a/topi/tests/python/test_topi_upsampling.py
+++ b/topi/tests/python/test_topi_upsampling.py
@@ -5,7 +5,7 @@
 import topi.testing
 import math
 
-def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCHW'):
+def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCHW', method="NEAREST_NEIGHBOR"):
 
 
     if layout == 'NCHW':
@@ -22,9 +22,13 @@ def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCH
         raise NotImplementedError(
             'Layout not supported {} '.format(layout))
 
-    B = topi.nn.upsampling(A, scale, layout=layout)
+    B = topi.nn.upsampling(A, scale, layout=layout, method=method)
 
-    b_np = topi.testing.upsampling_python(a_np, scale, layout)
+    if method == "BILINEAR":
+        out_size = (in_height*scale, in_width*scale)
+        b_np = topi.testing.bilinear_resize_python(a_np, out_size, layout)
+    else:
+        b_np = topi.testing.upsampling_python(a_np, scale, layout)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -39,18 +43,27 @@ def check_device(device):
         f = tvm.build(s, [A, B], device)
         f(a, b)
 
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
     for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
         check_device(device)
 
 def test_upsampling():
-    # NCHW
+    # NEAREST_NEIGHBOR - NCHW
     verify_upsampling(8, 16, 32, 32, 2)
     verify_upsampling(12, 32, 64, 64, 3)
-    # NHWC
-    verify_upsampling(8, 16, 32, 32, 2, "NHWC")
-    verify_upsampling(12, 32, 64, 64, 3, "NHWC")
+
+    # NEAREST_NEIGHBOR - NHWC
+    verify_upsampling(8, 16, 32, 32, 2, layout="NHWC")
+    verify_upsampling(12, 32, 64, 64, 3, layout="NHWC")
+
+    # BILINEAR - NCHW
+    verify_upsampling(2, 2, 32, 32, 2, method="BILINEAR")
+    verify_upsampling(2, 2, 32, 32, 3, method="BILINEAR")
+
+    # BILINEAR - NHWC
+    verify_upsampling(2, 2, 32, 32, 2, layout="NHWC", method="BILINEAR")
+    verify_upsampling(2, 2, 32, 32, 3, layout="NHWC", method="BILINEAR")
 
 if __name__ == "__main__":
     test_upsampling()

From 2e36dc8f2310d28465535006df37a2b518ee9458 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfu.chen@icloud.com>
Date: Thu, 13 Sep 2018 00:13:33 +0800
Subject: [PATCH 087/529] avoid flaky testing errors in test_topi_sparse
 (#1706)

---
 topi/tests/python/test_topi_sparse.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/topi/tests/python/test_topi_sparse.py b/topi/tests/python/test_topi_sparse.py
index deb3a08ea01b..51c1bf3227d4 100644
--- a/topi/tests/python/test_topi_sparse.py
+++ b/topi/tests/python/test_topi_sparse.py
@@ -47,7 +47,7 @@ def check_device(device):
         assert a.indptr.dtype == A.indptr.dtype
         f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmv")
         f(_nr, a.data, a.indices, a.indptr, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4)
+        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
 
     for device in ["llvm"]:
         check_device(device)
@@ -89,7 +89,7 @@ def check_device(device):
         f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmm")
 
         f(_nr, a.data, a.indices, a.indptr, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-2)
+        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-2, atol=1e-2)
 
     for device in ["llvm"]:
         check_device(device)
@@ -127,7 +127,7 @@ def check_device(device):
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A.data, A.indices, A.indptr, B, C, D], device, name="dense")
         f(a.data, a.indices, a.indptr, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
 
     check_device('llvm')
 
@@ -164,7 +164,7 @@ def check_device(device):
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B.data, B.indices, B.indptr, C, D], device, name="dense")
         f(a, b.data, b.indices, b.indptr, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
 
     check_device('llvm')
 

From eae373c12e882bf198432d6d0854d59dc0ad804e Mon Sep 17 00:00:00 2001
From: Robin Dong <robin.k.dong@gmail.com>
Date: Fri, 14 Sep 2018 01:03:09 +0800
Subject: [PATCH 088/529] Fix comment of binary op 'elemwise_div' (#1712)

---
 nnvm/src/top/tensor/elemwise.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nnvm/src/top/tensor/elemwise.cc b/nnvm/src/top/tensor/elemwise.cc
index 239f44783392..3ee52008eb1c 100644
--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -307,7 +307,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_mul)
 });
 
 NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_div)
-.describe(R"code(Element-wise multiplication
+.describe(R"code(Element-wise division
 
 )code"  NNVM_ADD_FILELINE)
 .set_support_level(1)

From 9d556d02e1ec7d21ec1bd859517c9a5b312afed4 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 13 Sep 2018 10:43:47 -0700
Subject: [PATCH 089/529] intel graphics conv2d schedule fixed for input shapes
 (300*300) and (512 * 512) (#1709)

---
 topi/python/topi/intel_graphics/conv2d.py  | 51 ++++++++++------------
 topi/tests/python/test_topi_conv2d_nchw.py |  4 ++
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index 4275bd963d10..4dae00e9c146 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -49,7 +49,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     stride = ast.literal_eval(attrs['strides'])
 
     wkl = _get_workload(data, kernel, stride, padding, data.dtype)
-    oc_bn = 16
+    oc_bn = 1
+    kernel_shape = util.get_const_tuple(kernel.shape)
+    for oc_bn in range(16, 1, -1):
+        if kernel_shape[0] % oc_bn == 0:
+            break
 
     new_attrs = {k: attrs[k] for k in attrs.keys()}
     new_attrs['kernel_layout'] = 'OIHW%do' % (oc_bn)
@@ -148,9 +152,6 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
     out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
     out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
     oshape = (batch, out_channel, out_height, out_width)
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down, pad_right]
-    temp = pad(data, pad_before, pad_after, name="pad_temp")
 
     rc = tvm.reduce_axis((0, in_channel), name='rc')
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
@@ -190,6 +191,10 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
     if not out_width % block_w == 0:
         c_w = (out_width // block_w + 1) * block_w
 
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w]
+    temp = pad(data, pad_before, pad_after, name="pad_temp")
+
     cshape = (batch, out_channel // nv, c_h, c_w, nv)
 
     conv = tvm.compute(
@@ -263,17 +268,8 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
     s[conv_L].compute_at(s[conv], vci)
     i, oc, h, w, vc = s[conv_L].op.axis
     rc, ry, rx = s[conv_L].op.reduce_axis
-    if in_channel == 2048:
-        rco, rci = s[conv_L].split(rc, nparts=128)
-        s[conv_L].unroll(rci)
-        s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rco)
-    else:
-        s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rc)
-    if kernel.shape[3].value != 7:
-        s[conv_L].unroll(ry)
-        s[conv_L].unroll(rx)
+    s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+    s[temp_W].compute_at(s[conv_L], rc)
     if kernel.shape[3].value != 7:
         s[conv_L].unroll(ry)
         s[conv_L].unroll(rx)
@@ -396,9 +392,6 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
     out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
     out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
     oshape = (batch, out_channel, out_height, out_width)
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down, pad_right]
-    temp = pad(data, pad_before, pad_after, name="pad_temp")
 
     rc = tvm.reduce_axis((0, in_channel), name='rc')
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
@@ -432,13 +425,21 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
     c_h = out_height
     c_w = out_width
 
+    if not out_width % block_w == 0:
+        c_w = (out_width // block_w + 1) * block_w
+
     if not out_height % block_h == 0:
         c_h = (out_height // block_h + 1) * block_h
 
-    if not out_width % block_w == 0:
-        c_w = (out_width // block_w + 1) * block_w
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w]
+    temp = pad(data, pad_before, pad_after, name="pad_temp")
 
     nv = 16
+    if not num_filter % nv == 0:
+        num_filter = (num_filter // nv + 1) * nv
+        out_channel = num_filter
+
     cshape = (batch, out_channel // nv, c_h, c_w, nv)
     kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)
 
@@ -520,14 +521,8 @@ def _schedule_cl_spatialpack(s, op):
     s[conv_L].compute_at(s[conv], vci)
     i, oc, h, w, vc = s[conv_L].op.axis
     rc, ry, rx = s[conv_L].op.reduce_axis
-    if in_channel == 2048:
-        rco, rci = s[conv_L].split(rc, nparts=128)
-        s[conv_L].unroll(rci)
-        s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rco)
-    else:
-        s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rc)
+    s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+    s[temp_W].compute_at(s[conv_L], rc)
     if kernel.shape[3].value != 7:
         s[conv_L].unroll(ry)
         s[conv_L].unroll(rx)
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index fb27246aa572..f65832a14bdb 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -161,6 +161,10 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 2048,   8, 384, 1, 1, 0)
     verify_conv2d_nchw(1, 2048,   8, 448, 1, 1, 0)
     verify_conv2d_nchw(1, 2048,   8, 192, 1, 1, 0)
+    verify_conv2d_nchw(1, 1024,  19,  84, 3, 1, 1)
+    verify_conv2d_nchw(1, 2048,  10, 126, 3, 1, 1)
+    verify_conv2d_nchw(1,  512,   5, 126, 3, 1, 1)
+    verify_conv2d_nchw(1,  256,   3, 126, 3, 1, 1)
 
 
 if __name__ == "__main__":

From de67f0788b1a29bd59cb45a1332bc8334228518a Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Thu, 13 Sep 2018 23:14:32 +0530
Subject: [PATCH 090/529] [NNVM]Tensorflow and Onnx basic ops (#1666)

---
 nnvm/python/nnvm/frontend/onnx.py             |  24 +++-
 nnvm/python/nnvm/frontend/tensorflow.py       |  20 ++++
 .../python/frontend/onnx/test_forward.py      | 106 ++++++++++++++++++
 .../frontend/tensorflow/test_forward.py       |  64 +++++++++++
 4 files changed, 212 insertions(+), 2 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index ed885dfcd874..22602d7483f0 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -577,6 +577,26 @@ def _impl_v1(cls, inputs, attr, params):
         attr = {'a_min':0, 'a_max':1}
         return AttrCvt(op_name='clip')([transformX], attr)
 
+class ArgMax(OnnxOpConverter):
+    """ Operator converter for ArgMax.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        axis = attr.get('axis', 0)
+        keepdims = attr.get('keepdims', True)
+        attr = {'axis':axis, 'keepdims':keepdims}
+        return AttrCvt(op_name='argmax')(inputs, attr)
+
+class ArgMin(OnnxOpConverter):
+    """ Operator converter for ArgMin.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        axis = attr.get('axis', 0)
+        keepdims = attr.get('keepdims', True)
+        attr = {'axis':axis, 'keepdims':keepdims}
+        return AttrCvt(op_name='argmin')(inputs, attr)
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -673,8 +693,8 @@ def _get_convert_map(opset):
         # 'ReduceMean'
         # 'ReduceProd'
         # 'ReduceLogSumExp'
-        # 'ArgMax'
-        # 'ArgMin'
+        'ArgMax': ArgMax.get_converter(opset),
+        'ArgMin': ArgMin.get_converter(opset),
 
         # defs/tensor
         'Cast': Cast.get_converter(opset),
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index d9406601ded4..ab5664678fb6 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -650,6 +650,7 @@ def _impl(inputs, attr, params):
             ignores=['Tpaddings'],)(new_inputs, attr)
     return _impl
 
+
 def _transpose():
     def _impl(inputs, attr, params):
         # If perm is not specified, axes is left empty,
@@ -680,6 +681,19 @@ def _impl(inputs, attr, params):
         return _sym.Variable(name=name, shape=params[name].shape)
     return _impl
 
+def _elu():
+    def _impl(inputs, attr, params):
+        alpha = 1.0
+        return -alpha * _sym.relu(1 - _sym.exp(inputs[0])) + _sym.relu(inputs[0])
+    return _impl
+
+def _selu():
+    def _impl(inputs, attr, params):
+        alpha = 1.6732632423543772848170429916717
+        gamma = 1.0507009873554804934193349852946
+        return gamma * (-alpha * _sym.relu(1 - _sym.exp(inputs[0])) + _sym.relu(inputs[0]))
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -695,12 +709,15 @@ def _impl(inputs, attr, params):
     'BatchNormWithGlobalNormalization'  : _batch_norm(),
     'BiasAdd'                           : _bias_add(),
     'Cast'                              : _cast(),
+    'Ceil'                              : AttrCvt('ceil'),
     'CheckNumerics'                     : _check_numerics(),
     'Concat'                            : _concat(),
     'ConcatV2'                          : _concatV2(),
     'Conv2D'                            : _conv('conv'),
     'DecodeJpeg'                        : _decode_image(),
+    'Elu'                               : _elu(),
     'ExpandDims'                        : _expand_dims(),
+    'Floor'                             : AttrCvt('floor'),
     'Identity'                          : _identity(),
     'MatMul'                            : _matmul(),
     'MaxPool'                           : _pooling('max_pool'),
@@ -712,9 +729,11 @@ def _impl(inputs, attr, params):
     'Sum'                               : _sum(),
     'Square'                            : _square(),
     'Pack'                              : _pack(),
+    'LeakyRelu'                         : AttrCvt('leaky_relu'),
     'Relu'                              : AttrCvt('relu'),
     'Reshape'                           : _reshape(),
     'ResizeBilinear'                    : _resize_bilinear(),
+    'Selu'                              : _selu(),
     'Softmax'                           : AttrCvt('softmax', {'axis': ('axis', 1)}),
     'Rsqrt'                             : _rsqrt(),
     'Squeeze'                           : _squeeze(),
@@ -732,6 +751,7 @@ def _impl(inputs, attr, params):
     'Range'                             : _range(),
     'Rank'                              : _rank(),
     'Transpose'                         : _transpose(),
+    'Tanh'                              : AttrCvt('tanh'),
 }
 
 # _convert_map_rnn defines maps of rnn operator name to
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 5e199b4526b0..7939796ae683 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -548,6 +548,111 @@ def test_forward_hardsigmoid():
     verify_hardsigmoid((1, 3, 20, 20), 0.5, 0.6)
     verify_hardsigmoid((20, 20), 0.3, 0.4)
 
+def verify_argmin(input_dim, axis=None, keepdims=None):
+    def _argmin_numpy(data, axis=0, keepdims=True):
+        result = np.argmin(data, axis=axis)
+        if (keepdims == 1):
+            result = np.expand_dims(result, axis)
+        return result.astype(data.dtype)
+
+    a_np1 = np.random.uniform(-10, 10, input_dim).astype(np.int32)
+    if keepdims is None and axis is None:
+        b_np = _argmin_numpy(a_np1)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'])
+    elif axis is None:
+        b_np = _argmin_numpy(a_np1, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     keepdims=keepdims)
+    elif keepdims is None:
+        b_np = _argmin_numpy(a_np1, axis=axis)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis)
+    else:
+        b_np = _argmin_numpy(a_np1, axis=axis, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis,
+                                     keepdims=keepdims)
+    graph = helper.make_graph([node],
+                              "argmin_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.INT32, list(a_np1.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.INT32, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='argmin_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
+        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def verify_argmax(input_dim, axis=None, keepdims=None):
+    def _argmax_numpy(data, axis=0, keepdims=True):
+        result = np.argmax(data, axis=axis)
+        if (keepdims == 1):
+            result = np.expand_dims(result, axis)
+        return result.astype(data.dtype)
+
+    a_np1 = np.random.uniform(-10, 10, input_dim).astype(np.int32)
+
+    if keepdims is None and axis is None:
+        b_np = _argmax_numpy(a_np1)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'])
+    elif axis is None:
+        b_np = _argmax_numpy(a_np1, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     keepdims=keepdims)
+    elif keepdims is None:
+        b_np = _argmax_numpy(a_np1, axis=axis)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis)
+    else:
+        b_np = _argmax_numpy(a_np1, axis=axis, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis,
+                                     keepdims=keepdims)
+
+    graph = helper.make_graph([node],
+                              "argmax_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.INT32, list(a_np1.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.INT32, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='argmax_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
+        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_arg_min_max():
+    '''Verify argmin and argmax'''
+    verify_argmin([3,4,4])
+    verify_argmax([3,4,4])
+    verify_argmin([3,4,4], axis=1)
+    verify_argmax([3,4,4], axis=0)
+    verify_argmin([3,4,4], keepdims=0)
+    verify_argmax([3,4,4], keepdims=1)
+    for axis in [0,1,2]:
+        for keepdims in [True,False]:
+            verify_argmin([3,4,4], axis, keepdims)
+            verify_argmax([3,4,4], axis, keepdims)
+
 if __name__ == '__main__':
     # verify_super_resolution_example()
     # verify_squeezenet1_1()
@@ -570,3 +675,4 @@ def test_forward_hardsigmoid():
     test_forward_max()
     test_forward_mean()
     test_forward_hardsigmoid()
+    test_forward_arg_min_max()
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index af69a0549ed6..616259504a67 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -877,6 +877,63 @@ def test_forward_transpose():
     _test_forward_transpose((2, 3, 4), (0, 1, 2))
     _test_forward_transpose((2, 3, 4, 5), (3, 0, 1, 2))
 
+
+def test_forward_ceil():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.ceil(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Ceil:0')
+
+def test_forward_floor():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.floor(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Floor:0')
+
+def test_forward_relu():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.nn.relu(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Relu:0')
+
+def test_forward_leaky_relu():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.nn.leaky_relu(in1, alpha=0.4)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'LeakyRelu:0')
+
+def test_forward_elu():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.nn.elu(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Elu:0')
+
+def test_forward_selu():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.nn.selu(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Selu:0')
+
+def test_forward_tanh():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.nn.tanh(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Tanh:0')
+
 #######################################################################
 # Main
 # ----
@@ -905,3 +962,10 @@ def test_forward_transpose():
     test_forward_ptb()
     test_forward_lrn()
     test_forward_l2_normalize()
+    test_forward_ceil()
+    test_forward_floor()
+    test_forward_relu()
+    test_forward_leaky_relu()
+    test_forward_elu()
+    test_forward_selu()
+    test_forward_tanh()

From d7f82746ba977716dd924ba9feaf78d0620282ae Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Fri, 14 Sep 2018 09:17:55 -0700
Subject: [PATCH 091/529] [NNVM][KERAS] Fix keras model converter and improve
 tutorial (#1716)

---
 nnvm/python/nnvm/frontend/keras.py    |  5 ++-
 tutorials/autotvm/tune_conv2d_cuda.py |  6 ++--
 tutorials/nnvm/deploy_ssd.py          |  3 +-
 tutorials/nnvm/from_coreml.py         | 18 ++++++-----
 tutorials/nnvm/from_darknet.py        | 45 +++++++++------------------
 tutorials/nnvm/from_keras.py          | 25 +++++++--------
 tutorials/nnvm/from_mxnet.py          | 15 +++++----
 tutorials/nnvm/from_onnx.py           | 11 ++++---
 tutorials/nnvm/from_tensorflow.py     |  4 +--
 tutorials/nnvm_quick_start.py         | 10 +++---
 10 files changed, 67 insertions(+), 75 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index eb3bb0d01ea5..eb9bf4d3720d 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -180,7 +180,10 @@ def _convert_convolution(insym, keras_layer, symtab):
         in_w = keras_layer.input_shape[2]
         pad_t, pad_b = _get_pad_pair(in_h, kernel_h, stride_h)
         pad_l, pad_r = _get_pad_pair(in_w, kernel_w, stride_w)
-        insym = _sym.pad(data=insym, pad_width=((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+        if pad_t == pad_b and pad_l == pad_r:
+            params['padding'] = (pad_t, pad_l)
+        else:
+            insym = _sym.pad(data=insym, pad_width=((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
     else:
         raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
     if is_deconv:
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index b541e4310df0..f2ee98a61f66 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -169,14 +169,16 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
                            target='cuda')
 print(task.config_space)
 
-# use local gpu, measure 10 times for every config to reduce variance
+# Use local gpu, measure 10 times for every config to reduce variance
 # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
 measure_option = autotvm.measure_option(
     builder=autotvm.LocalBuilder(),
     runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
 )
 
-# begin tuning, log records to file `conv2d.log`
+# Begin tuning, log records to file `conv2d.log`
+# During tuning we will also try many invalid configs, so you are expected to
+# see many error reports. As long as you can see non-zero GFLOPS, it is okay.
 tuner = autotvm.tuner.XGBTuner(task)
 tuner.tune(n_trial=20,
            measure_option=measure_option,
diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd.py
index 58725a3c92d3..26591b86c692 100644
--- a/tutorials/nnvm/deploy_ssd.py
+++ b/tutorials/nnvm/deploy_ssd.py
@@ -94,8 +94,7 @@
 # execute
 m.run()
 # get outputs
-_, oshape = compiler.graph_util.infer_shape(graph, shape={"data": dshape})
-tvm_output = m.get_output(0, tvm.nd.empty(tuple(oshape[0]), dtype))
+tvm_output = m.get_output(0)
 
 
 ######################################################################
diff --git a/tutorials/nnvm/from_coreml.py b/tutorials/nnvm/from_coreml.py
index 3cf8babe6418..1c958746247b 100644
--- a/tutorials/nnvm/from_coreml.py
+++ b/tutorials/nnvm/from_coreml.py
@@ -8,9 +8,11 @@
 For us to begin with, coremltools module is required to be installed.
 
 A quick solution is to install via pip
-```bash
-pip install -U coremltools --user
-```
+
+.. code-block:: bash
+
+    pip install -U coremltools --user
+
 or please refer to official site
 https://github.com/apple/coremltools
 """
@@ -65,7 +67,8 @@ def download(url, path, overwrite=False):
 import nnvm.compiler
 target = 'cuda'
 shape_dict = {'image': x.shape}
-graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
 
 ######################################################################
 # Execute on TVM
@@ -81,14 +84,13 @@ def download(url, path, overwrite=False):
 # execute
 m.run()
 # get outputs
-output_shape = (1000,)
-tvm_output = m.get_output(0, tvm.nd.empty(output_shape, dtype)).asnumpy()
-top1 = np.argmax(tvm_output)
+tvm_output = m.get_output(0)
+top1 = np.argmax(tvm_output.asnumpy()[0])
 
 #####################################################################
 # Look up synset name
 # -------------------
-# Look up prdiction top 1 index in 1000 class synset.
+# Look up prediction top 1 index in 1000 class synset.
 synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
diff --git a/tutorials/nnvm/from_darknet.py b/tutorials/nnvm/from_darknet.py
index 2bd7f4a1748a..87ab60fc2850 100644
--- a/tutorials/nnvm/from_darknet.py
+++ b/tutorials/nnvm/from_darknet.py
@@ -21,14 +21,13 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import tvm
-import os
 import sys
 
 from ctypes import *
 from tvm.contrib.download import download
 from nnvm.testing.darknet import __darknetffi__
 
-#Model name
+# Model name
 MODEL_NAME = 'yolo'
 
 ######################################################################
@@ -79,30 +78,13 @@
 with nnvm.compiler.build_config(opt_level=2):
     graph, lib, params = nnvm.compiler.build(sym, target, shape, dtype, params)
 
-#####################################################################
-# Save the JSON
-# -------------
-def save_lib():
-    #Save the graph, params and .so to the current directory
-    print("Saving the compiled output...")
-    path_name = 'nnvm_darknet_' + model_name
-    path_lib = path_name + '_deploy_lib.so'
-    lib.export_library(path_lib)
-    with open(path_name
-+ "deploy_graph.json", "w") as fo:
-        fo.write(graph.json())
-    with open(path_name
-+ "deploy_param.params", "wb") as fo:
-        fo.write(nnvm.compiler.save_param_dict(params))
-#save_lib()
-
 ######################################################################
 # Load a test image
 # --------------------------------------------------------------------
 test_image = 'dog.jpg'
 print("Loading the test image...")
 img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \
-            test_image   +'?raw=true'
+          test_image + '?raw=true'
 download(img_url, test_image)
 
 data = nnvm.testing.darknet.load_image(test_image, net.w, net.h)
@@ -124,9 +106,9 @@ def save_lib():
 m.run()
 # get outputs
 out_shape = (net.outputs,)
-tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+tvm_out = m.get_output(0).asnumpy().flatten()
 
-#do the detection and bring up the bounding boxes
+# do the detection and bring up the bounding boxes
 thresh = 0.24
 hier_thresh = 0.5
 img = nnvm.testing.darknet.load_image_color(test_image)
@@ -134,16 +116,18 @@ def save_lib():
 probs = []
 boxes = []
 region_layer = net.layers[net.n - 1]
-boxes, probs = nnvm.testing.yolo2_detection.get_region_boxes(region_layer, im_w, im_h, net.w, net.h,
-                       thresh, probs, boxes, 1, tvm_out)
+boxes, probs = nnvm.testing.yolo2_detection.get_region_boxes(
+    region_layer, im_w, im_h, net.w, net.h,
+    thresh, probs, boxes, 1, tvm_out)
 
-boxes, probs = nnvm.testing.yolo2_detection.do_nms_sort(boxes, probs,
-                       region_layer.w*region_layer.h*region_layer.n, region_layer.classes, 0.3)
+boxes, probs = nnvm.testing.yolo2_detection.do_nms_sort(
+    boxes, probs,
+    region_layer.w*region_layer.h*region_layer.n, region_layer.classes, 0.3)
 
 coco_name = 'coco.names'
-coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name   +'?raw=true'
+coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name + '?raw=true'
 font_name = 'arial.ttf'
-font_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + font_name   +'?raw=true'
+font_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + font_name + '?raw=true'
 download(coco_url, coco_name)
 download(font_url, font_name)
 
@@ -152,7 +136,8 @@ def save_lib():
 
 names = [x.strip() for x in content]
 
-nnvm.testing.yolo2_detection.draw_detections(img, region_layer.w*region_layer.h*region_layer.n,
-                 thresh, boxes, probs, names, region_layer.classes)
+nnvm.testing.yolo2_detection.draw_detections(
+    img, region_layer.w*region_layer.h*region_layer.n,
+    thresh, boxes, probs, names, region_layer.classes)
 plt.imshow(img.transpose(1, 2, 0))
 plt.show()
diff --git a/tutorials/nnvm/from_keras.py b/tutorials/nnvm/from_keras.py
index 402010b98634..5c13b8b1d30a 100644
--- a/tutorials/nnvm/from_keras.py
+++ b/tutorials/nnvm/from_keras.py
@@ -9,12 +9,12 @@
 Tensorflow is also required since it's used as the default backend of keras.
 
 A quick solution is to install via pip
-```
-pip install -U keras --user
-```
-```
-pip install -U tensorflow --user
-```
+
+.. code-block:: bash
+
+    pip install -U keras --user
+    pip install -U tensorflow --user
+
 or please refer to official site
 https://keras.io/#installation
 """
@@ -45,7 +45,7 @@ def download(url, path, overwrite=False):
 weights_file = 'resnet50_weights.h5'
 download(weights_url, weights_file)
 keras_resnet50 = keras.applications.resnet50.ResNet50(include_top=True, weights=None,
-	input_shape=(224,224,3), classes=1000)
+                                                      input_shape=(224, 224, 3), classes=1000)
 keras_resnet50.load_weights('resnet50_weights.h5')
 
 ######################################################################
@@ -75,8 +75,8 @@ def download(url, path, overwrite=False):
 # compile the model
 target = 'cuda'
 shape_dict = {'input_1': data.shape}
-with nnvm.compiler.build_config(opt_level=2):
-	graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+with nnvm.compiler.build_config(opt_level=3):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
 
 ######################################################################
 # Execute on TVM
@@ -91,14 +91,13 @@ def download(url, path, overwrite=False):
 # execute
 m.run()
 # get outputs
-out_shape = (1000,)
-tvm_out = m.get_output(0, tvm.nd.empty(out_shape, 'float32')).asnumpy()
-top1_tvm = np.argmax(tvm_out)
+tvm_out = m.get_output(0)
+top1_tvm = np.argmax(tvm_out.asnumpy()[0])
 
 #####################################################################
 # Look up synset name
 # -------------------
-# Look up prdiction top 1 index in 1000 class synset.
+# Look up prediction top 1 index in 1000 class synset.
 synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
diff --git a/tutorials/nnvm/from_mxnet.py b/tutorials/nnvm/from_mxnet.py
index cce3bc37126a..78247dbe2b0a 100644
--- a/tutorials/nnvm/from_mxnet.py
+++ b/tutorials/nnvm/from_mxnet.py
@@ -10,9 +10,11 @@
 For us to begin with, mxnet module is required to be installed.
 
 A quick solution is
-```
-pip install mxnet --user
-```
+
+.. code-block:: bash
+
+    pip install mxnet --user
+
 or please refer to offical installation guide.
 https://mxnet.incubator.apache.org/versions/master/install/index.html
 """
@@ -70,7 +72,8 @@ def transform_image(image):
 import nnvm.compiler
 target = 'cuda'
 shape_dict = {'data': x.shape}
-graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+with nnvm.compiler.build_config(opt_level=3):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
 
 ######################################################################
 # Execute the portable graph on TVM
@@ -86,8 +89,8 @@ def transform_image(image):
 # execute
 m.run()
 # get outputs
-tvm_output = m.get_output(0, tvm.nd.empty((1000,), dtype))
-top1 = np.argmax(tvm_output.asnumpy())
+tvm_output = m.get_output(0)
+top1 = np.argmax(tvm_output.asnumpy()[0])
 print('TVM prediction top-1:', top1, synset[top1])
 
 ######################################################################
diff --git a/tutorials/nnvm/from_onnx.py b/tutorials/nnvm/from_onnx.py
index 8fb5a1048569..df8dee8272ce 100644
--- a/tutorials/nnvm/from_onnx.py
+++ b/tutorials/nnvm/from_onnx.py
@@ -8,9 +8,11 @@
 For us to begin with, onnx module is required to be installed.
 
 A quick solution is to install protobuf compiler, and
-```bash
-pip install onnx --user
-```
+
+.. code-block:: bash
+
+    pip install onnx --user
+
 or please refer to offical site.
 https://github.com/onnx/onnx
 """
@@ -69,7 +71,8 @@ def download(url, path, overwrite=False):
 # assume first input name is data
 input_name = sym.list_input_names()[0]
 shape_dict = {input_name: x.shape}
-graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+with nnvm.compiler.build_config(opt_level=3):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
 
 ######################################################################
 # Execute on TVM
diff --git a/tutorials/nnvm/from_tensorflow.py b/tutorials/nnvm/from_tensorflow.py
index ee025c5b09ff..f4af2381780c 100644
--- a/tutorials/nnvm/from_tensorflow.py
+++ b/tutorials/nnvm/from_tensorflow.py
@@ -5,9 +5,7 @@
 
 For us to begin with, tensorflow python module is required to be installed.
 
-A quick solution is to install tensorflow from
-
-https://www.tensorflow.org/install
+Please refer to https://www.tensorflow.org/install
 """
 
 # tvm and nnvm
diff --git a/tutorials/nnvm_quick_start.py b/tutorials/nnvm_quick_start.py
index c171823604cd..e16184300e2f 100644
--- a/tutorials/nnvm_quick_start.py
+++ b/tutorials/nnvm_quick_start.py
@@ -49,8 +49,8 @@
 data_shape = (batch_size,) + image_shape
 out_shape = (batch_size, num_class)
 
-net, params = nnvm.testing.resnet.get_workload(layers=18,
-        batch_size=batch_size, image_shape=image_shape)
+net, params = nnvm.testing.resnet.get_workload(
+    layers=18, batch_size=batch_size, image_shape=image_shape)
 print(net.debug_str())
 
 ######################################################################
@@ -117,7 +117,7 @@
 from tvm.contrib import util
 
 temp = util.tempdir()
-path_lib = temp.relpath("deploy_lib.so")
+path_lib = temp.relpath("deploy_lib.tar")
 lib.export_library(path_lib)
 with open(temp.relpath("deploy_graph.json"), "w") as fo:
     fo.write(graph.json())
@@ -136,6 +136,4 @@
 module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
 module.load_params(loaded_params)
 module.run(data=input_data)
-
-out = module.get_output(0, out=tvm.nd.empty(out_shape))
-
+out = module.get_output(0).asnumpy()

From 25c216b8e50a9f8abfbed6fb04e3c0396a8277c1 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 14 Sep 2018 11:19:38 -0500
Subject: [PATCH 092/529] [TOPI] Add dp4a intrinsic to CUDA (#1707)

---
 topi/python/topi/cuda/tensor_intrin.py | 62 ++++++++++++++++++++++++++
 topi/recipe/gemm/gemm_int8.py          | 38 ++--------------
 2 files changed, 65 insertions(+), 35 deletions(-)
 create mode 100644 topi/python/topi/cuda/tensor_intrin.py

diff --git a/topi/python/topi/cuda/tensor_intrin.py b/topi/python/topi/cuda/tensor_intrin.py
new file mode 100644
index 000000000000..26ae7587c5df
--- /dev/null
+++ b/topi/python/topi/cuda/tensor_intrin.py
@@ -0,0 +1,62 @@
+"""Tensor intrinsics on CUDA."""
+#pylint: disable=invalid-name
+import tvm
+
+
+def dp4a(x_scope='local', y_scope='local', z_scope='local'):
+    """
+    Int8 dot product reduced by every 4 elements using __dp4a
+
+    Parameters
+    ----------
+    x_scope : str, optional
+        The storage scope of buffer for lhs
+    y_scope : str, optional
+        The storage scope of buffer for rhs
+    z_scope : str, optional
+        The storage scope of buffer for result
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The dp4a TensorIntrin that can be used in tensorizing schedule.
+    """
+
+    n = 4  # dp4a requires operands packed by 4
+    x = tvm.placeholder((n,), name='x', dtype='int8')
+    y = tvm.placeholder((n,), name='y', dtype='int8')
+
+    k = tvm.reduce_axis((0, n), name='rc')
+
+    z = tvm.compute((1,), lambda i: tvm.sum(
+        x[k].astype('int32') * y[k].astype('int32'), axis=[k]))
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            xx, yy = ins
+            zz = outs[0]
+
+            if index == 1:
+                return zz.vstore(0, 0)
+
+            ib = tvm.ir_builder.create()
+
+            vec_x = xx.vload(0, dtype='int8x4')
+            vec_y = yy.vload(0, dtype='int8x4')
+            prev_z = 0 if index == 0 else zz.vload(0)
+
+            new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z)
+            ib.emit(zz.vstore(0, new_z))
+
+            return ib.get()
+
+        return _instr(0), _instr(1), _instr(2) # body, reset, update
+
+    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
+        scopes = {x: x_scope, y: y_scope, z: z_scope}
+        binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name,
+                                    data_alignment=cfg.data_alignment,
+                                    offset_factor=cfg.offset_factor,
+                                    scope=scopes[t]) for t in [x, y, z]}
+
+        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py
index 4cce2735c4a2..ed735dad9cd9 100644
--- a/topi/recipe/gemm/gemm_int8.py
+++ b/topi/recipe/gemm/gemm_int8.py
@@ -4,44 +4,12 @@
 import numpy as np
 import tvm
 from tvm import autotvm
+from topi.cuda.tensor_intrin import dp4a
 
 DO_TUNING = True
 PRETUNED_INDEX = 75333
 
-def intrin_dot():
-    n = 4  # dp4a requires operands packed by 4
-    x = tvm.placeholder((n,), name='x', dtype='int8')
-    y = tvm.placeholder((n,), name='y', dtype='int8')
-    k = tvm.reduce_axis((0, n), name='k')
-
-    z = tvm.compute(
-        (1,), lambda _: tvm.sum(
-            x[k].astype('int32') * y[k].astype('int32'), axis=k))
-
-    def intrin_func(ins, outs):
-        xx, yy = ins
-        zz = outs[0]
-        ib = tvm.ir_builder.create()
-
-        dp4a = zz.vstore(0, tvm.call_pure_extern('int32', '__dp4a',
-                                                 xx.vload(0, dtype='int8x4'),
-                                                 yy.vload(0, dtype='int8x4'),
-                                                 zz.vload(0)))
-        ib.emit(dp4a)
-
-        body = ib.get()
-        return body, zz.vstore(0, 0), body
-
-    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
-        binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name,
-                                    data_alignment=cfg.data_alignment,
-                                    offset_factor=cfg.offset_factor,
-                                    scope='local') for t in [x, y, z]}
-        return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
-
-
-dot = intrin_dot()
-
+intrin_dp4a = dp4a('local', 'local', 'local')
 
 @autotvm.template
 def gemm_int8(n, m, l):
@@ -70,7 +38,7 @@ def gemm_int8(n, m, l):
 
     ko, kt, ki = cfg['tile_k'].apply(s, CC, k)
 
-    s[CC].tensorize(ki, dot)
+    s[CC].tensorize(ki, intrin_dp4a)
 
     block_x = tvm.thread_axis('blockIdx.x')
     block_y = tvm.thread_axis('blockIdx.y')

From fe7003135bac5ed6b96dbbbffd26cbc1f583b5c0 Mon Sep 17 00:00:00 2001
From: Sergey Mironov <grrwlf@gmail.com>
Date: Fri, 14 Sep 2018 19:26:34 +0300
Subject: [PATCH 093/529] Add demo_android Dockerfile (#1646)

---
 apps/android_deploy/README.md               | 14 +++--
 apps/android_deploy/build.gradle            |  5 +-
 apps/android_deploy/dev_tools/sign_apk.sh   |  2 +-
 apps/android_rpc/build.gradle               |  5 +-
 apps/android_rpc/dev_tools/sign_apk.sh      |  2 +-
 docker/Dockerfile.demo_android              | 36 +++++++++++
 docker/bash.sh                              |  2 +-
 docker/build.sh                             |  2 +-
 docker/install/ubuntu_install_androidsdk.sh | 69 +++++++++++++++++++++
 docker/install/ubuntu_install_gradle.sh     | 17 +++++
 docker/install/ubuntu_install_java.sh       |  3 +
 11 files changed, 147 insertions(+), 10 deletions(-)
 create mode 100644 docker/Dockerfile.demo_android
 create mode 100644 docker/install/ubuntu_install_androidsdk.sh
 create mode 100644 docker/install/ubuntu_install_gradle.sh

diff --git a/apps/android_deploy/README.md b/apps/android_deploy/README.md
index 801ca8bdf95c..2c2951b5332d 100644
--- a/apps/android_deploy/README.md
+++ b/apps/android_deploy/README.md
@@ -2,14 +2,21 @@
 
 This folder contains Android Demo app that allows us to show how to deploy model using TVM runtime api on a Android phone.
 
-You will need [JDK](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html), [Android SDK](https://developer.android.com/studio/index.html), [Android NDK](https://developer.android.com/ndk) and an Android device to use this.
+You will need [JDK](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html), [Android SDK](https://developer.android.com/studio/index.html), [Android NDK](https://developer.android.com/ndk) and an Android device to use this. Make sure the `ANDROID_HOME` variable already points to your Android SDK folder or set it using `export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk]`. We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
+
+Alternatively, you may execute Docker image we provide wich contains the required packages. Use the command below to build the image and enter interactive session. Note, that building with OpenCL was not tested from Docker.
+
+```bash
+./docker/build.sh demo_android -it bash
+(docker) $ echo $ANDROID_HOME
+(docker) /opt/android-sdk-linux
+```
+
 
 ## Build and Installation
 
 ### Build APK
 
-We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
-
 Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/dmlc/tvm/blob/master/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
 
 ```
@@ -48,7 +55,6 @@ USE_OPENCL = 0
 Now use Gradle to compile JNI, resolve Java dependencies and build the Android application together with tvm4j. Run following script to generate the apk file.
 
 ```bash
-export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk]
 cd apps/android_deploy
 gradle clean build
 ```
diff --git a/apps/android_deploy/build.gradle b/apps/android_deploy/build.gradle
index f7bbe2641c9d..1eeb9d686cfb 100644
--- a/apps/android_deploy/build.gradle
+++ b/apps/android_deploy/build.gradle
@@ -3,9 +3,12 @@
 buildscript {
     repositories {
         jcenter()
+        maven {
+            url 'https://maven.google.com'
+        }
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.3'
+        classpath 'com.android.tools.build:gradle:3.1.0'
         classpath 'org.apache.httpcomponents:httpclient:4.5.4'
 
         // NOTE: Do not place your application dependencies here; they belong
diff --git a/apps/android_deploy/dev_tools/sign_apk.sh b/apps/android_deploy/dev_tools/sign_apk.sh
index 314f82cdb76c..fd8cee6b927a 100644
--- a/apps/android_deploy/dev_tools/sign_apk.sh
+++ b/apps/android_deploy/dev_tools/sign_apk.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 CURR_DIR=$(cd `dirname $0`; pwd)
-APK_DIR=$CURR_DIR/../app/build/outputs/apk
+APK_DIR=$CURR_DIR/../app/build/outputs/apk/release
 UNSIGNED_APK=$APK_DIR/app-release-unsigned.apk
 SIGNED_APK=$APK_DIR/tvmdemo-release.apk
 jarsigner -verbose -keystore $CURR_DIR/tvmdemo.keystore -signedjar $SIGNED_APK $UNSIGNED_APK 'tvmdemo'
diff --git a/apps/android_rpc/build.gradle b/apps/android_rpc/build.gradle
index f13b8fc9a728..08140708d5ef 100644
--- a/apps/android_rpc/build.gradle
+++ b/apps/android_rpc/build.gradle
@@ -3,9 +3,12 @@
 buildscript {
     repositories {
         jcenter()
+        maven {
+            url 'https://maven.google.com'
+        }
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.3'
+        classpath 'com.android.tools.build:gradle:3.1.0'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
diff --git a/apps/android_rpc/dev_tools/sign_apk.sh b/apps/android_rpc/dev_tools/sign_apk.sh
index f52faff4d074..7dc6480f4bca 100755
--- a/apps/android_rpc/dev_tools/sign_apk.sh
+++ b/apps/android_rpc/dev_tools/sign_apk.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 CURR_DIR=$(cd `dirname $0`; pwd)
-APK_DIR=$CURR_DIR/../app/build/outputs/apk
+APK_DIR=$CURR_DIR/../app/build/outputs/apk/release
 UNSIGNED_APK=$APK_DIR/app-release-unsigned.apk
 SIGNED_APK=$APK_DIR/tvmrpc-release.apk
 jarsigner -verbose -keystore $CURR_DIR/tvmrpc.keystore -signedjar $SIGNED_APK $UNSIGNED_APK 'tvmrpc'
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
new file mode 100644
index 000000000000..2adcdb42f4e4
--- /dev/null
+++ b/docker/Dockerfile.demo_android
@@ -0,0 +1,36 @@
+# Minimum docker image for demo purposes
+FROM ubuntu:16.04
+
+RUN apt-get update --fix-missing
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+
+COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
+RUN bash /install/ubuntu_install_python_package.sh
+
+COPY install/ubuntu_install_keras.sh /install/ubuntu_install_keras.sh
+RUN bash /install/ubuntu_install_keras.sh
+
+COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
+RUN bash /install/ubuntu_install_java.sh
+
+COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
+RUN bash /install/ubuntu_install_llvm.sh
+
+COPY install/ubuntu_install_gradle.sh /install/ubuntu_install_gradle.sh
+RUN bash /install/ubuntu_install_gradle.sh
+
+COPY install/ubuntu_install_androidsdk.sh /install/ubuntu_install_androidsdk.sh
+RUN bash /install/ubuntu_install_androidsdk.sh
+
+# Build TVM
+COPY install/install_tvm_cpu.sh /install/install_tvm_cpu.sh
+RUN bash /install/install_tvm_cpu.sh
+
+# Environment variables
+ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/nnvm/python/:/usr/tvm/vta/python:${PYTHONPATH}
+
diff --git a/docker/bash.sh b/docker/bash.sh
index ba935d7ed089..0813edd5527d 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -55,5 +55,5 @@ ${DOCKER_BINARY} run --rm --pid=host\
     -e "CI_BUILD_GID=$(id -g)" \
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     ${DOCKER_IMAGE_NAME}\
-    bash /docker/with_the_same_user \
+    bash --login /docker/with_the_same_user \
     ${COMMAND[@]}
diff --git a/docker/build.sh b/docker/build.sh
index 1d476e52e642..5b6c4450f6e4 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -127,5 +127,5 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GID=$(id -g)" \
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     ${DOCKER_IMG_NAME} \
-    bash docker/with_the_same_user \
+    bash --login docker/with_the_same_user \
     ${COMMAND[@]}
diff --git a/docker/install/ubuntu_install_androidsdk.sh b/docker/install/ubuntu_install_androidsdk.sh
new file mode 100644
index 000000000000..a5c02e573b43
--- /dev/null
+++ b/docker/install/ubuntu_install_androidsdk.sh
@@ -0,0 +1,69 @@
+. /etc/profile
+
+set -o errexit -o nounset
+
+ANDROID_HOME=/opt/android-sdk-linux
+ASDKTOOLS_HOME=/opt/android-sdk-tools
+ASDKTOOLS_VERSION=3859397
+ASDKTOOLS_SHA256=444e22ce8ca0f67353bda4b85175ed3731cae3ffa695ca18119cbacef1c1bea0
+
+wget http://dl.google.com/android/repository/sdk-tools-linux-${ASDKTOOLS_VERSION}.zip -O sdk-tools-linux.zip
+echo "${ASDKTOOLS_SHA256} *sdk-tools-linux.zip" | sha256sum --check -
+unzip sdk-tools-linux.zip
+rm sdk-tools-linux.zip
+mv tools "${ASDKTOOLS_HOME}/"
+# The following popular fix makes sdkmanager honour $http_proxy variables
+mv ${ASDKTOOLS_HOME}/bin/sdkmanager ${ASDKTOOLS_HOME}/bin/sdkmanager-vanilla
+cat >${ASDKTOOLS_HOME}/bin/sdkmanager <<"EOF"
+#!/bin/sh
+if test -n "$http_proxy"; then
+  PROXY_HOST=`echo $http_proxy | sed 's@.*//\(.*\):.*@\1@'`
+  PROXY_PORT=`echo $http_proxy | sed 's@.*//.*:\(.*\)@\1@'`
+  PROXY="--proxy=http --proxy_host=$PROXY_HOST --proxy_port=$PROXY_PORT"
+else
+  PROXY=""
+fi
+exec "`dirname $0`/sdkmanager-vanilla" $PROXY "$@"
+EOF
+for f in ${ASDKTOOLS_HOME}/bin/* ; do
+  chmod +x "$f"
+  ln --symbolic "$f" "/usr/bin/`basename $f`"
+done
+
+
+cat >/install/package-list-minimal.txt <<EOF
+build-tools;26.0.3
+build-tools;27.0.3
+cmake;3.6.4111459
+emulator
+extras;android;gapid;1
+extras;android;gapid;3
+extras;android;m2repository
+extras;google;auto
+extras;google;google_play_services
+extras;google;instantapps
+extras;google;m2repository
+extras;google;market_apk_expansion
+extras;google;market_licensing
+extras;google;simulators
+extras;google;webdriver
+extras;m2repository;com;android;support;constraint;constraint-layout;1.0.2
+extras;m2repository;com;android;support;constraint;constraint-layout-solver;1.0.2
+lldb;2.3
+platforms;android-26
+platforms;android-27
+tools
+ndk-bundle
+EOF
+
+mkdir /root/.android 2>/dev/null || true
+touch /root/.android/repositories.cfg
+yes | sdkmanager --licenses --sdk_root="$ANDROID_HOME"
+sdkmanager --verbose --package_file=/install/package-list-minimal.txt --sdk_root="$ANDROID_HOME"
+test -d "${ANDROID_HOME}/build-tools/27.0.3"
+test -d "${ANDROID_HOME}/ndk-bundle"
+for f in ${ANDROID_HOME}/ndk-bundle/* ; do
+  ln --symbolic "$f" "/usr/bin/`basename $f`"
+done
+echo "export ANDROID_HOME=${ANDROID_HOME}" >> /etc/profile
+
diff --git a/docker/install/ubuntu_install_gradle.sh b/docker/install/ubuntu_install_gradle.sh
new file mode 100644
index 000000000000..b1535c98cabb
--- /dev/null
+++ b/docker/install/ubuntu_install_gradle.sh
@@ -0,0 +1,17 @@
+. /etc/profile
+
+set -o errexit -o nounset
+
+GRADLE_HOME=/opt/gradle
+GRADLE_VERSION=4.10-rc-2
+GRADLE_SHA256=e90d3c32910e259814bcca82b3911172ecca1ff1ab5ed69b4de3c1df8b378b40
+
+echo "Downloading Gradle"
+wget --output-document=gradle.zip "https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}-bin.zip"
+echo "Checking Gradle hash"
+echo "${GRADLE_SHA256} *gradle.zip" | sha256sum --check -
+echo "Installing Gradle"
+unzip gradle.zip
+rm gradle.zip
+mv "gradle-${GRADLE_VERSION}" "${GRADLE_HOME}/"
+ln --symbolic "${GRADLE_HOME}/bin/gradle" /usr/bin/gradle
diff --git a/docker/install/ubuntu_install_java.sh b/docker/install/ubuntu_install_java.sh
index ba07b2985efb..462edc491627 100644
--- a/docker/install/ubuntu_install_java.sh
+++ b/docker/install/ubuntu_install_java.sh
@@ -1 +1,4 @@
+set -o errexit -o nounset
 apt-get update && apt-get install -y openjdk-8-jdk maven
+test -d "/usr/lib/jvm/java-8-openjdk-amd64/jre"
+echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre" >> /etc/profile

From bd416ae5389420c7281cadae2664f6e23d6fe990 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Fri, 14 Sep 2018 22:26:11 +0530
Subject: [PATCH 094/529] [FRONTEND][TENSORFLOW] Helper function to add shapes
 into the graph. Use tmp folder for model files and clean it. (#1697)

---
 nnvm/python/nnvm/testing/tf.py    | 35 +++++++++++++++++++++++++++++--
 tutorials/nnvm/from_tensorflow.py |  4 ++--
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/nnvm/python/nnvm/testing/tf.py b/nnvm/python/nnvm/testing/tf.py
index 0372d7450586..f5b49b2280b4 100644
--- a/nnvm/python/nnvm/testing/tf.py
+++ b/nnvm/python/nnvm/testing/tf.py
@@ -8,6 +8,7 @@
 import os.path
 import collections
 import numpy as np
+from tvm.contrib import util
 
 # Tensorflow imports
 import tensorflow as tf
@@ -43,6 +44,31 @@ def ProcessGraphDefParam(graph_def):
             raise TypeError('graph_def must be a GraphDef proto.')
     return graph_def
 
+
+def AddShapesToGraphDef(out_node):
+    """ Add shapes attribute to nodes of the graph.
+        Input graph here is the default graph in context.
+
+    Parameters
+    ----------
+    out_node: String
+        Final output node of the graph.
+
+    Returns
+    -------
+    graph_def : Obj
+        tensorflow graph definition with shapes attribute added to nodes.
+
+    """
+
+    with tf.Session() as sess:
+        graph_def = tf.graph_util.convert_variables_to_constants(
+            sess,
+            sess.graph.as_graph_def(add_shapes=True),
+            [out_node],
+            )
+        return graph_def
+
 class NodeLookup(object):
     """Converts integer node ID's to human readable labels."""
 
@@ -128,13 +154,18 @@ def get_workload(model_path):
     model_url = os.path.join(repo_base, model_path)
 
     from mxnet.gluon.utils import download
-    download(model_url, model_name)
+
+    temp = util.tempdir()
+    path_model = temp.relpath(model_name)
+
+    download(model_url, path_model)
 
     # Creates graph from saved graph_def.pb.
-    with tf.gfile.FastGFile(os.path.join("./", model_name), 'rb') as f:
+    with tf.gfile.FastGFile(path_model, 'rb') as f:
         graph_def = tf.GraphDef()
         graph_def.ParseFromString(f.read())
         graph = tf.import_graph_def(graph_def, name='')
+        temp.remove()
         return graph_def
 
 #######################################################################
diff --git a/tutorials/nnvm/from_tensorflow.py b/tutorials/nnvm/from_tensorflow.py
index f4af2381780c..033cdd8a4cab 100644
--- a/tutorials/nnvm/from_tensorflow.py
+++ b/tutorials/nnvm/from_tensorflow.py
@@ -62,7 +62,6 @@
 download(map_proto_url, map_proto)
 download(lable_map_url, lable_map)
 
-
 ######################################################################
 # Import model
 # ------------
@@ -74,7 +73,8 @@
     graph = tf.import_graph_def(graph_def, name='')
     # Call the utility to import the graph definition into default graph.
     graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
-
+    # Add shapes to the graph.
+    graph_def = nnvm.testing.tf.AddShapesToGraphDef('softmax')
 
 ######################################################################
 # Decode image

From 8078eeb9fc55b438a1bde2fa635db2c65715d2e0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 14 Sep 2018 10:59:22 -0700
Subject: [PATCH 095/529] [RUNTIME] Support TVMContext (#1720)

---
 include/tvm/runtime/packed_func.h             |  5 ++++
 python/tvm/_ffi/_ctypes/function.py           |  4 +--
 python/tvm/_ffi/_ctypes/types.py              | 26 ++++++++++++++++---
 src/api/api_test.cc                           | 10 +++++++
 .../unittest/test_runtime_packed_func.py      | 11 ++++++++
 5 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index b7351274a350..dc7475d3bff1 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -646,6 +646,11 @@ class TVMRetValue : public TVMPODValue_ {
     value_.v_int64 = value;
     return *this;
   }
+  TVMRetValue& operator=(TVMContext value) {
+    this->SwitchToPOD(kTVMContext);
+    value_.v_ctx = value;
+    return *this;
+  }
   TVMRetValue& operator=(TVMType t) {
     this->SwitchToPOD(kTVMType);
     value_.v_type = t;
diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index 61679f0018c0..3c2a7a5f8c9b 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -15,7 +15,7 @@
 from .ndarray import NDArrayBase, _make_array
 from .types import TVMValue, TypeCode
 from .types import TVMPackedCFunc, TVMCFuncFinalizer
-from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
+from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _ctx_to_int64
 from .node import NodeBase
 from . import node as _node
 
@@ -110,7 +110,7 @@ def _make_tvm_args(args, temp_args):
             values[i].v_str = c_str(str(arg))
             type_codes[i] = TypeCode.STR
         elif isinstance(arg, TVMContext):
-            values[i].v_ctx = arg
+            values[i].v_int64 = _ctx_to_int64(arg)
             type_codes[i] = TypeCode.TVM_CONTEXT
         elif isinstance(arg, bytearray):
             arr = TVMByteArray()
diff --git a/python/tvm/_ffi/_ctypes/types.py b/python/tvm/_ffi/_ctypes/types.py
index 08337b08b521..b3fcad9cfefb 100644
--- a/python/tvm/_ffi/_ctypes/types.py
+++ b/python/tvm/_ffi/_ctypes/types.py
@@ -3,8 +3,9 @@
 from __future__ import absolute_import as _abs
 
 import ctypes
+import struct
 from ..base import py_str, check_call, _LIB
-from ..runtime_ctypes import TVMByteArray, TypeCode
+from ..runtime_ctypes import TVMByteArray, TypeCode, TVMContext
 
 class TVMValue(ctypes.Union):
     """TVMValue in C API"""
@@ -36,7 +37,7 @@ def _return_handle(x):
     return handle
 
 def _return_bytes(x):
-    """return handle"""
+    """return bytes"""
     handle = x.v_handle
     if not isinstance(handle, ctypes.c_void_p):
         handle = ctypes.c_void_p(handle)
@@ -48,6 +49,15 @@ def _return_bytes(x):
         raise RuntimeError('memmove failed')
     return res
 
+def _return_context(value):
+    """return TVMContext"""
+    # use bit unpacking from int64 view
+    # We use this to get around ctypes issue on Union of Structure
+    data = struct.pack("=q", value.v_int64)
+    arr = struct.unpack("=ii", data)
+    return TVMContext(arr[0], arr[1])
+
+
 def _wrap_arg_func(return_f, type_code):
     tcode = ctypes.c_int(type_code)
     def _wrap_func(x):
@@ -55,13 +65,20 @@ def _wrap_func(x):
         return return_f(x)
     return _wrap_func
 
+def _ctx_to_int64(ctx):
+    """Pack context into int64 in native endian"""
+    data = struct.pack("=ii", ctx.device_type, ctx.device_id)
+    return struct.unpack("=q", data)[0]
+
+
 RETURN_SWITCH = {
     TypeCode.INT: lambda x: x.v_int64,
     TypeCode.FLOAT: lambda x: x.v_float64,
     TypeCode.HANDLE: _return_handle,
     TypeCode.NULL: lambda x: None,
     TypeCode.STR: lambda x: py_str(x.v_str),
-    TypeCode.BYTES: _return_bytes
+    TypeCode.BYTES: _return_bytes,
+    TypeCode.TVM_CONTEXT: _return_context
 }
 
 C_TO_PY_ARG_SWITCH = {
@@ -70,5 +87,6 @@ def _wrap_func(x):
     TypeCode.HANDLE: _return_handle,
     TypeCode.NULL: lambda x: None,
     TypeCode.STR: lambda x: py_str(x.v_str),
-    TypeCode.BYTES: _return_bytes
+    TypeCode.BYTES: _return_bytes,
+    TypeCode.TVM_CONTEXT: _return_context
 }
diff --git a/src/api/api_test.cc b/src/api/api_test.cc
index 1744267fdcd7..7a2ae638a038 100644
--- a/src/api/api_test.cc
+++ b/src/api/api_test.cc
@@ -35,6 +35,16 @@ TVM_REGISTER_API("_nop")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
   });
 
+TVM_REGISTER_API("_context_test")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    DLContext ctx = args[0];
+    int dtype = args[1];
+    int did = args[2];
+    CHECK_EQ(static_cast<int>(ctx.device_type), dtype);
+    CHECK_EQ(static_cast<int>(ctx.device_id), did);
+    *ret = ctx;
+  });
+
 // internal fucntion used for debug and testing purposes
 TVM_REGISTER_API("_ndarray_use_count")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
diff --git a/tests/python/unittest/test_runtime_packed_func.py b/tests/python/unittest/test_runtime_packed_func.py
index 279172555d2a..2d7d0197640b 100644
--- a/tests/python/unittest/test_runtime_packed_func.py
+++ b/tests/python/unittest/test_runtime_packed_func.py
@@ -70,6 +70,16 @@ def myfunc(ss):
     tvm.convert(myfunc)(x)
 
 
+def test_ctx():
+    def test_ctx_func(ctx):
+        assert tvm.gpu(7) == ctx
+        return tvm.cpu(0)
+    x = test_ctx_func(tvm.gpu(7))
+    assert x == tvm.cpu(0)
+    x = tvm.opencl(10)
+    x = tvm._api_internal._context_test(x, x.device_type, x.device_id)
+    assert x == tvm.opencl(10)
+
 if __name__ == "__main__":
     test_empty_array()
     test_get_global()
@@ -77,3 +87,4 @@ def myfunc(ss):
     test_convert()
     test_return_func()
     test_byte_array()
+    test_ctx()

From ebaafeba7988b5f35e0670e4def20a58ceaaf298 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Sun, 16 Sep 2018 01:15:43 +0800
Subject: [PATCH 096/529] Allow inplace memory optimization for different data
 type (#1696)

---
 nnvm/src/pass/plan_memory.cc | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc
index 6c2fc0d087ea..e0788386e6ea 100644
--- a/nnvm/src/pass/plan_memory.cc
+++ b/nnvm/src/pass/plan_memory.cc
@@ -7,12 +7,37 @@
 #include <nnvm/pass.h>
 #include <nnvm/graph_attr_types.h>
 #include <nnvm/op_attr_types.h>
+#include <nnvm/top/tensor.h>
 #include <memory>
 #include "graph_algorithm.h"
 
 namespace nnvm {
 namespace pass {
 namespace {
+  using namespace nnvm::top;
+// Return bytes of data flag.
+static int GetDTypeSize(int type_flag) {
+  switch (type_flag) {
+    case kUint8:
+    case kInt8:
+      return 1;
+    case kFloat16:
+    case kInt16:
+    case kUint16:
+      return 2;
+    case kFloat32:
+    case kInt32:
+    case kUint32:
+      return 4;
+    case kFloat64:
+    case kInt64:
+    case kUint64:
+      return 8;
+    default:
+      LOG(FATAL) << "unknown type_flag=" << type_flag;
+      return -1;
+  }
+}
 
 // simple graph based allocator.
 class GraphAllocator {
@@ -199,7 +224,8 @@ size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
             ((storage_ref_count[sid_in] == 1 && !ignore_all_inputs) || identity[ipair]) &&
             entry_ref_count[eid_out] > 0 &&
             shape_vec[eid_out].Size() == shape_vec[eid_in].Size() &&
-            dtype_vec[eid_out] == dtype_vec[eid_in]) {
+             (dtype_vec[eid_out] == dtype_vec[eid_in] ||
+             GetDTypeSize(dtype_vec[eid_out]) == GetDTypeSize(dtype_vec[eid_in]))) {
           // inplace optimization
           taken[kv.first] = true;
           storage[eid_out] = sid_in;

From 8f4883e6a3730982a8a8ecadc19023b795ab4173 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 15 Sep 2018 13:32:30 -0700
Subject: [PATCH 097/529] [NODE] Enable EnvFunc to serialize global function as
 node (#1721)

---
 include/tvm/api_registry.h                    | 115 +++++++++++++++++-
 include/tvm/runtime/packed_func.h             |   8 ++
 python/tvm/api.py                             |  22 ++++
 python/tvm/container.py                       |  14 +++
 src/api/api_test.cc                           |   4 +
 src/lang/api_registry.cc                      |  50 ++++++++
 tests/python/unittest/test_lang_reflection.py |  29 +++++
 7 files changed, 240 insertions(+), 2 deletions(-)
 create mode 100644 src/lang/api_registry.cc

diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h
index e12ef423ed32..d6e9910ab1ee 100644
--- a/include/tvm/api_registry.h
+++ b/include/tvm/api_registry.h
@@ -1,16 +1,18 @@
 /*!
  *  Copyright (c) 2017 by Contributors
  * \file tvm/api_registry.h
- * \brief This files include necessary headers to
- *  be used to register an global API function.
+ * \brief This file contains utilities related to
+ *  the TVM's global function registry.
  */
 #ifndef TVM_API_REGISTRY_H_
 #define TVM_API_REGISTRY_H_
 
+#include <string>
 #include "base.h"
 #include "packed_func_ext.h"
 #include "runtime/registry.h"
 
+namespace tvm {
 /*!
  * \brief Register an API function globally.
  * It simply redirects to TVM_REGISTER_GLOBAL
@@ -24,4 +26,113 @@
  */
 #define TVM_REGISTER_API(OpName) TVM_REGISTER_GLOBAL(OpName)
 
+/*!
+ * \brief Node container of EnvFunc
+ * \sa EnvFunc
+ */
+class EnvFuncNode : public Node {
+ public:
+  /*! \brief Unique name of the global function */
+  std::string name;
+  /*! \brief The internal packed function */
+  PackedFunc func;
+  /*! \brief constructor */
+  EnvFuncNode() {}
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+  }
+
+  static constexpr const char* _type_key = "EnvFunc";
+  TVM_DECLARE_NODE_TYPE_INFO(EnvFuncNode, Node);
+};
+
+/*!
+ * \brief A serializable function backed by TVM's global environment.
+ *
+ * This is a wrapper to enable serializable global PackedFunc.
+ * An EnvFunc is saved by its name in the global registry
+ * under the assumption that the same function is registered during load.
+ */
+class EnvFunc : public NodeRef {
+ public:
+  EnvFunc() {}
+  explicit EnvFunc(std::shared_ptr<Node> n) : NodeRef(n) {}
+  /*! \return The internal global function pointer */
+  const EnvFuncNode* operator->() const {
+    return static_cast<EnvFuncNode*>(node_.get());
+  }
+  /*!
+   * \brief Invoke the function.
+   * \param args The arguments
+   * \returns The return value.
+   */
+  template<typename... Args>
+  runtime::TVMRetValue operator()(Args&&... args) const {
+    const EnvFuncNode* n = operator->();
+    CHECK(n != nullptr);
+    return n->func(std::forward<Args>(args)...);
+  }
+  /*!
+   * \brief Get a global function based on the name.
+   * \param name The name of the global function.
+   * \return The created global function.
+   * \note The function can be unique
+   */
+  TVM_DLL static EnvFunc Get(const std::string& name);
+  /*! \brief specify container node */
+  using ContainerType = EnvFuncNode;
+};
+
+/*!
+ * \brief Please refer to \ref TypedEnvFuncAnchor "TypedEnvFunc<R(Args..)>"
+ */
+template<typename FType>
+class TypedEnvFunc;
+
+/*!
+ * \anchor TypedEnvFuncAnchor
+ * \brief A typed version of EnvFunc.
+ * It is backed by a GlobalFuncNode internally.
+ *
+ * \tparam R The return value of the function.
+ * \tparam Args The argument signature of the function.
+ * \sa EnvFunc
+ */
+template<typename R, typename... Args>
+class TypedEnvFunc<R(Args...)> : public NodeRef {
+ public:
+  /*! \brief short hand for this function type */
+  using TSelf = TypedEnvFunc<R(Args...)>;
+  TypedEnvFunc() {}
+  explicit TypedEnvFunc(std::shared_ptr<Node> n) : NodeRef(n) {}
+  /*!
+   * \brief Assign global function to a TypedEnvFunc
+   * \param other Another global function.
+   * \return reference to self.
+   */
+  TSelf& operator=(const EnvFunc& other) {
+    this->node_ = other.node_;
+    return *this;
+  }
+  /*! \return The internal global function pointer */
+  const EnvFuncNode* operator->() const {
+    return static_cast<EnvFuncNode*>(node_.get());
+  }
+  /*!
+   * \brief Invoke the function.
+   * \param args The arguments
+   * \returns The return value.
+   */
+  R operator()(Args... args) const {
+    const EnvFuncNode* n = operator->();
+    CHECK(n != nullptr);
+    return runtime::detail::typed_packed_call_dispatcher<R>
+        ::run(n->func, std::forward<Args>(args)...);
+  }
+  /*! \brief specify container node */
+  using ContainerType = EnvFuncNode;
+};
+
+}  // namespace tvm
 #endif  // TVM_API_REGISTRY_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index dc7475d3bff1..d1206a8a34f4 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -257,6 +257,14 @@ class TypedPackedFunc<R(Args...)> {
   const PackedFunc& packed() const {
     return packed_;
   }
+  /*! \return Whether the packed function is nullptr */
+  bool operator==(std::nullptr_t null) const {
+    return packed_ == nullptr;
+  }
+  /*! \return Whether the packed function is not nullptr */
+  bool operator!=(std::nullptr_t null) const {
+    return packed_ != nullptr;
+  }
 
  private:
   friend class TVMRetValue;
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 2bcb003ee7e5..223e73eeb596 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -45,6 +45,28 @@ def const(value, dtype=None):
     return _api_internal._const(value, dtype)
 
 
+def get_env_func(name):
+    """Get an EnvFunc by a global name.
+
+    Parameters
+    ----------
+    name: str
+        The name of the global function.
+
+    Returns
+    -------
+    env_func : EnvFunc
+        The result env function.
+
+    Note
+    ----
+    EnvFunc is a Node wrapper around
+    global function that can be serialized via its name.
+    This can be used to serialize function field in the language.
+    """
+    return _api_internal._EnvFuncGet(name)
+
+
 def convert(value):
     """Convert value to TVM node or function.
 
diff --git a/python/tvm/container.py b/python/tvm/container.py
index 27e533113926..eb1f17b0fc9d 100644
--- a/python/tvm/container.py
+++ b/python/tvm/container.py
@@ -27,6 +27,20 @@ def __len__(self):
         return _api_internal._ArraySize(self)
 
 
+@register_node
+class EnvFunc(NodeBase):
+    """Environment function.
+
+    This is a global function object that can be serialized by its name.
+    """
+    def __call__(self, *args):
+        return _api_internal._EnvFuncCall(self, *args)
+
+    @property
+    def func(self):
+        return _api_internal._EnvFuncGetPackedFunc(self)
+
+
 @register_node
 class Map(NodeBase):
     """Map container of TVM.
diff --git a/src/api/api_test.cc b/src/api/api_test.cc
index 7a2ae638a038..181036acf82f 100644
--- a/src/api/api_test.cc
+++ b/src/api/api_test.cc
@@ -14,6 +14,7 @@ struct TestAttrs : public AttrsNode<TestAttrs> {
   int axis;
   std::string name;
   Array<Expr> padding;
+  TypedEnvFunc<int(int)> func;
 
   TVM_DECLARE_ATTRS(TestAttrs, "attrs.TestAttrs") {
     TVM_ATTR_FIELD(axis)
@@ -26,6 +27,9 @@ struct TestAttrs : public AttrsNode<TestAttrs> {
     TVM_ATTR_FIELD(padding)
         .describe("padding of input")
         .set_default(Array<Expr>({0, 0}));
+    TVM_ATTR_FIELD(func)
+        .describe("some random env function")
+        .set_default(TypedEnvFunc<int(int)>(nullptr));
   }
 };
 
diff --git a/src/lang/api_registry.cc b/src/lang/api_registry.cc
new file mode 100644
index 000000000000..466ee1d3dd68
--- /dev/null
+++ b/src/lang/api_registry.cc
@@ -0,0 +1,50 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file api_registry.cc
+ */
+#include <tvm/api_registry.h>
+
+namespace tvm {
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<EnvFuncNode>([](const EnvFuncNode *op, IRPrinter *p) {
+    p->stream << "EnvFunc(" << op->name << ")";
+});
+
+std::shared_ptr<EnvFuncNode> CreateEnvNode(const std::string& name) {
+  auto* f = runtime::Registry::Get(name);
+  CHECK(f != nullptr) << "Cannot find global function \'" << name << '\'';
+  std::shared_ptr<EnvFuncNode> n = std::make_shared<EnvFuncNode>();
+  n->func = *f;
+  n->name = name;
+  return n;
+}
+
+EnvFunc EnvFunc::Get(const std::string& name) {
+  return EnvFunc(CreateEnvNode(name));
+}
+
+TVM_REGISTER_API("_EnvFuncGet")
+.set_body_typed<EnvFunc(const std::string& name)>(EnvFunc::Get);
+
+TVM_REGISTER_API("_EnvFuncCall")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    EnvFunc env = args[0];
+    CHECK_GE(args.size(), 1);
+    env->func.CallPacked(TVMArgs(args.values + 1,
+                                 args.type_codes + 1,
+                                 args.size() - 1), rv);
+  });
+
+TVM_REGISTER_API("_EnvFuncGetPackedFunc")
+.set_body_typed<PackedFunc(const EnvFunc& n)>([](const EnvFunc&n) {
+    return n->func;
+  });
+
+TVM_REGISTER_NODE_TYPE(EnvFuncNode)
+.set_creator(CreateEnvNode)
+.set_global_key([](const Node* n) {
+    return static_cast<const EnvFuncNode*>(n)->name;
+  });
+
+}  // namespace tvm
diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py
index 567c5ad93e93..83b440a2c1d4 100644
--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -56,11 +56,14 @@ def test_make_attrs():
     assert x.padding[1].value == 4
     assert x.axis == 10
 
+
     dattr = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
     assert dattr.x.value == 1
     datrr = tvm.load_json(tvm.save_json(dattr))
     assert dattr.name.value == "xyz"
 
+
+
 def test_make_sum():
     A = tvm.placeholder((2, 10), name='A')
     k = tvm.reduce_axis((0,10), "k")
@@ -70,7 +73,33 @@ def test_make_sum():
     assert B.op.body[0].combiner is not None
     assert BB.op.body[0].combiner is not None
 
+
+def test_env_func():
+    @tvm.register_func("test.env_func")
+    def test(x):
+        return x + 1
+
+    f = tvm.get_global_func("test.env_func")
+    x = tvm.get_env_func("test.env_func")
+    assert x.name == "test.env_func"
+    json_str = tvm.save_json([x])
+    y = tvm.load_json(json_str)[0]
+    assert y.name == x.name
+    assert y(1) == 2
+    assert y.func(1) == 2
+
+    x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3,4), func=y)
+    assert x.name == "xx"
+    assert x.padding[0].value == 3
+    assert x.padding[1].value == 4
+    assert x.axis == 10
+    x = tvm.load_json(tvm.save_json(x))
+    assert isinstance(x.func, tvm.container.EnvFunc)
+    assert x.func(10) == 11
+
+
 if __name__ == "__main__":
+    test_env_func()
     test_make_attrs()
     test_make_node()
     test_make_smap()

From 9cf04a29cdd0f136fb87590ff44db5f7a94a084c Mon Sep 17 00:00:00 2001
From: Yuwei Hu <huyuwei1995@gmail.com>
Date: Sat, 15 Sep 2018 17:46:40 -0400
Subject: [PATCH 098/529] [Keras] fix weight shape in dilated conv (#1715)

---
 nnvm/python/nnvm/frontend/keras.py            | 14 +++---
 .../python/frontend/keras/test_forward.py     | 47 +++++++++----------
 2 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index eb9bf4d3720d..4ff60e3b3fc3 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -58,8 +58,10 @@ def _convert_activation(insym, keras_layer, _):
         return _get_elu(insym, alpha)
     elif act_type == 'selu':
         # Alpha, Gamma values, obtained from  https://arxiv.org/abs/1706.02515
-        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1.6732
-        gamma = keras_layer.gamma if hasattr(keras_layer, "gamma") else 1.0507
+        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") \
+            else 1.6732632423543772848170429916717
+        gamma = keras_layer.gamma if hasattr(keras_layer, "gamma") \
+            else 1.0507009873554804934193349852946
         return gamma * _get_elu(insym, alpha)
     elif act_type == 'relu6':
         return _sym.clip(insym, a_min=0, a_max=6)
@@ -155,8 +157,8 @@ def _convert_convolution(insym, keras_layer, symtab):
         dilation = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
     else:
         dilation = [keras_layer.dilation_rate, keras_layer.dilation_rate]
-    kernel_h = (kernel_h - 1) * dilation[0] + 1
-    kernel_w = (kernel_w - 1) * dilation[1] + 1
+    dilated_kernel_h = (kernel_h - 1) * dilation[0] + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation[1] + 1
     stride_h, stride_w = keras_layer.strides
     params = {'weight': symtab.new_const(weight),
               'kernel_size': [kernel_h, kernel_w],
@@ -178,8 +180,8 @@ def _convert_convolution(insym, keras_layer, symtab):
     elif keras_layer.padding == 'same':
         in_h = keras_layer.input_shape[1]
         in_w = keras_layer.input_shape[2]
-        pad_t, pad_b = _get_pad_pair(in_h, kernel_h, stride_h)
-        pad_l, pad_r = _get_pad_pair(in_w, kernel_w, stride_w)
+        pad_t, pad_b = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
         if pad_t == pad_b and pad_l == pad_r:
             params['padding'] = (pad_t, pad_l)
         else:
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index a8623b8a3976..92410ee8dbb3 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -73,10 +73,10 @@ def test_forward_elemwise_add():
     keras_model = keras.models.Model(data, y)
     verify_keras_frontend(keras_model)
 
+
 def test_forward_dense():
-    data = keras.layers.Input(shape=(32,32,3))
-    x = keras.layers.MaxPooling2D(pool_size=(2,2))(data)
-    x = keras.layers.Flatten()(x)
+    data = keras.layers.Input(shape=(32,32,1))
+    x = keras.layers.Flatten()(data)
     x = keras.layers.Dropout(0.5)(x)
     x = keras.layers.Dense(10, activation='relu', kernel_initializer='uniform')(x)
     keras_model = keras.models.Model(data, x)
@@ -84,7 +84,7 @@ def test_forward_dense():
 
 
 def test_forward_pool():
-    data = keras.layers.Input(shape=(2,2,1))
+    data = keras.layers.Input(shape=(32,32,1))
     # maxpool
     x = keras.layers.MaxPooling2D((3, 3), strides=(1, 1), padding='same')(data)
     keras_model = keras.models.Model(data, x)
@@ -95,25 +95,20 @@ def test_forward_pool():
     verify_keras_frontend(keras_model)
 
 
-def test_forward_transpose_conv():
-    data = keras.layers.Input(shape=(32,32,3))
-    x = keras.layers.Conv2D(filters=10, kernel_size=(3,3), strides=(2,2), padding='same')(data)
-    x = keras.layers.DepthwiseConv2D(kernel_size=(3,3), padding='same')(x)
-    x = keras.layers.Conv2DTranspose(filters=64, kernel_size=(3,3), padding='valid')(x)
-    x = keras.layers.GlobalMaxPooling2D()(x)
-    keras_model = keras.models.Model(data, x)
-    verify_keras_frontend(keras_model)
-
-
-def test_forward_separable_conv():
+def test_forward_conv():
     data = keras.layers.Input(shape=(32,32,3))
-    x = keras.layers.SeparableConv2D(filters=10, kernel_size=(3,3),
-        padding='same', activation='relu')(data)
-    x = keras.layers.BatchNormalization(scale=True, center=False,
-        beta_initializer='uniform', gamma_initializer='uniform')(x)
-    x = keras.layers.GlobalAveragePooling2D()(x)
-    keras_model = keras.models.Model(data, x)
-    verify_keras_frontend(keras_model)
+    conv_funcs = [keras.layers.Conv2D(filters=10, kernel_size=(3,3),
+                                      strides=(2,2), padding='same'),
+                  keras.layers.Conv2D(filters=10, kernel_size=(3,3),
+                                      dilation_rate=(2,2), padding='same'),
+                  keras.layers.DepthwiseConv2D(kernel_size=(3,3), padding='same'),
+                  keras.layers.Conv2DTranspose(filters=10, kernel_size=(3,3), padding='valid'),
+                  keras.layers.SeparableConv2D(filters=10, kernel_size=(3,3), padding='same')]
+    for conv_func in conv_funcs:
+        x = conv_func(data)
+        x = keras.layers.GlobalAveragePooling2D()(x)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model)
 
 
 def test_forward_upsample():
@@ -123,6 +118,7 @@ def test_forward_upsample():
     keras_model = keras.models.Model(data, x)
     verify_keras_frontend(keras_model)
 
+
 def test_forward_reshape():
     data = keras.layers.Input(shape=(32,32,3))
     x = keras.layers.Reshape(target_shape=(32,32,3))(data)
@@ -168,6 +164,7 @@ def test_forward_mobilenet():
         input_shape=(224,224,3), classes=1000)
     verify_keras_frontend(keras_model)
 
+
 def test_forward_activations():
     data = keras.layers.Input(shape=(32,32,3))
     weights = np.random.rand(1, 32, 32, 3)
@@ -187,10 +184,11 @@ def test_forward_activations():
                  keras.layers.Activation('linear')]
     for act_func in act_funcs:
         x = act_func(data)
-        x = keras.layers.GlobalMaxPooling2D()(x)
+        x = keras.layers.GlobalAveragePooling2D()(x)
         keras_model = keras.models.Model(data, x)
         verify_keras_frontend(keras_model)
 
+
 def test_forward_multi_inputs():
     data1 = keras.layers.Input(shape=(32,32,3))
     data2 = keras.layers.Input(shape=(32,32,3))
@@ -239,8 +237,7 @@ def test_forward_reuse_layers():
     test_forward_activations()
     test_forward_dense()
     test_forward_pool()
-    test_forward_transpose_conv()
-    test_forward_separable_conv()
+    test_forward_conv()
     test_forward_upsample()
     test_forward_reshape()
     test_forward_crop()

From 2b46d377c1a52c6b8f45fec7e582a6d7b78408d3 Mon Sep 17 00:00:00 2001
From: yuruofeifei <yuruofeifei@gmail.com>
Date: Mon, 17 Sep 2018 18:00:07 -0700
Subject: [PATCH 099/529] [NNVM][TOPI] Add mean and product operators (#1628)

* Add mean and product operators

* Fix typo

* Fix lint

* fix test

* Fix gpu schedule

* Update doc

* remove mean from topi

* Add nnvm test

* Fix cuda schedule

* Remove cuda schedule
---
 docs/api/python/topi.rst                      |   2 +
 docs/nnvm_top.rst                             |   4 +
 include/tvm/expr.h                            |   1 +
 include/tvm/ir_operator.h                     |   6 +
 nnvm/python/nnvm/top/reduction.py             |   8 +
 nnvm/src/top/tensor/reduce.cc                 |  64 +++++++
 nnvm/tests/python/compiler/test_top_level4.py |  10 ++
 src/lang/ir_operator.cc                       |   9 +
 topi/include/topi/reduction.h                 |  22 +++
 topi/python/topi/reduction.py                 | 164 ++++--------------
 topi/src/topi.cc                              |   5 +
 topi/tests/python/test_topi_reduce.py         |   1 +
 topi/tests/python_cpp/test_topi_reduce.py     |  38 ++--
 13 files changed, 189 insertions(+), 145 deletions(-)

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 7f150ddbf7cd..767dfe1ba844 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -49,6 +49,7 @@ List of operators
    topi.min
    topi.argmax
    topi.argmin
+   topi.prod
    topi.broadcast_to
    topi.add
    topi.subtract
@@ -107,6 +108,7 @@ topi
 .. autofunction:: topi.max
 .. autofunction:: topi.sum
 .. autofunction:: topi.min
+.. autofunction:: topi.prod
 .. autofunction:: topi.broadcast_to
 .. autofunction:: topi.add
 .. autofunction:: topi.subtract
diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
index 663c85ac789e..be1077f664c3 100644
--- a/docs/nnvm_top.rst
+++ b/docs/nnvm_top.rst
@@ -114,6 +114,8 @@ This level enables typical convnet models.
    nnvm.symbol.sum
    nnvm.symbol.min
    nnvm.symbol.max
+   nnvm.symbol.mean
+   nnvm.symbol.prod
    nnvm.symbol.broadcast_add
    nnvm.symbol.broadcast_sub
    nnvm.symbol.broadcast_mul
@@ -228,6 +230,8 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.sum
 .. autofunction:: nnvm.symbol.min
 .. autofunction:: nnvm.symbol.max
+.. autofunction:: nnvm.symbol.mean
+.. autofunction:: nnvm.symbol.prod
 .. autofunction:: nnvm.symbol.broadcast_add
 .. autofunction:: nnvm.symbol.broadcast_sub
 .. autofunction:: nnvm.symbol.broadcast_mul
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index fe645bcf580a..fb2233dacb69 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -36,6 +36,7 @@ using HalideIR::Internal::Variable;
 
 using HalideIR::Internal::make_const;
 using HalideIR::Internal::make_zero;
+using HalideIR::Internal::make_one;
 using HalideIR::Internal::as_const_int;
 using HalideIR::Internal::as_const_uint;
 using HalideIR::Internal::const_true;
diff --git a/include/tvm/ir_operator.h b/include/tvm/ir_operator.h
index e809b06e49b5..39588a2228f9 100644
--- a/include/tvm/ir_operator.h
+++ b/include/tvm/ir_operator.h
@@ -41,6 +41,12 @@ TVM_DLL Expr max(Expr source, Array<IterVar> axis);
  */
 TVM_DLL Expr min(Expr source, Array<IterVar> axis);
 
+/*!
+ * \brief product of of source expression over axis
+ * \param source The source expression.
+ * \param axis List of iteration variables that will be used for reduction.
+ */
+TVM_DLL Expr prod(Expr source, Array<IterVar> axis);
 
 // Unary intrinsic operators
 #define TVM_DECLARE_INTRIN_UNARY(OpName)                                \
diff --git a/nnvm/python/nnvm/top/reduction.py b/nnvm/python/nnvm/top/reduction.py
index fd8e2f8df56e..aef6e1dcc4a8 100644
--- a/nnvm/python/nnvm/top/reduction.py
+++ b/nnvm/python/nnvm/top/reduction.py
@@ -49,3 +49,11 @@ def _compute(attrs, inputs, out_info):
 # argmin
 reg.register_pattern("argmin", OpPattern.COMM_REDUCE)
 reg.register_schedule("argmin", _fschedule_reduce)
+
+# mean
+reg.register_pattern("mean", OpPattern.COMM_REDUCE)
+reg.register_schedule("mean", _fschedule_reduce)
+
+# product
+reg.register_pattern("prod", OpPattern.COMM_REDUCE)
+reg.register_schedule("prod", _fschedule_reduce)
diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index d8f426b4f4bc..10dd95742222 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -322,6 +322,70 @@ values over a given axis.
       topi::argmin(inputs[0], axis, param.keepdims) };
 });
 
+NNVM_REGISTER_REDUCE_OP(mean)
+  .describe(R"code(Computes the mean of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  mean(data)
+  [3.22]
+
+  mean(data, axis=[1,2])
+  [ 2.  3.16666667  4.5]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
+    auto axis = ShapeToArray(r_axes);
+
+    Expr count = make_one(inputs[0]->dtype);
+    for (auto& i : r_axes) {
+      count *= inputs[0]->shape[i];
+    }
+
+    return Array<Tensor>{
+      topi::divide(topi::sum(inputs[0], axis, param.keepdims), count) };
+});
+
+NNVM_REGISTER_REDUCE_OP(prod)
+  .describe(R"code(Computes the products of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  mean(data, axis=1)
+  [35562240]
+
+  mean(data, axis=[1,2])
+  [ 36  480  2058]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
+    auto axis = ShapeToArray(r_axes);
+    return Array<Tensor>{
+      topi::prod(inputs[0], axis, param.keepdims) };
+});
+
 
 }  // namespace top
 }  // namespace nnvm
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 6503d2d2292d..16b02f956ccc 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -31,6 +31,9 @@ def verify_reduce_explicit(dshape, data, result, fsym, oshape=None, otype='float
     x = sym.Variable("x")
     y = fsym(x + 0, **kwargs)
     for target, ctx in ctx_list():
+        # TODO(yuruofei): remove when cuda reduce schedule is done
+        if target == 'cuda' and fsym == sym.mean:
+            continue
         graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
         m = graph_runtime.create(graph, lib, ctx)
         # set input
@@ -93,6 +96,13 @@ def wrapper(data, axis=None, keepdims=False):
     verify_reduce((4, 4, 3), np.min, sym.min, keepdims=True)
     verify_reduce((4, 4, 3), np.sum, sym.sum, axis=(0, 2))
     verify_reduce((4, 4, 3), np.sum, sym.sum)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 1), keepdims=False)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 2), keepdims=False)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 1), keepdims=True)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 2), keepdims=True)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, keepdims=True)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, keepdims=False)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 1, 2), keepdims=True)
 
     data = np.array([[[1,2],[3,4]],[[3,44],[5,6]]], dtype=np.float32)
     verify_reduce_explicit([2,2,2], data, np.array([[1,1],[1,0]]), sym.argmax, otype='int32', axis=[0,2], exclude=True)
diff --git a/src/lang/ir_operator.cc b/src/lang/ir_operator.cc
index ded27bbdce7e..50e598d13dc2 100644
--- a/src/lang/ir_operator.cc
+++ b/src/lang/ir_operator.cc
@@ -35,4 +35,13 @@ Expr min(Expr source, Array<IterVar> rdom) {
   return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
+Expr prod(Expr source, Array<IterVar> rdom) {
+  Var x("x"), y("y");
+  Expr result = ir::Mul::make(x, y);
+  Expr identity_element = make_one(source.type());
+  ir::CommReducer combiner =
+    ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
+  return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
+}
+
 }  // namespace tvm
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index f14187471faf..1ac3f2d6157c 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -12,6 +12,7 @@
 #include <vector>
 #include <iterator>
 
+#include "topi/broadcast.h"
 #include "topi/elemwise.h"
 #include "topi/tags.h"
 #include "topi/transform.h"
@@ -288,6 +289,11 @@ inline Expr MaxOp(Expr source, Array<IterVar> axis) {
   return tvm::max(source, axis);  // NOLINT(*)
 }
 
+/*! \brief Wrap tvm::prod to ensure we get the correct overload */
+inline Expr ProdOp(Expr source, Array<IterVar> axis) {
+  return tvm::prod(source, axis);  // NOLINT(*)
+}
+
 /*!
 * \brief Creates an operation that sums array elements over a given axis
 *
@@ -426,5 +432,21 @@ inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false
   return CommReduceIdx(data, axis, func, keepdims);
 }
 
+/*!
+* \brief Creates product operation over given axis.
+*
+* \param data The input tensor
+* \param axis The axis to do product over. If axis is empty, the
+* operation will do the product over all elements of the array.
+* \param keepdims If this is set to true, the axes which are reduced are
+* left in the result as dimensions with size one. This enables the result
+* to broadcast correctly against the input array.
+*
+* \return A Tensor whose op member is the prod operation
+*/
+inline Tensor prod(const Tensor& data, Array<Expr> axis, bool keepdims = false) {  // NOLINT(*)
+  return CommReduce(data, axis, ProdOp, keepdims);
+}
+
 }  // namespace topi
 #endif  // TOPI_REDUCTION_H_
diff --git a/topi/python/topi/reduction.py b/topi/python/topi/reduction.py
index 9f88953bb770..52121a506f43 100644
--- a/topi/python/topi/reduction.py
+++ b/topi/python/topi/reduction.py
@@ -2,8 +2,8 @@
 """Reduce operators"""
 from __future__ import absolute_import as _abs
 import tvm
+from . import cpp
 from . import tag
-from .util import ravel_index
 
 def _get_real_axis(ndim, axis):
     if axis is None:
@@ -26,130 +26,6 @@ def _get_real_axis(ndim, axis):
     return real_axis
 
 
-def get_reduce_out_shape(src_shape, axis=None, keepdims=False):
-    """Get the output shape for the reduction OPs
-
-    Parameters
-    ----------
-    src_shape : tuple of int or tvm.expr.IntImm
-
-    axis : None or int or tuple of int
-
-    keepdims : bool
-
-    Returns
-    -------
-    dst_shape : tuple of int or tvm.expr.IntImm
-    """
-    real_axis = _get_real_axis(len(src_shape), axis)
-    if keepdims:
-        dst_shape = [src_shape[i] if i in real_axis else 1 for i in range(len(src_shape))]
-    else:
-        dst_shape = []
-        for i in range(len(src_shape)):
-            if i not in real_axis:
-                dst_shape.append(src_shape[i])
-    return dst_shape
-
-
-def _argmax_comp(lhs, rhs):
-    """Compare function of argmax"""
-    idx = tvm.make.Select((lhs[1] >= rhs[1]), lhs[0], rhs[0])
-    val = tvm.make.Select((lhs[1] >= rhs[1]), lhs[1], rhs[1])
-    return idx, val
-
-
-def _argmax_init(idx_typ, val_typ):
-    """Initial ind and val of argmax"""
-    return tvm.const(-1, idx_typ), tvm.min_value(val_typ)
-
-
-def _argmin_comp(lhs, rhs):
-    """Compare function of argmin"""
-    idx = tvm.make.Select((lhs[1] <= rhs[1]), lhs[0], rhs[0])
-    val = tvm.make.Select((lhs[1] <= rhs[1]), lhs[1], rhs[1])
-    return idx, val
-
-
-def _argmin_init(idx_typ, val_typ):
-    """Initial ind and val of argmax"""
-    return tvm.const(-1, idx_typ), tvm.max_value(val_typ)
-
-
-def _choose_idx(idx, _, *indices):
-    """Chose the idx from idx and val"""
-    return idx(*indices)
-
-
-def comm_reduce(data, axis=None, keepdims=False, func=tvm.sum, is_idx_reduce=False):
-    """Reducing the data
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a sum is performed.
-        The default, axis=None, will sum all of the elements of the input array.
-        If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-         with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    func : function
-        functions like tvm.sum, tvm.max, tvm.min
-
-    Returns
-    -------
-    ret : tvm.Tensor
-    """
-    ndim = len(data.shape)
-    assert ndim != 0, "Reduce a dim-0 input is not supported!"
-    real_axis = _get_real_axis(ndim, axis)
-    reduce_axes = [tvm.reduce_axis((0, data.shape[i]), "k%d" %i) for i in real_axis]
-    if keepdims:
-        target_shape = [1 if i in real_axis else data.shape[i] for i in range(ndim)]
-    else:
-        target_shape = []
-        for i in range(ndim):
-            if i not in real_axis:
-                target_shape.append(tvm.convert(data.shape[i]))
-    def _compute(*indices):
-        eval_range = []
-        eval_indices = []
-        if not keepdims:
-            arg_counter = 0
-        else:
-            arg_counter = None
-        red_counter = 0
-        for i in range(len(data.shape)):
-            if i in real_axis:
-                eval_range.append(reduce_axes[red_counter])
-                eval_indices.append(reduce_axes[red_counter].var)
-                red_counter += 1
-            else:
-                if not keepdims:
-                    eval_range.append(indices[arg_counter])
-                    arg_counter += 1
-                else:
-                    eval_range.append(indices[i])
-        if not is_idx_reduce:
-            return func(data[tuple(eval_range)], axis=reduce_axes)
-        idx = ravel_index(eval_indices, [data.shape[i] for i in real_axis])
-        return func((idx, data[tuple(eval_range)]), axis=reduce_axes)
-    if is_idx_reduce:
-        temp_idx, temp_val = tvm.compute(target_shape, _compute, name=data.name + "_red_temp")
-        out = tvm.compute(target_shape,
-                          lambda *indices: _choose_idx(temp_idx, temp_val, *indices),
-                          name=data.name + "_red")
-    else:
-        out = tvm.compute(target_shape, _compute, name=data.name + "_red")
-    return out
-
-
 @tvm.tag_scope(tag=tag.COMM_REDUCE)
 def sum(data, axis=None, keepdims=False):
     """Sum of array elements over a given axis or a list of axes
@@ -173,7 +49,7 @@ def sum(data, axis=None, keepdims=False):
     -------
     ret : tvm.Tensor
     """
-    return comm_reduce(data, axis=axis, keepdims=keepdims, func=tvm.sum)
+    return cpp.sum(data, axis, keepdims)
 
 
 @tvm.tag_scope(tag=tag.COMM_REDUCE)
@@ -199,7 +75,7 @@ def max(data, axis=None, keepdims=False):
     -------
     ret : tvm.Tensor
     """
-    return comm_reduce(data, axis=axis, keepdims=keepdims, func=tvm.max)
+    return cpp.max(data, axis, keepdims)
 
 
 @tvm.tag_scope(tag=tag.COMM_REDUCE)
@@ -225,7 +101,7 @@ def min(data, axis=None, keepdims=False):
     -------
     ret : tvm.Tensor
     """
-    return comm_reduce(data, axis=axis, keepdims=keepdims, func=tvm.min)
+    return cpp.min(data, axis, keepdims)
 
 
 @tvm.tag_scope(tag=tag.COMM_REDUCE_IDX)
@@ -251,8 +127,7 @@ def argmax(data, axis=None, keepdims=False):
     -------
     ret : tvm.Tensor
     """
-    _argmax = tvm.comm_reducer(fcombine=_argmax_comp, fidentity=_argmax_init, name='argmax')
-    return comm_reduce(data, axis=axis, keepdims=keepdims, func=_argmax, is_idx_reduce=True)
+    return cpp.argmax(data, axis, keepdims)
 
 
 @tvm.tag_scope(tag=tag.COMM_REDUCE_IDX)
@@ -278,5 +153,30 @@ def argmin(data, axis=None, keepdims=False):
     -------
     ret : tvm.Tensor
     """
-    _argmin = tvm.comm_reducer(fcombine=_argmin_comp, fidentity=_argmin_init, name='argmin')
-    return comm_reduce(data, axis=axis, keepdims=keepdims, func=_argmin, is_idx_reduce=True)
+    return cpp.argmin(data, axis, keepdims)
+
+
+@tvm.tag_scope(tag=tag.COMM_REDUCE)
+def prod(data, axis=None, keepdims=False):
+    """Product of array elements over a given axis or a list of axes
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        The input tvm tensor
+
+    axis : None or int or tuple of int
+        Axis or axes along which a prod operation is performed.
+        The default, axis=None, will get the prod element over all of the elements of the
+        input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.prod(data, axis, keepdims)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 4cdab4401459..cac3545a75a2 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -230,6 +230,11 @@ TVM_REGISTER_GLOBAL("topi.argmax")
   *rv = topi::argmax(args[0], ArrayOrInt(args[1]), args[2]);
   });
 
+TVM_REGISTER_GLOBAL("topi.prod")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::prod(args[0], ArrayOrInt(args[1]), args[2]);
+  });
+
 /* Ops from transform.h */
 TVM_REGISTER_GLOBAL("topi.expand_dims")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 0be652948060..ceb2a4fe1bb1 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -72,6 +72,7 @@ def check_device(device):
             out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
         else:
             raise NotImplementedError
+        out_npy = np.atleast_1d(out_npy)
         data_tvm = tvm.nd.array(in_npy, ctx=ctx)
         out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
         for _ in range(1):
diff --git a/topi/tests/python_cpp/test_topi_reduce.py b/topi/tests/python_cpp/test_topi_reduce.py
index 7bf369c7f1ff..ab4ac9372373 100644
--- a/topi/tests/python_cpp/test_topi_reduce.py
+++ b/topi/tests/python_cpp/test_topi_reduce.py
@@ -42,6 +42,8 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
     elif type == "argmin":
         B = topi.cpp.argmin(A1, axis, keepdims)
         out_dtype = "int32"
+    elif type == "prod":
+        B = topi.cpp.prod(A1, axis, keepdims)
     else:
         raise NotImplementedError
 
@@ -57,7 +59,7 @@ def check_device(device):
         else:
             s = topi.cpp.cuda.schedule_reduce(target, [B])
 
-        foo = tvm.build(s, [A, B], device, name="sum")
+        foo = tvm.build(s, [A, B], device, name=type)
         # Test
         in_npy = np.random.uniform(size=in_shape).astype(np.float32)
         in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32)
@@ -71,6 +73,8 @@ def check_device(device):
             out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
         elif type == "argmin":
             out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
+        elif type == "prod":
+            out_npy = in_npy_map.prod(axis=axis, keepdims=keepdims)
         else:
             raise NotImplementedError
         out_npy = np.atleast_1d(out_npy)
@@ -100,21 +104,29 @@ def check_device(device):
 
 def test_reduce_map():
     verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
-                        axis=(1, 2, 3),
-                        keepdims=True,
-                        type="sum")
+                          axis=(1, 2, 3),
+                          keepdims=True,
+                          type="sum")
     verify_reduce_map_ele(in_shape=(128, 24 * 128 * 24),
-                        axis=(1,),
-                        keepdims=False,
-                        type="max")
+                          axis=(1,),
+                          keepdims=False,
+                          type="max")
     verify_reduce_map_ele(in_shape=(32, 128, 24),
-                        axis=None,
-                        keepdims=True,
-                        type="sum")
+                          axis=None,
+                          keepdims=True,
+                          type="sum")
     verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
-                        axis=(0, 2),
-                        keepdims=False,
-                        type="min")
+                          axis=(0, 2),
+                          keepdims=False,
+                          type="min")
+    verify_reduce_map_ele(in_shape=(128, 4, 4, 128),
+                          axis=(1, ),
+                          keepdims=True,
+                          type="prod")
+    verify_reduce_map_ele(in_shape=(4, 4),
+                          axis=(0, 1),
+                          keepdims=False,
+                          type="prod")
     verify_reduce_map_ele(in_shape=(32, 128),
                           axis=1,
                           keepdims=True,

From 0bf8d1d417a5f33ce2e071ad16b4d0b447b81325 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Tue, 18 Sep 2018 09:29:49 -0700
Subject: [PATCH 100/529] [CODEGEN] Fix let expression (#1727)

---
 src/codegen/codegen_c.cc                 |  3 +--
 topi/tests/python/test_topi_transform.py | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index 09a6c7e6ab4c..c3b0d278c7ac 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -652,11 +652,10 @@ void CodeGenC::VisitStmt_(const Store* op) {
 }
 
 void CodeGenC::VisitExpr_(const Let* op, std::ostream& os) {  // NOLINT(*)
-  CHECK(print_ssa_form_)
-      << "LetExpr is only supported by print SSA form";
   std::string value = PrintExpr(op->value);
   CHECK(!var_idmap_.count(op->var.get()));
   var_idmap_[op->var.get()] = value;
+  os << PrintExpr(op->body);
 }
 
 void CodeGenC::VisitExpr_(const Ramp* op, std::ostream& os) {  // NOLINT(*)
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 123df331e174..ce2505e0d8f6 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -307,6 +307,21 @@ def test_squeeze():
     verify_squeeze((1, 1, 1, 4), (1, 2))
     verify_squeeze((1, 1, 1, 1), None)
 
+    # a special case to trigger inline let expression
+    A = tvm.placeholder((2,), 'float32', 'A')
+    E = topi.squeeze(A)
+    C = tvm.compute((1,), lambda i: E[(2 * A[0] - 1).astype('int32')])
+    for device in ['cuda', 'opencl']:
+        ctx = tvm.context(device, 0)
+        if ctx.exist:
+            with tvm.target.create(device):
+                s = topi.generic.schedule_injective(C)
+                func = tvm.build(s, [A, C])
+            a = tvm.nd.array(np.array((1, 2)).astype('float32'), ctx=ctx)
+            c = tvm.nd.empty((1,), dtype='float32', ctx=ctx)
+            func(a, c)
+            assert c.asnumpy()[0] == 2
+
 
 def test_concatenate():
     verify_concatenate([(2,), (2,), (2,)], 0)

From 9e4cae08edaac58bb76e9d3462536dea37f64a36 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 18 Sep 2018 22:08:28 +0530
Subject: [PATCH 101/529] [NNVM][KERAS]LSTMCell support (#1686)

---
 nnvm/python/nnvm/frontend/keras.py            | 70 ++++++++++++++++---
 .../python/frontend/keras/test_forward.py     | 43 ++++++++++--
 2 files changed, 98 insertions(+), 15 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 4ff60e3b3fc3..07f1ce5024e2 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -395,6 +395,40 @@ def _convert_reshape(insym, keras_layer, _):
     shape = (-1, ch) + keras_layer.target_shape[:-1]
     return _sym.reshape(insym, shape=shape)
 
+def _convert_lstm(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        buffer = np.zeros((1, keras_layer.units), 'float32')
+        c_sym = symtab.new_const(buffer)
+        h_sym = symtab.new_const(buffer)
+        insym = [insym, h_sym, c_sym]
+
+    in_data = insym[0]
+    in_state_h = insym[1]
+    in_state_c = insym[2]
+
+    weightList = keras_layer.get_weights()
+
+    kernel_wt = symtab.new_const(weightList[0].transpose([1, 0]))
+    recurrent_wt = symtab.new_const(weightList[1].transpose([1, 0]))
+    in_bias = symtab.new_const(weightList[2])
+
+    units = list(weightList[0].shape)[1]
+
+    in_data = _sym.flatten(in_data)
+    ixh1 = _sym.dense(in_data, kernel_wt, use_bias=False, units=units)
+    ixh2 = _sym.dense(in_state_h, recurrent_wt, in_bias, use_bias=True, units=units)
+    gate = ixh1 + ixh2
+    gates = _sym.split(gate, indices_or_sections=4, axis=1)
+    in_gate = _sym.sigmoid(gates[0])
+    in_transform = _sym.sigmoid(gates[1])
+    next_c = in_transform * in_state_c + in_gate * _sym.tanh(gates[2])
+    out_gate = _sym.sigmoid(gates[3])
+    next_h = out_gate * _sym.tanh(next_c)
+
+    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
+    out = _sym.reshape(next_h, shape=out_shape)
+    return [out, next_h, next_c]
 
 def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     """Layers that can be skipped because they are train time only."""
@@ -442,7 +476,7 @@ def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     # 'Conv1D'                 : _convert_convolution1d,
 
     # 'GRU'                    : _convert_gru,
-    # 'LSTM'                   : _convert_lstm,
+    'LSTM'                     : _convert_lstm,
     # 'SimpleRNN'              : _convert_simple_rnn,
     # 'Bidirectional'          : _convert_bidirectional,
     # 'TimeDistributed'        : _default_skip,
@@ -466,6 +500,11 @@ def _check_unsupported_layers(model):
         if type(layer).__name__ not in _convert_map:
             raise ValueError("Keras layer {} not supported.".format(type(layer).__name__))
 
+def _as_list(arr):
+    """Force being a list, ignore if already is."""
+    if isinstance(arr, list):
+        return arr
+    return [arr]
 
 def keras_op_to_nnvm(insym, keras_layer, outname, symtab):
     """Convert keras layer to nnvm symbol, and update symtab.
@@ -486,9 +525,12 @@ def keras_op_to_nnvm(insym, keras_layer, outname, symtab):
     """
     if type(keras_layer).__name__ not in _convert_map:
         raise NotImplementedError("{} is not supported".format((type(keras_layer).__name__)))
-    ret = _convert_map[type(keras_layer).__name__](insym, keras_layer, symtab)
-    symtab.set_var(outname, ret)
+    outs = _convert_map[type(keras_layer).__name__](insym, keras_layer, symtab)
+    outs = _as_list(outs)
 
+    for t_idx, out in enumerate(outs):
+        name = outname + ":" + str(t_idx)
+        symtab.set_var(name, out)
 
 def from_keras(model):
     """Convert keras model to NNVM format.
@@ -529,7 +571,7 @@ def from_keras(model):
             if inbound_nodes is None:
                 raise TypeError("Unknown layer type or unsupported Keras version : {}"
                                 .format(keras_layer))
-            for my_idx, node in enumerate(inbound_nodes):
+            for node_idx, node in enumerate(inbound_nodes):
                 insym = []
 
                 # Since Keras allows creating multiple layers from the same name instance,
@@ -537,17 +579,25 @@ def from_keras(model):
                 # The one exception is InputLayer.  Changing input variable names after conversion
                 # would confuse users, so we should keep them as far as possible.  Fortunately,
                 # they are named uniquely to input_1, input_2, input_3 ... by default.
-                for pred_idx, pred in zip(node.node_indices, node.inbound_layers):
-                    if isinstance(pred, keras.engine.InputLayer):
-                        sym = symtab.get_var(pred.name, must_contain=True)
+                zip_node = zip(node.node_indices, node.tensor_indices, node.inbound_layers)
+                for n_idx, t_idx, layer in zip_node:
+                    if isinstance(layer, keras.engine.InputLayer):
+                        sym = symtab.get_var(layer.name, must_contain=True)
                     else:
-                        sym = symtab.get_var(pred.name + ':' + str(pred_idx), must_contain=True)
+                        sym_name = layer.name + ':' + str(n_idx) + ':' + str(t_idx)
+                        sym = symtab.get_var(sym_name, must_contain=True)
                     insym.append(sym)
 
                 if len(insym) == 1:
                     insym = insym[0]
-                keras_op_to_nnvm(insym, keras_layer, keras_layer.name + ':' + str(my_idx), symtab)
+                keras_op_to_nnvm(insym, keras_layer, keras_layer.name + ':' + str(node_idx), symtab)
+
+    #model._output_coordinates contains out_node(oc[0]), node_index(oc[1]) and tensor index(oc[2])
+    #Get all output nodes in symtab using the name made from above values. The out symbols
+    #were added to symtab in keras_op_to_nnvm using this name. For multiple outputs, make a list
+    #with these output symbols and Group them.
+    outsym = [symtab.get_var(oc[0].name + ":" + str(oc[1]) + ":" + str(oc[2]))
+              for oc in model._output_coordinates]
 
-    outsym = [symtab.get_var(layer.name + ':0') for layer in model._output_layers]
     tvmparams = {k:tvm.nd.array(np.array(v, dtype=np.float32)) for k, v in symtab.params.items()}
     return _sym.Group(outsym), tvmparams
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 92410ee8dbb3..b1c3730820d4 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -13,16 +13,22 @@
 set_session(tf.Session(config=config))
 
 
-def verify_keras_frontend(keras_model):
+def verify_keras_frontend(keras_model, need_transpose=True):
     # Keras frontend currently supports tensorflow backend only.
     assert(keras.backend.backend() == 'tensorflow')
 
     in_shapes = []
     for layer in keras_model._input_layers:
         in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
+
+    #keras_model._output_coordinates contains the output_node, node_index and tensor_index
+    #get the outshapes from combining output node and tensor index
     out_shapes = []
-    for layer in keras_model._output_layers:
-        out_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.output.shape))
+    for layer, node_index, tensor_index in keras_model._output_coordinates:
+        layer_out = layer.output
+        if isinstance(layer.output, list):#if multiple outputs are there
+            layer_out = layer.output[tensor_index]
+        out_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer_out.shape))
 
     def get_keras_output(xs, dtype='float32'):
         return keras_model.predict(xs)
@@ -46,14 +52,13 @@ def get_tvm_output(xs, target, ctx, dtype='float32'):
     keras_out = get_keras_output(xs)
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output([x.transpose([0,3,1,2]) for x in xs], target, ctx)
+        tvm_out = get_tvm_output([x.transpose([0,3,1,2]) for x in xs ] if need_transpose else xs, target, ctx)
         if isinstance (keras_out, list):
             for kout, tout in zip(keras_out, tvm_out):
                 np.testing.assert_allclose(kout, tout.reshape(kout.shape), rtol=1e-5, atol=1e-5)
         else:
             np.testing.assert_allclose(keras_out, tvm_out.reshape(keras_out.shape), rtol=1e-5, atol=1e-5)
 
-
 def test_forward_elemwise_add():
     r = []
     data = keras.layers.Input(shape=(32,32,3))
@@ -231,6 +236,33 @@ def test_forward_reuse_layers():
     keras_model = keras.models.Model(data, z)
     verify_keras_frontend(keras_model)
 
+def _test_LSTM(inputs, hidden, return_state=True):
+    data = keras.layers.Input(shape=(1, inputs))
+    lstm_out = keras.layers.LSTM(hidden,
+                                 return_state=return_state,
+                                 recurrent_activation='sigmoid',
+                                 activation='tanh')
+    x = lstm_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_LSTM_MultiLayer(inputs, hidden):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.LSTM(hidden, return_state=True, return_sequences=True,
+                                 recurrent_activation='sigmoid',
+                                 activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.LSTM(hidden, recurrent_activation='sigmoid',
+                               activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+
+def test_forward_LSTM():
+    _test_LSTM(8, 8, return_state=True)
+    _test_LSTM(4, 4, return_state=False)
+    _test_LSTM_MultiLayer(4, 4)
 
 if __name__ == '__main__':
     test_forward_elemwise_add()
@@ -249,3 +281,4 @@ def test_forward_reuse_layers():
     test_forward_multi_inputs()
     test_forward_multi_outputs()
     test_forward_reuse_layers()
+    test_forward_LSTM()

From 4bc910bba5262c99c244b10d7245ca19b5942568 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Tue, 18 Sep 2018 22:50:54 +0530
Subject: [PATCH 102/529] [COMPILER][BUG] Fix out of bound access. (#1723)

* [COMPILER][BUG] Fix out of bound access.

* 	* Review comments.
---
 nnvm/src/top/tensor/reduce.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index 10dd95742222..527a6a5abd74 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -67,10 +67,11 @@ inline TShape ReduceShapeImpl(const TShape& ishape,
   if (r_axes.ndim() == indim)
     return TShape(keepdims ? indim : 1);
 
+  CHECK(r_axes.ndim() < indim);
   if (keepdims) {
     TShape oshape(ishape);
     for (unsigned i = 0, j = 0; i < indim; ++i) {
-      if (i != r_axes[j]) continue;
+      if (j >= r_axes.ndim() || i != r_axes[j]) continue;
       oshape[i] = 1;
       ++j;
     }
@@ -79,7 +80,7 @@ inline TShape ReduceShapeImpl(const TShape& ishape,
 
   TShape oshape(indim - r_axes.ndim());
   for (unsigned i = 0, j = 0, k = 0; i < indim; ++i) {
-    if (i == r_axes[j]) {
+    if (j < r_axes.ndim() && i == r_axes[j]) {
       ++j;
       continue;
     }
@@ -95,7 +96,7 @@ inline bool ReduceShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   if ((*in_attrs)[0].ndim() == 0) return false;
   const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
-  NNVM_ASSIGN_INPUT_SHAPE(
+  NNVM_ASSIGN_OUTPUT_SHAPE(
       attrs, *out_attrs, 0,
       ReduceShapeImpl((*in_attrs)[0], param.axis,
                       param.keepdims, param.exclude));

From d575ec991c7a11f60f5a98945af683270c8795a5 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 18 Sep 2018 23:28:12 -0700
Subject: [PATCH 103/529] [SUBMODULE] update submodule to latest (#1728)

---
 HalideIR                            | 2 +-
 dlpack                              | 2 +-
 include/tvm/runtime/c_runtime_api.h | 4 ----
 src/codegen/build_module.cc         | 2 +-
 src/runtime/c_runtime_api.cc        | 2 +-
 vta/src/device_api.cc               | 3 +--
 6 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/HalideIR b/HalideIR
index 6f64f7866747..f519848d972c 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit 6f64f7866747a2a81bec84aea9bde0479c5b72c1
+Subproject commit f519848d972c67971b4cbf8c34070d5a5e3ede0d
diff --git a/dlpack b/dlpack
index 10892ac964f1..bee4d1dd8dc1 160000
--- a/dlpack
+++ b/dlpack
@@ -1 +1 @@
-Subproject commit 10892ac964f1af7c81aae145cd3fab78bbccd297
+Subproject commit bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 52499fb9186f..75e936d8f502 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -62,11 +62,7 @@ typedef int64_t tvm_index_t;
 typedef enum {
   kDLAOCL = 5,
   kDLSDAccel = 6,
-  kDLVulkan = 7,
   kOpenGL = 11,
-  // Extension DRAM type, used for quickly test extension device
-  // The device api can differ depending on the xpu driver registered.
-  kExtDev = 12,
   // AddExtraTVMType which is not in DLPack here
 } TVMDeviceExtType;
 
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index fef5a28b1d21..f35b09d1dfe6 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -102,7 +102,7 @@ Target CreateTarget(const std::string& target_name,
   } else if (target_name == "stackvm") {
     t->device_type = kDLCPU;
   } else if (target_name == "ext_dev") {
-    t->device_type = kExtDev;
+    t->device_type = kDLExtDev;
   } else {
     LOG(ERROR) << "Unknown target name " << target_name;
     return target::stackvm();
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index a081a4c1df11..b566b9a3f608 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -38,7 +38,7 @@ inline std::string DeviceName(int type) {
     case kDLVPI: return "vpi";
     case kDLROCM: return "rocm";
     case kOpenGL: return "opengl";
-    case kExtDev: return "ext_dev";
+    case kDLExtDev: return "ext_dev";
     default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
   }
 }
diff --git a/vta/src/device_api.cc b/vta/src/device_api.cc
index 88990e1b1331..a2729baa2591 100644
--- a/vta/src/device_api.cc
+++ b/vta/src/device_api.cc
@@ -72,8 +72,7 @@ class VTADeviceAPI final : public DeviceAPI {
 
 struct VTAWorkspacePool : public WorkspacePool {
   VTAWorkspacePool() :
-      WorkspacePool(static_cast<DLDeviceType>(kExtDev),
-                    VTADeviceAPI::Global()) {}
+      WorkspacePool(kDLExtDev, VTADeviceAPI::Global()) {}
 };
 
 void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) {

From 3f1779acf01b379859b9db7e21d86f480d4cd010 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 19 Sep 2018 08:18:52 -0700
Subject: [PATCH 104/529] Fix VTA Tutorial for more strict graphrt check
 (#1737)

---
 vta/tutorials/resnet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
index 8ca5eb7375b3..8bdb53d15583 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@@ -61,8 +61,8 @@ def classify(m, image):
     m.set_input('data', image)
     timer = m.module.time_evaluator("run", ctx, number=1)
     tcost = timer()
-    tvm_output = m.get_output(0, tvm.nd.empty((1000,), "float32", remote.cpu(0)))
-    top = np.argmax(tvm_output.asnumpy())
+    tvm_output = m.get_output(0)
+    top = np.argmax(tvm_output.asnumpy()[0])
     tcost = "t={0:.2f}s".format(tcost.mean)
     return tcost + " {}".format(synset[top])
 
@@ -237,8 +237,8 @@ def generate_graph(graph_fn, params_fn, device="vta"):
 tcost = timer()
 
 # Get classification results
-tvm_output = m.get_output(0, tvm.nd.empty((1000,), "float32", remote.cpu(0)))
-top_categories = np.argsort(tvm_output.asnumpy())
+tvm_output = m.get_output(0)
+top_categories = np.argsort(tvm_output.asnumpy()[0])
 
 # Report top-5 classification results
 print("ResNet-18 Prediction #1:", synset[top_categories[-1]])

From c8fdefcd94e4d6d38b8df38941493093031e1224 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 19 Sep 2018 10:10:26 -0700
Subject: [PATCH 105/529] [TOPI] Fix reduce behavior to be consistent to numpy
 (#1738)

[TOPI] Fix reduce behavior to be consistent with numpy
---
 topi/include/topi/reduction.h             | 3 ---
 topi/tests/python/test_topi_reduce.py     | 1 -
 topi/tests/python_cpp/test_topi_reduce.py | 1 -
 3 files changed, 5 deletions(-)

diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index 1ac3f2d6157c..ccc85e96643e 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -95,9 +95,6 @@ inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
         target_shape.push_back(data->shape[i]);
       }
     }
-    if (target_shape.size() == 0) {
-      target_shape.push_back(1);
-    }
   }
   return target_shape;
 }
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index ceb2a4fe1bb1..0be652948060 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -72,7 +72,6 @@ def check_device(device):
             out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
         else:
             raise NotImplementedError
-        out_npy = np.atleast_1d(out_npy)
         data_tvm = tvm.nd.array(in_npy, ctx=ctx)
         out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
         for _ in range(1):
diff --git a/topi/tests/python_cpp/test_topi_reduce.py b/topi/tests/python_cpp/test_topi_reduce.py
index ab4ac9372373..b17176938d82 100644
--- a/topi/tests/python_cpp/test_topi_reduce.py
+++ b/topi/tests/python_cpp/test_topi_reduce.py
@@ -77,7 +77,6 @@ def check_device(device):
             out_npy = in_npy_map.prod(axis=axis, keepdims=keepdims)
         else:
             raise NotImplementedError
-        out_npy = np.atleast_1d(out_npy)
         data_tvm = tvm.nd.array(in_npy, ctx=ctx)
         out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
         for _ in range(1):

From 999da0c498c51537192a6bd9e347b707a85a2a64 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 19 Sep 2018 12:49:20 -0700
Subject: [PATCH 106/529] [NNVM] Recover reduction behavir, fix CI (#1740)

---
 nnvm/src/top/tensor/reduce.cc |  5 +++++
 topi/include/topi/reduction.h | 11 +++++++++++
 topi/src/topi.cc              |  2 ++
 3 files changed, 18 insertions(+)

diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index 527a6a5abd74..91d2ea7202b8 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -3,6 +3,9 @@
  * \file reduce.cc
  * \brief reduce operator.
  */
+// Enforce TOPI to use old behavior that reduces to at least 1d
+#define TOPI_REDUCE_ATLEAST1D 1
+
 #include <nnvm/op.h>
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
@@ -17,6 +20,8 @@
 #include "topi/reduction.h"
 #include "topi/transform.h"
 
+static_assert(TOPI_REDUCE_ATLEAST1D, "need to use legacy reduce behavior");
+
 namespace nnvm {
 namespace top {
 using namespace tvm;
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index ccc85e96643e..d68b9b390419 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -20,6 +20,14 @@
 #include "topi/detail/constant_utils.h"
 #include "tvm/tvm.h"
 
+/*!
+ * \brief macro flag to enable some legacy behavior which requires
+ * reduction result to be at least 1d.
+ */
+#ifndef TOPI_REDUCE_ATLEAST1D
+#define TOPI_REDUCE_ATLEAST1D 0
+#endif
+
 namespace topi {
 using namespace tvm;
 
@@ -96,6 +104,9 @@ inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
       }
     }
   }
+  if (target_shape.size() == 0 && TOPI_REDUCE_ATLEAST1D) {
+    target_shape.push_back(1);
+  }
   return target_shape;
 }
 
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index cac3545a75a2..fef2487e6770 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -3,6 +3,8 @@
 * \brief Registration of TVM operators and schedules
 * \file topi.cc
 */
+#define TOPI_REDUCE_ATLEAST1D 0
+
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>

From 0801889971663ddbcfcd7d200fb974e1154191b5 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Wed, 19 Sep 2018 15:55:12 -0700
Subject: [PATCH 107/529] [High level OPT][RFC] NNVMv2 IR - Relay (#1672)

---
 CMakeLists.txt                                |   6 +
 docs/conf.py                                  |   2 +-
 include/tvm/relay/base.h                      | 203 ++++++
 include/tvm/relay/environment.h               | 121 ++++
 include/tvm/relay/error.h                     |  36 +
 include/tvm/relay/expr.h                      | 378 +++++++++++
 include/tvm/relay/expr_functor.h              | 170 +++++
 include/tvm/relay/logging.h                   |  33 +
 include/tvm/relay/op.h                        | 469 +++++++++++++
 include/tvm/relay/pass.h                      |  85 +++
 include/tvm/relay/type.h                      | 276 ++++++++
 python/tvm/relay/__init__.py                  |  34 +
 python/tvm/relay/_env.py                      |   5 +
 python/tvm/relay/_env.pyi                     |   5 +
 python/tvm/relay/_ir_pass.py                  |   5 +
 python/tvm/relay/_ir_pass.pyi                 |   6 +
 python/tvm/relay/_make.py                     |   9 +
 python/tvm/relay/base.py                      |  26 +
 python/tvm/relay/env.py                       |  84 +++
 python/tvm/relay/expr.py                      | 115 ++++
 python/tvm/relay/expr.pyi                     | 114 ++++
 python/tvm/relay/ir_builder.py                | 394 +++++++++++
 python/tvm/relay/ir_pass.py                   |  12 +
 python/tvm/relay/op/__init__.py               |  12 +
 python/tvm/relay/op/_make.py                  |   4 +
 python/tvm/relay/op/_tensor.py                |   2 +
 python/tvm/relay/op/op.py                     |  77 +++
 python/tvm/relay/op/tensor.py                 | 114 ++++
 python/tvm/relay/ty.py                        | 138 ++++
 python/tvm/relay/ty.pyi                       | 139 ++++
 python/tvm/tensor.py                          |   5 +
 src/relay/ir/base.cc                          |  77 +++
 src/relay/ir/environment.cc                   | 147 ++++
 src/relay/ir/expr.cc                          | 201 ++++++
 src/relay/ir/expr_functor.cc                  | 205 ++++++
 src/relay/ir/op.cc                            | 155 +++++
 src/relay/ir/type.cc                          | 121 ++++
 src/relay/op/tensor/elemwise.cc               | 137 ++++
 src/relay/op/type_relations.cc                | 206 ++++++
 src/relay/op/type_relations.h                 |  67 ++
 src/relay/pass/alpha_eq.cc                    | 258 +++++++
 src/relay/pass/incomplete_type.h              |  38 ++
 src/relay/pass/kind_check.cc                  |  42 ++
 src/relay/pass/resolve.cc                     | 100 +++
 src/relay/pass/resolve.h                      |  47 ++
 src/relay/pass/type_functor.h                 |  93 +++
 src/relay/pass/type_infer.cc                  | 629 ++++++++++++++++++
 src/relay/pass/type_subst.cc                  |  39 ++
 src/relay/pass/type_subst.h                   |  19 +
 src/relay/pass/type_visitor.h                 | 120 ++++
 src/relay/pass/unifier.cc                     | 324 +++++++++
 src/relay/pass/unifier.h                      | 141 ++++
 tests/python/relay/test_ir_builder.py         |  20 +
 tests/python/relay/test_ir_nodes.py           | 159 +++++
 tests/python/relay/test_relay_op.py           |  27 +
 .../relay/test_tyck_eval_integration.py       | 162 +++++
 tests/scripts/task_python_integration.sh      |   2 +
 57 files changed, 6614 insertions(+), 1 deletion(-)
 create mode 100644 include/tvm/relay/base.h
 create mode 100644 include/tvm/relay/environment.h
 create mode 100644 include/tvm/relay/error.h
 create mode 100644 include/tvm/relay/expr.h
 create mode 100644 include/tvm/relay/expr_functor.h
 create mode 100644 include/tvm/relay/logging.h
 create mode 100644 include/tvm/relay/op.h
 create mode 100644 include/tvm/relay/pass.h
 create mode 100644 include/tvm/relay/type.h
 create mode 100644 python/tvm/relay/__init__.py
 create mode 100644 python/tvm/relay/_env.py
 create mode 100644 python/tvm/relay/_env.pyi
 create mode 100644 python/tvm/relay/_ir_pass.py
 create mode 100644 python/tvm/relay/_ir_pass.pyi
 create mode 100644 python/tvm/relay/_make.py
 create mode 100644 python/tvm/relay/base.py
 create mode 100644 python/tvm/relay/env.py
 create mode 100644 python/tvm/relay/expr.py
 create mode 100644 python/tvm/relay/expr.pyi
 create mode 100644 python/tvm/relay/ir_builder.py
 create mode 100644 python/tvm/relay/ir_pass.py
 create mode 100644 python/tvm/relay/op/__init__.py
 create mode 100644 python/tvm/relay/op/_make.py
 create mode 100644 python/tvm/relay/op/_tensor.py
 create mode 100644 python/tvm/relay/op/op.py
 create mode 100644 python/tvm/relay/op/tensor.py
 create mode 100644 python/tvm/relay/ty.py
 create mode 100644 python/tvm/relay/ty.pyi
 create mode 100644 src/relay/ir/base.cc
 create mode 100644 src/relay/ir/environment.cc
 create mode 100644 src/relay/ir/expr.cc
 create mode 100644 src/relay/ir/expr_functor.cc
 create mode 100644 src/relay/ir/op.cc
 create mode 100644 src/relay/ir/type.cc
 create mode 100644 src/relay/op/tensor/elemwise.cc
 create mode 100644 src/relay/op/type_relations.cc
 create mode 100644 src/relay/op/type_relations.h
 create mode 100644 src/relay/pass/alpha_eq.cc
 create mode 100644 src/relay/pass/incomplete_type.h
 create mode 100644 src/relay/pass/kind_check.cc
 create mode 100644 src/relay/pass/resolve.cc
 create mode 100644 src/relay/pass/resolve.h
 create mode 100644 src/relay/pass/type_functor.h
 create mode 100644 src/relay/pass/type_infer.cc
 create mode 100644 src/relay/pass/type_subst.cc
 create mode 100644 src/relay/pass/type_subst.h
 create mode 100644 src/relay/pass/type_visitor.h
 create mode 100644 src/relay/pass/unifier.cc
 create mode 100644 src/relay/pass/unifier.h
 create mode 100644 tests/python/relay/test_ir_builder.py
 create mode 100644 tests/python/relay/test_ir_nodes.py
 create mode 100644 tests/python/relay/test_relay_op.py
 create mode 100644 tests/python/relay/test_tyck_eval_integration.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 572f4aef1432..65a7d9e36e2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,6 +104,12 @@ file(GLOB COMPILER_SRCS
     src/schedule/*.cc
     )
 
+file(GLOB_RECURSE RELAY_SRCS
+    src/relay/*.cc
+    )
+list(APPEND COMPILER_SRCS ${RELAY_SRCS})
+
+
 if(NOT MSVC)
   file(GLOB COMPILER_VERILOG_SRCS src/codegen/verilog/*.cc)
   list(APPEND COMPILER_SRCS ${COMPILER_VERILOG_SRCS})
diff --git a/docs/conf.py b/docs/conf.py
index e3f7f6a82c24..717003824703 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -33,7 +33,7 @@
 # General information about the project.
 project = u'tvm'
 author = u'%s developers' % project
-copyright = u'2017, %s' % author
+copyright = u'2018, %s' % author
 github_doc_root = 'https://github.com/tqchen/tvm/tree/master/docs/'
 
 # add markdown parser
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
new file mode 100644
index 000000000000..7c66d2c2de43
--- /dev/null
+++ b/include/tvm/relay/base.h
@@ -0,0 +1,203 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/base.h
+ * \brief Base classes for the Relay IR.
+ */
+#ifndef TVM_RELAY_BASE_H_
+#define TVM_RELAY_BASE_H_
+
+#include <tvm/api_registry.h>
+#include <tvm/ir.h>
+#include <tvm/node.h>
+#include <string>
+#include <vector>
+
+namespace tvm {
+/*!
+ * \brief Relay: a high level functional IR for TVM.
+ *
+ * This namespace contains the abstract syntax tree, and other
+ * essential data structures for the Relay IR.
+ *
+ * You can find more about Relay by reading the language reference.
+ */
+namespace relay {
+/*!
+ * \brief we always used NodeRef for referencing nodes.
+ *
+ *  By default, NodeRef is a std::shared_ptr of node
+ */
+using NodeRef = tvm::NodeRef;
+
+/*!
+ * \brief Content data type.
+ */
+using DataType = ::tvm::Type;
+
+/*!
+ * \brief Symbolic expression for tensor shape.
+ */
+using ShapeExpr = ::tvm::Expr;
+
+/*!
+ * \brief Hash function for nodes.
+ * e.g. std::unordered_map<Expr, Value, NodeHash, NodeEqual>
+ */
+using NodeHash = ::tvm::NodeHash;
+/*!
+ * \brief Equality check function for nodes.
+ */
+using NodeEqual = ::tvm::NodeEqual;
+
+/*!
+ * \brief Macro to make it easy to define node ref type given node
+ * \param TypeName The name of the reference type.
+ * \param NodeName The internal container name.
+ * \param NodeRefBase The base type.
+ */
+#define RELAY_DEFINE_NODE_REF(TypeName, NodeName, NodeRefBase)            \
+  class TypeName : public NodeRefBase {                                   \
+   public:                                                                \
+    TypeName() {}                                                         \
+    explicit TypeName(std::shared_ptr<::tvm::Node> n) : NodeRefBase(n) {} \
+    const NodeName* operator->() const {                                  \
+      return static_cast<const NodeName*>(node_.get());                   \
+    }                                                                     \
+    operator bool() { return this->defined(); }                           \
+    using ContainerType = NodeName;                                       \
+  };
+
+/*!
+ * \brief The source name in the Span
+ * \sa SourceNameNode, Span
+ */
+class SourceName;
+/*!
+ * \brief The name of a source fragment.
+ */
+class SourceNameNode : public Node {
+ public:
+  /*! \brief The source name. */
+  std::string name;
+  // override attr visitor
+  void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); }
+
+  TVM_DLL static SourceName make(std::string name);
+
+  static constexpr const char* _type_key = "relay.SourceName";
+  TVM_DECLARE_NODE_TYPE_INFO(SourceNameNode, Node);
+};
+
+/*!
+ * \brief The source name of a file span.
+ * \sa SourceNameNode, Span
+ */
+class SourceName : public NodeRef {
+ public:
+  /*! \brief default constructor  */
+  SourceName() {}
+
+  /*! \brief constructor from node pointer */
+  explicit SourceName(std::shared_ptr<Node> n) : NodeRef(n) {}
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline const SourceNameNode* operator->() const;
+
+  /*!
+   * \brief Get an SourceName for a given operator name.
+   *  Will raise an error if the source name has not been registered.
+   * \param name Name of the operator.
+   * \return Reference to a SourceName valid throughout program lifetime.
+   */
+  TVM_DLL static const SourceName& Get(const std::string& name);
+
+  /*! \brief specify container node */
+  using ContainerType = SourceNameNode;
+};
+
+/*!
+ * \brief Span information for debugging purposes
+ */
+class Span;
+/*!
+ * \brief Stores locations in frontend source that generated a node.
+ */
+class SpanNode : public Node {
+ public:
+  /*! \brief The source name */
+  SourceName source;
+  /*! \brief Line number */
+  int lineno;
+  /*! \brief column offset */
+  int col_offset;
+  // override attr visitor
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("source", &source);
+    v->Visit("lineno", &lineno);
+    v->Visit("col_offset", &col_offset);
+  }
+
+  TVM_DLL static Span make(SourceName source, int lineno, int col_offset);
+
+  static constexpr const char* _type_key = "relay.Span";
+  TVM_DECLARE_NODE_TYPE_INFO(SpanNode, Node);
+};
+
+RELAY_DEFINE_NODE_REF(Span, SpanNode, NodeRef);
+
+/*!
+ * \brief This is the base node container of all relay structures.
+ */
+class RelayNode : public Node {
+ public:
+  /*! \brief The location of the program in a SourceFragment can be null,
+   * check with span.defined() */
+  mutable Span span;
+
+  static constexpr const char* _type_key = "relay.Node";
+  TVM_DECLARE_BASE_NODE_INFO(RelayNode, Node);
+};
+
+/*!
+ * \brief Get a reference type from a Node ptr type
+ *
+ *  It is always important to get a reference type
+ *  if we want to return a value as reference or keep
+ *  the node alive beyond the scope of the function.
+ *
+ * \param ptr The node pointer
+ * \tparam RefType The reference type
+ * \tparam NodeType The node type
+ * \return The corresponding RefType
+ */
+template <typename RefType, typename NodeType>
+RefType GetRef(const NodeType* ptr) {
+  static_assert(std::is_same<typename RefType::ContainerType, NodeType>::value,
+                "Can only cast to the ref of same container type");
+  return RefType(const_cast<NodeType*>(ptr)->shared_from_this());
+}
+
+// TODO(@tqchen, @jroesch): can we move these semantics to HalideIR
+template <typename T>
+inline const T* As(const NodeRef& node) {
+  const Node* ptr = static_cast<const Node*>(node.get());
+  if (ptr && (ptr->is_type<T>() || ptr->derived_from<T>())) {
+    return static_cast<const T*>(ptr);
+  }
+  return nullptr;
+}
+
+template <typename SubRef, typename BaseRef>
+SubRef Downcast(BaseRef ref) {
+  CHECK(ref->template is_type<typename SubRef::ContainerType>())
+      << "Downcast from " << ref->type_key() << " to "
+      << SubRef::ContainerType::_type_key << " failed.";
+  return SubRef(ref.node_);
+}
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BASE_H_
diff --git a/include/tvm/relay/environment.h b/include/tvm/relay/environment.h
new file mode 100644
index 000000000000..7e07dc01eab4
--- /dev/null
+++ b/include/tvm/relay/environment.h
@@ -0,0 +1,121 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/environment.h
+ * \brief The global environment: contains information needed to
+ * compile & optimize Relay programs.
+ */
+#ifndef TVM_RELAY_ENVIRONMENT_H_
+#define TVM_RELAY_ENVIRONMENT_H_
+
+#include <tvm/relay/error.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/type.h>
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+struct Environment;
+
+/*! \brief The global environment of Relay programs.
+ *
+ *  The global environment contains the global
+ *  information needed to compile a Relay program.
+ *
+ *  It contains all global functions, and configuration
+ *  options.
+ *
+ *  Many operations require access to the global
+ *  Environment. We pass the Environment by value
+ *  in a functional style as an explicit argument,
+ *  but we mutate the Environment while optimizing
+ *  Relay programs.
+ *
+ *  The functional style allows users to construct custom
+ *  environments easily, for example each thread can store
+ *  an Environment while auto-tuning.
+ * */
+
+class EnvironmentNode : public RelayNode {
+ public:
+  /*! \brief A map from ids to all global functions. */
+  tvm::Map<GlobalVar, Function> functions;
+
+  EnvironmentNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("functions", &functions);
+    v->Visit("global_map_", &global_map_);
+  }
+
+  TVM_DLL static Environment make(tvm::Map<GlobalVar, Function> global_funcs);
+
+  /*! \brief Add a function to the global environment.
+   * \param var The name of the global function.
+   * \param func The function.
+   * \param update Controls whether you can replace a definition in the
+   * environment.
+   */
+  void Add(const GlobalVar& var, const Function& func, bool update = false);
+
+  /*! \brief Update a function in the global environment.
+   * \param var The name of the global function to update.
+   * \param func The new function.
+   */
+  void Update(const GlobalVar& var, const Function& func);
+
+  /*! \brief Remove a function from the global environment.
+   * \param var The name of the global function to update.
+   */
+  void Remove(const GlobalVar& var);
+
+  /*! \brief Lookup a global function by its variable.
+   * \param str The unique string specifying the global variable.
+   * \returns The global variable.
+   */
+  GlobalVar GetGlobalVar(const std::string& str);
+
+  /*! \brief Lookup a global function by its variable.
+   * \param var The global var to lookup.
+   * \returns The function named by the variable argument.
+   */
+  Function Lookup(const GlobalVar& var);
+
+  /*! \brief Lookup a global function by its string name
+   * \param name The name of the function.
+   * \returns The function named by the argument.
+   */
+  Function Lookup(const std::string& name);
+
+  /*! \brief Combine with another Environment.
+   * \param other The other environment.
+   */
+  void Merge(const Environment& other);
+
+  static constexpr const char* _type_key = "relay.Environment";
+  TVM_DECLARE_NODE_TYPE_INFO(EnvironmentNode, Node);
+
+ private:
+  /*! \brief A map from string names to global variables that 
+   * ensures global uniqueness. 
+   */
+  tvm::Map<std::string, GlobalVar> global_map_;
+};
+
+struct Environment : public NodeRef {
+  Environment() {}
+  explicit Environment(std::shared_ptr<tvm::Node> p) : NodeRef(p) {}
+
+  inline EnvironmentNode* operator->() const {
+    return static_cast<EnvironmentNode*>(node_.get());
+  }
+
+  using ContainerType = EnvironmentNode;
+};
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_ENVIRONMENT_H_
diff --git a/include/tvm/relay/error.h b/include/tvm/relay/error.h
new file mode 100644
index 000000000000..8ce73a027ca0
--- /dev/null
+++ b/include/tvm/relay/error.h
@@ -0,0 +1,36 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file error.h
+ * \brief The set of errors raised by Relay.
+ */
+#ifndef TVM_RELAY_ERROR_H_
+#define TVM_RELAY_ERROR_H_
+
+#include <string>
+#include "./base.h"
+
+namespace tvm {
+namespace relay {
+
+struct Error : dmlc::Error {
+  explicit Error(const std::string &msg) : dmlc::Error(msg) {}
+};
+
+struct InternalError : Error {
+  explicit InternalError(const std::string &msg) : Error(msg) {}
+};
+
+// TODO(@jroesch): we should change spanned errors to report
+// errors against the Environment, inverting control to error definition.
+struct FatalTypeError : dmlc::Error {
+  explicit FatalTypeError(const std::string &s) : dmlc::Error(s) {}
+};
+
+struct TypecheckerError : public dmlc::Error {
+  explicit TypecheckerError(const std::string &msg) : Error(msg) {}
+};
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_ERROR_H_
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
new file mode 100644
index 000000000000..6388e8367bf6
--- /dev/null
+++ b/include/tvm/relay/expr.h
@@ -0,0 +1,378 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/expr.h
+ * \brief Relay expression language.
+ */
+#ifndef TVM_RELAY_EXPR_H_
+#define TVM_RELAY_EXPR_H_
+
+#include <tvm/attrs.h>
+#include <string>
+#include "./base.h"
+#include "./type.h"
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief A Relay expression.
+ */
+class Expr;
+/*!
+ * \brief Base type of the Relay expression hiearchy.
+ */
+class ExprNode : public RelayNode {
+ public:
+  /*!
+   * \brief Stores the result of type inference(type checking).
+   *
+   * \note This can be undefined before type inference.
+   *       This value is discarded during serialization.
+   */
+  mutable Type checked_type_ = Type(nullptr);
+  /*!
+   * \return The checked_type
+   */
+  const Type& checked_type() const {
+    CHECK(checked_type_.defined()) << "internal error: the type checker has "
+                                      "not populated the checked_type "
+                                      "field for this node";
+    return this->checked_type_;
+  }
+
+  static constexpr const char* _type_key = "relay.Expr";
+  TVM_DECLARE_BASE_NODE_INFO(ExprNode, RelayNode);
+};
+
+RELAY_DEFINE_NODE_REF(Expr, ExprNode, NodeRef);
+
+/*!
+ * \brief Constant tensor, backed by an NDArray on the cpu(0) device.
+ *
+ * \note Scalar constants are represented by rank-0 const tensor.
+ *  Constant folding are handled uniformly via Tensor types.
+ */
+class Constant;
+/*!
+ * \brief Constant tensor type.
+ */
+class ConstantNode : public ExprNode {
+ public:
+  /*! \brief The data of the tensor */
+  runtime::NDArray data;
+
+  /*! \return The corresponding tensor type of the data */
+  TensorType tensor_type() const;
+
+  /*! \return Whether it is scalar(rank-0 tensor) */
+  bool is_scalar() const { return data->ndim == 0; }
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("data", &data);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Constant make(runtime::NDArray data);
+
+  static constexpr const char* _type_key = "relay.Constant";
+  TVM_DECLARE_NODE_TYPE_INFO(ConstantNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Constant, ConstantNode, Expr);
+
+/*! \brief Tuple of multiple Exprs */
+class Tuple;
+/*! \brief Tuple container */
+class TupleNode : public ExprNode {
+ public:
+  /*! \brief the fields of the tuple */
+  tvm::Array<relay::Expr> fields;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("fields", &fields);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Tuple make(tvm::Array<relay::Expr> fields);
+
+  static constexpr const char* _type_key = "relay.Tuple";
+  TVM_DECLARE_NODE_TYPE_INFO(TupleNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Tuple, TupleNode, Expr);
+
+/*!
+ * \brief Local variables used in the let expression.
+ *
+ * Its semantics are similar to tvm.Var node used in TVM's low level
+ * tensor expression language.
+ *
+ * \note Each Var is bind only once and is immutable/
+ */
+class Var;
+/*! \brief Container for Var */
+class VarNode : public ExprNode {
+ public:
+  /*! \brief The name of the variable, this only acts as a hint to the user,
+   * and is not used for equality.
+   */
+  std::string name_hint;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name_hint", &name_hint);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Var make(std::string name_hint);
+
+  static constexpr const char* _type_key = "relay.Var";
+  TVM_DECLARE_NODE_TYPE_INFO(VarNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Var, VarNode, Expr);
+
+/*!
+ * \brief Global variable that leaves in the top-level environment.
+ * This is used to enable recursive calls between function.
+ *
+ * \note A GlobalVar may only point to functions.
+ */
+class GlobalVar;
+/*! \brief A GlobalId from the node's current type to target type. */
+class GlobalVarNode : public ExprNode {
+ public:
+  /*! \brief The name of the variable, this only acts as a hint. */
+  std::string name_hint;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name_hint", &name_hint);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static GlobalVar make(std::string name_hint);
+
+  static constexpr const char* _type_key = "relay.GlobalVar";
+  TVM_DECLARE_NODE_TYPE_INFO(GlobalVarNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(GlobalVar, GlobalVarNode, Expr);
+
+/*!
+ * \brief Function parameter declaration.
+ */
+class Param;
+/*! \brief A parameter. */
+class ParamNode : public ExprNode {
+ public:
+  /*! \brief The variable */
+  Var var;
+  /*! \brief The type of the parameter */
+  Type type;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("var", &var);
+    v->Visit("type", &type);
+    v->Visit("span", &span);
+  }
+
+  TVM_DLL static Param make(Var var, Type type);
+
+  static constexpr const char* _type_key = "relay.Param";
+  TVM_DECLARE_NODE_TYPE_INFO(ParamNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Param, ParamNode, Expr);
+
+/*!
+ * \brief Function (subgraph in computational graph)
+ */
+class Function;
+/*! \brief Function container */
+class FunctionNode : public ExprNode {
+ public:
+  /*! \brief Function parameters */
+  tvm::Array<Param> params;
+  /*! \brief User annotated return type of the function. */
+  Type ret_type;
+  /*!
+   * \brief
+   * The expression which represents the computation of the function,
+   * the expression may reference the parameters, and the type of it
+   * or sub-expressions may reference the type variables.
+   */
+  Expr body;
+  /*!
+   * \brief Type parameters of the function.
+   *  Enables the function to vary its type based on these.
+   *  This corresponds to template paramaters in c++'s terminology.
+   *
+   * \note This can be usually empty for non-polymorphic functions.
+   */
+  tvm::Array<TypeParam> type_params;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("params", &params);
+    v->Visit("ret_type", &ret_type);
+    v->Visit("body", &body);
+    v->Visit("type_params", &type_params);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  Type fn_type() const;
+
+  TVM_DLL static Function make(tvm::Array<Param> params, Type ret_type,
+                               Expr body, tvm::Array<TypeParam> ty_params);
+
+  static constexpr const char* _type_key = "relay.Function";
+  TVM_DECLARE_NODE_TYPE_INFO(FunctionNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Function, FunctionNode, Expr);
+
+/*!
+ * \brief Call corresponds to operator invocation.
+ *  Corresponds to the operator in computational graph terminology.
+ */
+class Call;
+/*! \brief Call container. */
+class CallNode : public ExprNode {
+ public:
+  /*!
+   * \brief The operator(function) being invoked
+   *
+   *  - It can be relay::Op which corresponds to the primitive operators.
+   *  - It can also be user defined functions (Function, GlobalVar, Var).
+   */
+  Expr op;
+
+  /*! \brief The arguments(inputs) of the call */
+  tvm::Array<relay::Expr> args;
+
+  /*! \brief The additional attributes */
+  Attrs attrs;
+
+  /*!
+   * \brief The type arguments passed to polymorphic(template) function.
+   *
+   * This is the advance feature that is only used when the function is
+   * polymorphic. It is safe to be ignored in most cases. For example, in the
+   * following code, the type_args of addone call is [int].
+   *
+   * \code
+   *
+   * template<typename T>
+   * T addone(T a) { return a + 1; }
+   *
+   * void main() {
+   *   int x = addone<int>(10);
+   * }
+   *
+   * \endcode
+   */
+  tvm::Array<Type> type_args;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("op", &op);
+    v->Visit("args", &args);
+    v->Visit("attrs", &attrs);
+    v->Visit("type_args", &type_args);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Call make(Expr op, Array<Expr> args, Attrs attrs = Attrs(),
+                           Array<Type> ty_args = Array<Type>());
+
+  static constexpr const char* _type_key = "relay.Call";
+  TVM_DECLARE_NODE_TYPE_INFO(CallNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Call, CallNode, Expr);
+
+/*!
+ * \brief Let binding that binds a local var and optionally a type annotation.
+ *
+ * \note Let is useful to transform the program to be A-normal form.
+ *  where each of the expression corresponds to a let binding.
+ *
+ *  For developers who are familar with the computational graph.
+ *  Each of the let can be viewed as a operator node in the computational graph.
+ *  Traversing the list of let bindings is similar to running
+ * PostDFS-order(topo-order) traversal on the computational graph.
+ */
+class Let;
+/*! \brief A binding of a sub-network. */
+class LetNode : public ExprNode {
+ public:
+  /*! \brief The variable we bind to */
+  Var var;
+  /*! \brief The value we bind var to */
+  Expr value;
+  /*! \brief The body of the let binding */
+  Expr body;
+  /*! \brief Type annotation of value, this can be null */
+  Type value_type;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("var", &var);
+    v->Visit("value", &value);
+    v->Visit("body", &body);
+    v->Visit("value_type", &value_type);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Let make(Var var, Expr value, Expr body, Type value_type);
+
+  static constexpr const char* _type_key = "relay.Let";
+  TVM_DECLARE_NODE_TYPE_INFO(LetNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Let, LetNode, Expr);
+
+/*!
+ * \brief Condition expression
+ *
+ * Unlike traditional statement `if`s, the if evalutes
+ * to the result of the branch taken.
+ *
+ * let x = if (true) { 1 } else { 0 }; // x is 1
+ * let y = if (false) { 1 } else { 0 }; // y is 0
+ * 
+ * \note This is similar to C's ternary operator.
+ */
+class If;
+/*! \brief container of If */
+class IfNode : public ExprNode {
+ public:
+  /*! \brief The condition */
+  Expr cond;
+  /*! \brief The expression evaluated when condition is true. */
+  Expr true_branch;
+  /*! \brief The expression evaluated when condition is false */
+  Expr false_branch;
+
+  IfNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("cond", &cond);
+    v->Visit("true_branch", &true_branch);
+    v->Visit("false_branch", &false_branch);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static If make(Expr cond, Expr true_branch, Expr false_branch);
+
+  static constexpr const char* _type_key = "relay.If";
+  TVM_DECLARE_NODE_TYPE_INFO(IfNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(If, IfNode, Expr);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_EXPR_H_
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
new file mode 100644
index 000000000000..8ad0537ad68b
--- /dev/null
+++ b/include/tvm/relay/expr_functor.h
@@ -0,0 +1,170 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/expr_functor.h
+ * \brief A more powerful visitor which enables defining arbitrary function
+ * signatures with type based dispatch on first argument.
+ */
+#ifndef TVM_RELAY_EXPR_FUNCTOR_H_
+#define TVM_RELAY_EXPR_FUNCTOR_H_
+
+#include <tvm/ir_functor.h>
+#include <string>
+#include "./expr.h"
+#include "./op.h"
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief A dynamical functor that dispatches on in the first Expr argument.
+ *  You can use this as a more powerful Visitor, since it allows you to
+ *  define function signatures of Visit Function.
+ * 
+ * \sa tvm/ir_functor.h
+ *
+ * \tparam FType function signiture
+ *  This type is only defined for FType with function signature R(const Expr&,
+ * Args...)
+ */
+template <typename FType>
+class ExprFunctor;
+
+// functions to be overriden.
+#define EXPR_FUNCTOR_DEFAULT \
+  { return VisitExprDefault_(op, std::forward<Args>(args)...); }
+
+#define RELAY_EXPR_FUNCTOR_DISPATCH(OP)                                \
+  vtable.template set_dispatch<OP>(                                    \
+      [](const NodeRef& n, TSelf* self, Args... args) {                \
+        return self->VisitExpr_(static_cast<const OP*>(n.node_.get()), \
+                                std::forward<Args>(args)...);          \
+      });
+
+template <typename R, typename... Args>
+class ExprFunctor<R(const Expr& n, Args...)> {
+ private:
+  using TSelf = ExprFunctor<R(const Expr& n, Args...)>;
+  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+
+ public:
+  /*! \brief the result type of this functor */
+  using result_type = R;
+  /*! \brief virtual destructor */
+  virtual ~ExprFunctor() {}
+  /*!
+   * \brief Same as call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  R operator()(const Expr& n, Args... args) {
+    return VisitExpr(n, std::forward<Args>(args)...);
+  }
+  /*!
+   * \brief The functor call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  virtual R VisitExpr(const Expr& n, Args... args) {
+    static FType vtable = InitVTable();
+    return vtable(n, this, std::forward<Args>(args)...);
+  }
+  // Functions that can be overriden by subclass
+  virtual R VisitExpr_(const ConstantNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const TupleNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const VarNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const GlobalVarNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const ParamNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const FunctionNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const CallNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const LetNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const IfNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const OpNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExprDefault_(const Node* op, Args...) {
+    throw dmlc::Error(std::string("Do not have a default for ") + op->type_key());
+  }
+
+ private:
+  // initialize the vtable.
+  static FType InitVTable() {
+    FType vtable;
+    // Set dispatch
+    RELAY_EXPR_FUNCTOR_DISPATCH(ConstantNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(TupleNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(VarNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(GlobalVarNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(ParamNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(FunctionNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(CallNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(LetNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(IfNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(OpNode);
+    return vtable;
+  }
+};
+
+/*! \brief A simple visitor wrapper around ExprFunctor.
+ *
+ * Exposes two visitors with default traversal strategies, one
+ * which doesn't compute a result but can mutate internal state,
+ * and another which functionally builds a new Expr.
+ */
+
+class ExprVisitor : public ::tvm::relay::ExprFunctor<void(const Expr& n)> {
+ public:
+  void VisitExpr_(const VarNode* op) override;
+  void VisitExpr_(const GlobalVarNode* op) override;
+  void VisitExpr_(const ConstantNode* op) override;
+  void VisitExpr_(const TupleNode* op) override;
+  void VisitExpr_(const ParamNode* op) override;
+  void VisitExpr_(const FunctionNode* op) override;
+  void VisitExpr_(const CallNode* op) override;
+  void VisitExpr_(const LetNode* op) override;
+  void VisitExpr_(const IfNode* op) override;
+  void VisitExpr_(const OpNode* op) override;
+  virtual void VisitType(const Type& t);
+};
+
+/*! \brief A wrapper around ExprFunctor which functionally updates the AST.
+*
+* ExprMutator uses memoization and self return in order to amortize
+* the cost of using functional updates.
+*/
+class ExprMutator
+    : public ::tvm::relay::ExprFunctor<Expr(const Expr&, const Expr&)> {
+ public:
+  Expr Mutate(const Expr& expr);
+  Expr VisitExpr_(const VarNode* op, const Expr& e) override;
+  Expr VisitExpr_(const ConstantNode* op, const Expr& e) override;
+  Expr VisitExpr_(const GlobalVarNode* op, const Expr& e) override;
+  Expr VisitExpr_(const OpNode* op, const Expr& expr) override;
+  Expr VisitExpr_(const TupleNode* op, const Expr& e) override;
+  Expr VisitExpr_(const ParamNode* op, const Expr& e) override;
+  Expr VisitExpr_(const FunctionNode* op, const Expr& e) override;
+  Expr VisitExpr_(const CallNode* call_node, const Expr& e) override;
+  Expr VisitExpr_(const LetNode* op, const Expr& e) override;
+  Expr VisitExpr_(const IfNode* op, const Expr& e) override;
+  /*! \brief Used to visit the types inside of expressions. 
+   *  
+   * Can be overloaded to transform the types in arbitrary
+   * ways, one way would be to define a sub-class of type
+   * visitor for types which transform them appropriately.
+   */ 
+  virtual Type VisitType(const Type& t);
+
+ private:
+  /*! \brief Internal map used for memoization. */
+  tvm::Map<Expr, Expr> memo_;
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_EXPR_FUNCTOR_H_
diff --git a/include/tvm/relay/logging.h b/include/tvm/relay/logging.h
new file mode 100644
index 000000000000..c53cd15ee72e
--- /dev/null
+++ b/include/tvm/relay/logging.h
@@ -0,0 +1,33 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/logging.h
+ * \brief A wrapper around dmlc-core/logging.h which adds the ability
+ * to toggle logging via an environment variable.
+ */
+
+#ifndef TVM_RELAY_LOGGING_H_
+#define TVM_RELAY_LOGGING_H_
+
+#include <dmlc/logging.h>
+#include <string>
+#include <cstdlib>
+#include <iostream>
+
+namespace tvm {
+namespace relay {
+
+static bool logging_enabled() {
+  if (auto var = std::getenv("RELAY_LOG")) {
+    std::string is_on(var);
+    return is_on == "1";
+  } else {
+      return false;
+  }
+}
+
+#define RELAY_LOG(severity) LOG_IF(severity, logging_enabled())
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_LOGGING_H_
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
new file mode 100644
index 000000000000..49661fec5731
--- /dev/null
+++ b/include/tvm/relay/op.h
@@ -0,0 +1,469 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/op.h
+ * \brief Primitive operator definition.
+ */
+#ifndef TVM_RELAY_OP_H_
+#define TVM_RELAY_OP_H_
+
+#include <functional>
+#include <limits>
+#include <string>
+#include <typeinfo>
+#include <utility>
+#include <vector>
+
+#include "../attrs.h"
+#include "./base.h"
+#include "./expr.h"
+#include "./type.h"
+
+namespace tvm {
+namespace relay {
+
+// forward declare name.
+template <typename ValueType>
+class OpMap;
+class GenericOpMap;
+class OpRegistry;
+
+/*!
+ * \brief Node container of operator structure.
+ */
+class OpNode : public relay::ExprNode {
+ public:
+  /*! \brief name of the operator */
+  std::string name;
+  /*! \brief the type of the operator */
+  mutable FuncType op_type;
+  /*!
+   * \brief detailed description of the operator
+   *  This can be used to generate docstring automatically for the operator.
+   */
+  std::string description;
+  /* \brief Information of input arguments to the operator */
+  Array<AttrFieldInfo> arguments;
+  /*!
+   * \brief The type key of the attribute field
+   *  This can be empty, in which case it defaults to
+   */
+  std::string attrs_type_key;
+  /*!
+   * \brief number of input arguments to the operator,
+   * -1 means it is variable length
+   */
+  int32_t num_inputs = -1;
+  /*!
+   * \brief support level of the operator,
+   *  The lower the more priority it contains.
+   *  This is in analogies to BLAS levels.
+   */
+  int32_t support_level = 10;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("op_type", &op_type);
+    v->Visit("description", &description);
+    v->Visit("arguments", &arguments);
+    v->Visit("attrs_type_key", &attrs_type_key);
+    v->Visit("num_inputs", &num_inputs);
+    v->Visit("support_level", &support_level);
+  }
+
+  static constexpr const char* _type_key = "relay.Op";
+  TVM_DECLARE_NODE_TYPE_INFO(OpNode, ExprNode);
+
+ private:
+  // friend class
+  friend class GenericOpMap;
+  friend class OpRegistry;
+  // Program internal unique index of operator.
+  // Used to help index the program.
+  uint32_t index_{0};
+};
+
+/*!
+ * \brief Operator reference class.
+ */
+class Op : public relay::Expr {
+ public:
+  /*! \brief default constructor  */
+  Op() {}
+  /*! \brief constructor from node pointer */
+  explicit Op(std::shared_ptr<Node> n) : Expr(n) {}
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline const OpNode* operator->() const;
+  /*!
+   * \brief Get additional registered attribute about operators.
+   *  If nothing has been registered, an empty OpMap will be returned.
+   * \param attr_name The name of the attribute.
+   * \return An OpMap of specified attr_name.
+   * \tparam ValueType The type of the attribute.
+   */
+  template <typename ValueType>
+  inline static OpMap<ValueType> GetAttr(const std::string& attr_name);
+  /*!
+   * \brief Get an Op for a given operator name.
+   *  Will raise an error if the op has not been registered.
+   * \param op_name Name of the operator.
+   * \return Pointer to a Op, valid throughout program lifetime.
+   */
+  TVM_DLL static const Op& Get(const std::string& op_name);
+
+  /*! \brief specify container node */
+  using ContainerType = OpNode;
+
+ private:
+  /*!
+   * \brief Get generic attrmap given attr name
+   * \param key The attribute key
+   * \return reference to GenericOpMap
+   */
+  TVM_DLL static const GenericOpMap& GetGenericAttr(const std::string& key);
+};
+
+/*! \brief Helper structure to register operators */
+class OpRegistry {
+ public:
+  /*! \return the operator */
+  const Op& op() const { return op_; }
+  /*!
+   * \brief setter function during registration
+   *  Set the description of operator
+   * \param descr the description string.
+   * \return reference to self.
+   */
+  inline OpRegistry& describe(const std::string& descr);  // NOLINT(*)
+  /*!
+   * \brief Add argument information to the function.
+   * \param name Name of the argument.
+   * \param type Type of the argument.
+   * \param description Description of the argument.
+   * \return reference to self.
+   */
+  inline OpRegistry& add_argument(const std::string& name,
+                                  const std::string& type,
+                                  const std::string& description);
+  /*!
+   * \brief Attach the type function corresponding to the return type.
+   * \param rel_name The type relation name to register. 
+   * \param type_rel_func The backing relation function which can solve an arbitrary
+   * relation on variables. 
+   * \return reference to self.
+   */
+  inline OpRegistry& add_type_rel(
+      const std::string& rel_name,
+      std::function<Array<Type>(const Array<Type>&, int)> type_rel_func);
+
+  /*!
+   * \brief Set the type key of attributes.
+   * \param type_key The type of of the attrs field.x
+   * \return reference to self.
+   */
+  inline OpRegistry& set_attrs_type_key(const std::string& type_key);
+  /*!
+   * \brief Set the num_inputs
+   * \param n The number of inputs to be set.
+   * \return reference to self.
+   */
+  inline OpRegistry& set_num_inputs(int32_t n);  // NOLINT(*)
+  /*!
+   * \brief Set the support level of op.
+   * \param level The support level.
+   * \return reference to self.
+   */
+  inline OpRegistry& set_support_level(int32_t level);  // NOLINT(*)
+  /*!
+   * \brief Register additional attributes to operator.
+   * \param attr_name The name of the attribute.
+   * \param value The value to be set.
+   * \param plevel The priority level of this set,
+   *  an higher priority level attribute
+   *  will replace lower priority level attribute.
+   *  Must be bigger than 0.
+   *
+   *  Cannot set with same plevel twice in the code.
+   *
+   * \tparam ValueType The type of the value to be set.
+   */
+  template <typename ValueType>
+  inline OpRegistry& set_attr(const std::string& attr_name,  // NOLINT(*)
+                              const ValueType& value, int plevel = 10);
+
+  // set the name of the op to be the same as registry
+  inline OpRegistry& set_name() {  // NOLINT(*)
+    if (get()->name.length() == 0) {
+      get()->name = name;
+    }
+    return *this;
+  }
+  /*! \return The global single registry */
+  TVM_DLL static ::dmlc::Registry<OpRegistry>* Registry();
+
+ private:
+  friend class ::dmlc::Registry<OpRegistry>;
+  // the name
+  std::string name;
+  /*! \brief The operator */
+  Op op_;
+  // private constructor
+  OpRegistry();
+  // return internal pointer to op.
+  inline OpNode* get();
+  // update the attribute OpMap
+  TVM_DLL void UpdateAttr(const std::string& key, TVMRetValue value,
+                          int plevel);
+};
+
+/*!
+ * \brief Generic map to store additional information of Op.
+ */
+class GenericOpMap {
+ public:
+  /*!
+   * \brief Check if the map has op as key.
+   * \param op The key to the map
+   * \return 1 if op is contained in map, 0 otherwise.
+   */
+  inline int count(const Op& op) const;
+  /*!
+   * \brief get the corresponding value element at op
+   * \param op The key to the map
+   * \return the const reference to the content value.
+   */
+  inline const TVMRetValue& operator[](const Op& op) const;
+  /*!
+   * \brief get the corresponding value element at op with default value.
+   * \param op The key to the map
+   * \param def_value The default value when the key does not exist.
+   * \return the const reference to the content value.
+   * \tparam ValueType The content value type.
+   */
+  template <typename ValueType>
+  inline ValueType get(const Op& op, ValueType def_value) const;
+
+ private:
+  friend class OpRegistry;
+  // the attribute field.
+  std::string attr_name_;
+  // internal data
+  std::vector<std::pair<TVMRetValue, int> > data_;
+  // The value
+  GenericOpMap() = default;
+};
+
+/*!
+ * \brief Map<Op,ValueType> used to store meta-information about Op.
+ * \tparam ValueType The type of the value stored in map.
+ */
+template <typename ValueType>
+class OpMap {
+ public:
+  /*!
+   * \brief Check if the map has op as key.
+   * \param op The key to the map
+   * \return 1 if op is contained in map, 0 otherwise.
+   */
+  inline int count(const Op& op) const;
+  /*!
+   * \brief get the corresponding value element at op
+   * \param op The key to the map
+   * \return the const reference to the content value.
+   */
+  inline ValueType operator[](const Op& op) const;
+  /*!
+   * \brief get the corresponding value element at op with default value.
+   * \param op The key to the map
+   * \param def_value The default value when the key does not exist.
+   * \return the const reference to the content value.
+   */
+  inline ValueType get(const Op& op, ValueType def_value) const;
+
+ private:
+  friend class Op;
+  // constructor
+  explicit OpMap(const GenericOpMap& map) : map_(map) {}
+  /*! \brief The internal map field */
+  const GenericOpMap& map_;
+};
+
+// internal macros to make
+#define RELAY_REGISTER_VAR_DEF \
+  static DMLC_ATTRIBUTE_UNUSED ::tvm::relay::OpRegistry& __make_##RelayOp
+
+/*!
+ * \def RELAY_REGISTER_OP
+ * \brief Register a new operator, or set attribute of the corresponding op.
+ *
+ * \param OpName The name of registry
+ *
+ * \code
+ *
+ *  RELAY_REGISTER_OP("add")
+ *  .describe("add two inputs together")
+ *  .set_num_inputs(2)
+ *  .set_attr<OpKernel>("gpu_kernel", AddKernel);
+ *
+ * \endcode
+ */
+#define RELAY_REGISTER_OP(OpName)                        \
+  DMLC_STR_CONCAT(RELAY_REGISTER_VAR_DEF, __COUNTER__) = \
+      ::tvm::relay::OpRegistry::Registry()               \
+          ->__REGISTER_OR_GET__(OpName)                  \
+          .set_name()
+
+// implementations
+inline const OpNode* Op::operator->() const {
+  return static_cast<const OpNode*>(node_.get());
+}
+
+template <typename ValueType>
+inline OpMap<ValueType> Op::GetAttr(const std::string& key) {
+  return OpMap<ValueType>(Op::GetGenericAttr(key));
+}
+
+inline OpNode* OpRegistry::get() {
+  return const_cast<OpNode*>(op_.operator->());
+}
+
+inline OpRegistry& OpRegistry::describe(
+    const std::string& descr) {  // NOLINT(*)
+  get()->description = descr;
+  return *this;
+}
+
+inline OpRegistry& OpRegistry::add_argument(const std::string& name,
+                                            const std::string& type,
+                                            const std::string& description) {
+  std::shared_ptr<AttrFieldInfoNode> n = std::make_shared<AttrFieldInfoNode>();
+  n->name = name;
+  n->type_info = type;
+  n->description = description;
+  get()->arguments.push_back(AttrFieldInfo(n));
+  return *this;
+}
+
+inline OpRegistry& OpRegistry::add_type_rel(
+    const std::string& rel_name,
+    std::function<Array<Type>(const Array<Type>&, int)> type_rel_func) {
+  auto func_name = std::string("tvm.relay.type_relation.") + rel_name;
+
+  TypedEnvFunc<Array<Type>(const Array<Type>&, int)> env_type_rel_func;
+
+  if (runtime::Registry::Get(func_name)) {
+    auto env_func = EnvFunc::Get(func_name);
+    env_type_rel_func = env_func;
+  } else {
+    runtime::Registry::Register(func_name)
+        .set_body_typed<Array<Type>(const Array<Type>&, int)>(type_rel_func);
+    auto env_func = EnvFunc::Get(func_name);
+    env_type_rel_func = env_func;
+  }
+
+  std::vector<TypeParam> type_params;
+  std::vector<Type> arg_types;
+
+  // Add inputs.
+  std::string input_name_prefix = "in";
+  for (int i = 0; i < get()->num_inputs; i++) {
+    auto name = input_name_prefix + std::to_string(i);
+    auto param = TypeParamNode::make(name, TypeParamNode::Kind::kType);
+    type_params.push_back(param);
+    arg_types.push_back(param);
+  }
+
+  auto ty_call_args = Array<Type>(arg_types);
+
+  // Add output type.
+  auto out_param = TypeParamNode::make("out", TypeParamNode::Kind::kType);
+  type_params.push_back(out_param);
+  ty_call_args.push_back(out_param);
+
+  TypeConstraint type_rel =
+      TypeRelationNode::make(rel_name, env_type_rel_func, ty_call_args);
+
+  auto func_type =
+      FuncTypeNode::make(arg_types, out_param, type_params, {type_rel});
+
+  get()->op_type = func_type;
+
+  return *this;
+}
+
+inline OpRegistry& OpRegistry::set_num_inputs(int32_t n) {  // NOLINT(*)
+  get()->num_inputs = n;
+  return *this;
+}
+
+inline OpRegistry& OpRegistry::set_attrs_type_key(  // NOLINT(*)
+    const std::string& type_key) {
+  get()->attrs_type_key = type_key;
+  return *this;
+}
+
+inline OpRegistry& OpRegistry::set_support_level(int32_t n) {  // NOLINT(*)
+  get()->support_level = n;
+  return *this;
+}
+
+template <typename ValueType>
+inline OpRegistry& OpRegistry::set_attr(  // NOLINT(*)
+    const std::string& attr_name, const ValueType& value, int plevel) {
+  CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
+  TVMRetValue rv;
+  rv = value;
+  UpdateAttr(attr_name, rv, plevel);
+  return *this;
+}
+
+// member functions of OpMap
+inline int GenericOpMap::count(const Op& op) const {
+  if (op.defined()) {
+    const uint32_t idx = op->index_;
+    return idx < data_.size() ? (data_[idx].second != 0) : 0;
+  } else {
+    return 0;
+  }
+}
+
+inline const TVMRetValue& GenericOpMap::operator[](const Op& op) const {
+  CHECK(op.defined());
+  const uint32_t idx = op->index_;
+  CHECK(idx < data_.size() && data_[idx].second != 0)
+      << "Attribute " << attr_name_ << " has not been registered for Operator "
+      << op->name;
+  return data_[idx].first;
+}
+
+template <typename ValueType>
+inline ValueType GenericOpMap::get(const Op& op, ValueType value) const {
+  CHECK(op.defined());
+  const uint32_t idx = op->index_;
+  if (idx < data_.size() && data_[idx].second != 0) {
+    return data_[idx].first;
+  } else {
+    return value;
+  }
+}
+
+template <typename ValueType>
+inline int OpMap<ValueType>::count(const Op& op) const {
+  return map_.count(op);
+}
+
+template <typename ValueType>
+inline ValueType OpMap<ValueType>::operator[](const Op& op) const {
+  return map_[op];
+}
+template <typename ValueType>
+inline ValueType OpMap<ValueType>::get(const Op& op,
+                                       ValueType def_value) const {
+  return map_.get<ValueType>(op, def_value);
+}
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_OP_H_
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
new file mode 100644
index 000000000000..e956097780bb
--- /dev/null
+++ b/include/tvm/relay/pass.h
@@ -0,0 +1,85 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/pass.h
+ * \brief The set of Relay passes written in C++.
+ */
+#ifndef TVM_RELAY_PASS_H_
+#define TVM_RELAY_PASS_H_
+
+#include <tvm/relay/environment.h>
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Infer the type of an expression with the provided environment.
+ *
+ * The result of type checking is a new expression with unambigous
+ * type information filled in, as well as it's checked type field
+ * populated with the result type.
+ *
+ * \param env The environment used for global settings and referencing
+ * global functions.
+ *
+ * \param e The expression to type check.
+ *
+ * \return A type checked expression with its checked_type field populated.
+ */
+Expr InferType(const Environment& env, const Expr& e);
+Expr InferType(const Environment& env, const GlobalVar& v, const Function& e);
+
+/*!
+ * \brief Check that types are well formed by applying "kinding rules".
+ *
+ * This pass ensures we do not do things that violate the design of the
+ * type system when writing down types.
+ *
+ * For example tensors are not allowed to contain functions in Relay.
+ *
+ * We check this by ensuring the `dtype` field of a Tensor always contains
+ * a data type such as `int`, `float`, `uint`.
+ *
+ * \param env The global environment.
+ * \param t The type to check.
+ * \return true if the rules are satisified otherwise false
+ */
+bool KindCheck(const Environment& env, const Type& t);
+
+/*! \brief Compare two expressions for structural equivalence.
+ *
+ * This comparison operator respects scoping and compares
+ * expressions without regard to variable choice.
+ *
+ * For example: `let x = 1 in x` is equal to `let y = 1 in y`.
+ *
+ *   See https://en.wikipedia.org/wiki/Lambda_calculus#Alpha_equivalence
+ *   for more details.
+ *
+ *   \param e1 The left hand expression.
+ *   \param e2 The right hand expression.
+ *
+ *   \return true if equal, otherwise false
+ */
+bool AlphaEqual(const Expr& e1, const Expr& e2);
+
+/*! \brief Compare two types for structural equivalence.
+ *
+ * This comparison operator respects scoping and compares
+ * expressions without regard to variable choice.
+ *
+ * For example: `forall s, Tensor[f32, s]` is equal to
+ * `forall w, Tensor[f32, w]`.
+ * 
+ * See https://en.wikipedia.org/wiki/Lambda_calculus#Alpha_equivalence
+ * for more details.
+ *
+ * \param t1 The left hand type.
+ * \param t2 The right hand type.
+ *
+ * \return true if equal, otherwise false
+ */
+bool AlphaEqual(const Type& t1, const Type& t2);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_H_
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
new file mode 100644
index 000000000000..44030ad8d97f
--- /dev/null
+++ b/include/tvm/relay/type.h
@@ -0,0 +1,276 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/type.h
+ * \brief Relay typed AST nodes.
+ */
+#ifndef TVM_RELAY_TYPE_H_
+#define TVM_RELAY_TYPE_H_
+
+#include <tvm/api_registry.h>
+#include <tvm/ir.h>
+#include <tvm/node.h>
+#include <string>
+
+#include "./base.h"
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Base type of the Relay type hiearchy. */
+class TypeNode : public RelayNode {
+ public:
+  static constexpr const char* _type_key = "relay.Type";
+  TVM_DECLARE_BASE_NODE_INFO(TypeNode, Node);
+};
+
+/*!
+ * \brief Type is the base type of relay type hiearchy.
+ *
+ * Relay's type system contains following two key concepts:
+ *
+ * - TensorType: type of certain Tensor values in the expression.
+ * - FunctionType: the type of the function.
+ *
+ * There are also advanced types to support generic(polymorphic types),
+ * which can be ignored when first reading the code base.
+ */
+class Type : public NodeRef {
+ public:
+  Type() {}
+  explicit Type(std::shared_ptr<tvm::Node> p) : NodeRef(p) {}
+
+  using ContainerType = TypeNode;
+};
+
+/*!
+ * \brief Base of all Tensor types
+ *  This container can hold TensorType or GenericTensorType.
+ */
+class BaseTensorTypeNode : public TypeNode {
+ public:
+  static constexpr const char* _type_key = "relay.BaseTensorType";
+  TVM_DECLARE_BASE_NODE_INFO(BaseTensorTypeNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(BaseTensorType, BaseTensorTypeNode, Type);
+
+/*!
+ * \brief This is the most commonly used type in relay.
+ *  TensorType have a fixed dimension, data type.
+ *
+ *  The elements of shape can be either IntImm(constant integer),
+ *  or any symbolic integer expression.
+ *  The symbolic integer allows generic shape inference in certain cases.
+ * \sa TensorTypeNode The container class of TensorType.
+ */
+class TensorType;
+/*! \brief TensorType container node */
+class TensorTypeNode : public BaseTensorTypeNode {
+ public:
+  /*!
+   * \brief The shape of the tensor,
+   *  represented by ShapeExpr(tvm::Expr).
+   */
+  Array<ShapeExpr> shape;
+  /*! \brief The content data type */
+  DataType dtype;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("shape", &shape);
+    v->Visit("dtype", &dtype);
+    v->Visit("span", &span);
+  }
+
+  TVM_DLL static TensorType make(Array<ShapeExpr> shape, DataType dtype);
+
+  /*! \brief Construct an scalar containing elements of dtype.  */
+  TVM_DLL static TensorType Scalar(DataType dtype);
+
+  static constexpr const char* _type_key = "relay.TensorType";
+  TVM_DECLARE_NODE_TYPE_INFO(TensorTypeNode, BaseTensorTypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(TensorType, TensorTypeNode, Type);
+
+/*!
+ * \brief Type parameter in the function.
+ *  This can be viewed as template parameter in c++ template function.
+ *
+ * For example, in the following pesudo code,
+ * the TypeParam of f is TypeParam(kind=kShapeVar, var=n).
+ * This function can take in a Tensor with shape=(3, 3) and
+ * returns a Tensor with shape=(9,)
+ *
+ * \code
+ *
+ *  template<i32 n>
+ *  f(x : Tensor[i32, (n, n)]) -> Tensor[i32, (n * n)]
+ *
+ * \endcode
+ * \sa TypeParamNode The actual container class of TypeParam
+ */
+class TypeParam;
+/*! \brief TypeParam container node */
+class TypeParamNode : public TypeNode {
+ public:
+  /*! \brief possible kinds of TypeParam */
+  enum Kind : int {
+    /*! \brief template variable in shape expression */
+    kShapeVar = 0,
+    kShape = 1,
+    kBaseType = 2,
+    kType = 3
+  };
+  /*!
+   * \brief The variable itself is only meaningful when
+   *  kind is ShapeVar, otherwise, we only use the name.
+   */
+  tvm::Var var;
+  /*! \brief The kind of type parameter */
+  Kind kind;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("var", &var);
+    v->Visit("kind", &kind);
+    v->Visit("span", &span);
+  }
+
+  TVM_DLL static TypeParam make(std::string name, Kind kind);
+
+  static constexpr const char* _type_key = "relay.TypeParam";
+  TVM_DECLARE_NODE_TYPE_INFO(TypeParamNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(TypeParam, TypeParamNode, Type);
+
+/*!
+ * \brief Potential Constraints in the type.
+ * \note This is reserved for future use.
+ */
+class TypeConstraint;
+/*! \brief TypeConstraint container node. */
+class TypeConstraintNode : public TypeNode {
+ public:
+  static constexpr const char* _type_key = "relay.TypeConstraint";
+  TVM_DECLARE_BASE_NODE_INFO(TypeConstraintNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(TypeConstraint, TypeConstraintNode, Type);
+
+class FuncType;
+/*!
+ * \brief Function type in Relay.
+ *
+ * Relay support polymorphic function type.
+ * This can be roughly viewed as template function in C++.
+ *
+ * \sa TypeParam, TypeConstraint
+ */
+class FuncTypeNode : public TypeNode {
+ public:
+  /*! \brief type type of arguments */
+  tvm::Array<Type> arg_types;
+  /*! \brief The type of return value. */
+  Type ret_type;
+  // The following fields are used in polymorphic(template) functions
+  // For normal functions, the following two fields will be empty.
+  /*! \brief The type parameters of the function */
+  tvm::Array<TypeParam> type_params;
+  /*!
+   * \brief potential constraint the type need to obey
+   * \note this field is reserved for futher purposes.
+   */
+  tvm::Array<TypeConstraint> type_constraints;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("arg_types", &arg_types);
+    v->Visit("ret_type", &ret_type);
+    v->Visit("type_params", &type_params);
+    v->Visit("type_constraints", &type_constraints);
+    v->Visit("span", &span);
+  }
+
+  TVM_DLL static FuncType make(tvm::Array<Type> arg_types, Type ret_type,
+                               tvm::Array<TypeParam> type_params,
+                               tvm::Array<TypeConstraint> type_constraints);
+
+  static constexpr const char* _type_key = "relay.FuncType";
+  TVM_DECLARE_NODE_TYPE_INFO(FuncTypeNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(FuncType, FuncTypeNode, Type);
+
+using TypeRelationFn =
+    TypedEnvFunc<Array<Type>(const Array<Type>&, int)>;
+
+/*!
+ * \brief Opaque type relation, is an input-output relation on types.
+ */
+class TypeRelation;
+/*!
+ * \brief TypeRelation container.
+ * \note This node is not directly serializable.
+ * The type function need to be lookedup in the environment.
+ */
+class TypeRelationNode : public TypeConstraintNode {
+ public:
+  /*! \brief The name of the function */
+  std::string name;
+
+  /*!
+   * \brief The function on input and output variables which
+   *  this is not directly serializable,
+   *  need to be looked-up in the environment.
+   */
+  TypeRelationFn func_;
+
+  /*! \brief The type arguments to the type function. */
+  tvm::Array<Type> args;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name", &name);
+  }
+
+  TVM_DLL static TypeRelation make(std::string name, TypeRelationFn func_, Array<Type> args);
+
+  static constexpr const char* _type_key = "relay.TypeRelation";
+  TVM_DECLARE_NODE_TYPE_INFO(TypeRelationNode, TypeConstraintNode);
+};
+
+RELAY_DEFINE_NODE_REF(TypeRelation, TypeRelationNode, TypeConstraint);
+
+/*!
+ * \brief The type of tuple values.
+ */
+class TupleType;
+/*!
+ * \brief TupleType container.
+ */
+class TupleTypeNode : public TypeNode {
+ public:
+  /*! \brief The type of each field in the tuple. */
+  tvm::Array<Type> fields;
+
+  TupleTypeNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("fields", &fields); }
+
+  TVM_DLL static TupleType make(tvm::Array<Type> fields);
+
+  static constexpr const char* _type_key = "relay.TypeTuple";
+  TVM_DECLARE_NODE_TYPE_INFO(TupleTypeNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(TupleType, TupleTypeNode, Type);
+
+// The following fields contains advanced typing
+// Only keep the class name and reserved for future usage.
+class GenericTensorType;
+// stores a DataType.
+class GenericDataType;
+// stores a DataType.
+class GenericShape;
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_TYPE_H_
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
new file mode 100644
index 000000000000..18a53be92815
--- /dev/null
+++ b/python/tvm/relay/__init__.py
@@ -0,0 +1,34 @@
+# pylint: disable=wildcard-import
+"""The Relay IR namespace containing the IR definition and compiler."""
+from . import base
+from . import ty
+from . import expr
+from . import env
+from . import ir_pass
+from . import ir_builder
+# Operators
+from .op import Op
+from .op.tensor import *
+
+# Span
+Span = base.Span
+
+# Type
+Type = ty.Type
+TensorType = ty.TensorType
+Kind = ty.Kind
+TypeParam = ty.TypeParam
+TypeConstraint = ty.TypeConstraint
+FuncType = ty.FuncType
+
+# Expr
+Constant = expr.Constant
+Tuple = expr.Tuple
+Var = expr.Var
+GlobalVar = expr.GlobalVar
+Param = expr.Param
+Function = expr.Function
+Call = expr.Call
+Let = expr.Let
+If = expr.If
+Var = Var
diff --git a/python/tvm/relay/_env.py b/python/tvm/relay/_env.py
new file mode 100644
index 000000000000..25b8715a7816
--- /dev/null
+++ b/python/tvm/relay/_env.py
@@ -0,0 +1,5 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable
+"""The interface to the Environment exposed from C++."""
+from tvm._ffi.function import _init_api
+
+_init_api("relay._env", __name__)
diff --git a/python/tvm/relay/_env.pyi b/python/tvm/relay/_env.pyi
new file mode 100644
index 000000000000..c6b5d0f6c4bd
--- /dev/null
+++ b/python/tvm/relay/_env.pyi
@@ -0,0 +1,5 @@
+from typing import Union, Tuple, Dict, List
+from relay.ir import GlobalId, OperatorId, Item, NodeBase, Span, FileId
+from relay.ir import ShapeExtension, Operator, Defn
+
+class Environment(NodeBase): ...
\ No newline at end of file
diff --git a/python/tvm/relay/_ir_pass.py b/python/tvm/relay/_ir_pass.py
new file mode 100644
index 000000000000..61fdcfa38c2f
--- /dev/null
+++ b/python/tvm/relay/_ir_pass.py
@@ -0,0 +1,5 @@
+"""FFI exposing the Relay type inference and checking."""
+
+from tvm._ffi.function import _init_api
+
+_init_api("relay._ir_pass", __name__)
diff --git a/python/tvm/relay/_ir_pass.pyi b/python/tvm/relay/_ir_pass.pyi
new file mode 100644
index 000000000000..1bb42ab854c2
--- /dev/null
+++ b/python/tvm/relay/_ir_pass.pyi
@@ -0,0 +1,6 @@
+from .env import Environment
+from . import ir
+
+def check_expr(env: Environment, expr: ir.Expr) -> ir.Type: ...
+def generalize(env: Environment, expr: ir.Expr) -> ir.Expr: ...
+def _get_checked_type(expr: ir.Expr) -> ir.Type: ...
diff --git a/python/tvm/relay/_make.py b/python/tvm/relay/_make.py
new file mode 100644
index 000000000000..20a582e76d6a
--- /dev/null
+++ b/python/tvm/relay/_make.py
@@ -0,0 +1,9 @@
+"""
+The constructors for all Relay AST nodes exposed from C++.
+
+This module includes MyPy type signatures for all of the
+exposed modules.
+"""
+from .._ffi.function import _init_api
+
+_init_api("relay._make", __name__)
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
new file mode 100644
index 000000000000..d683c96739cd
--- /dev/null
+++ b/python/tvm/relay/base.py
@@ -0,0 +1,26 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck
+"""The base node types for the Relay language."""
+from __future__ import absolute_import as _abs
+from .._ffi.node import NodeBase, register_node as _register_tvm_node
+from . import _make
+
+NodeBase = NodeBase
+
+def register_relay_node(type_key=None):
+    """register relay node type
+
+    Parameters
+    ----------
+    type_key : str or cls
+        The type key of the node
+    """
+    if not isinstance(type_key, str):
+        return _register_tvm_node(
+            "relay." + type_key.__name__)(type_key)
+    return _register_tvm_node(type_key)
+
+
+@register_relay_node
+class Span(NodeBase):
+    def __init__(self, source, lineno, col_offset):
+        self.__init_handle_by_constructor__(_make.Span, source, lineno, col_offset)
diff --git a/python/tvm/relay/env.py b/python/tvm/relay/env.py
new file mode 100644
index 000000000000..62afef76425a
--- /dev/null
+++ b/python/tvm/relay/env.py
@@ -0,0 +1,84 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable, wildcard-import
+"""A global environment storing everything needed to interpret or compile a Relay program."""
+from .base import register_relay_node, NodeBase
+from . import _make
+from . import _env
+
+
+@register_relay_node
+class Environment(NodeBase):
+    """The global Relay environment containing functions,
+    options and more.
+    """
+
+    def __init__(self, funcs):
+        """Construct an environment.
+
+        Parameters
+        ------
+        funcs: list of relay.Function
+
+        Returns
+        ------
+        env: A new environment containing :py:class:`~relay.env.Environment`.
+        """
+        self.__init_handle_by_constructor__(_make.Environment, funcs)
+
+    def add(self, var, func):
+        """Add a function to the environment.
+
+        Parameters
+        ---------
+        var: GlobalVar
+            The global variable which names the function.
+
+        func: Function
+            The function.
+        """
+        if isinstance(var, str):
+            var = _env.Environment_GetGlobalVar(self, var)
+
+        _env.Environment_Add(self, var, func)
+
+    def merge(self, other):
+        """Merge two environments.
+
+        Parameters
+        ----------
+        other: Environment
+            The environment to merge into the current Environment.
+        """
+        return _env.Environment_Merge(self, other)
+
+    def global_var(self, name):
+        """Get a global variable by name.
+
+        Parameters
+        ----------
+        name: str
+            The name of the global variable.
+
+        Returns
+        -------
+            global_var: GlobalVar
+                The global variable mapped to :code:`name`.
+        """
+        return _env.Environment_GetGlobalVar(self, name)
+
+    def __getitem__(self, var):
+        """Lookup a global function by name or by variable.
+
+        Parameters
+        ----------
+        var: str or GlobalVar
+            The name or global variable.
+
+        Returns
+        -------
+            func: Function
+                The function referenced by :code:`var`.
+        """
+        if isinstance(var, str):
+            return _env.Environment_Lookup_str(self, var)
+        else:
+            return _env.Environment_Lookup(self, var)
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
new file mode 100644
index 000000000000..3bddbc89b56e
--- /dev/null
+++ b/python/tvm/relay/expr.py
@@ -0,0 +1,115 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
+"""The expression nodes of Relay."""
+from __future__ import absolute_import
+from .base import NodeBase, register_relay_node
+from ._ir_pass import _get_checked_type
+from . import _make
+from .. import convert
+
+
+class Expr(NodeBase):
+    """The base type for all Relay expressions."""
+
+    def checked_type(self):
+        return _get_checked_type(self)
+
+    def __call__(self, *args):
+        converted_args = []
+        for arg in args:
+            if isinstance(arg, Param):
+                converted_args.append(arg.var)
+            else:
+                converted_args.append(arg)
+
+        return Call(self, args, None, None)
+
+
+@register_relay_node
+class Constant(Expr):
+    """A constant tensor in Relay, see tvm/relay/type.h for more details.
+    """
+
+    def __init__(self, data):
+        self.__init_handle_by_constructor__(_make.Constant, data)
+
+
+@register_relay_node
+class Tuple(Expr):
+    """A hetereogenous sequence of values.
+       see tvm/relay/type.h for more details.
+    """
+
+    def __init__(self, fields):
+        self.__init_handle_by_constructor__(_make.Tuple, fields)
+
+
+@register_relay_node
+class Var(Expr):
+    """A local variable in Relay."""
+
+    def __init__(self, name_hint):
+        self.__init_handle_by_constructor__(_make.Var, name_hint)
+
+
+@register_relay_node
+class GlobalVar(Expr):
+    """A global variable in Relay."""
+
+    def __init__(self, name_hint):
+        self.__init_handle_by_constructor__(_make.GlobalVar, name_hint)
+
+
+@register_relay_node
+class Param(Expr):
+    """A function type in Relay, see tvm/relay/type.h for more details.
+    """
+
+    def __init__(self, var, ty):
+        self.__init_handle_by_constructor__(_make.Param, var, ty)
+
+
+@register_relay_node
+class Function(Expr):
+    """A function in Relay, see tvm/relay/expr.h for more details."""
+
+    def __init__(self,
+                 params,
+                 ret_type,
+                 body,
+                 type_params=None
+                ):
+        if type_params is None:
+            type_params = convert([])
+
+        self.__init_handle_by_constructor__(
+            _make.Function, params, ret_type, body, type_params)
+
+
+@register_relay_node
+class Call(Expr):
+    """A function call in Relay, see tvm/relay/expr.h for more details."""
+
+    def __init__(self, op, args, attrs, ty_args=None):
+        if not ty_args:
+            ty_args = []
+
+        self.__init_handle_by_constructor__(
+            _make.Call, op, args, attrs, ty_args)
+
+
+@register_relay_node
+class Let(Expr):
+    """A variable bindings in Relay, see tvm/relay/expr.h for more details."""
+
+    def __init__(self, var, value, body, value_type):
+        self.__init_handle_by_constructor__(
+            _make.Let, var, value, body, value_type)
+
+
+@register_relay_node
+class If(Expr):
+    """A conditional expression in Relay, see tvm/relay/expr.h for more details."""
+
+    def __init__(self, cond, true_value, false_value):
+        self.__init_handle_by_constructor__(
+            _make.If, cond, true_value, false_value)
diff --git a/python/tvm/relay/expr.pyi b/python/tvm/relay/expr.pyi
new file mode 100644
index 000000000000..fd30e3ed25cf
--- /dev/null
+++ b/python/tvm/relay/expr.pyi
@@ -0,0 +1,114 @@
+from typing import List
+import tvm
+from .base import Span, NodeBase
+from .ty import Type, TypeParam
+from ._ir_pass import _get_checked_type
+
+
+class Expr(NodeBase):
+    def checked_type(self):
+        ...
+
+    def __call__(self, *args):
+        ...
+
+
+class Constant(Expr):
+    data = ...  # type: tvm.nd.NDArray
+
+    def __init__(self, data):
+        # type: (tvm.nd.NDArray) -> None
+        ...
+
+
+class Tuple(Expr):
+    fields = ..  # type: List[Expr]
+
+    def __init__(self, fields):
+        # type: (List[Expr]) -> None
+        ...
+
+
+class Var(Expr):
+    """A local variable in Relay."""
+    name_hint = ...  # type: str
+
+    def __init__(self, name_hint):
+        # type: (str) -> None
+        ...
+
+
+class GlobalVar(Expr):
+    name_hint = ...  # type: str
+
+    def __init__(self, name_hint):
+        # type: (str) -> None
+        ...
+
+
+class Param(Expr):
+    var = ...  # type: Var
+    type = ...  # type: Type
+
+    def __init__(self, var, ty):
+        # type: (Var, Type) -> None
+        ...
+
+
+class Function(Expr):
+    """A function in Relay, see tvm/relay/expr.h for more details."""
+    type_params = ...  # type: List[TypeParam]
+    params = ...  # type: List[Param]
+    ret_type = ...  # type: Type
+    body = ...  # type: Expr
+
+    def __init__(self,
+                 params,  # type: List[Param],
+                 ret_type,  # type: Type,
+                 body,  # type: Expr,
+                 type_params=None,  # type: List[TypeParam]
+                 ):
+        # type: (...) -> None
+        ...
+
+
+@register_relay_node
+class Call(Expr):
+    """A function call in Relay, see tvm/relay/expr.h for more details."""
+    op = ...  # type: Expr
+    args = ...  # type: List[Expr]
+    # todo(@jroesch): add attrs
+
+    def __init__(self, op, args, attrs, ty_args=None):
+        # type: (Expr, List[Expr], Optional[List[Type]]) -> None
+        if not ty_args:
+            ty_args = []
+
+        self.__init_handle_by_constructor__(
+            _make.Call, op, args, attrs, ty_args)
+
+
+@register_relay_node
+class Let(Expr):
+    """A variable bindings in Relay, see tvm/relay/expr.h for more details."""
+    var = ...  # type: Var
+    value = ...  # type: Expr
+    body = ...  # type: Expr
+    value_type = ...  # type: Type
+
+    def __init__(self, var, value, body, value_type):
+        # type: (Var, Expr, Expr, Type) -> None
+        ...
+
+
+@register_relay_node
+class If(Expr):
+    """A conditional expression in Relay, see tvm/relay/expr.h for more details."""
+    cond = ...  # type: Expr
+    true_value = ...  # type: Expr
+    false_value = ...  # type: Expr
+    span = ...  # type: Span
+
+    def __init__(self, cond, true_value, false_value):
+        # type: (Expr, Expr, Expr) -> None
+        ...
diff --git a/python/tvm/relay/ir_builder.py b/python/tvm/relay/ir_builder.py
new file mode 100644
index 000000000000..6e52f209d0c6
--- /dev/null
+++ b/python/tvm/relay/ir_builder.py
@@ -0,0 +1,394 @@
+# pylint: disable=no-else-return
+"""IR builder for the Relay IR.
+
+Enables users to construct Relay programs with a Python API.
+"""
+from collections import OrderedDict
+import numpy as np
+import tvm
+from .ty import Type, FuncType, TensorType
+from .expr import Expr, Constant, Let, Var, Param, Function, If
+from .env import Environment
+
+
+def _convert_to_value(arg, ctxt=tvm.cpu(0)):
+    # type: (Any, tvm.Context) -> tvm.nd.NDArray
+    """Convert Python values into the appropriate types
+       for the Relay evaluator.
+    """
+    if isinstance(arg, int):
+        return tvm.nd.array(np.array(arg, dtype='int32'), ctxt)
+    elif isinstance(arg, float):
+        return tvm.nd.array(arg, ctxt)
+    elif isinstance(arg, bool):
+        return tvm.nd.array(np.array(arg, dtype='float32'), ctxt)
+    elif isinstance(arg, np.ndarray):
+        return tvm.nd.array(arg, ctxt)
+    elif isinstance(arg, tvm.ndarray.NDArray):
+        return arg
+    else:
+        # raise Exception(f"can't convert {type(arg)} to a Relay AST")
+        raise Exception("unsupported argument type {0}".format(type(arg)))
+
+
+def _convert_type(rtype):
+    if isinstance(rtype, str):
+        return scalar_type(rtype)
+    elif isinstance(rtype, Type):
+        return rtype
+    else:
+        raise Exception(
+            "unsupported conversion to Relay type {0}".format(type(rtype)))
+
+
+def convert(arg):
+    # type: (Any) -> Expr
+    """Convert some Python objects into a Relay AST fragment.
+
+    Parameters
+    ----------
+    arg: Any
+        The Python object
+
+    Returns
+    -------
+    expr: relay.Expr
+        The converted expression.
+    """
+    if isinstance(arg, Expr):
+        return arg
+    elif isinstance(arg, tuple):
+        return relay.Tuple([convert(el) for el in arg])
+    elif isinstance(arg, PartialFunc):
+        return arg.to_func()
+    else:
+        value = _convert_to_value(arg)
+        return Constant(value)
+
+
+class WithScope(object):
+    """A wrapper for builder methods which introduce scoping."""
+
+    def __init__(self, enter_value, exit_cb):
+        self._enter_value = enter_value
+        self._exit_cb = exit_cb
+
+    def __enter__(self):
+        return self._enter_value
+
+    def __exit__(self, ptype, value, trace):
+        if value:
+            raise value
+        else:
+            self._exit_cb()
+
+
+class PartialFunc(object):
+    """A wrapper around functions while they are being built.
+
+      Used by the builder as a user is building up a function,
+      allows Function nodes which contain partially initialized
+      state.
+    """
+
+    def __init__(self, params, ret_type, body, type_params):
+        self.params = params
+        self.ret_type = ret_type
+        self.body = body
+        self.type_params = type_params
+
+    def param_ids(self):
+        return [p.var for p in self.params]
+
+    def to_func(self):
+        """Converts a PartialFunc into a :py:class:`~relay.Function`."""
+        return Function(
+            self.params,
+            self.ret_type,
+            self.body,
+            self.type_params)
+
+#pylint: disable=invalid-name
+
+
+def _mk_let(bindings, ret_value):
+    let_expr = ret_value
+    for var, (value, ty) in reversed(list(bindings.items())):
+        let_expr = Let(var, value, let_expr, ty)
+
+    return let_expr
+
+
+class IRBuilder(object):
+    """The IRBuilder class.
+
+    Enables users to build up a Relay environment and program.
+
+    Examples
+    --------
+
+    Program:
+       fn (x : Tensor[f32, (10, 10)]) {
+         let t1 = log(x);
+         let t2 = add(t1, x);
+         return t1;
+       }
+
+    ..code-block: python
+        b = IRBuilder()
+        with b.function(('x', tensor_type(10, 10))) as func:
+            x, = func.param_ids()
+            t1 = b.let('t1', log(x))
+            t2 = b.let('t2', add(t1, x))
+            b.ret(t2)
+    """
+
+    def __init__(self):
+        self.bindings = [OrderedDict({})]
+        self.scopes = [OrderedDict({})]
+        self.params = []
+        self.ret_values = [None]
+        self.env = Environment({})
+
+    def enter_scope(self, params=None):
+        if not params:
+            params = []
+
+        self.bindings.append(OrderedDict({}))
+        self.scopes.append(OrderedDict({}))
+        self.params.append(params)
+        self.ret_values.append(None)
+
+    def exit_scope(self):
+        bindings = self.bindings.pop()
+        scopes = self.scopes.pop()
+        params = self.params.pop()
+        ret_value = self.ret_values.pop()
+        return bindings, scopes, params, ret_value
+
+    #pylint: disable=invalid-name
+    def bind(self, name, value, ty):
+        lv = Var(name)
+        self.scopes[-1][name] = lv
+        self.bindings[-1][lv] = (value, ty)
+        return lv
+
+    def let(self, name, value, value_type=None):
+        if isinstance(value, Param):
+            value = value.var
+
+        if not isinstance(value, Expr):
+            value = convert(value)
+
+        return self.bind(name, value, value_type)
+
+    def _convert_params(self, raw_params):
+        relay_params = []
+        for raw_param in raw_params:
+            if isinstance(raw_param, Param):
+                var = raw_param.var
+                param = raw_param
+            elif isinstance(raw_param, tuple):
+                var, ty = raw_param
+                if isinstance(var, str):
+                    var = Var(var)
+                ty = _convert_type(ty)
+                param = Param(var, ty)
+            elif isinstance(param, str):
+                var = Var(raw_param)
+                ty = None
+                param = Param(var, ty)
+            else:
+                raise Exception("unknown parameter type")
+
+            self.scopes[-1][var.name_hint] = var
+            relay_params.append(param)
+
+        return relay_params
+
+    def function(self, *params):
+        """Construct a Relay function."""
+
+        relay_params = self._convert_params(params)
+
+        self.enter_scope()
+
+        pfunc = PartialFunc(relay_params, None, None, [])
+
+        def _on_exit():
+            bindings, _, _, ret_value = self.exit_scope()
+            body = _mk_let(bindings, ret_value)
+            pfunc.body = body
+
+        return WithScope(pfunc, _on_exit)
+
+    def ret(self, x):
+        """Set `x` to be the return value of the current function."""
+        if not self.ret_values[-1]:
+            self.ret_values[-1] = convert(x)
+        else:
+            raise Exception(
+                "return value already set, a function can only have one return value")
+
+    def if_scope(self, cond):
+        """Construct the if branch an if expression with scoping."""
+        self.enter_scope()
+
+        def _on_exit():
+            bindings, _, _, ret_value = self.exit_scope()
+            assert self.ret_values[-1] is None
+            true_branch = _mk_let(bindings, ret_value)
+            self.ret_values[-1] = If(cond, true_branch, None)
+
+        return WithScope(10, _on_exit)
+
+    def else_scope(self):
+        """Construct the else branch of an if expression with scoping."""
+        self.enter_scope()
+
+        def _on_exit():
+            bindings, _, _, ret_value = self.exit_scope()
+            partial_if = self.ret_values[-1]
+            assert isinstance(
+                partial_if, If) and partial_if.false_branch is None
+            false_branch = _mk_let(bindings, ret_value)
+            self.ret_values[-1] = If(
+                partial_if.cond,
+                partial_if.true_branch,
+                false_branch)
+
+        return WithScope(10, _on_exit)
+
+    def param(self, name, ty=None):
+        if not ty:
+            ty = scalar_type('float32')
+        else:
+            ty = _convert_type(ty)
+
+        return Param(Var(name), ty)
+
+    def global_var(self, name):
+        # type: (str) -> GlobalVar
+        """Construct a global var with `name` as its name hint.
+
+        Parameters
+        ----------
+        name: str
+            The name of the global variable.
+
+        Returns
+        -------
+        global_var: relay.GlobalVar
+            The global variable with `name`.
+
+        """
+        return self.env.global_var(name)
+
+    def decl(self, name, *params, **kwargs):
+        """Create a global function.
+
+        Parameters
+        ----------
+        name: str or GlobalVar
+            The name of the function.
+        params: params
+            The parameters of the function.
+
+        Returns
+        -------
+        with_scope: Scope for the function.
+        """
+
+        ret_type = kwargs.get('ret_type', None)
+
+        self.enter_scope()
+
+        def _on_exit():
+            bindings, _, _, ret_value = self.exit_scope()
+            exp = _mk_let(bindings, ret_value)
+            self.env.add(name, Function(params, ret_type, exp))
+
+        return WithScope(10, _on_exit)
+
+    def get(self):
+        """Get the full program.
+
+        Returns
+        ----------
+        (prog, env) : (relay.Expr, relay.Environment)
+            A pair of the partial program, and the modified environment.
+        """
+        bindings = self.bindings.pop()
+        scope = self.scopes.pop()
+
+        if self.bindings:
+            raise Exception("IRBuilder: binding error")
+
+        if self.scopes:
+            raise Exception("IRBuilder: scoping error")
+
+        if bindings and scope and not self.ret_values:
+            raise Exception("IRBuilder: no return value set")
+
+        return _mk_let(bindings, self.ret_values[-1]), self.env
+
+
+def scalar_type(dtype):
+    """Construct a Relay scalar type.
+
+    Parameters
+    ----------
+    dtype: dtype
+        The dtype of the scalar type.
+
+    Returns:
+    scalar_type: relay.Type
+        The scalar type.
+    """
+    return TensorType(tvm.convert([]), dtype)
+
+
+def tensor_type(*shape, **kwargs):
+    """Construct a Relay Tensor type.
+
+    Parameters
+    ----------
+    shape: list of tvm.Expr
+        The shape of the Tensor type.
+    dtype: dtype
+        The dtype of the Tensor type.
+
+    Returns
+    -------
+    tensor_type: relay.Type
+        The resulting tensor types.
+    """
+    dtype = kwargs.get('dtype', 'float32')
+
+    return TensorType(tvm.convert(shape), dtype)
+
+
+def func_type(args, ret_type, type_params=None):
+    """Construct a Relay function type.
+
+    Parameters
+    ----------
+    args: list of relay.Type
+        The argument types.
+
+    ret_type: relay.Type
+        The return type.
+
+    type_params: list of relay.TypeParam
+        The type parameters.
+
+    Returns
+    -------
+    func_type: The function type.
+    """
+    if not type_params:
+        type_params = []
+
+    args = [_convert_type(arg) for arg in args]
+    ret_type = _convert_type(ret_type)
+    return FuncType(args, ret_type, type_params, [])
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
new file mode 100644
index 000000000000..bbc294b59f5b
--- /dev/null
+++ b/python/tvm/relay/ir_pass.py
@@ -0,0 +1,12 @@
+# pylint: disable=no-else-return,
+# pylint: disable=unidiomatic-typecheck
+"""The set of passes for Relay.
+
+Exposes an interface for configuring the passes and scripting
+them in Python.
+"""
+from . import _ir_pass
+
+# Expose checking expression, should rename to infer_type.
+# pylint: disable=invalid-name
+check_expr = _ir_pass.check_expr
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
new file mode 100644
index 000000000000..0646a8326db6
--- /dev/null
+++ b/python/tvm/relay/op/__init__.py
@@ -0,0 +1,12 @@
+#pylint: disable=wildcard-import
+"""Relay core operators."""
+# operator defs
+from .op import get, register, Op
+
+# Operators
+from .tensor import *
+
+# operator registry
+from . import _tensor
+from ..expr import Expr
+from ..base import register_relay_node
diff --git a/python/tvm/relay/op/_make.py b/python/tvm/relay/op/_make.py
new file mode 100644
index 000000000000..79c86cbb0254
--- /dev/null
+++ b/python/tvm/relay/op/_make.py
@@ -0,0 +1,4 @@
+"""Constructor APIs"""
+from ..._ffi.function import _init_api
+
+_init_api("relay.op._make", __name__)
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
new file mode 100644
index 000000000000..0bc2054cebdf
--- /dev/null
+++ b/python/tvm/relay/op/_tensor.py
@@ -0,0 +1,2 @@
+#pylint: disable=invalid-name
+"""Backend compiler related feature registration"""
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
new file mode 100644
index 000000000000..f1130b52e7ce
--- /dev/null
+++ b/python/tvm/relay/op/op.py
@@ -0,0 +1,77 @@
+"""The base node types for the Relay language."""
+from ..._ffi.function import _init_api
+
+from ..base import register_relay_node
+from ..expr import Expr
+
+
+@register_relay_node
+class Op(Expr):
+    """A Relay operator definition."""
+
+    def __init__(self):
+        raise RuntimeError("Cannot create op, use get instead")
+
+    def get_attr(self, attr_name):
+        """Get additional attribute about the operator.
+
+        Parameters
+        ----------
+        attr_name : str
+            The attribute name.
+
+        Returns
+        -------
+        value : object
+            The attribute value
+        """
+        return _OpGetAttr(self, attr_name)
+
+
+def get(op_name):
+    """Get the Op for a given name
+
+    Parameters
+    ----------
+    op_name : str
+        The operator name
+
+    Returns
+    -------
+    op : Op
+        The op of the corresponding name
+    """
+    return _GetOp(op_name)
+
+
+def register(op_name, attr_key, value=None, level=10):
+    """Register an operator property of an operator.
+
+
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+
+    attr_key : str
+        The attribute name.
+
+    value : object, optional
+        The value to set
+
+    level : int, optional
+        The priority level
+
+    Returns
+    -------
+    fregister : function
+        Register function if value is not specified.
+    """
+    def _register(v):
+        """internal register function"""
+        _Register(op_name, attr_key, v, level)
+        return v
+    return _register(value) if value else _register
+
+
+_init_api("relay.op", __name__)
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
new file mode 100644
index 000000000000..fa54d8b53dd8
--- /dev/null
+++ b/python/tvm/relay/op/tensor.py
@@ -0,0 +1,114 @@
+"""Basic tensor operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+from ..expr import Tuple
+
+# We create a wrapper function for each operator in the
+# python side to call into the positional _make.OpName function.
+#
+# We make this decision so that we can:
+# - Have declare python docstring for each function
+# - Enable keyword arguments easily
+# - Not put too much burden on FFI to support complicated features
+#   like default value and keyword arguments
+
+
+def log(data):
+    """Take log of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.log(data)
+
+
+def exp(data):
+    """Take exp of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.exp(data)
+
+
+def sqrt(data):
+    """Take sqrt of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.sqrt(data)
+
+
+def add(lhs, rhs):
+    """Elementwise addition.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.add(lhs, rhs)
+
+
+def subtract(lhs, rhs):
+    """Elementwise subtraction.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.add(lhs, rhs)
+
+def equal(lhs, rhs):
+    return _make.equal(lhs, rhs)
+
+def concat(*args):
+    """Concatenate the input tensors along the zero axis.
+
+    Parameters
+    ----------
+    args: list of Tensor
+
+    Returns
+    -------
+    tensor: The concatenated tensor.
+    """
+    tup = Tuple(list(args))
+    return _make.concat(tup)
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
new file mode 100644
index 000000000000..10e267a53977
--- /dev/null
+++ b/python/tvm/relay/ty.py
@@ -0,0 +1,138 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
+"""The type nodes of the Relay language."""
+from enum import IntEnum
+from .base import NodeBase, register_relay_node
+from . import _make
+
+
+class Type(NodeBase):
+    """The base type for all Relay types."""
+
+    def __eq__(self, other):
+        """Compare two Relay types for structural equivalence using
+           alpha equivalence.
+        """
+        return bool(_make._type_alpha_eq(self, other))
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def same_as(self, other):
+        """Compares two Relay types by referential equality."""
+        return super().__eq__(other)
+
+
+@register_relay_node
+class TensorType(Type):
+    """A concrete TensorType in Relay, see tvm/relay/type.h for more details.
+
+    This is the type assigned to tensor's with a known dype and shape. For
+    example a tensor of `float32` and `(5, 5)`.
+    """
+
+    def __init__(self, shape, dtype):
+        """Construct a tensor type.
+
+        Parameters
+        ----------
+        shape: list of tvm.Expr
+        dtype: str
+
+        Returns
+        -------
+        tensor_type: The TensorType
+        """
+        self.__init_handle_by_constructor__(_make.TensorType, shape, dtype)
+
+
+class Kind(IntEnum):
+    """The kind of a type parameter, represents a variable shape,
+       base type, type, or dimension.
+
+       This controls what a type parameter is allowed to be instantiated
+       with. For example one's of kind BaseType can only be `float32`, `int32`,
+       and so on.
+    """
+    ShapeVar = 0
+    Shape = 1
+    BaseType = 2
+    Type = 3
+
+
+@register_relay_node
+class TypeParam(Type):
+    """A type parameter used for generic types in Relay,
+    see tvm/relay/type.h for more details.
+
+    A type parameter represents a type placeholder which will
+    be filled in later on. This allows the user to write
+    functions which are generic over types.
+    """
+
+    def __init__(self, var, kind):
+        """Construct a TypeParam.
+
+        Parameters
+        ----------
+        var: tvm.expr.Var
+            The tvm.Var which backs the type parameter.
+
+        kind: Kind
+            The kind of the type parameter.
+
+        Returns
+        -------
+        type_param: TypeParam
+            The type parameter.
+        """
+        self.__init_handle_by_constructor__(_make.TypeParam, var, kind)
+
+
+@register_relay_node
+class TypeConstraint(Type):
+    """Abstract class representing a type constraint."""
+    pass
+
+
+@register_relay_node
+class FuncType(Type):
+    """A function type in Relay, see tvm/relay/type.h for more details.
+
+    This is the type assigned to functions in Relay. They consist of
+    a list of type parameters which enable the definition of generic
+    fucntions, a set of type constraints which we omit for the time
+    being, a sequence of argument types, and a return type.
+
+    We informally write them as:
+    `forall (type_params), (arg_types) -> ret_type where type_constraints`
+    """
+    def __init__(self,
+                 arg_types,
+                 ret_type,
+                 type_params,
+                 type_constraints
+                ):
+        """Construct a function type.
+
+        Parameters
+        ----------
+        arg_types:  list of Type
+        ret_type: Type
+        type_params: list of TypeParam
+        type_constraints: list of TypeConstraint
+
+        Returns
+        -------
+        func_type: FuncType
+            The function type.
+        """
+        self.__init_handle_by_constructor__(
+            _make.FuncType, arg_types, ret_type, type_params, type_constraints)
+
+
+@register_relay_node
+class IncompleteType(Type):
+    """An incomplete type."""
+
+    def __init__(self, kind):
+        self.__init_handle_by_constructor__(_make.IncompleteType, kind)
diff --git a/python/tvm/relay/ty.pyi b/python/tvm/relay/ty.pyi
new file mode 100644
index 000000000000..0581847598d4
--- /dev/null
+++ b/python/tvm/relay/ty.pyi
@@ -0,0 +1,139 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
+"""The type nodes of the Relay language."""
+from enum import IntEnum
+from .base import NodeBase, register_relay_node
+from . import _make
+
+
+class Type(NodeBase):
+    """The base type for all Relay types."""
+
+    def __eq__(self, other):
+        """Compare two Relay types for structural equivalence using
+           alpha equivalence.
+        """
+        return bool(_make._type_alpha_eq(self, other))
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def same_as(self, other):
+        """Compares two Relay types by referential equality."""
+        return super().__eq__(other)
+
+
+@register_relay_node
+class TensorType(Type):
+    """A concrete TensorType in Relay, see tvm/relay/type.h for more details.
+
+    This is the type assigned to tensor's with a known dype and shape. For
+    example a tensor of `float32` and `(5, 5)`.
+    """
+
+    def __init__(self, shape, dtype):
+        """Construct a tensor type.
+
+        Parameters
+        ----------
+        shape: list of tvm.Expr
+        dtype: str
+
+        Returns
+        -------
+        tensor_type: The TensorType
+        """
+        self.__init_handle_by_constructor__(_make.TensorType, shape, dtype)
+
+
+class Kind(IntEnum):
+    """The kind of a type parameter, represents a variable shape,
+       base type, type, or dimension.
+
+       This controls what a type parameter is allowed to be instantiated
+       with. For example one's of kind BaseType can only be `float32`, `int32`,
+       and so on.
+    """
+    ShapeVar = 0
+    Shape = 1
+    BaseType = 2
+    Type = 3
+
+
+@register_relay_node
+class TypeParam(Type):
+    """A type parameter used for generic types in Relay,
+    see tvm/relay/type.h for more details.
+
+    A type parameter represents a type placeholder which will
+    be filled in later on. This allows the user to write
+    functions which are generic over types.
+    """
+
+    def __init__(self, var, kind):
+        """Construct a TypeParam.
+
+        Parameters
+        ----------
+        var: tvm.expr.Var
+            The tvm.Var which backs the type parameter.
+
+        kind: Kind
+            The kind of the type parameter.
+
+        Returns
+        -------
+        type_param: TypeParam
+            The type parameter.
+        """
+        self.__init_handle_by_constructor__(_make.TypeParam, var, kind)
+
+
+@register_relay_node
+class TypeConstraint(Type):
+    """Abstract class representing a type constraint."""
+    pass
+
+
+@register_relay_node
+class FuncType(Type):
+    """A function type in Relay, see tvm/relay/type.h for more details.
+
+    This is the type assigned to functions in Relay. They consist of
+    a list of type parameters which enable the definition of generic
+    fucntions, a set of type constraints which we omit for the time
+    being, a sequence of argument types, and a return type.
+
+    We informally write them as:
+    `forall (type_params), (arg_types) -> ret_type where type_constraints`
+    """
+
+    def __init__(self,
+                 arg_types,
+                 ret_type,
+                 type_params,
+                 type_constraints,
+                 ):
+        """Construct a function type.
+
+        Parameters
+        ----------
+        arg_types:  list of Type
+        ret_type: Type
+        type_params: list of TypeParam
+        type_constraints: list of TypeConstraint
+
+        Returns
+        -------
+        func_type: FuncType
+            The function type.
+        """
+        self.__init_handle_by_constructor__(
+            _make.FuncType, arg_types, ret_type, type_params, type_constraints)
+
+
+@register_relay_node
+class IncompleteType(Type):
+    """An incomplete type."""
+
+    def __init__(self, kind):
+        self.__init_handle_by_constructor__(_make.IncompleteType, kind)
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index f169ff1b64ac..f0d60f514a37 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -6,8 +6,10 @@
 from . import make as _make
 from . import expr as _expr
 
+
 class TensorSlice(NodeGeneric, _expr.ExprOp):
     """Auxiliary data structure for enable slicing syntax from tensor."""
+
     def __init__(self, tensor, indices):
         if not isinstance(indices, tuple):
             indices = (indices,)
@@ -31,9 +33,11 @@ def dtype(self):
 
 itervar_cls = None
 
+
 @register_node
 class Tensor(NodeBase, _expr.ExprOp):
     """Tensor object, to construct, see function.Tensor"""
+
     def __call__(self, *indices):
         ndim = self.ndim
         if len(indices) != ndim:
@@ -104,6 +108,7 @@ def name(self):
 
 class Operation(NodeBase):
     """Represent an operation that generate a tensor"""
+
     def output(self, index):
         """Get the index-th output of the operation
 
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
new file mode 100644
index 000000000000..7e7fb71f6d6c
--- /dev/null
+++ b/src/relay/ir/base.cc
@@ -0,0 +1,77 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file base.cc
+ * \brief The core base types for Relay.
+ */
+#include <tvm/api_registry.h>
+#include <tvm/relay/base.h>
+
+namespace tvm {
+namespace relay {
+
+using tvm::IRPrinter;
+using namespace tvm::runtime;
+
+SourceName SourceNameNode::make(std::string name) {
+  std::shared_ptr<SourceNameNode> n = std::make_shared<SourceNameNode>();
+  n->name = std::move(name);
+  return SourceName(n);
+}
+
+std::shared_ptr<SourceNameNode> CreateSourceName(const std::string& name) {
+  SourceName sn = SourceName::Get(name);
+  CHECK(!sn.defined()) << "Cannot find source name \'" << name << '\'';
+  std::shared_ptr<Node> node = sn.node_;
+  return std::dynamic_pointer_cast<SourceNameNode>(node);
+}
+
+const SourceName& SourceName::Get(const std::string& name) {
+  static std::unordered_map<std::string, SourceName> source_map;
+
+  auto sn = source_map.find(name);
+  if (sn == source_map.end()) {
+    auto source_name = SourceNameNode::make(name);
+    source_map.insert({name, source_name});
+    return source_map.at(name);
+  } else {
+    return sn->second;
+  }
+}
+
+TVM_REGISTER_API("relay._make.SourceName")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue *ret) {
+      *ret = SourceNameNode::make(args[0]);
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<SourceNameNode>([](const SourceNameNode *node, tvm::IRPrinter *p) {
+      p->stream << "SourceNameNode(" << node->name << ", " << node << ")";
+    });
+
+TVM_REGISTER_NODE_TYPE(SourceNameNode)
+.set_creator(CreateSourceName)
+.set_global_key([](const Node* n) {
+    return static_cast<const SourceNameNode*>(n)->name;
+  });
+
+Span SpanNode::make(SourceName source, int lineno, int col_offset) {
+  std::shared_ptr<SpanNode> n = std::make_shared<SpanNode>();
+  n->source = std::move(source);
+  n->lineno = lineno;
+  n->col_offset = col_offset;
+  return Span(n);
+}
+
+TVM_REGISTER_API("relay._make.Span")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = SpanNode::make(args[0], args[1], args[2]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<SpanNode>([](const SpanNode *node, tvm::IRPrinter *p) {
+    p->stream << "SpanNode(" << node->source << ", " << node->lineno << ", "
+              << node->col_offset << ")";
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/environment.cc b/src/relay/ir/environment.cc
new file mode 100644
index 000000000000..47c9789ab5ae
--- /dev/null
+++ b/src/relay/ir/environment.cc
@@ -0,0 +1,147 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file  environment.cc
+ * \brief The global environment in Relay.
+ */
+#include <tvm/relay/environment.h>
+#include <tvm/relay/pass.h>
+#include <sstream>
+#include "./../pass/resolve.h"
+
+namespace tvm {
+namespace relay {
+
+using tvm::IRPrinter;
+using namespace runtime;
+
+Environment EnvironmentNode::make(tvm::Map<GlobalVar, Function> global_funcs) {
+  std::shared_ptr<EnvironmentNode> n = std::make_shared<EnvironmentNode>();
+  n->functions = std::move(global_funcs);
+  return Environment(n);
+}
+
+GlobalVar EnvironmentNode::GetGlobalVar(const std::string &str) {
+  auto global_id = global_map_.find(str);
+  if (global_id != global_map_.end()) {
+    return (*global_id).second;
+  } else {
+    auto id = GlobalVarNode::make(str);
+    this->global_map_.Set(str, id);
+    return id;
+  }
+}
+
+/*! \brief Add a new item to the global environment
+ * \note if the update flag is not set adding a duplicate
+ * definition will trigger an exception, otherwise we will
+ * update the definition if and only if it is type compatible.
+ */
+void EnvironmentNode::Add(const GlobalVar &var, const Function &func,
+                          bool update) {
+  // Type check the item before we add it to the environment.
+  auto env = GetRef<Environment>(this);
+
+  Expr checked_expr = InferType(env, var, func);
+
+  if (const FunctionNode *func_node = checked_expr.as<FunctionNode>()) {
+    auto checked_func = GetRef<Function>(func_node);
+    auto type = checked_func->checked_type();
+
+    CHECK(IsFullyResolved(type));
+
+    if (functions.find(var) != functions.end()) {
+      if (!update) {
+        throw dmlc::Error("already have definition for XXXX.");
+      }
+
+      auto old_type = functions[var].as<FunctionNode>()->checked_type();
+
+      if (!AlphaEqual(type, old_type)) {
+        throw dmlc::Error(
+            "Environment#update changes type, not possible in this mode.");
+      }
+
+      this->functions.Set(var, checked_func);
+    } else {
+      this->functions.Set(var, checked_func);
+    }
+  } else {
+    throw Error("internal error: unknown item type, unreachable code");
+  }
+}
+
+void EnvironmentNode::Update(const GlobalVar &var, const Function &func) {
+  this->Add(var, func, true);
+}
+
+void EnvironmentNode::Remove(const GlobalVar & var) {
+  auto functions_node = this->functions.CopyOnWrite();
+  functions_node->data.erase(var.node_);
+}
+
+Function EnvironmentNode::Lookup(const GlobalVar &var) {
+  auto func = functions.find(var);
+  if (func != functions.end()) {
+    return (*func).second;
+  } else {
+    throw Error(std::string("there is no definition of ") + var->name_hint);
+  }
+}
+
+Function EnvironmentNode::Lookup(const std::string &str) {
+  GlobalVar id = this->GetGlobalVar(str);
+  return this->Lookup(id);
+}
+
+void EnvironmentNode::Merge(const Environment &env) {
+  for (auto pair : env->functions) {
+    this->functions.Set(pair.first, pair.second);
+  }
+}
+
+TVM_REGISTER_API("relay._make.Environment")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      *ret = EnvironmentNode::make(args[0]);
+    });
+
+TVM_REGISTER_API("relay._env.Environment_Add")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      Environment env = args[0];
+      env->Add(args[1], args[2], false);
+    });
+
+TVM_REGISTER_API("relay._env.Environment_GetGlobalVar")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      Environment env = args[0];
+      *ret = env->GetGlobalVar(args[1]);
+    });
+
+TVM_REGISTER_API("relay._env.Environment_Lookup")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      Environment env = args[0];
+      GlobalVar var = args[1];
+      *ret = env->Lookup(var);
+    });
+
+TVM_REGISTER_API("relay._env.Environment_Lookup_str")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      Environment env = args[0];
+      std::string var_name = args[1];
+      auto var = env->GetGlobalVar(var_name);
+      *ret = env->Lookup(var);
+    });
+
+TVM_REGISTER_API("relay._env.Environment_Merge")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      Environment env = args[0];
+      env->Merge(args[1]);
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<EnvironmentNode>([](const EnvironmentNode *node,
+                                      tvm::IRPrinter *p) {
+      p->stream << "EnvironmentNode( " << node->functions << ")";
+    });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
new file mode 100644
index 000000000000..f4363f5312c4
--- /dev/null
+++ b/src/relay/ir/expr.cc
@@ -0,0 +1,201 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/ir/expr.cc
+ * \brief The expression AST nodes of Relay.
+ */
+#include <tvm/ir_functor.h>
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+
+using tvm::IRPrinter;
+using namespace tvm::runtime;
+
+Constant ConstantNode::make(runtime::NDArray data) {
+  std::shared_ptr<ConstantNode> n = std::make_shared<ConstantNode>();
+  n->data = std::move(data);
+  return Constant(n);
+}
+
+TVM_REGISTER_API("relay._make.Constant")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      *ret = ConstantNode::make(args[0]);
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<ConstantNode>([](const ConstantNode *node,
+                                   tvm::IRPrinter *p) {
+      p->stream << "ConstantNode(TODO)";
+    });
+
+TensorType ConstantNode::tensor_type() const {
+  auto dtype = TVMType2Type(data->dtype);
+
+  Array<tvm::Expr> shape;
+  for (int i = 0; i < data->ndim; i++) {
+    shape.push_back(tvm::ir::IntImm::make(HalideIR::Int(64), data->shape[i]));
+  }
+
+  return TensorTypeNode::make(shape, dtype);
+}
+
+Tuple TupleNode::make(tvm::Array<relay::Expr> fields) {
+  std::shared_ptr<TupleNode> n = std::make_shared<TupleNode>();
+  n->fields = std::move(fields);
+  return Tuple(n);
+}
+
+TVM_REGISTER_API("relay._make.Tuple")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      *ret = TupleNode::make(args[0]);
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<TupleNode>([](const TupleNode *node, tvm::IRPrinter *p) {
+      p->stream << "TupleNode(" << node->fields << ")";
+    });
+
+Var VarNode::make(std::string name_hint) {
+  std::shared_ptr<VarNode> n = std::make_shared<VarNode>();
+  n->name_hint = std::move(name_hint);
+  return Var(n);
+}
+
+TVM_REGISTER_API("relay._make.Var")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      *ret = VarNode::make(args[0]);
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<VarNode>([](const VarNode *node,
+                                   tvm::IRPrinter *p) {
+      p->stream << "VarNode(" << node->name_hint << ")";
+    });
+
+GlobalVar GlobalVarNode::make(std::string name_hint) {
+  std::shared_ptr<GlobalVarNode> n = std::make_shared<GlobalVarNode>();
+  n->name_hint = std::move(name_hint);
+  return GlobalVar(n);
+}
+
+TVM_REGISTER_API("relay._make.GlobalVar")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      *ret = GlobalVarNode::make(args[0]);
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<GlobalVarNode>([](const GlobalVarNode *node,
+                                    tvm::IRPrinter *p) {
+      p->stream << "GlobalVarNode(" << node->name_hint << ")";
+    });
+
+Param ParamNode::make(Var var, Type type) {
+  std::shared_ptr<ParamNode> n = std::make_shared<ParamNode>();
+  n->var = std::move(var);
+  n->type = std::move(type);
+  return Param(n);
+}
+
+TVM_REGISTER_API("relay._make.Param")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  *ret = ParamNode::make(args[0], args[1]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<ParamNode>([](const ParamNode *node, tvm::IRPrinter *p) {
+  p->stream << "ParamNode(" << node->var << ", " << node->type << ")";
+});
+
+Function FunctionNode::make(tvm::Array<Param> params, Type ret_type, Expr body,
+                            tvm::Array<TypeParam> type_params) {
+  std::shared_ptr<FunctionNode> n = std::make_shared<FunctionNode>();
+  n->params = std::move(params);
+  n->ret_type = std::move(ret_type);
+  n->body = std::move(body);
+  n->type_params = std::move(type_params);
+  return Function(n);
+}
+
+Type FunctionNode::fn_type() const {
+  Array<Type> param_types;
+  for (auto param : this->params) {
+    param_types.push_back(param->type);
+  }
+
+  return FuncTypeNode::make(param_types, this->ret_type, this->type_params, {});
+}
+
+TVM_REGISTER_API("relay._make.Function")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  *ret = FunctionNode::make(args[0], args[1], args[2], args[3]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<FunctionNode>([](const FunctionNode *node,
+                                   tvm::IRPrinter *p) {
+      p->stream << "FunctionNode(" << node->params << ", " << node->ret_type
+                << ", " << node->body << ", " << node->type_params << ")";
+});
+
+Call CallNode::make(Expr op, Array<Expr> args, Attrs attrs,
+                    Array<Type> type_args) {
+  std::shared_ptr<CallNode> n = std::make_shared<CallNode>();
+  n->op = std::move(op);
+  n->args = std::move(args);
+  n->attrs = std::move(attrs);
+  n->type_args = std::move(type_args);
+  return Call(n);
+}
+
+TVM_REGISTER_API("relay._make.Call")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  *ret = CallNode::make(args[0], args[1], args[2], args[3]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<CallNode>([](const CallNode *node, tvm::IRPrinter *p) {
+  p->stream << "CallNode(" << node->op << ", " << node->args << ", "
+    << node->attrs << ", " << node->type_args << ")";
+});
+
+Let LetNode::make(Var var, Expr value, Expr body, Type value_type) {
+  std::shared_ptr<LetNode> n = std::make_shared<LetNode>();
+  n->var = std::move(var);
+  n->value = std::move(value);
+  n->body = std::move(body);
+  n->value_type = std::move(value_type);
+  return Let(n);
+}
+
+TVM_REGISTER_API("relay._make.Let")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  *ret = LetNode::make(args[0], args[1], args[2], args[3]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<LetNode>([](const LetNode *node, tvm::IRPrinter *p) {
+  p->stream << "LetNode(" << node->var << ", " << node->value
+    << ", " << node->body << ", " << node->value_type << ")";
+});
+
+If IfNode::make(Expr cond, Expr true_branch, Expr false_branch) {
+  std::shared_ptr<IfNode> n = std::make_shared<IfNode>();
+  n->cond = std::move(cond);
+  n->true_branch = std::move(true_branch);
+  n->false_branch = std::move(false_branch);
+  return If(n);
+}
+
+TVM_REGISTER_API("relay._make.If").set_body([](TVMArgs args, TVMRetValue *ret) {
+  *ret = IfNode::make(args[0], args[1], args[2]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<IfNode>([](const IfNode *node, tvm::IRPrinter *p) {
+  p->stream << "IfNode(" << node->cond << ", " << node->true_branch
+            << node->false_branch << ")";
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
new file mode 100644
index 000000000000..85ae5ffa694e
--- /dev/null
+++ b/src/relay/ir/expr_functor.cc
@@ -0,0 +1,205 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/expr_mutator.cc
+ * \brief A wrapper around ExprFunctor which functionally updates the AST.
+ *
+ * ExprMutator uses memoization and self return in order to amortize
+ * the cost of using functional updates.
+ */
+
+#include <tvm/relay/expr_functor.h>
+
+namespace tvm {
+namespace relay {
+
+Expr ExprMutator::Mutate(const Expr& expr) {
+  auto cached_expr = this->memo_.find(expr);
+  if (cached_expr != this->memo_.end()) {
+    return (*cached_expr).second;
+  } else {
+    auto new_expr = this->ExprMutator::VisitExpr(expr, expr);
+    this->memo_.Set(expr, new_expr);
+    return new_expr;
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const VarNode* op, const Expr& expr) {
+  return expr;
+}
+
+Expr ExprMutator::VisitExpr_(const ConstantNode* op, const Expr& expr) {
+  return expr;
+}
+
+Expr ExprMutator::VisitExpr_(const GlobalVarNode* op, const Expr& expr) {
+  return expr;
+}
+
+Expr ExprMutator::VisitExpr_(const OpNode* op, const Expr& expr) {
+  return expr;
+}
+
+Expr ExprMutator::VisitExpr_(const TupleNode* op, const Expr& e) {
+  tvm::Array<Expr> fields;
+  bool all_fields_unchanged = true;
+  for (auto field : op->fields) {
+    auto new_field = this->Mutate(field);
+    fields.push_back(new_field);
+    all_fields_unchanged &= new_field.same_as(field);
+  }
+
+  if (all_fields_unchanged) {
+    return e;
+  } else {
+    return TupleNode::make(fields);
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const ParamNode* op, const Expr& e) {
+  Var var = Downcast<Var>(this->Mutate(op->var));
+  auto type = this->VisitType(op->type);
+  if (var == op->var && type == op->type) {
+    return e;
+  } else {
+    return ParamNode::make(var, type);
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const FunctionNode* op, const Expr& e) {
+  tvm::Array<TypeParam> ty_params;
+  bool all_ty_params_changed = true;
+
+  for (auto ty_param : op->type_params) {
+    TypeParam new_ty_param = Downcast<TypeParam>(VisitType(ty_param));
+    ty_params.push_back(new_ty_param);
+    all_ty_params_changed &= new_ty_param.same_as(ty_param);
+  }
+
+  tvm::Array<Param> params;
+  bool all_params_changed = true;
+  for (auto param : op->params) {
+    Param new_param = Downcast<Param>(this->Mutate(param));
+    params.push_back(new_param);
+    all_params_changed &= param.same_as(new_param);
+  }
+
+  auto ret_type = this->VisitType(op->ret_type);
+  auto body = this->Mutate(op->body);
+
+  if (ty_params.same_as(op->type_params) && params.same_as(op->params) &&
+      ret_type.same_as(op->ret_type) && body.same_as(op->body)) {
+    return e;
+  } else {
+    return FunctionNode::make(params, ret_type, body, ty_params);
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const CallNode* call_node, const Expr& e) {
+  auto op = this->Mutate(call_node->op);
+
+  tvm::Array<Type> ty_args;
+  bool all_ty_args_unchanged = true;
+  for (auto ty_arg : call_node->type_args) {
+    auto new_ty_arg = this->VisitType(ty_arg);
+    ty_args.push_back(new_ty_arg);
+    all_ty_args_unchanged &= new_ty_arg.same_as(ty_arg);
+  }
+
+  tvm::Array<Expr> call_args;
+  bool all_args_unchanged = true;
+  for (auto arg : call_node->args) {
+    auto new_arg = this->Mutate(arg);
+    call_args.push_back(new_arg);
+    all_args_unchanged &= new_arg.same_as(arg);
+  }
+
+  if (all_ty_args_unchanged && all_args_unchanged &&
+      call_node->op.same_as(op)) {
+    return e;
+  } else {
+    return CallNode::make(op, call_args, call_node->attrs, ty_args);
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const LetNode* op, const Expr& e) {
+  Var var = Downcast<Var>(this->Mutate(op->var));
+  auto type = this->VisitType(op->value_type);
+  auto value = this->Mutate(op->value);
+  auto body = this->Mutate(op->body);
+
+  if (var.same_as(op->var) && type.same_as(op->value_type) &&
+      value.same_as(op->value) && body.same_as(op->body)) {
+    return e;
+  } else {
+    return LetNode::make(var, value, body, type);
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const IfNode* op, const Expr& e) {
+  auto guard = this->Mutate(op->cond);
+  auto true_b = this->Mutate(op->true_branch);
+  auto false_b = this->Mutate(op->false_branch);
+  if (op->cond == guard && true_b == op->true_branch &&
+      false_b == op->false_branch) {
+    return e;
+  } else {
+    return IfNode::make(guard, true_b, false_b);
+  }
+}
+
+Type ExprMutator::VisitType(const Type& t) { return t; }
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const VarNode* op) { return; }
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const GlobalVarNode* op) { return; }
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const ConstantNode* op) { return; }
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const TupleNode* op) {
+  for (auto field : op->fields) {
+    this->VisitExpr(field);
+  }
+}
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const ParamNode* op) {
+  this->VisitExpr(op->var);
+}
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const FunctionNode* op) {
+  for (auto param : op->params) {
+    this->VisitExpr(param);
+  }
+
+  this->VisitExpr(op->body);
+}
+
+void ExprVisitor::VisitExpr_(const CallNode* op) {
+  this->VisitExpr(op->op);
+  for (auto ty_arg : op->type_args) {
+    this->VisitType(ty_arg);
+  }
+
+  for (auto arg : op->args) {
+    this->VisitExpr(arg);
+  }
+}
+
+void ExprVisitor::VisitExpr_(const LetNode* op) {
+  this->VisitExpr(op->var);
+  this->VisitExpr(op->value);
+  this->VisitExpr(op->body);
+}
+
+void ExprVisitor::VisitExpr_(const IfNode* op) {
+  this->VisitExpr(op->cond);
+  this->VisitExpr(op->true_branch);
+  this->VisitExpr(op->false_branch);
+}
+
+void ExprVisitor::VisitExpr_(const OpNode* op) { return; }
+
+void ExprVisitor::VisitType(const Type& t) { return; }
+
+}  // namespace relay
+}  // namespace tvm
+
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
new file mode 100644
index 000000000000..d1a9dd072d31
--- /dev/null
+++ b/src/relay/ir/op.cc
@@ -0,0 +1,155 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/op.cc
+ * \brief Resolve incomplete types to complete types.
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <memory>
+#include <mutex>
+
+#include "./../pass/type_subst.h"
+
+namespace dmlc {
+// enable registry
+DMLC_REGISTRY_ENABLE(::tvm::relay::OpRegistry);
+}  // namespace dmlc
+
+namespace tvm {
+namespace relay {
+
+::dmlc::Registry<OpRegistry>* OpRegistry::Registry() {
+  return ::dmlc::Registry<OpRegistry>::Get();
+}
+
+// single manager of operator information.
+struct OpManager {
+  // mutex to avoid registration from multiple threads.
+  std::mutex mutex;
+  // global operator counter
+  std::atomic<int> op_counter{0};
+  // storage of additional attribute table.
+  std::unordered_map<std::string, std::unique_ptr<GenericOpMap>> attr;
+  // frontend functions
+  std::vector<PackedFunc*> frontend_funcs;
+  // get singleton of the op manager
+  static OpManager* Global() {
+    static OpManager inst;
+    return &inst;
+  }
+};
+
+// find operator by name
+const Op& Op::Get(const std::string& name) {
+  const OpRegistry* reg = dmlc::Registry<OpRegistry>::Find(name);
+  CHECK(reg != nullptr) << "Operator " << name << " is not registered";
+  return reg->op();
+}
+
+OpRegistry::OpRegistry() {
+  OpManager* mgr = OpManager::Global();
+  std::shared_ptr<OpNode> n = std::make_shared<OpNode>();
+  n->index_ = mgr->op_counter++;
+  op_ = Op(n);
+}
+
+// Get attribute map by key
+const GenericOpMap& Op::GetGenericAttr(const std::string& key) {
+  OpManager* mgr = OpManager::Global();
+  std::lock_guard<std::mutex> lock(mgr->mutex);
+  auto it = mgr->attr.find(key);
+  if (it == mgr->attr.end()) {
+    LOG(FATAL) << "Operator attribute \'" << key << "\' is not registered";
+  }
+  return *it->second.get();
+}
+
+void OpRegistry::UpdateAttr(const std::string& key, TVMRetValue value,
+                            int plevel) {
+  OpManager* mgr = OpManager::Global();
+  std::lock_guard<std::mutex> lock(mgr->mutex);
+  std::unique_ptr<GenericOpMap>& op_map = mgr->attr[key];
+  if (op_map == nullptr) {
+    op_map.reset(new GenericOpMap());
+  }
+  uint32_t index = op_->index_;
+  if (op_map->data_.size() <= index) {
+    op_map->data_.resize(index + 1, std::make_pair(TVMRetValue(), 0));
+  }
+  std::pair<TVMRetValue, int>& p = op_map->data_[index];
+  CHECK(p.second != plevel)
+      << "Attribute " << key << " of operator " << this->name
+      << " is already registered with same plevel=" << plevel;
+  if (p.second < plevel) {
+    op_map->data_[index] = std::make_pair(value, plevel);
+  }
+}
+
+// Frontend APIs
+TVM_REGISTER_API("relay.op._ListOpNames")
+    .set_body_typed<Array<tvm::Expr>()>([]() {
+      Array<tvm::Expr> ret;
+      for (const std::string& name :
+           dmlc::Registry<OpRegistry>::ListAllNames()) {
+        ret.push_back(tvm::Expr(name));
+      }
+      return ret;
+    });
+
+TVM_REGISTER_API("relay.op._GetOp").set_body_typed<Op(std::string)>(Op::Get);
+
+TVM_REGISTER_API("relay.op._OpGetAttr")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      Op op = args[0];
+      std::string attr_name = args[1];
+      auto op_map = Op::GetAttr<TVMRetValue>(attr_name);
+      if (op_map.count(op)) {
+        *rv = op_map[op];
+      }
+    });
+
+TVM_REGISTER_API("relay.op._Register")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      std::string op_name = args[0];
+      std::string attr_key = args[1];
+      runtime::TVMArgValue value = args[2];
+      int plevel = args[3];
+      auto& reg =
+          OpRegistry::Registry()->__REGISTER_OR_GET__(op_name).set_name();
+      // enable resgiteration and override of certain properties
+      if (attr_key == "num_inputs" && plevel > 128) {
+        reg.set_num_inputs(value);
+      } else if (attr_key == "attrs_type_key" && plevel > 128) {
+        reg.set_attrs_type_key(value);
+      } else {
+        // normal attr table override.
+        if (args[2].type_code() == kFuncHandle) {
+          // do an eager copy of the PackedFunc
+          PackedFunc f = args[2];
+          // If we get a function from frontend, avoid deleting it.
+          OpManager::Global()->frontend_funcs.push_back(new PackedFunc(f));
+          reg.set_attr(attr_key, f, plevel);
+        } else {
+          reg.set_attr(attr_key, args[2], plevel);
+        }
+      }
+    });
+
+std::shared_ptr<OpNode> CreateOp(const std::string& name) {
+  auto op = Op::Get(name);
+  CHECK(!op.defined()) << "Cannot find op \'" << name << '\'';
+  std::shared_ptr<Node> node = op.node_;
+  return std::dynamic_pointer_cast<OpNode>(node);
+}
+
+TVM_REGISTER_NODE_TYPE(OpNode)
+.set_creator(CreateOp)
+.set_global_key([](const Node* n) {
+    return static_cast<const OpNode*>(n)->name;
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
new file mode 100644
index 000000000000..c13fea26dacd
--- /dev/null
+++ b/src/relay/ir/type.cc
@@ -0,0 +1,121 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/ir/type.cc
+ * \brief The type system AST nodes of Relay.
+ */
+#include <tvm/ir_functor.h>
+#include <tvm/relay/type.h>
+
+namespace tvm {
+namespace relay {
+
+using tvm::IRPrinter;
+using namespace tvm::runtime;
+
+TensorType TensorTypeNode::make(Array<ShapeExpr> shape, DataType dtype) {
+  std::shared_ptr<TensorTypeNode> n = std::make_shared<TensorTypeNode>();
+  n->shape = std::move(shape);
+  n->dtype = std::move(dtype);
+  return TensorType(n);
+}
+
+TensorType TensorTypeNode::Scalar(DataType dtype) {
+  return TensorTypeNode::make({}, dtype);
+}
+
+TVM_REGISTER_API("relay._make.TensorType")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  Array<ShapeExpr> shape = args[0];
+  *ret = TensorTypeNode::make(shape, args[1]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TensorTypeNode>([](const TensorTypeNode *node,
+                                     tvm::IRPrinter *p) {
+  p->stream << "TensorTypeNode(" << node->dtype << ", " << node->shape << ")";
+});
+
+TypeParam TypeParamNode::make(std::string name, TypeParamNode::Kind kind) {
+  std::shared_ptr<TypeParamNode> n = std::make_shared<TypeParamNode>();
+  n->var = tvm::Var(name);
+  n->kind = std::move(kind);
+  return TypeParam(n);
+}
+
+TVM_REGISTER_API("relay._make.TypeParam")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  int kind = args[1];
+  *ret =
+    TypeParamNode::make(args[0], static_cast<TypeParamNode::Kind>(kind));
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TypeParamNode>([](const TypeParamNode *node,
+                                    tvm::IRPrinter *p) {
+  p->stream << "TypeParamNode(" << node->var->name_hint << ", "
+    << node->kind << ")";
+});
+
+FuncType FuncTypeNode::make(tvm::Array<Type> arg_types, Type ret_type,
+                            tvm::Array<TypeParam> type_params,
+                            tvm::Array<TypeConstraint> type_constraints) {
+  std::shared_ptr<FuncTypeNode> n = std::make_shared<FuncTypeNode>();
+  n->arg_types = std::move(arg_types);
+  n->ret_type = std::move(ret_type);
+  n->type_params = std::move(type_params);
+  n->type_constraints = std::move(type_constraints);
+  return FuncType(n);
+}
+
+TVM_REGISTER_API("relay._make.FuncType")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  *ret = FuncTypeNode::make(args[0], args[1], args[2], args[3]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<FuncTypeNode>([](const FuncTypeNode *node,
+                                   tvm::IRPrinter *p) {
+  p->stream << "FuncTypeNode(" << node->type_params << ", "
+            << node->arg_types << ", " << node->ret_type << ", "
+            << node->type_constraints << ")";
+});
+
+TypeRelation TypeRelationNode::make(std::string name, TypeRelationFn func, Array<Type> args) {
+  std::shared_ptr<TypeRelationNode> n = std::make_shared<TypeRelationNode>();
+  n->name = std::move(name);
+  n->func_ = std::move(func);
+  n->args = std::move(args);
+  return TypeRelation(n);
+}
+
+TVM_REGISTER_API("relay._make.TypeRelation")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  *ret = TypeRelationNode::make(args[0], args[1], args[2]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TypeRelationNode>([](const TypeRelationNode *node,
+                                       tvm::IRPrinter *p) {
+  p->stream << "TypeRelationNode(" << node->name << ", " << node->args
+    << ")";
+});
+
+TupleType TupleTypeNode::make(Array<Type> fields) {
+  std::shared_ptr<TupleTypeNode> n = std::make_shared<TupleTypeNode>();
+  n->fields = std::move(fields);
+  return TupleType(n);
+}
+
+TVM_REGISTER_API("relay._make.TupleType")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = TupleTypeNode::make(args[0]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TupleTypeNode>([](const TupleTypeNode *node,
+                                    tvm::IRPrinter *p) {
+  p->stream << "TupleTypeNode(" << node->fields << ")";
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/tensor/elemwise.cc b/src/relay/op/tensor/elemwise.cc
new file mode 100644
index 000000000000..8c1823114f44
--- /dev/null
+++ b/src/relay/op/tensor/elemwise.cc
@@ -0,0 +1,137 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file elemwise.cc
+ * \brief Elementwise operators.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+// Quick helper macro
+// - Expose a positional make function to construct the node.
+// - Register op to the registry.
+//
+// We make the decision to always only expose positional argument.
+// We will do rewrapping in the frontend to support language
+// sugars such as keyword arguments and default value.
+//
+#define RELAY_REGISTER_UNARY_OP(OpName)               \
+  TVM_REGISTER_API("relay.op._make." OpName)          \
+  .set_body_typed<Expr(Expr)>([](Expr data) {         \
+      static const Op& op = Op::Get(OpName);          \
+    return CallNode::make(op, {data}, Attrs(), {});   \
+    });                                               \
+  RELAY_REGISTER_OP(OpName)                           \
+  .set_num_inputs(1)                                  \
+  .add_argument("data", "Tensor", "The input tensor.")
+
+
+RELAY_REGISTER_UNARY_OP("log")
+.describe(R"code(Returns the log input array, computed element-wise.
+
+.. math::
+   log(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
+// data : Tensor[shape, dtype]
+// result: Tensor[shape, dtype]
+
+
+RELAY_REGISTER_UNARY_OP("exp")
+.describe(R"code(Returns the exp input array, computed element-wise.
+
+.. math::
+   \exp(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
+
+RELAY_REGISTER_UNARY_OP("sqrt")
+.describe(R"code(Returns the sqrt input array, computed element-wise.
+
+.. math::
+   sqrt(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
+// Addition
+TVM_REGISTER_API("relay.op._make.add")
+  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {
+      static const Op& op = Op::Get("add");
+    return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+  });
+
+RELAY_REGISTER_OP("add")
+  .set_num_inputs(2)
+  .add_argument("lhs", "Tensor", "The left hand side tensor.")
+  .add_argument("rhs", "Tensor", "The right hand side tensor.")
+  .set_support_level(1)
+  .add_type_rel("Broadcast", BroadcastRel);
+
+  // def broadcast(s1, s2):
+  // ...
+  //
+  // input1: Tensor[dtype, s1]
+  // input2: Tensor[dtype, s2]
+  // output: Tensor[dtype, broadcast(s1, s2)]
+
+// Addition
+TVM_REGISTER_API("relay.op._make.subtract")
+  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {
+      static const Op& op = Op::Get("subtract");
+    return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+  });
+
+RELAY_REGISTER_OP("subtract")
+  .set_num_inputs(2)
+  .add_argument("lhs", "Tensor", "The left hand side tensor.")
+  .add_argument("rhs", "Tensor", "The right hand side tensor.")
+  .set_support_level(1)
+  .add_type_rel("Broadcast", BroadcastRel);
+
+  // def broadcast(s1, s2):
+  // ...
+  //
+  // input1: Tensor[dtype, s1]
+  // input2: Tensor[dtype, s2]
+  // output: Tensor[dtype, broadcast(s1, s2)]
+
+// Equality Comparison
+TVM_REGISTER_API("relay.op._make.equal")
+  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {
+      static const Op& op = Op::Get("equal");
+    return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+  });
+
+RELAY_REGISTER_OP("equal")
+  .set_num_inputs(2)
+  .add_argument("lhs", "Tensor", "The left hand side tensor.")
+  .add_argument("rhs", "Tensor", "The right hand side tensor.")
+  .set_support_level(1)
+  .add_type_rel("BroadcastComp", BroadcastCompRel);
+
+// Concat
+TVM_REGISTER_API("relay.op._make.concat")
+  .set_body_typed<Expr(Expr)>([](Expr tuple) {
+      static const Op& op = Op::Get("concat");
+    return CallNode::make(op, { tuple }, Attrs(), {});
+  });
+
+RELAY_REGISTER_OP("concat")
+  .set_num_inputs(1)
+  .add_argument("tuple", "Tuple", "The tupled tensor arguments.")
+  .set_support_level(1)
+  .add_type_rel("Concat", ConcatRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
new file mode 100644
index 000000000000..94550dbd5075
--- /dev/null
+++ b/src/relay/op/type_relations.cc
@@ -0,0 +1,206 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_relations.cc
+ * \brief A set of utilities and common functionality
+ * for type relations.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/logging.h>
+#include <tvm/relay/op.h>
+#include <numeric>
+#include "../pass/incomplete_type.h"
+#include "./type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+TensorType ToTensorType(const Type& t) {
+  if (auto tt_node = t.as<TensorTypeNode>()) {
+    return GetRef<TensorType>(tt_node);
+  } else {
+    return TensorType(nullptr);
+  }
+}
+
+// TODO(@jroesch) what size value do we extract, 64bit or 32bit?
+int ToInt(const tvm::Expr& e) {
+  CHECK(e.defined());
+  auto imm = e.as<tvm::ir::IntImm>();
+  CHECK(imm) << "TYPE: " << imm << imm->type << std::endl;
+  return imm->value;
+}
+
+Array<Type> IdentityRel(const Array<Type>& types, int num_args) {
+  CHECK_EQ(types.size(), 2);
+  auto t1 = ToTensorType(types[0]);
+  if (t1 && types[1].as<IncompleteTypeNode>()) {
+    return {t1, t1};
+  } else {
+    return types;
+  }
+}
+
+static Type ConcreteBroadcast(const TensorType& t1, const TensorType& t2,
+                              DataType output_dtype) {
+  RELAY_LOG(INFO) << "ConcreteBroadcast: t1=" << t1 << " t2=" << t2
+                  << std::endl;
+  auto sh1 = t1->shape;
+  auto sh2 = t2->shape;
+  RELAY_LOG(INFO) << "ConcreteBroadcast: sh1=" << sh1 << " sh2=" << sh2
+                  << std::endl;
+  if (sh1.size() == 0 && sh2.size() == 0) {
+    return TensorTypeNode::make({}, output_dtype);
+    // We have non-zero shapes so broadcast rules apply.
+  } else {
+    auto suffix_len = static_cast<int>(std::min(sh1.size(), sh2.size()));
+    auto full_len = static_cast<int>(std::max(sh1.size(), sh2.size()));
+
+    auto rev_sh1 = sh1.rbegin();
+    auto rev_sh2 = sh2.rbegin();
+
+    while (rev_sh1 != sh1.rend() && rev_sh2 != sh2.rend()) {
+      auto dim1 = ToInt(*rev_sh1);
+      auto dim2 = ToInt(*rev_sh2);
+      if ((dim1 != dim2) && ((dim1 != 1) && (dim2 != 1))) {
+        CHECK(false) << "Dimension mistmatch "
+                     << "dim1: " << dim1 << " dim2: " << dim2 << std::endl;
+      }
+      rev_sh1++;
+      rev_sh2++;
+    }
+
+    Array<ShapeExpr> larger;
+    Array<ShapeExpr> smaller;
+
+    for (int i = 0; i < (full_len - suffix_len); i++) {
+      smaller.push_back(tvm::ir::IntImm::make(HalideIR::Int(64), 1));
+    }
+
+    if (sh1.size() < sh2.size()) {
+      for (auto sh : sh1) {
+        smaller.push_back(sh);
+      }
+      larger = sh2;
+    } else if (sh1.size() > sh2.size()) {
+      for (auto sh : sh1) {
+        larger.push_back(sh);
+      }
+      smaller = sh2;
+    } else {
+      larger = sh1;
+      smaller = sh2;
+    }
+
+    CHECK_EQ(larger.size(), smaller.size());
+
+    Array<HalideIR::Expr> out_shape;
+    for (size_t i = 0; i < smaller.size(); i++) {
+      auto left = smaller[i].as<tvm::ir::IntImm>();
+      auto right = larger[i].as<tvm::ir::IntImm>();
+      CHECK(left);
+      CHECK(right);
+      int64_t dim = std::max(left->value, right->value);
+      out_shape.push_back(tvm::ir::IntImm::make(HalideIR::Int(64), dim));
+    }
+
+    return TensorTypeNode::make(out_shape, output_dtype);
+  }
+}
+
+Array<Type> BroadcastRel(const Array<Type>& types, int num_args) {
+  CHECK_EQ(types.size(), 3);
+  RELAY_LOG(INFO) << "In1: " << types[0] << "In2: " << types[1]
+                  << "Out: " << types[2] << std::endl;
+  if (auto t1 = ToTensorType(types[0])) {
+    if (auto t2 = ToTensorType(types[1])) {
+      CHECK_EQ(t1->dtype, t2->dtype);
+      return {t1, t2, ConcreteBroadcast(t1, t2, t1->dtype)};
+    }
+  }
+
+  return types;
+}
+
+/* A relation which specifies broadcasting rules for operations which
+   compute boolean results.
+*/
+Array<Type> BroadcastCompRel(const Array<Type>& types, int num_args) {
+  CHECK_EQ(types.size(), 3);
+  if (auto t1 = ToTensorType(types[0])) {
+    if (auto t2 = ToTensorType(types[1])) {
+      return {t1, t2, ConcreteBroadcast(t1, t2, HalideIR::Bool())};
+    }
+  }
+
+  return types;
+}
+
+/*! \brief Handle concrete concat case from known input to output. */
+inline Type ConcreteConcatRel(const Type& input_type) {
+  if (auto tuple_node = input_type.as<TupleTypeNode>()) {
+    // NB: For now the axis argument is hardwired to be 0.
+    std::vector<int> dims;
+    DataType dtype;
+
+    CHECK_LT(1, tuple_node->fields.size());
+    bool skip_first = true;
+
+    // Collect the suffix dimensions since axis is zero.
+    // TODO(@jroesch): This is a demonstration of how
+    // to do varargs. It requires a little more work to
+    // fully type the behavior of concat.
+
+    auto first = Downcast<TensorType>(tuple_node->fields[0]);
+    dtype = first->dtype;
+
+    for (auto dim_expr : first->shape) {
+      if (!skip_first) {
+        dims.push_back(ToInt(dim_expr));
+      } else {
+        skip_first = false;
+      }
+    }
+
+    std::vector<int> axis_dims;
+    for (auto field_ty : tuple_node->fields) {
+      auto ttype = Downcast<TensorType>(field_ty);
+      for (size_t i = 0; i < ttype->shape.size(); i++) {
+        if (i != 0) {
+          CHECK_EQ(ToInt(dims[i - 1]), ToInt(ttype->shape[i]));
+        } else {
+          axis_dims.push_back(ToInt(ttype->shape[i]));
+        }
+      }
+    }
+
+    auto out_axis_dim = std::accumulate(axis_dims.begin(), axis_dims.end(), 0);
+
+    Array<tvm::Expr> out_shape = { tvm::ir::IntImm::make(HalideIR::Int(64), out_axis_dim) };
+
+    for (auto dim : dims) {
+      out_shape.push_back(tvm::ir::IntImm::make(HalideIR::Int(64), dim));
+    }
+
+    return TensorTypeNode::make(out_shape, dtype);
+
+  } else {
+    throw TypeRelationError("concat can only be used with a tuple as its argument");
+  }
+}
+
+Array<Type> ConcatRel(const Array<Type>& types, int num_args) {
+  CHECK_EQ(types.size(), 2);
+
+  if (types[0].as<IncompleteTypeNode>() && types[1].as<IncompleteTypeNode>()) {
+    return types;
+  } else if (types[1].as<IncompleteTypeNode>()) {
+    return { types[0], ConcreteConcatRel(types[0]) };
+  } else {
+    throw TypeRelationError(
+      "can not deduce relationship between the " \
+      "type of concat's input and output");
+  }
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/type_relations.h b/src/relay/op/type_relations.h
new file mode 100644
index 000000000000..9dfc29022ee3
--- /dev/null
+++ b/src/relay/op/type_relations.h
@@ -0,0 +1,67 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/op/type_relations.h
+ * \brief A set of utilities and common functionality
+ * for type relations.
+ */
+#ifndef TVM_RELAY_OP_TYPE_RELATIONS_H_
+#define TVM_RELAY_OP_TYPE_RELATIONS_H_
+
+#include <tvm/relay/error.h>
+#include <tvm/relay/type.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief The error raised by a type relation.
+ *
+ * This error is how a type relation signals that it has failed.
+ *
+ */
+struct TypeRelationError : Error {
+  explicit TypeRelationError(const std::string& msg)
+      : Error(msg) {}
+};
+
+/*! \brief The identity type relation maps a single input variable
+ * to the output variable.
+ *
+ * \param types The input and output types to the relation.
+ * \param num_args The number of input arguments.
+ * \return The (potentially partial) solution to the relation.
+ */
+Array<Type> IdentityRel(const Array<Type>& types, int num_args);
+/*! \brief The broadcast type relation, implements the broadcasting
+ * rule over the two input types producing the broadcasted type.
+ *
+ * \param types The input and output types to the relation.
+ * \param num_args The number of input arguments.
+ * \return The (potentially partial) solution to the relation.
+ */
+Array<Type> BroadcastRel(const Array<Type>& types, int num_args);
+/*! \brief The broadcast type relation, implements the broadcasting
+ * rule over the two input types producing the broadcasted type.
+ *
+ * This differs from BroadcastRel in the return dtype,
+ * it instead returns bool, for use in comparsion operators
+ * such as equal, not_equal, lt, and so on.
+ *
+ * \param types The input and output types to the relation.
+ * \param num_args The number of input arguments.
+ * \return The (potentially partial) solution to the relation.
+ */
+Array<Type> BroadcastCompRel(const Array<Type>& types, int num_args);
+
+/*! \brief The concat relation.
+ *
+ * This relation takes a single input which must be a single tensor
+ * or an arbitrary sized tuple. It combines these input dimensions
+ * together to produce the output example.
+ */
+Array<Type> ConcatRel(const Array<Type>& types, int num_args);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_TYPE_RELATIONS_H_
diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
new file mode 100644
index 000000000000..f76da793c503
--- /dev/null
+++ b/src/relay/pass/alpha_eq.cc
@@ -0,0 +1,258 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/pass/alpha_eq.cc
+ * \brief Compute the set of variables not bound in the expression.
+ */
+#include <tvm/relay/expr_functor.h>
+#include "./type_visitor.h"
+#include "tvm/relay/pass.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace tvm::runtime;
+
+struct TypeAlphaEq : TypeVisitor<const Type&> {
+  tvm::Map<TypeParam, TypeParam> eq_map;
+  bool equal;
+
+  TypeAlphaEq() : eq_map(), equal(true) {}
+
+  void DataTypeEqual(const DataType& dt1, const DataType& dt2) {
+    equal = equal && dt1 == dt2;
+  }
+  void ShapeEqual(Array<ShapeExpr> s1, Array<ShapeExpr> s2) {}
+
+  void VisitType_(const TensorTypeNode *tt1, const Type& t2) final {
+    if (const TensorTypeNode *tt2 = t2.as<TensorTypeNode>()) {
+      DataTypeEqual(tt1->dtype, tt2->dtype);
+      ShapeEqual(tt1->shape, tt2->shape);
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitType_(const IncompleteTypeNode *bt1, const Type& t2) final {
+    if (const IncompleteTypeNode *bt2 = t2.as<IncompleteTypeNode>()) {
+      equal = equal && bt1 == bt2;
+      return;
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitType_(const TypeParamNode *ti1, const Type& t2) final {
+    if (const TypeParamNode *ti2 = t2.as<TypeParamNode>()) {
+      auto tid1 = GetRef<TypeParam>(ti1);
+      auto tid2 = GetRef<TypeParam>(ti2);
+
+      // We handle open terms with this rule assuming variables are identical.
+      //
+      // Not sure if we should do this.
+      if (tid1 == tid2) {
+        return;
+      }
+
+      // Check that they are same kind
+      if (tid1->kind != tid2->kind) {
+        equal = false;
+        return;
+      }
+
+      // Next we see if there is mapping for local1 into the rhs term.
+      // If there is we check to see if those are equal.
+      if (eq_map.find(tid1) != eq_map.end()) {
+        equal = equal && eq_map[tid1] == tid2;
+      } else {
+        equal = false;
+      }
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitType_(const FuncTypeNode *op, const Type& t2) final {
+    if (const FuncTypeNode *ta2 = t2.as<FuncTypeNode>()) {
+      if (op->arg_types.size() != ta2->arg_types.size()) {
+        equal = false;
+        return;
+      }
+
+      for (size_t i = 0; i < op->arg_types.size(); i++) {
+        this->VisitType(op->arg_types[i], ta2->arg_types[i]);
+        if (!equal) {
+          return;
+        }
+      }
+
+      this->VisitType(op->ret_type, ta2->ret_type);
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitType_(const TypeRelationNode *tr1, const Type& t2) final {
+    if (const TypeRelationNode *tr2 = t2.as<TypeRelationNode>()) {
+      equal = tr1 == tr2;
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitType_(const TupleTypeNode *op, const Type& t2) final {
+    if (const TupleTypeNode *pt = t2.as<TupleTypeNode>()) {
+      if (op->fields.size() != pt->fields.size()) {
+        equal = false;
+        return;
+      }
+
+      for (size_t i = 0U; i < op->fields.size(); i++) {
+        if (!equal) {
+          return;
+        }
+        this->VisitType(op->fields[i], pt->fields[i]);
+      }
+    } else {
+      equal = false;
+    }
+  }
+};
+
+bool AlphaEqual(const Type& t1, const Type& t2) {
+  TypeAlphaEq aeq;
+  aeq.VisitType(t1, t2);
+  return aeq.equal;
+}
+
+struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
+ public:
+  tvm::Map<Var, Var> eq_map;
+
+  bool equal;
+  AlphaEq() : eq_map(), equal(true) {}
+
+  void VisitExpr_(const VarNode *e1, const Expr& e2) final {
+    if (const VarNode *id2 = e2.as<VarNode>()) {
+      auto local1 = GetRef<Var>(e1);
+      auto local2 = GetRef<Var>(id2);
+      // We handle open terms with this rule assuming variables are identical.
+      if (local1 == local2) {
+        equal = true;
+        return;
+      }
+
+      // Next we see if there is mapping for local1 into the rhs term.
+      // If there is we check to see if those are equal.
+      if (eq_map.find(local1) != eq_map.end()) {
+        equal = equal && eq_map[local1] == local2;
+      } else {
+        equal = false;
+      }
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitExpr_(const GlobalVarNode *g1, const Expr& e2) final {
+    if (const GlobalVarNode *g2 = e2.as<GlobalVarNode>()) {
+      equal = equal && g1 == g2;
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitExpr_(const TupleNode *pl1, const Expr& e2) final {
+    Tuple prod1 = GetRef<Tuple>(pl1);
+    if (const TupleNode *pl2 = e2.as<TupleNode>()) {
+      Tuple prod2 = GetRef<Tuple>(pl2);
+      if (prod1->fields.size() != prod2->fields.size()) {
+        equal = false;
+        return;
+      }
+
+      for (size_t i = 0U; i < prod1->fields.size(); i++) {
+        this->VisitExpr(prod1->fields[i], prod2->fields[i]);
+      }
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitExpr_(const ParamNode *p1, const Expr& e2) final {
+    if (const ParamNode *p2 = e2.as<ParamNode>()) {
+      eq_map.Set(p1->var, p2->var);
+      equal = equal && AlphaEqual(p1->type, p2->type);
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitExpr_(const FunctionNode *func1, const Expr& e2) final {
+    if (const FunctionNode *func2 = e2.as<FunctionNode>()) {
+      if (func1->params.size() != func2->params.size()) {
+        equal = false;
+        return;
+      }
+
+      for (size_t i = 0U; i < func1->params.size(); i++) {
+        this->VisitExpr(func1->params[i], func2->params[i]);
+      }
+
+      this->VisitExpr(func1->body, func2->body);
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitExpr_(const CallNode *op, const Expr& e2) final {
+    if (const CallNode *call = e2.as<CallNode>()) {
+      this->VisitExpr(op->op, call->op);
+
+      if (op->args.size() != call->args.size()) {
+        equal = false;
+        return;
+      }
+
+      for (size_t i = 0U; i < op->args.size(); i++) {
+        this->VisitExpr(op->args[i], call->args[i]);
+      }
+
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitExpr_(const LetNode *op, const Expr& e2) final {
+    if (const LetNode *let = e2.as<LetNode>()) {
+      eq_map.Set(op->var, let->var);
+      this->VisitExpr(op->value, let->value);
+      this->VisitExpr(op->body, let->body);
+    } else {
+      equal = false;
+    }
+  }
+};
+
+bool AlphaEqual(const Expr& e1, const Expr& e2) {
+  AlphaEq eq;
+  eq.VisitExpr(e1, e2);
+  return eq.equal;
+}
+
+// TODO(@jroesch): move to correct namespace?
+TVM_REGISTER_API("relay._make._alpha_eq")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      Expr e1 = args[0];
+      Expr e2 = args[1];
+      *ret = AlphaEqual(e1, e2);
+    });
+
+TVM_REGISTER_API("relay._make._type_alpha_eq")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      Type t1 = args[0];
+      Type t2 = args[1];
+      *ret = AlphaEqual(t1, t2);
+    });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/incomplete_type.h b/src/relay/pass/incomplete_type.h
new file mode 100644
index 000000000000..78771dc6e9b7
--- /dev/null
+++ b/src/relay/pass/incomplete_type.h
@@ -0,0 +1,38 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file incomplete_type.h
+ * \brief A way to defined arbitrary function signature with dispatch on types.
+ */
+
+#ifndef TVM_RELAY_PASS_INCOMPLETE_TYPE_H_
+#define TVM_RELAY_PASS_INCOMPLETE_TYPE_H_
+
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Represents a portion of an incomplete type.
+ */
+class IncompleteType;
+
+/*! \brief IncompleteType container node */
+class IncompleteTypeNode : public TypeNode {
+ public:
+  TypeParamNode::Kind kind;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("kind", &kind); }
+
+  TVM_DLL static IncompleteType make(TypeParamNode::Kind kind);
+
+  static constexpr const char* _type_key = "relay.IncompleteType";
+  TVM_DECLARE_NODE_TYPE_INFO(IncompleteTypeNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(IncompleteType, IncompleteTypeNode, Type);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_PASS_INCOMPLETE_TYPE_H_
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
new file mode 100644
index 000000000000..522eb93483fb
--- /dev/null
+++ b/src/relay/pass/kind_check.cc
@@ -0,0 +1,42 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file kindchecker.cc
+ *
+ * \brief Check that types are well formed by applying "kinding rules".
+ *
+ * This pass ensures we do not do things that violate the design of the
+ * type system when writing down types.
+ *
+ * For example tensors are not allowed to contain functions in Relay.
+ *
+ * We check this by ensuring the `dtype` field of a Tensor always 
+ * contains a data type such as `int`, `float`, `uint`.
+ */
+#include <tvm/ir_functor.h>
+#include <tvm/relay/pass.h>
+#include "./type_visitor.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace tvm::runtime;
+
+struct KindChecker : TypeVisitor<> {
+  bool valid;
+
+  KindChecker() : valid(true) {}
+
+  bool Check(const Type &t) {
+    this->VisitType(t);
+    return valid;
+  }
+};
+
+bool KindCheck(const Environment& env, const Type &t) {
+  KindChecker kc;
+  return kc.Check(t);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/resolve.cc b/src/relay/pass/resolve.cc
new file mode 100644
index 000000000000..b073613bafc2
--- /dev/null
+++ b/src/relay/pass/resolve.cc
@@ -0,0 +1,100 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file resolve.cc
+ * \brief Resolve incomplete types to complete types.
+ */
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include "./resolve.h"
+#include "./type_visitor.h"
+
+namespace tvm {
+namespace relay {
+
+struct ResolveTypeType : TypeMutator {
+  const TypeUnifier &unifier;
+
+  explicit ResolveTypeType(const TypeUnifier &unifier) : unifier(unifier) {}
+
+  Type VisitType(const Type &t) override {
+    if (!t.defined()) {
+      auto inc_ty = IncompleteTypeNode::make(TypeParamNode::Kind::kType);
+      unifier->Insert(inc_ty);
+      return inc_ty;
+    } else {
+      return TypeMutator::VisitType(t);
+    }
+  }
+
+  Type VisitType_(const IncompleteTypeNode *op) override {
+    return unifier->Subst(GetRef<IncompleteType>(op));
+  }
+};
+
+struct ResolveTypeExpr : ExprMutator {
+  const TypeUnifier &unifier;
+
+  explicit ResolveTypeExpr(const TypeUnifier &unifier) : unifier(unifier) {}
+
+  Expr Mutate(const Expr &e) {
+    // NB: a bit tricky here.
+    //
+    // We want to store resolved type without having
+    // to re-typecheck the entire term.
+    //
+    // Since we know that e : T[...] under some holes
+    // then it is the case that if we resolve types
+    // present in e, then we can type it under T
+    // with the wholes filled in.
+    //
+    // We will visit e like normal building a new
+    // term, then resolve e's old type and write
+    // it back into the new node.
+    auto new_e = ExprMutator::Mutate(e);
+    CHECK(e->checked_type_.defined());
+    auto resolved_cty = VisitType(e->checked_type_);
+    new_e->checked_type_ = resolved_cty;
+    return new_e;
+  }
+
+  Type VisitType(const Type &t) {
+    return ResolveTypeType(unifier).VisitType(t);
+  }
+};
+
+Type Resolve(const TypeUnifier &unifier, const Type &ty) {
+  CHECK(ty.defined());
+  return ResolveTypeType(unifier).VisitType(ty);
+}
+
+Expr Resolve(const TypeUnifier &unifier, const Expr &expr) {
+  return ResolveTypeExpr(unifier).Mutate(expr);
+}
+
+struct FullyResolved : TypeVisitor<> {
+  bool incomplete;
+
+  FullyResolved() : incomplete(true) {}
+
+  void VisitType(const Type &t) override {
+    if (!t.defined()) {
+      incomplete = true;
+    } else {
+      return TypeVisitor<>::VisitType(t);
+    }
+  }
+
+  void VisitType_(const IncompleteTypeNode *ty_var) override {
+    incomplete = false;
+  }
+};
+
+bool IsFullyResolved(const Type &t) {
+  auto fr = FullyResolved();
+  fr.VisitType(t);
+  return fr.incomplete;
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/resolve.h b/src/relay/pass/resolve.h
new file mode 100644
index 000000000000..0cd7dce2d88d
--- /dev/null
+++ b/src/relay/pass/resolve.h
@@ -0,0 +1,47 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/resolve.h
+ * \brief Resolve incomplete types to complete types.
+ */
+#ifndef TVM_RELAY_PASS_RESOLVE_H_
+#define TVM_RELAY_PASS_RESOLVE_H_
+
+#include <tvm/relay/expr.h>
+#include <string>
+#include "./unifier.h"
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Resolve a type containing incomplete types.
+ *
+ * This pass replaces incomplete types with their representative, and
+ * converts types which are not defined into fresh variables.
+ *
+ * \param unifier The unifier containing the unification data.
+ * \param ty The type to resolve.
+ * \returns The resolved type.
+ */
+Type Resolve(const TypeUnifier& unifier, const Type& ty);
+
+/*! \brief Resolve an expression containing incomplete types.
+ *
+ * This pass replaces incomplete types with their representative, and
+ * converts types which are not defined into fresh variables.
+ *
+ * \param unifier The unifier containing the unification data.
+ * \param ty The expression to resolve.
+ * \returns The resolved expression.
+ */
+Expr Resolve(const TypeUnifier& unifier, const Expr& expr);
+
+/*! \brief Check if all types have been filled in.
+ *   \param t The type.
+ *   \returns True if the type is resolved, false otherwise.
+ */
+bool IsFullyResolved(const Type& t);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_PASS_RESOLVE_H_
diff --git a/src/relay/pass/type_functor.h b/src/relay/pass/type_functor.h
new file mode 100644
index 000000000000..339552108af4
--- /dev/null
+++ b/src/relay/pass/type_functor.h
@@ -0,0 +1,93 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_functor.h
+ * \brief A way to defined arbitrary function signature with dispatch on types.
+ */
+#ifndef TVM_RELAY_PASS_TYPE_FUNCTOR_H_
+#define TVM_RELAY_PASS_TYPE_FUNCTOR_H_
+
+#include <tvm/ir_functor.h>
+#include <tvm/relay/expr.h>
+#include "./incomplete_type.h"
+
+namespace tvm {
+namespace relay {
+
+template <typename FType>
+class TypeFunctor;
+
+// functions to be overriden.
+#define TYPE_FUNCTOR_DEFAULT \
+  { return VisitTypeDefault_(op, std::forward<Args>(args)...); }
+
+#define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                       \
+  vtable.template set_dispatch<OP>(                           \
+      [](const NodeRef& n, TSelf* self, Args... args) {       \
+        return self->VisitType_(static_cast<const OP*>(n.node_.get()),    \
+                                std::forward<Args>(args)...); \
+      });
+
+template <typename R, typename... Args>
+class TypeFunctor<R(const Type& n, Args...)> {
+ private:
+  using TSelf = TypeFunctor<R(const Type& n, Args...)>;
+  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+
+ public:
+  /*! \brief the result type of this functor */
+  using result_type = R;
+  /*! \brief virtual destructor */
+  virtual ~TypeFunctor() {}
+  /*!
+   * \brief Same as call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  R operator()(const Type& n, Args... args) {
+    return VisitType(n, std::forward<Args>(args)...);
+  }
+  /*!
+   * \brief The functor call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  virtual R VisitType(const Type& n, Args... args) {
+    static FType vtable = InitVTable();
+    return vtable(n, this, std::forward<Args>(args)...);
+  }
+  // Functions that can be overriden by subclass
+  virtual R VisitType_(const TensorTypeNode* op,
+                       Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TypeParamNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TypeConstraintNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const FuncTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TypeRelationNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TupleTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const IncompleteTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+
+  virtual R VisitTypeDefault_(const Node* op, Args...) {
+    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    return R();
+  }
+
+ private:
+  // initialize the vtable.
+  static FType InitVTable() {
+    FType vtable;
+    // Set dispatch
+    RELAY_TYPE_FUNCTOR_DISPATCH(TensorTypeNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TypeParamNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TypeConstraintNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(FuncTypeNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TypeRelationNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TupleTypeNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(IncompleteTypeNode);
+    return vtable;
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_TYPE_FUNCTOR_H_
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
new file mode 100644
index 000000000000..f4f6d82eb5e1
--- /dev/null
+++ b/src/relay/pass/type_infer.cc
@@ -0,0 +1,629 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_infer.cc
+ * \brief Relay type inference and checking.
+ *
+ * This file implements one of the most important passes to the
+ * Relay IR. In order to do many transformations and generate the
+ * most efficient code we need to obtain type information for the
+ * IR.
+ *
+ * Like computation graphs the IR leaves most type information
+ * implicit and relies performing analysis of the program to
+ * generate this information.
+ *
+ * This pass given an expression `e` will infer a type `t` for
+ * the expression simultaneous checking the property `e : t`
+ * (i.e we can show e has type t).
+ *
+ * If we can not infer a type or there are conflicting typing
+ * constraints we will trigger an error.
+ */
+
+#include <tvm/relay/error.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/logging.h>
+#include <tvm/relay/pass.h>
+#include "./incomplete_type.h"
+#include "./resolve.h"
+#include "./type_subst.h"
+#include "./type_visitor.h"
+#include "./unifier.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace tvm::runtime;
+
+// // We declare this for forward compatibility.
+struct ConstraintData {};
+
+/*! \brief A more efficient representation of the type relation
+ * data needed for type checking.
+ */
+struct TypeRelationData : ConstraintData {
+  std::string name;
+  std::vector<Type> args;
+  TypeRelationFn func;
+  Span span;
+
+  explicit TypeRelationData(const TypeRelation& ty_rel)
+      : TypeRelationData(ty_rel->args, ty_rel->func_, ty_rel->span) {}
+
+  TypeRelationData(const Array<Type>& args, const TypeRelationFn& func, const Span& sp)
+      : func(func), span(sp) {
+    for (auto arg : args) {
+      this->args.push_back(arg);
+    }
+  }
+
+  TypeRelation ToTypeRel() const {
+    Array<Type> args = Array<Type>(this->args.begin(), this->args.end());
+    return TypeRelationNode::make(
+        this->name, this->func, args);
+  }
+};
+
+struct TypeContext {
+  std::unordered_map<Var, Type, NodeHash> var_map;
+  std::vector<std::vector<TypeRelationData> > constraints;
+
+  TypeContext() { constraints.push_back({}); }
+
+  void Insert(const Var& id, const Type& t) { var_map[id] = t; }
+
+  void AddConstraint(const TypeConstraint& constraint) {
+      constraints.back().push_back(TypeRelationData(Downcast<TypeRelation>(constraint)));
+  }
+
+  Type Lookup(const Var& var) {
+    auto type = var_map.find(var);
+    if (type != var_map.end()) {
+      return (*type).second;
+    } else {
+      throw FatalTypeError(std::string("undeclared local variable: ") + var->name_hint);
+    }
+  }
+
+  struct Scope {
+    TypeContext& tc;
+    explicit Scope(TypeContext& tc) : tc(tc) { tc.constraints.push_back({}); }
+    ~Scope() { tc.constraints.pop_back(); }
+  };
+};
+
+struct CheckedExpr {
+  Expr expr;
+  Type type;
+  CheckedExpr(Expr e, Type t) : expr(e), type(t) {}
+  CheckedExpr() {}
+};
+
+enum SolverResult : int;
+
+class TypeInferencer : private ExprFunctor<CheckedExpr(const Expr&)> {
+ private:
+  TypeContext context;
+
+ public:
+  Environment env;
+  TypeUnifier unifier;
+
+  template <typename T>
+  T WithScope(const std::function<T()>& f) {
+    TypeContext::Scope fr(context);
+    return f();
+  }
+
+  TypeInferencer();
+  TypeInferencer(Environment env, TypeUnifier unifier)
+      : env(env), unifier(unifier) {}
+  explicit TypeInferencer(Environment env);
+
+  CheckedExpr Infer(const Expr &expr);
+
+  FuncType Instantiate(FuncType fn_ty, tvm::Array<Type> &ty_args);
+
+  Type Normalize(const Type& t);
+
+  void ReportError(const std::string& msg, Span sp);
+  [[noreturn]] void FatalError(const std::string& msg, Span sp);
+
+  Type Unify(const Type &t1, const Type& t2, Span sp);
+  Type Resolve(const Type &t);
+  Expr Resolve(const Expr &e);
+
+  /*! \brief Attempt to solve a single relation. */
+  void Solve(TypeRelationData& ty_rel);
+
+  /*! \brief Attempt to solve all pending relations.
+   * 
+   * If the solver
+   */
+  SolverResult Solve(std::vector<TypeRelationData>& rels);
+
+  /*! \brief Check that all relations hold. */
+  bool RelationsHold(bool scope_only = false);
+
+  /*! \brief Visit a function node, extra flag controls behavior. */
+  CheckedExpr VisitFunction(const Function& f, bool generalize);
+
+ private:
+  CheckedExpr VisitExpr_(const VarNode* op) override;
+  CheckedExpr VisitExpr_(const GlobalVarNode* op) override;
+  CheckedExpr VisitExpr_(const ConstantNode* op) override;
+  CheckedExpr VisitExpr_(const TupleNode* op) override;
+  CheckedExpr VisitExpr_(const ParamNode* op) override;
+  CheckedExpr VisitExpr_(const FunctionNode* op) override;
+  CheckedExpr VisitExpr_(const CallNode* op) override;
+  CheckedExpr VisitExpr_(const LetNode* op) override;
+  CheckedExpr VisitExpr_(const IfNode* op) override;
+  CheckedExpr VisitExpr_(const OpNode* op) override;
+};
+
+TypeInferencer::TypeInferencer() {
+  this->env = EnvironmentNode::make({});
+  this->unifier = TypeUnifierNode::make(UnionFindNode::make({}));
+}
+
+TypeInferencer::TypeInferencer(Environment env) : env(env) {
+  this->unifier = TypeUnifierNode::make(UnionFindNode::make({}));
+}
+
+CheckedExpr TypeInferencer::Infer(const Expr& expr) {
+  RELAY_LOG(INFO) << "TypeInferencer::Check expr=" << expr << std::endl;
+  CheckedExpr checked_expr = this->VisitExpr(expr);
+  RELAY_LOG(INFO) << "TypeInferencer::Check type=" << checked_expr.type
+                  << std::endl;
+  Type final_type = checked_expr.type;
+  RELAY_LOG(INFO) << "TypeInferencer::Check type_after_subst=" << final_type
+                  << std::endl;
+  checked_expr.expr->checked_type_ = final_type;
+  return checked_expr;
+}
+
+CheckedExpr TypeInferencer::VisitExpr_(const VarNode* op) {
+  auto var = GetRef<Var>(op);
+  return {var, this->context.Lookup(var)};
+}
+
+CheckedExpr TypeInferencer::VisitExpr_(const GlobalVarNode* op) {
+  GlobalVar var = GetRef<GlobalVar>(op);
+  Expr e = this->env->Lookup(var);
+  return {var, e->checked_type()};
+}
+
+CheckedExpr TypeInferencer::VisitExpr_(const ConstantNode* const_node) {
+  return {GetRef<Constant>(const_node), const_node->tensor_type()};
+}
+
+CheckedExpr TypeInferencer::VisitExpr_(const TupleNode* op) {
+  Tuple pl = GetRef<Tuple>(op);
+
+  std::vector<Expr> field_exprs;
+  std::vector<Type> field_types;
+  for (auto field = pl->fields.begin(); field != pl->fields.end(); field++) {
+    auto checked_field = Infer(*field);
+    field_exprs.push_back(checked_field.expr);
+    field_types.push_back(checked_field.type);
+  }
+
+  return {TupleNode::make(field_exprs), TupleTypeNode::make(field_types)};
+}
+
+CheckedExpr TypeInferencer::VisitExpr_(const ParamNode* param) {
+  // We should trigger error here and move param code direclty into function
+  // checking.
+  auto rtype = this->Resolve(param->type);
+  // This is a special case ... not sure if there is a better way
+  // to handle this.
+  param->var->checked_type_ = rtype;
+  return {ParamNode::make(param->var, rtype), rtype};
+}
+
+CheckedExpr TypeInferencer::VisitFunction(const Function& f, bool generalize) {
+  // First we add the parameters to the context allowing us to check their
+  // types.
+
+  // TODO(@jroesch): support polymorphism
+
+  std::vector<Type> param_types;
+  std::vector<Param> params;
+
+  return this->WithScope<CheckedExpr>([&]() -> CheckedExpr {
+    for (auto param : f->params) {
+      CheckedExpr checked_param = this->Infer(param);
+      Type arg_type;
+      param_types.push_back(checked_param.type);
+      params.push_back(GetRef<Param>(checked_param.expr.as<ParamNode>()));
+      this->context.Insert(param->var, checked_param.type);
+    }
+
+    auto checked_body = this->Infer(f->body);
+    auto inferred_rtype = checked_body.type;
+    auto annotated_rtype = Resolve(f->ret_type);
+
+    auto unified_rtype = this->Unify(inferred_rtype, annotated_rtype, f->span);
+
+    CHECK(RelationsHold(true));
+
+    Array<TypeConstraint> cs;
+
+    for (auto cons : this->context.constraints.back()) {
+      cs.push_back(cons.ToTypeRel());
+    }
+
+    return {FunctionNode::make(params, unified_rtype, checked_body.expr, {}),
+            FuncTypeNode::make(param_types, unified_rtype, {}, cs)};
+  });
+}
+
+CheckedExpr TypeInferencer::VisitExpr_(const FunctionNode* op) {
+  return this->VisitFunction(GetRef<Function>(op), false);
+}
+
+FuncType TypeInferencer::Instantiate(FuncType fn_ty,
+                                     tvm::Array<Type>& ty_args) {
+  tvm::Map<TypeParam, Type> subst_map;
+
+  // Build a subsitituion map up from the function type and type arguments.
+  // Eventually allow the type vars to be passed in.
+  for (auto ty_param : fn_ty->type_params) {
+    IncompleteType fresh = IncompleteTypeNode::make(ty_param->kind);
+    this->unifier->Insert(fresh);
+    ty_args.push_back(fresh);
+    subst_map.Set(ty_param, fresh);
+  }
+
+  Type inst_ty = FuncTypeNode::make(fn_ty->arg_types, fn_ty->ret_type, {},
+                                    fn_ty->type_constraints);
+  inst_ty = TypeSubst(inst_ty, subst_map);
+
+  CHECK(KindCheck(this->env, inst_ty));
+
+  return GetRef<FuncType>(inst_ty.as<FuncTypeNode>());
+}
+
+CheckedExpr TypeInferencer::VisitExpr_(const CallNode* op) {
+  Call c = GetRef<Call>(op);
+
+  auto checked_op = this->Infer(c->op);
+
+  RELAY_LOG(INFO) << "TypeInferencer::VisitExpr_ op=" << c << std::endl
+                  << "fn_ty=" << checked_op.type << std::endl;
+
+  auto fn_ty_node = checked_op.type.as<FuncTypeNode>();
+
+  if (!fn_ty_node) {
+    this->FatalError("only expressions with function types can be called",
+                     c->op->span);
+  }
+
+  // We now have a function type.
+  FuncType fn_ty = GetRef<FuncType>(fn_ty_node);
+
+  tvm::Array<Type> ty_args;
+  if (ty_args.size() != 0) {
+    throw Error("found manually suplied type args, not supported");
+  }
+
+  fn_ty = Instantiate(fn_ty, ty_args);
+
+  std::vector<Type> arg_types;
+  std::vector<Expr> checked_args;
+
+  for (auto arg : c->args) {
+    auto checked_arg = this->Infer(arg);
+    arg_types.push_back(checked_arg.type);
+    checked_args.push_back(checked_arg.expr);
+  }
+
+  auto type_arity = fn_ty->arg_types.size();
+  auto number_of_args = arg_types.size();
+
+  if (type_arity != number_of_args) {
+    if (type_arity < number_of_args) {
+      this->FatalError("the function is provided too many arguments", c->span);
+    } else {
+      this->FatalError("the function is provided too few arguments", c->span);
+    }
+  }
+
+  for (size_t i = 0; i < fn_ty->arg_types.size(); i++) {
+    this->Unify(fn_ty->arg_types[i], arg_types[i], c->args[i]->span);
+  }
+
+  // After we unify the arguments we should know more about the type
+  // arguments, let's run a quick pass over them to find new
+  // representatives.
+
+  for (size_t i = 0; i < ty_args.size(); i++) {
+    ty_args.Set(i, this->unifier->Subst(ty_args[i]));
+  }
+
+  // Add type constraints from the function types.
+  for (auto cs : fn_ty->type_constraints) {
+    context.AddConstraint(cs);
+  }
+
+  auto new_call =
+      CallNode::make(checked_op.expr, checked_args, c->attrs, ty_args);
+
+  return {new_call, fn_ty->ret_type};
+}
+
+CheckedExpr TypeInferencer::VisitExpr_(const LetNode* op) {
+  Let let = GetRef<Let>(op);
+
+  CheckedExpr checked_value;
+  Type annotated_ty = Resolve(let->value_type);
+
+  // If we are let-defining a function, we want to be able to
+  // recursively name the function in order to support recursive
+  // local definitions.
+  if (let->value.as<FunctionNode>()) {
+    context.Insert(let->var, annotated_ty);
+    checked_value = Infer(let->value);
+  } else {
+    checked_value = Infer(let->value);
+  }
+
+  Type unified_ty = this->Unify(checked_value.type, annotated_ty, let->span);
+
+  // Update type context with unified type now that we have
+  // solved this equation.
+  context.Insert(let->var, unified_ty);
+
+  auto checked_body = Infer(let->body);
+
+  auto checked_let = LetNode::make(let->var, checked_value.expr,
+                                   checked_body.expr, let->value_type);
+
+  return {checked_let, checked_body.type};
+}
+
+CheckedExpr TypeInferencer::VisitExpr_(const IfNode* op) {
+  If ifn = GetRef<If>(op);
+
+  // Ensure the type of the guard is of Tensor[Bool, ()],
+  // that is a rank-0 boolean tensor.
+  auto checked_cond = this->Infer(ifn->cond);
+  auto cond_type = checked_cond.type;
+
+  this->Unify(cond_type, TensorTypeNode::make({}, HalideIR::Bool()),
+              ifn->cond->span);
+  auto checked_true = this->Infer(ifn->true_branch);
+  auto checked_false = this->Infer(ifn->false_branch);
+  auto unified_type =
+      this->Unify(checked_true.type, checked_false.type, ifn->span);
+  auto checked_if =
+      IfNode::make(checked_cond.expr, checked_true.expr, checked_false.expr);
+  return {checked_if, unified_type};
+}
+
+CheckedExpr TypeInferencer::VisitExpr_(const OpNode* op_node) {
+  auto op = GetRef<Op>(op_node);
+  return {op, op->op_type};
+}
+
+Type TypeInferencer::Resolve(const Type &t) {
+  if (t.defined()) {
+    return ::tvm::relay::Resolve(this->unifier, t);
+  } else {
+    return IncompleteTypeNode::make(TypeParamNode::Kind::kType);
+  }
+}
+
+Expr TypeInferencer::Resolve(const Expr &e) {
+  CHECK(e.defined());
+  return ::tvm::relay::Resolve(this->unifier, e);
+}
+
+void TypeInferencer::Solve(TypeRelationData & ty_rel) {
+  Array<Type> normalized_args;
+
+  for (auto arg : ty_rel.args) {
+    normalized_args.push_back(Resolve(arg));
+  }
+
+  auto new_args = ty_rel.func(normalized_args, ty_rel.args.size());
+
+  CHECK(new_args.size() == normalized_args.size());
+  tvm::Array<Type> final_args;
+
+  for (size_t i = 0; i < new_args.size(); i++) {
+    ty_rel.args[i] = Unify(normalized_args[i], new_args[i], ty_rel.span);
+  }
+}
+
+int NumSolvedVars(const Array<Type>& vars) {
+  int num = 0;
+  for (auto var : vars) {
+    if (!var.as<IncompleteTypeNode>()) {
+      num += 1;
+    }
+  }
+  return num;
+}
+
+enum SolverResult : int {
+  Failed = -1,
+  Progress = 0,
+  Done = 1,
+};
+
+SolverResult TypeInferencer::Solve(std::vector<TypeRelationData>& rels) {
+  // We start in the done state with zero progress.
+  SolverResult status = SolverResult::Done;
+  int progress = 0;
+
+  do {
+    // Upon rentering the loop we reset the state.
+    status = SolverResult::Done;
+    progress = 0;
+
+    std::vector<int> complete;
+
+    int i = 0;
+    // We will now process each relation in order.
+    for (TypeRelationData& ty_rel : rels) {
+      int arity = ty_rel.args.size();
+      int pre_solved = NumSolvedVars(ty_rel.args);
+      RELAY_LOG(INFO) << "TypeInferencer::Solve: "
+                      << "TypeRelation= "
+                      << ", Arity=" << arity << ", Solved=" << pre_solved
+                      << std::endl;
+      // If the relation is already solved then we will make no progress but try
+      // to set the status to done.
+      if (pre_solved == arity) {
+        status = static_cast<SolverResult>((status && SolverResult::Done));
+        complete.push_back(i);
+      // If there are unsolved variables we will try to solve some.
+      } else if (pre_solved < arity) {
+        Solve(ty_rel);
+        int post_solved = NumSolvedVars(ty_rel.args);
+
+        // If we solved any variables we will try to downgrade status to
+        // progress update the type relation, and then bump the progress counter
+        // by one.
+        if (post_solved > pre_solved) {
+          status =
+              static_cast<SolverResult>((status && SolverResult::Progress));
+          progress += 1;
+        }
+      }
+      i++;
+    }
+
+    // If we made no progress and we aren't finished, then the state should be
+    // downgraded to fail, then we should exit the loop.
+    if (progress == 0 && status != SolverResult::Done) {
+      status = SolverResult::Failed;
+      break;
+    }
+
+    // Remove the satisfied relations.
+    for (auto i : complete) {
+      if (rels.size() > 1) {
+        rels[i] = rels.back();
+        rels.pop_back();
+      } else {
+        rels.pop_back();
+      }
+    }
+
+    std::reverse(rels.begin(), rels.end());
+  } while (status == SolverResult::Progress);
+  return status;
+}
+
+bool TypeInferencer::RelationsHold(bool scope_only) {
+  // If we are only checking the top scope,
+  // slice out the constraints.
+  //
+  // Otherwise we use all of them.
+  std::vector<std::vector<TypeRelationData> > constraints;
+
+  if (scope_only) {
+    constraints = {context.constraints[0]};
+  } else {
+    constraints = context.constraints;
+  }
+
+  RELAY_LOG(INFO) << "TypeInferencer::RelationsHold: scope_only= " << scope_only
+                  << std::endl;
+  bool all_hold = true;
+  for (auto ty_rels : context.constraints) {
+    auto status = Solve(ty_rels);
+    RELAY_LOG(INFO) << "status= " << status << std::endl;
+    if (status == SolverResult::Failed || status == SolverResult::Progress) {
+      all_hold = false;
+    } else if (status == SolverResult::Done) {
+      continue;
+    } else {
+      throw InternalError("found invalid value for SolverResult");
+    }
+  }
+
+  return all_hold;
+}
+
+Expr InferType(const Environment& env, const Expr& e) {
+  TypeInferencer ti(env);
+  auto checked_expr = ti.Infer(e);
+  CHECK(ti.RelationsHold());
+  return ti.Resolve(checked_expr.expr);
+}
+
+Expr InferType(const Environment& env, const GlobalVar& var,
+               const Function& func) {
+  TypeInferencer ti(env);
+  auto func_copy = FunctionNode::make(func->params, func->ret_type, func->body,
+                                      func->type_params);
+  func_copy->checked_type_ = ti.Resolve(func_copy->fn_type());
+  env->functions.Set(var, func_copy);
+  auto checked_expr = ti.Infer(func);
+  CHECK(ti.RelationsHold());
+  auto map_node = env->functions.CopyOnWrite();
+  map_node->data.erase(var.node_);
+  return ti.Resolve(checked_expr.expr);
+}
+
+void TypeInferencer::FatalError(const std::string& msg, Span sp) {
+  throw FatalTypeError(
+      "internal error: this exception should"
+      "be handled and errors reported with Environment::display_errors\n" +
+      msg);
+}
+
+Type TypeInferencer::Unify(const Type& t1, const Type& t2, Span sp) {
+  try {
+    return this->unifier->Unify(t1, t2);
+  } catch (const dmlc::Error &e) {
+    std::stringstream ss;
+    ss << "Error unifying `";
+    ss << t1;
+    ss << "` and `";
+    ss << t2;
+    ss << "`: " << e.what();
+    this->FatalError(ss.str(), sp);
+  }
+}
+
+TVM_REGISTER_API("relay._ir_pass.check_expr")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
+      Environment env = args[0];
+      Expr e = args[1];
+      *ret = InferType(env, e);
+    });
+
+// TODO(@jroesch): put in a better namespace.
+TVM_REGISTER_API("relay._ir_pass._get_checked_type")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
+      Expr e = args[0];
+      *ret = e->checked_type();
+    });
+
+/* Incomplete Type */
+
+IncompleteType IncompleteTypeNode::make(TypeParamNode::Kind kind) {
+  std::shared_ptr<IncompleteTypeNode> n =
+      std::make_shared<IncompleteTypeNode>();
+  n->kind = std::move(kind);
+  return IncompleteType(n);
+}
+
+TVM_REGISTER_API("relay._make.IncompleteType")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
+      int kind = args[0];
+      *ret = IncompleteTypeNode::make(static_cast<TypeParamNode::Kind>(kind));
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<IncompleteTypeNode>([](const IncompleteTypeNode* node,
+                                         tvm::IRPrinter* p) {
+      p->stream << "IncompleteTypeNode(" << node->kind << ", " << node << ")";
+    });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/type_subst.cc b/src/relay/pass/type_subst.cc
new file mode 100644
index 000000000000..0b17fa0bc4f8
--- /dev/null
+++ b/src/relay/pass/type_subst.cc
@@ -0,0 +1,39 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_subst.cc
+ * \brief Function for substituting a concrete type in place of a type ID
+ */
+#include "./type_subst.h"
+#include "./type_visitor.h"
+
+namespace tvm {
+namespace relay {
+
+struct TypeSubstV : TypeMutator {
+  tvm::Map<TypeParam, Type> subst_map;
+
+  explicit TypeSubstV(tvm::Map<TypeParam, Type> subst_map)
+    : subst_map(subst_map) {}
+
+  Type VisitType_(const TypeParamNode* op) override {
+    auto id = GetRef<TypeParam>(op);
+    if (subst_map.find(id) != subst_map.end()) {
+      return this->subst_map[id];
+    } else {
+      return id;
+    }
+  }
+};
+
+Type TypeSubst(const Type& type, const TypeParam& target, const Type& subst) {
+  TypeSubstV ty_sub({ {target, subst} });
+  return ty_sub.VisitType(type);
+}
+
+Type TypeSubst(const Type& type, tvm::Map<TypeParam, Type> subst_map) {
+  TypeSubstV ty_sub(subst_map);
+  return ty_sub.VisitType(type);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/type_subst.h b/src/relay/pass/type_subst.h
new file mode 100644
index 000000000000..aee3209afb7a
--- /dev/null
+++ b/src/relay/pass/type_subst.h
@@ -0,0 +1,19 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/pass/type_subst.h
+ * \brief Utility functions for substituting types.
+ */
+#ifndef TVM_RELAY_PASS_TYPE_SUBST_H_
+#define TVM_RELAY_PASS_TYPE_SUBST_H_
+
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+
+Type TypeSubst(const Type& type, const TypeParam& target, const Type& subst);
+Type TypeSubst(const Type& type, tvm::Map<TypeParam, Type> subst_map);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_TYPE_SUBST_H_
diff --git a/src/relay/pass/type_visitor.h b/src/relay/pass/type_visitor.h
new file mode 100644
index 000000000000..725e3d9b3846
--- /dev/null
+++ b/src/relay/pass/type_visitor.h
@@ -0,0 +1,120 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_visitor.h
+ * \brief A wrapper around TypeFunctor for common use cases.
+ */
+#ifndef TVM_RELAY_PASS_TYPE_VISITOR_H_
+#define TVM_RELAY_PASS_TYPE_VISITOR_H_
+
+#include <vector>
+#include "./type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+/*! \brief A type visitor for vistiors which make use of internal
+ * mutable state.
+ *
+ * We recursively visit each type contained inside the visitor.
+ */
+template <typename... Args>
+struct TypeVisitor : ::tvm::relay::TypeFunctor<void(const Type& n, Args...)> {
+  void VisitType_(const TypeParamNode* op, Args... args) override {}
+
+  void VisitType_(const FuncTypeNode* op, Args... args) override {
+    for (auto type_param : op->type_params) {
+      this->VisitType(type_param, std::forward<Args>(args)...);
+    }
+
+    for (auto type_cs : op->type_constraints) {
+      this->VisitType(type_cs, std::forward<Args>(args)...);
+    }
+
+    for (auto arg_type : op->arg_types) {
+      this->VisitType(arg_type, std::forward<Args>(args)...);
+    }
+    this->VisitType(op->ret_type, std::forward<Args>(args)...);
+  }
+
+  void VisitType_(const TensorTypeNode* op, Args... args) override {}
+
+  void VisitType_(const TupleTypeNode* op, Args... args) override {
+    for (const Type& t : op->fields) {
+      this->VisitType(t, std::forward<Args>(args)...);
+    }
+  }
+
+  void VisitType_(const TypeRelationNode* op, Args... args) override {
+    for (const Type& t : op->args) {
+      this->VisitType(t, std::forward<Args>(args)...);
+    }
+  }
+
+  void VisitType_(const IncompleteTypeNode* op, Args... args) override {}
+};
+
+// A functional visitor for rebuilding an AST in place.
+struct TypeMutator : TypeFunctor<Type(const Type& n)> {
+  Type VisitType_(const TensorTypeNode* op) override {
+    // TODO(@jroesch): maybe we should recursively visit
+    return TensorTypeNode::make(op->shape, op->dtype);
+  }
+
+  Type VisitType_(const TypeParamNode* op) override {
+    return GetRef<TypeParam>(op);
+  }
+
+  Type VisitType_(const FuncTypeNode* op) override {
+    Array<TypeParam> type_params;
+    for (auto type_param : op->type_params) {
+      auto new_type_param = VisitType(type_param);
+      if (const TypeParamNode* tin = new_type_param.as<TypeParamNode>()) {
+        type_params.push_back(GetRef<TypeParam>(tin));
+      } else {
+        CHECK(false) << new_type_param << std::endl;
+      }
+    }
+
+    Array<TypeConstraint> type_constraints;
+    for (auto type_cs : op->type_constraints) {
+      auto new_type_cs = VisitType(type_cs);
+      if (const TypeConstraintNode* tin = As<TypeConstraintNode>(new_type_cs)) {
+        type_constraints.push_back(GetRef<TypeConstraint>(tin));
+      } else {
+        CHECK(false) << new_type_cs << std::endl;
+      }
+    }
+
+    std::vector<Type> args;
+    for (auto arg_type : op->arg_types) {
+      args.push_back(VisitType(arg_type));
+    }
+
+    return FuncTypeNode::make(tvm::Array<Type>(args), VisitType(op->ret_type),
+                              type_params, type_constraints);
+  }
+
+    Type VisitType_(const TupleTypeNode* op) override {
+      std::vector<Type> new_fields;
+      for (const Type& t : op->fields) {
+        new_fields.push_back(this->VisitType(t));
+      }
+      return TupleTypeNode::make(new_fields);
+    }
+
+  Type VisitType_(const TypeRelationNode* type_rel) override {
+    std::vector<Type> new_args;
+    for (const Type& t : type_rel->args) {
+      new_args.push_back(this->VisitType(t));
+    }
+    return TypeRelationNode::make(type_rel->name, type_rel->func_, new_args);
+  }
+
+  Type VisitType_(const IncompleteTypeNode* op) override {
+    return GetRef<IncompleteType>(op);
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_TYPE_VISITOR_H_
diff --git a/src/relay/pass/unifier.cc b/src/relay/pass/unifier.cc
new file mode 100644
index 000000000000..b0ed71d17911
--- /dev/null
+++ b/src/relay/pass/unifier.cc
@@ -0,0 +1,324 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/src/relay/pass/unifier.cc
+ * \brief The type unifier which solves a system of equations between
+ * incomplete types.
+ */
+
+#include "./unifier.h"
+#include <tvm/relay/error.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/logging.h>
+#include <tvm/relay/pass.h>
+#include <tvm/relay/type.h>
+#include "./type_subst.h"
+#include "./type_visitor.h"
+
+namespace tvm {
+namespace relay {
+
+using tvm::IRPrinter;
+using namespace tvm::runtime;
+
+UnionFind UnionFindNode::make(tvm::Map<IncompleteType, Type> uf_map) {
+  std::shared_ptr<UnionFindNode> n = std::make_shared<UnionFindNode>();
+  n->uf_map = uf_map;
+  return UnionFind(n);
+}
+
+void UnionFindNode::Insert(const IncompleteType& v) { this->uf_map.Set(v, v); }
+
+void UnionFindNode::debug() {
+  for (const auto& entry : this->uf_map) {
+    RELAY_LOG(INFO) << entry.first << " = " << entry.second << std::endl;
+  }
+}
+
+void UnionFindNode::AssertAlphaEqual(const Type& l, const Type& r) {
+  if (!AlphaEqual(l, r)) {
+    std::stringstream ss;
+    ss << "Incompatible parent types in UF:" << l << " and " << r;
+    throw UnionFindError(ss.str());
+  }
+}
+
+void UnionFindNode::Unify(const IncompleteType& v1, const Type& t) {
+  RELAY_LOG(INFO) << "UnionFindNode::Unify v1=" << v1 << ", t=" << t
+                  << std::endl;
+  auto parent1 = this->Find(v1);
+
+  // if t is a type var, then unify parents
+  const IncompleteTypeNode *tvn2 = t.as<IncompleteTypeNode>();
+  if (tvn2) {
+    auto v2 = GetRef<IncompleteType>(tvn2);
+    auto parent2 = this->Find(v2);
+
+    // if parents are exactly equal, then we're done
+    if (parent1 == parent2) {
+      return;
+    }
+
+    // if first parent is a type var, then can just set its union find map to
+    // second parent
+    if (const IncompleteTypeNode *pvn1 = parent1.as<IncompleteTypeNode>()) {
+      auto pv1 = GetRef<IncompleteType>(pvn1);
+      this->uf_map.Set(pv1, parent2);
+      return;
+    }
+
+    // if second parent is a type var but first isn't, can set second type var
+    if (const IncompleteTypeNode *pvn2 = parent2.as<IncompleteTypeNode>()) {
+      auto pv2 = GetRef<IncompleteType>(pvn2);
+      this->uf_map.Set(pv2, parent1);
+      return;
+    }
+
+    // if both parents are not type vars themselves, check alpha-equality
+    AssertAlphaEqual(parent1, parent2);
+    return;
+  }
+
+  // if t is not a type var, then unify with v1's parent if parent is a type
+  // var; else, check alpha-equality for compatibility
+  if (const IncompleteTypeNode *pvn1 = parent1.as<IncompleteTypeNode>()) {
+    auto pv1 = GetRef<IncompleteType>(pvn1);
+    this->uf_map.Set(pv1, t);
+    return;
+  }
+
+  AssertAlphaEqual(parent1, t);
+}
+
+Type UnionFindNode::Find(const IncompleteType& v) {
+  // The node has no mapping, so its representative is just itself.
+  if (this->uf_map.find(v) == this->uf_map.end()) {
+    return v;
+  }
+
+  Type parent = this->uf_map.at(v);
+
+  if (v == parent) {
+    return v;
+  }
+
+  // if parent is not a type var, then it must be the representative type
+  const IncompleteTypeNode *rep = parent.as<IncompleteTypeNode>();
+  if (!rep) {
+    return parent;
+  }
+
+  // otherwise, recurse and perform path compression
+  IncompleteType pv = GetRef<IncompleteType>(rep);
+  Type higher_up = this->Find(pv);
+  this->uf_map.Set(v, higher_up);
+  return higher_up;
+}
+
+TVM_REGISTER_API("relay._make.UnionFind")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      if (args.size() == 0) {
+        *ret = UnionFindNode::make({});
+      } else {
+        *ret = UnionFindNode::make(args[0]);
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<UnionFindNode>([](const UnionFindNode *node,
+                                    tvm::IRPrinter *p) {
+      p->stream << "UnionFindNode(" << node->uf_map << ")";
+    });
+
+TypeUnifier TypeUnifierNode::make(UnionFind union_find) {
+  std::shared_ptr<TypeUnifierNode> n = std::make_shared<TypeUnifierNode>();
+  n->union_find = union_find;
+  return TypeUnifier(n);
+}
+
+void TypeUnifierNode::Insert(const IncompleteType& v) {
+  this->union_find->Insert(v);
+}
+
+Type TypeUnifierNode::Unify(const Type& t1, const Type& t2) {
+  RELAY_LOG(INFO) << "TypeUnifierNode::unify: t1=" << t1 << " t2=" << t2
+                  << std::endl;
+
+  Type unified = this->VisitType(t1, t2);
+  // TODO(@jroesch): Restore this code when we finish kind checker.
+  // if (!check_kind(unified)) {
+  // throw UnificationError("Invalid kinds in unified type");
+  // }
+  return unified;
+}
+
+struct IncompleteTypeSubst : TypeMutator {
+  const TypeUnifierNode *unifier;
+
+  IncompleteTypeSubst(const TypeUnifierNode *unifier) : unifier(unifier) {}
+
+  // type var: look it up in the type map and recurse
+  Type VisitType_(const IncompleteTypeNode* op) override {
+    auto tv = GetRef<IncompleteType>(op);
+    auto parent = unifier->union_find->Find(tv);
+    if (parent == tv) {
+      return tv;
+    }
+    return this->VisitType(parent);
+  }
+};
+
+Type TypeUnifierNode::Subst(const Type& t) {
+  IncompleteTypeSubst tvsubst(this);
+  // normalize first so substitutions in quantifiers will be correct
+  Type ret = tvsubst.VisitType(t);
+  // TODO(@jroesch): Restore this code when we finish kind checker.
+  // if (!check_kind(ret)) {
+  // std::stringstream ss;
+  // ss << "Invalid Kinds in substituted type!";
+  // ss << t << std::endl;
+  // ss << ret << std::endl;
+  // throw SubstitutionError(ss.str());
+  // }
+  return ret;
+}
+
+Type TypeUnifierNode::VisitType(const Type& t1, const Type t2) {
+  // When the right hand size is a type variable immediately unify.
+  if (const IncompleteTypeNode *tvn2 = t2.as<IncompleteTypeNode>()) {
+    return this->UnifyWithIncompleteType(t1, GetRef<IncompleteType>(tvn2));
+  } else {
+    return TypeFunctor<Type(const Type &t1, const Type t2)>::VisitType(t1, t2);
+  }
+}
+
+Type TypeUnifierNode::UnifyWithIncompleteType(const Type& t1,
+                                              const IncompleteType tv2) {
+  RELAY_LOG(INFO) << "unifyWithIncompleteType: t1=" << t1 << " t2=" << tv2
+                  << std::endl;
+  // Fix unify to return new representative
+  this->union_find->Unify(tv2, t1);
+  auto rep = this->union_find->Find(tv2);
+  RELAY_LOG(INFO) << "unifyWithIncompleteType: rep =" << rep << std::endl;
+  return rep;
+}
+
+Type TypeUnifierNode::VisitType_(const IncompleteTypeNode* t1, const Type rt2) {
+  IncompleteType tv1 = GetRef<IncompleteType>(t1);
+  RELAY_LOG(INFO) << "VisitType_: IncompleteTypeNode t1=" << t1 << " = " << rt2
+                  << std::endl;
+  this->union_find->Unify(tv1, rt2);
+  auto rep = this->union_find->Find(tv1);
+  RELAY_LOG(INFO) << "VisitType_: IncompleteTypeNode rep=" << rep << std::endl;
+  return rep;
+}
+
+Type TypeUnifierNode::VisitType_(const TypeParamNode* t1, const Type rt2) {
+  TypeParam ti1 = GetRef<TypeParam>(t1);
+
+  if (const TypeParamNode *tin2 = rt2.as<TypeParamNode>()) {
+    TypeParam ti2 = GetRef<TypeParam>(tin2);
+
+    if (ti1 != ti2) {
+      throw UnificationError("Attempting to unify non-matching TypeParams");
+    }
+
+    return ti1;
+  }
+
+  throw UnificationError("Unable to unify TypeParamNode");
+}
+
+Type TypeUnifierNode::VisitType_(const FuncTypeNode* t1, const Type rt2) {
+  FuncType ft1 = GetRef<FuncType>(t1);
+
+  if (const FuncTypeNode *tan2 = rt2.as<FuncTypeNode>()) {
+    FuncType ft2 = GetRef<FuncType>(tan2);
+
+    if (ft1->type_params.size() != ft2->type_params.size()) {
+      throw UnificationError(
+          "unable to unify functions with differing number of type parameters");
+    }
+
+    tvm::Map<TypeParam, Type> subst_map;
+
+    for (size_t i = 0; i < ft1->arg_types.size(); i++) {
+      subst_map.Set(ft1->type_params[i], ft2->type_params[i]);
+    }
+
+    ft1 = Downcast<FuncType>(TypeSubst(ft1, subst_map));
+
+    if (ft1->arg_types.size() != ft2->arg_types.size()) {
+      throw UnificationError("unable to unify functions of different arities");
+    }
+
+    tvm::Array<Type> unified_args;
+    for (size_t i = 0; i < ft1->arg_types.size(); i++) {
+      unified_args.push_back(
+          this->VisitType(ft1->arg_types[i], ft2->arg_types[i]));
+    }
+
+    Type unified_ret_type = this->VisitType(ft1->ret_type, ft2->ret_type);
+
+    return FuncTypeNode::make(unified_args, unified_ret_type, {}, {});
+  }
+
+  throw UnificationError("unable to unify function types");
+}
+
+Type TypeUnifierNode::VisitType_(const TensorTypeNode* t1, const Type rt2) {
+  TensorType tt1 = GetRef<TensorType>(t1);
+
+  if (const TensorTypeNode *ttn2 = rt2.as<TensorTypeNode>()) {
+    TensorType tt2 = GetRef<TensorType>(ttn2);
+
+    if (!AlphaEqual(tt1, tt2)) {
+      throw UnificationError("dtypes do not match");
+    }
+
+    RELAY_LOG(INFO) << "Unify Tensor Shape s1=" << tt1->shape
+                    << " s2= " << tt2->shape << std::endl;
+
+    if (tt1->shape.size() != tt2->shape.size()) {
+      throw UnificationError("shapes are not of the same length");
+    }
+
+    for (size_t i = 0U; i < tt1->shape.size(); i++) {
+      if (!tt1->shape[i].same_as(tt2->shape[i])) {
+        throw UnificationError("shapes do not match at index");
+      }
+    }
+
+    return rt2;
+  }
+
+  throw UnificationError("Cannot unify TensorTypeNode");
+}
+
+Type TypeUnifierNode::VisitType_(const TupleTypeNode* t1, const Type rt2) {
+  TupleType pt1 = GetRef<TupleType>(t1);
+
+  if (const TupleTypeNode *ptn2 = rt2.as<TupleTypeNode>()) {
+    TupleType pt2 = GetRef<TupleType>(ptn2);
+
+    std::vector<Type> unified_fields;
+    if (pt1->fields.size() != pt2->fields.size()) {
+      throw UnificationError("Product types are of different dimensions");
+    }
+
+    for (size_t i = 0U; i < pt1->fields.size(); i++) {
+      Type unified = this->VisitType(pt1->fields[i], pt2->fields[i]);
+      unified_fields.push_back(unified);
+    }
+
+    return TupleTypeNode::make(unified_fields);
+  }
+
+  throw UnificationError("Cannot unify TupleTypeNode");
+}
+
+Type TypeUnifierNode::VisitType_(const TypeRelationNode* tr1, const Type t2) {
+  throw InternalError("Cannot unify different type relations");
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/unifier.h b/src/relay/pass/unifier.h
new file mode 100644
index 000000000000..4e939cc26bca
--- /dev/null
+++ b/src/relay/pass/unifier.h
@@ -0,0 +1,141 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file include/tvm/relay/pass/unifier.h
+ * \brief The type unifier which solves a system of equations between
+ * incomplete types.
+ */
+#ifndef TVM_RELAY_PASS_UNIFIER_H_
+#define TVM_RELAY_PASS_UNIFIER_H_
+
+#include <tvm/relay/expr.h>
+#include <string>
+#include "./type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+struct UnionFindError : dmlc::Error {
+  explicit UnionFindError(const std::string& msg) : Error(msg) {}
+};
+
+struct UnificationError : dmlc::Error {
+  explicit UnificationError(const std::string& msg) : Error(msg) {}
+};
+
+struct SubstitutionError : dmlc::Error {
+  explicit SubstitutionError(const std::string& msg) : Error(msg) {}
+};
+
+/*! \brief A union-find data structure for the type-checker */
+class UnionFind;
+
+class UnionFindNode : public Node {
+ public:
+  /*! \brief The inernal map from incomplete types to their representatives. */
+  tvm::Map<IncompleteType, Type> uf_map;
+
+  UnionFindNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("uf_map", &uf_map); }
+
+  TVM_DLL static UnionFind make(tvm::Map<IncompleteType, Type> uf_map);
+
+  /*! \brief Insert it into the union find.
+  * \param it The type to add to the union find.
+  */
+  void Insert(const IncompleteType& it);
+
+  /*! \brief Union operation, combine two equivalence classes.
+  * \param it The incomplete type to unify.
+  * \param ty The other type.
+  */
+  void Unify(const IncompleteType& it, const Type& t);
+
+  /*! \brief Find operation, returns the representative of the argument.
+  * \param it The element to lookup.
+  */
+  Type Find(const IncompleteType& it);
+
+  void debug();
+
+  void AssertAlphaEqual(const Type& l, const Type& r);
+
+  static constexpr const char* _type_key = "relay.UnionFind";
+  TVM_DECLARE_NODE_TYPE_INFO(UnionFindNode, Node);
+};
+
+class UnionFind : public NodeRef {
+ public:
+  UnionFind() {}
+  explicit UnionFind(std::shared_ptr<tvm::Node> p) : NodeRef(p) {}
+
+  // The union find structure is mutable so we do not use the standard macros
+  // and expose the pointer via `->`.
+  UnionFindNode* operator->() const {
+    return static_cast<UnionFindNode*>(node_.get());
+  }
+
+  using ContainerType = UnionFindNode;
+};
+
+class TypeUnifier;
+class TypeUnifierNode : public Node,
+                        private TypeFunctor<Type(const Type&, const Type)> {
+ public:
+  UnionFind union_find;
+
+  TypeUnifierNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("union_find", &union_find); }
+
+  TVM_DLL static TypeUnifier make(UnionFind uf);
+
+  /*! \brief Introduces a new type var into the unifier */
+  void Insert(const IncompleteType& v);
+
+  /*! \brief Unifies two types if possible, throws a unification error if it
+   * cannot  */
+  Type Unify(const Type& t1, const Type& t2);
+
+  /*! \brief Attempts to substitute all type vars in t with concrete types,
+   * throws substitution error if it cannot concretize*/
+  Type Subst(const Type& t);
+
+  // /*! \brief Checks the kinds in the given type */
+  // Type CheckKinds(const Type& t);
+
+  static constexpr const char* _type_key = "relay.TypeUnifier";
+  TVM_DECLARE_NODE_TYPE_INFO(TypeUnifierNode, Node);
+
+ private:
+  /*! \brief Unify incomplete type with another type. */
+  Type UnifyWithIncompleteType(const Type& t1, const IncompleteType tvn2);
+  /*! \brief Implements unification between two types with incomplete portions.
+   */
+  Type VisitType(const Type& t1, const Type t2) override;
+
+  // Visitor Cases
+  Type VisitType_(const IncompleteTypeNode* t1, const Type t2) override;
+  Type VisitType_(const TensorTypeNode* t1, const Type t2) override;
+  Type VisitType_(const TypeParamNode* t1, const Type t2) override;
+  Type VisitType_(const FuncTypeNode* t1, const Type t2) override;
+  Type VisitType_(const TupleTypeNode* t1, const Type t2) override;
+  Type VisitType_(const TypeRelationNode* s1, const Type t2) override;
+};
+
+class TypeUnifier : public NodeRef {
+ public:
+  TypeUnifier() {}
+  explicit TypeUnifier(std::shared_ptr<tvm::Node> p) : NodeRef(p) {}
+
+  // no const so that unifier can be mutable as a member of typechecker
+  inline TypeUnifierNode* operator->() const {
+    return static_cast<TypeUnifierNode*>(node_.get());
+  }
+
+  using ContainerType = TypeUnifierNode;
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_UNIFIER_H_
diff --git a/tests/python/relay/test_ir_builder.py b/tests/python/relay/test_ir_builder.py
new file mode 100644
index 000000000000..c98f920ca491
--- /dev/null
+++ b/tests/python/relay/test_ir_builder.py
@@ -0,0 +1,20 @@
+import numpy as np
+from tvm.relay.expr import Let, Constant
+from tvm.relay.ir_builder import IRBuilder
+
+def test_let():
+    b = IRBuilder()
+    x = b.let('x', 1)
+    b.ret(x)
+    prog, _ = b.get()
+    assert isinstance(prog, Let)
+    var = prog.var
+    value = prog.value
+    assert var.name_hint == 'x'
+    assert var == prog.body
+    assert isinstance(value, Constant)
+    assert value.data.asnumpy() == np.array(1)
+    assert prog.value_type == None
+
+if __name__ == "__main__":
+    test_let()
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
new file mode 100644
index 000000000000..803b3d0faa0c
--- /dev/null
+++ b/tests/python/relay/test_ir_nodes.py
@@ -0,0 +1,159 @@
+""" test ir"""
+import tvm
+from tvm import relay
+from tvm.expr import *
+
+# Span
+def test_span():
+    span = relay.Span(None, 1, 1)
+    assert span.source == None
+    assert span.lineno == 1
+    assert span.col_offset == 1
+    assert span.same_as(span)
+    assert span == span
+    assert isinstance(span, relay.base.Span)
+    str(span)
+
+# Types
+
+def test_tensor_type():
+    shape = tvm.convert([1, 2, 3])
+    dtype = 'float32'
+    tt = relay.TensorType(shape, dtype)
+    assert tt.dtype == dtype
+    assert tt.shape == shape
+    assert tt.span == None
+    str(tt)
+
+
+def test_type_param():
+    tp = relay.TypeParam('name', relay.Kind.Shape)
+    tp.kind == relay.Kind.Shape
+    tp.span  # TODO allow us to set span
+    str(tp)
+
+
+def test_func_type():
+    type_params = tvm.convert([])
+    type_constraints = tvm.convert([])  # TODO: fill me in
+    arg_types = tvm.convert([])
+    ret_type = None
+    tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
+    assert tf.type_params == type_params
+    assert tf.type_constraints == type_constraints
+    assert tf.arg_types == arg_types
+    assert tf.ret_type == ret_type
+    assert tf.span == None
+    # TODO make sure we can set
+    str(tf)
+
+
+def test_constant():
+    arr = tvm.nd.array(10)
+    const = relay.Constant(arr)
+    assert const.data == arr
+    assert const.span == None
+    str(const)
+
+
+def test_tuple():
+    fields = tvm.convert([])
+    tup = relay.Tuple(fields)
+    assert tup.fields == fields
+    assert tup.span == None
+    str(tup)
+
+
+def test_local_var():
+    name_hint = 's'
+    lv = relay.Var(name_hint)
+    lv.name_hint == name_hint
+    # assert lv.span == None todo(@jroesch): what do we do about spans
+    str(lv)
+
+
+def test_global_var():
+    name_hint = 'g'
+    gv = relay.GlobalVar(name_hint)
+    gv.name_hint == name_hint
+    # assert lv.span == None todo(@jroesch): what do we do about spans
+    str(gv)
+
+
+def test_param():
+    lv = relay.Var('x')
+    ty = None
+    param = relay.Param(lv, ty)
+    assert param.var == lv
+    assert param.type == ty
+    assert param.span == None
+    str(param)
+
+
+def test_function():
+    param_names = ['a', 'b', 'c', 'd']
+    params = tvm.convert([relay.Param(relay.Var(n), None) for n in param_names])
+    ret_type = None
+    body = None
+    type_params = tvm.convert([])
+    fn = relay.Function(params, ret_type, body, type_params)
+    assert fn.params == params
+    assert fn.body == body
+    assert fn.type_params == type_params
+    assert fn.span == None
+    str(fn)
+
+
+def test_call():
+    op = relay.Var('f')
+    arg_names = ['a', 'b', 'c', 'd']
+    args = tvm.convert([relay.Var(n) for n in arg_names])
+    call = relay.Call(op, args, None, None)
+    assert call.op == op
+    assert call.args == args
+    assert call.span == None
+    str(call)
+
+
+def test_let():
+    lv = relay.Var('x')
+    ty = None
+    arr = tvm.nd.array(10)
+    value = relay.Constant(arr)
+    # I would prefer that the order of arguments
+    # matches syntax let x: t = v in b
+    let = relay.Let(lv, value, lv, ty)
+    assert let.var == lv
+    assert let.value == value
+    assert let.value_type == ty
+    assert let.body == lv
+    assert let.span == None
+    str(let)
+
+
+def test_if():
+    cond = relay.Var('cond')
+    left = relay.Var('left')
+    right = relay.Var('right')
+    ife = relay.If(cond, left, right)
+    assert ife.cond == cond
+    assert ife.true_branch == left
+    assert ife.false_branch == right
+    assert ife.span == None
+    str(ife)
+
+
+if __name__ == "__main__":
+    test_span()
+    test_tensor_type()
+    test_type_param()
+    test_func_type()
+    test_constant()
+    test_tuple()
+    test_local_var()
+    test_global_var()
+    test_param()
+    test_function()
+    test_call()
+    test_let()
+    test_if()
diff --git a/tests/python/relay/test_relay_op.py b/tests/python/relay/test_relay_op.py
new file mode 100644
index 000000000000..1f95a3f72c15
--- /dev/null
+++ b/tests/python/relay/test_relay_op.py
@@ -0,0 +1,27 @@
+from tvm import relay
+
+def test_op_attr():
+    log_op = relay.op.get("log")
+
+    @relay.op.register("exp", "ftest")
+    def test(x):
+        return x + 1
+
+    assert log_op.num_inputs  == 1
+    assert log_op.get_attr("ftest") is None
+    assert relay.op.get("exp").get_attr("ftest")(1) == 2
+
+def test_op_level1():
+    x = relay.Var("x")
+
+    for op_name in ["log", "exp", "sqrt"]:
+        y = getattr(relay, op_name)(x)
+        assert y.op.name == op_name
+        assert y.op.support_level == 1
+        assert y.args[0] == x
+
+
+if __name__ == "__main__":
+    test_op_attr()
+    test_op_level1()
+
diff --git a/tests/python/relay/test_tyck_eval_integration.py b/tests/python/relay/test_tyck_eval_integration.py
new file mode 100644
index 000000000000..d95cda0ba819
--- /dev/null
+++ b/tests/python/relay/test_tyck_eval_integration.py
@@ -0,0 +1,162 @@
+"""Test that type checker correcly computes types
+   for expressions.
+"""
+import tvm
+import numpy as np
+from tvm.relay.ir_pass import check_expr
+from tvm.relay.ir_builder import IRBuilder, func_type
+from tvm.relay.ir_builder import scalar_type, convert, tensor_type
+from tvm.relay.env import Environment
+from tvm.relay.op import log, add, equal, subtract, concat
+from tvm.relay.expr import Function
+
+def assert_has_type(expr, typ, env=Environment({})):
+    checked_expr = check_expr(env, expr)
+    assert checked_expr.checked_type() == typ
+
+
+def assert_decl_has_type(env, name, typ):
+    func = env[name]
+    assert func.checked_type() == typ
+
+
+def test_monomorphic_let():
+    "Program: let x = 1; return x"
+    b = IRBuilder()
+    x = b.let('x', 1.0, value_type=scalar_type('float64'))
+    b.ret(x)
+
+    prog, env = b.get()
+    assert_has_type(prog, scalar_type('float64'))
+
+
+def test_single_op():
+    "Program: fn (x : float32) { let t1 = f(x); t1 }"
+    b = IRBuilder()
+    with b.function(('x', 'float32')) as func:
+        x, = func.param_ids()
+        t1 = b.let('t1', log(x))
+        b.ret(t1)
+    assert_has_type(func.to_func(), func_type(['float32'], 'float32'))
+
+def test_add_op():
+    """
+    Program:
+        fn (x, y) {
+            return x + y;
+        }
+    """
+    b = IRBuilder()
+    x = b.param('x', tensor_type(5, 5, 5))
+    y = b.param('y', tensor_type(5, 5, 5))
+    with b.function(x, y) as func:
+        b.ret(add(x.var, y.var))
+    b.ret(func)
+    prog, env = b.get()
+    ttype = tensor_type(5, 5, 5)
+    expected_ty = func_type([ttype, ttype], ttype)
+    assert_has_type(func.to_func(), expected_ty)
+
+def test_add_broadcast_op():
+    """
+    Program:
+        fn (x: Tensor[(10, 4), f32], y: Tensor[(5, 10, 1), f32]) -> Tensor[(5, 10, 4), f32] {
+            return x + y;
+        }
+    """
+    b = IRBuilder()
+    x = b.param('x', tensor_type(10, 4))
+    y = b.param('y', tensor_type(5, 10, 1))
+    with b.function(x, y) as func:
+        b.ret(add(x.var, y.var))
+    b.ret(func)
+    prog, env = b.get()
+    ttype = tensor_type(5, 5, 5)
+    expected_ty = func_type([ttype, ttype], ttype)
+    assert_has_type(func.to_func(), expected_ty)
+
+def test_dual_op():
+    """Program: 
+       fn (x : Tensor[f32, (10, 10)]) { 
+         let t1 = log(x); 
+         let t2 = add(t1, x); 
+         return t1;
+       }
+    """
+    b = IRBuilder()
+    with b.function(('x', tensor_type(10, 10))) as func:
+        x, = func.param_ids()
+        t1 = b.let('t1', log(x))
+        t2 = b.let('t2', add(t1, x))
+        b.ret(t2)
+    assert_has_type(func.to_func(), func_type(['float32'], 'float32'))
+
+
+def test_decl():
+    """Program: 
+       def f(x : Tensor[f32, (10, 10)]) { 
+           let lx = log(x);
+           return lx;
+       }
+    """
+    b = IRBuilder()
+    x = b.param('x')
+    with b.decl('f', x):
+        lx = b.let('lx', log(x))
+        b.ret(lx)
+    _, env = b.get()
+    assert_decl_has_type(env, 'f', func_type(['float32'], 'float32'))
+
+
+def test_recursion():
+    """
+    Program:
+       def f(n: i32, data: f32) -> f32 {
+          if (n == 0) {
+              return f(n - 1, log(data));
+          } else {
+              return data;
+          }
+       }
+       f(2, 10000);
+    """
+    b = IRBuilder()
+    f = b.global_var('f')
+    n = b.param('n', ty='int32')
+    data = b.param('data', ty='float32')
+    with b.decl(f, n, data):
+        with b.if_scope(equal(n, convert(0.0))):
+            b.ret(f(subtract(n, convert(1)), log(data)))
+        with b.else_scope():
+            b.ret(data)
+    b.ret(f(convert(2.0), convert(10000.0)))
+    assert_decl_has_type(b.env, 'f', func_type(
+        ['int32', 'float32'], 'float32'))
+    # TODO(@jroesch): need evaluator or new runtime
+    # to execute this.
+
+def test_concat():
+    """
+    Program:
+        def try_concat2(x: Float(3, 2), y: Float(2, 2)) -> Float(5, 2) {
+            return concat(x, y);
+        }
+    """
+    ib = IRBuilder()
+    try_concat2 = ib.global_var('try_concat2')
+    x = ib.param('x', ty=tensor_type(3, 2))
+    y = ib.param('y', ty=tensor_type(2, 2))
+    with ib.decl(try_concat2, x, y):
+        ib.ret(concat(x, y))
+    fn_ty = func_type([tensor_type(3, 2), tensor_type(2, 2)], tensor_type(5, 2))
+    assert_decl_has_type(ib.env, try_concat2, fn_ty)
+
+if __name__ == "__main__":
+    # test_monomorphic_let()
+    # test_single_op()
+    # test_add_op()
+    # test_add_broadcast_op()
+    # test_dual_op()
+    # test_decl()
+    # test_recursion()
+    test_concat()
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 8104bf079502..7dcd5c921905 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -18,6 +18,8 @@ TVM_FFI=cython python -m nose -v tests/python/integration || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/integration || exit -1
 TVM_FFI=cython python -m nose -v tests/python/contrib || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/contrib || exit -1
+TVM_FFI=cython python -m nose -v tests/python/relay || exit -1
+TVM_FFI=ctypes python3 -m nose -v tests/python/relay || exit -1
 
 # Do not enabke OpenGL
 # TVM_FFI=cython python -m nose -v tests/webgl || exit -1

From 7ffcb3d16c09034e44c8adde4c7b61b8fa5b5690 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 19 Sep 2018 17:35:19 -0700
Subject: [PATCH 108/529] [RUNTIME] Improve memory usage for RPC (#1741)

---
 apps/benchmark/README.md                    | 10 ++++--
 apps/benchmark/arm_cpu_imagenet_bench.py    |  5 +--
 apps/benchmark/mobile_gpu_imagenet_bench.py |  6 ++--
 nnvm/python/nnvm/testing/tf.py              |  3 +-
 python/tvm/contrib/graph_runtime.py         | 12 ++++++--
 src/common/ring_buffer.h                    | 34 ++++++++++++++-------
 tutorials/autotvm/tune_nnvm_arm.py          |  5 ++-
 tutorials/autotvm/tune_nnvm_cuda.py         |  5 ++-
 tutorials/autotvm/tune_nnvm_mobile_gpu.py   | 28 ++++++++++++++---
 tutorials/nnvm/deploy_model_on_mali_gpu.py  | 12 +++-----
 tutorials/nnvm/deploy_model_on_rasp.py      | 12 +++-----
 11 files changed, 80 insertions(+), 52 deletions(-)

diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index db4be4b8e557..f713684524ef 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -31,8 +31,10 @@ python3 gpu_imagenet_bench.py --model titanx
 ```
 
 ### ARM CPU & Mali GPU
-For embedded deivces, we use RPC infrastructure in TVM to make the management easy.
-So you need to use it for reproducing benchmark results.
+For embedded devices, we use RPC infrastructure in TVM to make the management easy.
+You need to use it for reproducing benchmark results.
+
+**Note**: We use llvm-4.0 in our tuning environment. Mismatch of the LLVM version during tuning and deployment can influence the performance, so you have to use a same version for reproduction.
 
 0. Build TVM with LLVM enabled. [Help](https://docs.tvm.ai/install/from_source.html)
 
@@ -87,6 +89,10 @@ python3 -m tvm.exec.rpc_tracker
   python3 arm_cpu_imagenet_bench.py --model mate10pro --rpc-key mate10pro  
 
   # Mali GPU
+  # NOTE: To make the test environment more stable, we close GUI and lock the frequency
+  sudo /etc/init.d/lightdm stop
+  sudo -i
+  echo performance > /sys/class/misc/mali0/device/devfreq/ff9a0000.gpu/governor
   python3 mobile_gpu_imagenet_bench.py --model rk3399 --rpc-key rk3399
   ```
 
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index 931899069700..2d7116475bc5 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -41,15 +41,12 @@ def evaluate_network(network, target, target_host, number):
     print_progress("%-20s uploading..." % network)
     ctx = remote.context(str(target), 0)
     remote.upload(tmp.relpath(filename))
-    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
 
     rlib = remote.load_module(filename)
     module = runtime.create(graph, rlib, ctx)
     data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
     module.set_input('data', data_tvm)
-    module.set_input(**rparams)
-
-    del rparams
+    module.set_input(**params)
 
     # evaluate
     print_progress("%-20s evaluating..." % network)
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
index da1207381c86..8e29fa5dab9a 100644
--- a/apps/benchmark/mobile_gpu_imagenet_bench.py
+++ b/apps/benchmark/mobile_gpu_imagenet_bench.py
@@ -46,9 +46,7 @@ def evaluate_network(network, target, target_host, number):
     module = runtime.create(graph, rlib, ctx)
     data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
     module.set_input('data', data_tvm)
-    module.set_input(**rparams)
-
-    del rparams
+    module.set_input(**params)
 
     # evaluate
     print_progress("%-20s evaluating..." % network)
@@ -87,4 +85,4 @@ def evaluate_network(network, target, target_host, number):
     print("--------------------------------------------------")
 
     for network in networks:
-        evaluate_network(network, target, target_host, args.number)
\ No newline at end of file
+        evaluate_network(network, target, target_host, args.number)
diff --git a/nnvm/python/nnvm/testing/tf.py b/nnvm/python/nnvm/testing/tf.py
index f5b49b2280b4..d89ac497a46f 100644
--- a/nnvm/python/nnvm/testing/tf.py
+++ b/nnvm/python/nnvm/testing/tf.py
@@ -8,12 +8,13 @@
 import os.path
 import collections
 import numpy as np
-from tvm.contrib import util
 
 # Tensorflow imports
 import tensorflow as tf
 from tensorflow.core.framework import graph_pb2
 
+from tvm.contrib import util
+
 ######################################################################
 # Some helper functions
 # ---------------------
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 4819cd3c7364..e49b966e6a1e 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -1,4 +1,6 @@
 """Minimum graph runtime that executes graph containing TVM PackedFunc."""
+import numpy as np
+
 from .._ffi.base import string_types
 from .._ffi.function import get_global_func
 from ..rpc import base as rpc_base
@@ -97,9 +99,13 @@ def set_input(self, key=None, value=None, **params):
         """
         if key:
             self._set_input(key, nd.array(value, ctx=self.ctx))
-        for k, v in params.items():
-            self._set_input(k, nd.array(v, ctx=self.ctx))
-        return self
+
+        if params:
+            # upload big arrays first to avoid memory issue in rpc mode
+            keys = list(params.keys())
+            keys.sort(key=lambda x: -np.prod(params[x].shape))
+            for k in keys:
+                self._set_input(k, nd.array(params[k], ctx=self.ctx))
 
     def run(self, **input_dict):
         """Run forward execution of the graph
diff --git a/src/common/ring_buffer.h b/src/common/ring_buffer.h
index 421f19466957..dcec54d1823d 100644
--- a/src/common/ring_buffer.h
+++ b/src/common/ring_buffer.h
@@ -36,19 +36,31 @@ class RingBuffer {
    * \param n The size of capacity.
    */
   void Reserve(size_t n) {
-    if (ring_.size() >= n) return;
-    size_t old_size = ring_.size();
-    size_t new_size = ring_.size();
-    while (new_size < n) {
-      new_size *= 2;
-    }
-    ring_.resize(new_size);
-    if (head_ptr_ + bytes_available_ > old_size) {
-      // copy the ring overflow part into the tail.
-      size_t ncopy = head_ptr_ + bytes_available_ - old_size;
-      memcpy(&ring_[0] + old_size, &ring_[0], ncopy);
+    if (ring_.size() < n) {
+        size_t old_size = ring_.size();
+        size_t new_size = static_cast<size_t>(n * 1.2);
+        ring_.resize(new_size);
+        if (head_ptr_ + bytes_available_ > old_size) {
+          // copy the ring overflow part into the tail.
+          size_t ncopy = head_ptr_ + bytes_available_ - old_size;
+          memcpy(&ring_[0] + old_size, &ring_[0], ncopy);
+        }
+    } else if (ring_.size() > n * 8 && ring_.size() > kInitCapacity) {
+        // shrink too large temporary buffer to avoid out of memory on some embedded devices
+        size_t old_bytes = bytes_available_;
+
+        std::vector<char> tmp(old_bytes);
+
+        Read(&tmp[0], old_bytes);
+        ring_.resize(kInitCapacity);
+        ring_.shrink_to_fit();
+
+        memcpy(&ring_[0], &tmp[0], old_bytes);
+        head_ptr_ = 0;
+        bytes_available_ = old_bytes;
     }
   }
+
   /*!
    * \brief Peform a non-blocking read from buffer
    *  size must be smaller than this->bytes_available()
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index 31c634bf2a9b..8f0d74180449 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -327,11 +327,10 @@ def tune_and_evaluate(tuning_opt):
 
         # upload parameters to device
         ctx = remote.context(str(target), 0)
-        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module = runtime.create(graph, rlib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module.set_input('data', data_tvm)
-        module.set_input(**rparams)
+        module.set_input(**params)
 
         # evaluate
         print("Evaluate inference time cost...")
diff --git a/tutorials/autotvm/tune_nnvm_cuda.py b/tutorials/autotvm/tune_nnvm_cuda.py
index 8224276f47f8..1900c87aa40d 100644
--- a/tutorials/autotvm/tune_nnvm_cuda.py
+++ b/tutorials/autotvm/tune_nnvm_cuda.py
@@ -229,11 +229,10 @@ def tune_and_evaluate(tuning_opt):
 
         # load parameters
         ctx = tvm.context(str(target), 0)
-        params_tvm = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module = runtime.create(graph, lib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module.set_input('data', data_tvm)
-        module.set_input(**params_tvm)
+        module.set_input(**params)
 
         # evaluate
         print("Evaluate inference time cost...")
diff --git a/tutorials/autotvm/tune_nnvm_mobile_gpu.py b/tutorials/autotvm/tune_nnvm_mobile_gpu.py
index c7e496c94231..27cdd2e632fc 100644
--- a/tutorials/autotvm/tune_nnvm_mobile_gpu.py
+++ b/tutorials/autotvm/tune_nnvm_mobile_gpu.py
@@ -328,11 +328,10 @@ def tune_and_evaluate(tuning_opt):
 
         # upload parameters to device
         ctx = remote.context(str(target), 0)
-        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module = runtime.create(graph, rlib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module.set_input('data', data_tvm)
-        module.set_input(**rparams)
+        module.set_input(**params)
 
         # evaluate
         print("Evaluate inference time cost...")
@@ -357,9 +356,28 @@ def tune_and_evaluate(tuning_opt):
 #
 #    Extract tasks...
 #    Tuning...
-#    [Task  1/17]  Current/Best:   12.22/  36.05 GFLOPS | Progress: (32/1000) | 42.12 s
+#    [Task  1/17]  Current/Best:   25.30/  39.12 GFLOPS | Progress: (992/1000) | 751.22 s Done.
+#    [Task  2/17]  Current/Best:   40.70/  45.50 GFLOPS | Progress: (736/1000) | 545.46 s Done.
+#    [Task  3/17]  Current/Best:   38.83/  42.35 GFLOPS | Progress: (992/1000) | 1549.85 s Done.
+#    [Task  4/17]  Current/Best:   23.31/  31.02 GFLOPS | Progress: (640/1000) | 1059.31 s Done.
+#    [Task  5/17]  Current/Best:    0.06/   2.34 GFLOPS | Progress: (544/1000) | 305.45 s Done.
+#    [Task  6/17]  Current/Best:   10.97/  17.20 GFLOPS | Progress: (992/1000) | 1050.00 s Done.
+#    [Task  7/17]  Current/Best:    8.98/  10.94 GFLOPS | Progress: (928/1000) | 421.36 s Done.
+#    [Task  8/17]  Current/Best:    4.48/  14.86 GFLOPS | Progress: (704/1000) | 582.60 s Done.
+#    [Task  9/17]  Current/Best:   10.30/  25.99 GFLOPS | Progress: (864/1000) | 899.85 s Done.
+#    [Task 10/17]  Current/Best:   11.73/  12.52 GFLOPS | Progress: (608/1000) | 304.85 s Done.
+#    [Task 11/17]  Current/Best:   15.26/  18.68 GFLOPS | Progress: (800/1000) | 747.52 s Done.
+#    [Task 12/17]  Current/Best:   17.48/  26.71 GFLOPS | Progress: (1000/1000) | 1166.40 s Done.
+#    [Task 13/17]  Current/Best:    0.96/  11.43 GFLOPS | Progress: (960/1000) | 611.65 s Done.
+#    [Task 14/17]  Current/Best:   17.88/  20.22 GFLOPS | Progress: (672/1000) | 670.29 s Done.
+#    [Task 15/17]  Current/Best:   11.62/  13.98 GFLOPS | Progress: (736/1000) | 449.25 s Done.
+#    [Task 16/17]  Current/Best:   19.90/  23.83 GFLOPS | Progress: (608/1000) | 708.64 s Done.
+#    [Task 17/17]  Current/Best:   17.98/  22.75 GFLOPS | Progress: (736/1000) | 1122.60 s Done.
+#    Compile...
+#    Upload...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 128.05 ms (7.74 ms)
 #
-#    (The following part is running, will update it later).
 
 ######################################################################
 #
diff --git a/tutorials/nnvm/deploy_model_on_mali_gpu.py b/tutorials/nnvm/deploy_model_on_mali_gpu.py
index 0b1b54899ee7..10aac3a67b94 100644
--- a/tutorials/nnvm/deploy_model_on_mali_gpu.py
+++ b/tutorials/nnvm/deploy_model_on_mali_gpu.py
@@ -132,7 +132,6 @@ def transform_image(image):
 num_classes = 1000
 image_shape = (3, 224, 224)
 data_shape = (batch_size,) + image_shape
-out_shape = (batch_size, num_classes)
 
 ######################################################################
 # Compile The Graph
@@ -197,20 +196,17 @@ def transform_image(image):
 remote.upload(lib_fname)
 rlib = remote.load_module('net.tar')
 
-ctx = remote.cpu(0) if local_demo else remote.cl(0)
-# upload the parameter
-rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-
 # create the remote runtime module
+ctx = remote.cl(0) if not local_demo else remote.cpu(0)
 module = runtime.create(graph, rlib, ctx)
-# set parameter
-module.set_input(**rparams)
+# set parameter (upload params to the remote device. This may take a while)
+module.set_input(**params)
 # set input data
 module.set_input('data', tvm.nd.array(x.astype('float32')))
 # run
 module.run()
 # get output
-out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
+out = module.get_output(0)
 # get top1 result
 top1 = np.argmax(out.asnumpy())
 print('TVM prediction top-1: {}'.format(synset[top1]))
diff --git a/tutorials/nnvm/deploy_model_on_rasp.py b/tutorials/nnvm/deploy_model_on_rasp.py
index e5737a96489f..807365829809 100644
--- a/tutorials/nnvm/deploy_model_on_rasp.py
+++ b/tutorials/nnvm/deploy_model_on_rasp.py
@@ -128,7 +128,6 @@ def transform_image(image):
 num_classes = 1000
 image_shape = (3, 224, 224)
 data_shape = (batch_size,) + image_shape
-out_shape = (batch_size, num_classes)
 
 ######################################################################
 # Compile The Graph
@@ -188,20 +187,17 @@ def transform_image(image):
 remote.upload(lib_fname)
 rlib = remote.load_module('net.tar')
 
-# upload the parameter (this may take a while)
-ctx = remote.cpu(0)
-rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-
 # create the remote runtime module
+ctx = remote.cpu(0)
 module = runtime.create(graph, rlib, ctx)
-# set parameter
-module.set_input(**rparams)
+# set parameter (upload params to the remote device. This may take a while)
+module.set_input(**params)
 # set input data
 module.set_input('data', tvm.nd.array(x.astype('float32')))
 # run
 module.run()
 # get output
-out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
+out = module.get_output(0)
 # get top1 result
 top1 = np.argmax(out.asnumpy())
 print('TVM prediction top-1: {}'.format(synset[top1]))

From 175f9c821644da37f08cdc9236162a827ca8f2b5 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 19 Sep 2018 19:56:45 -0700
Subject: [PATCH 109/529] [NODE] Node base system refactor (#1739)

---
 HalideIR                                  |   2 +-
 include/tvm/api_registry.h                |   4 +-
 include/tvm/arithmetic.h                  |   2 +-
 include/tvm/attrs.h                       |  10 +-
 include/tvm/base.h                        |  10 +-
 include/tvm/buffer.h                      |   4 +-
 include/tvm/build_module.h                |   6 +-
 include/tvm/channel.h                     |   2 +-
 include/tvm/expr.h                        |   6 +-
 include/tvm/ir.h                          |   2 +-
 include/tvm/ir_functor_ext.h              |   2 +-
 include/tvm/ir_mutator.h                  |   2 +-
 include/tvm/ir_pass.h                     |   1 -
 include/tvm/ir_visitor.h                  |   2 +-
 include/tvm/lowered_func.h                |   4 +-
 include/tvm/node/container.h              | 586 ++++++++++++++++++++++
 include/tvm/node/ir_functor.h             | 254 ++++++++++
 include/tvm/node/memory.h                 |  59 +++
 include/tvm/node/node.h                   | 295 +++++++++++
 include/tvm/packed_func_ext.h             |  22 +-
 include/tvm/relay/base.h                  |  32 +-
 include/tvm/relay/environment.h           |   6 +-
 include/tvm/relay/expr_functor.h          |  12 +-
 include/tvm/relay/op.h                    |   8 +-
 include/tvm/relay/type.h                  |   4 +-
 include/tvm/runtime/ndarray.h             |   8 +-
 include/tvm/runtime/node_base.h           | 241 +++++++++
 include/tvm/runtime/packed_func.h         |  17 +-
 include/tvm/schedule.h                    |   8 +-
 include/tvm/tensor.h                      |   6 +-
 include/tvm/tensor_intrin.h               |   2 +-
 nnvm/src/compiler/compile_engine.cc       |   9 +-
 nnvm/src/compiler/compile_engine.h        |   2 +-
 nnvm/src/compiler/graph_hash.cc           |   3 +-
 nnvm/src/compiler/graph_runtime.cc        |   3 +-
 nnvm/src/compiler/graph_runtime.h         |   1 +
 nnvm/src/compiler/packed_func_ext.cc      |   2 +-
 src/api/api_lang.cc                       |  12 +-
 src/api/dsl_api.cc                        |   2 +-
 src/arithmetic/canonical.cc               |  19 +-
 src/arithmetic/int_set.cc                 |   4 +-
 src/arithmetic/int_set_internal.h         |   8 +-
 src/arithmetic/modular.cc                 |   2 +-
 src/codegen/build_module.cc               |  10 +-
 src/codegen/verilog/verilog_ir.cc         |  22 +-
 src/codegen/verilog/vpi_session.cc        |   4 +-
 src/codegen/verilog/vpi_session.h         |   4 +-
 src/lang/api_registry.cc                  |   4 +-
 src/lang/attrs.cc                         |   2 +-
 src/lang/buffer.cc                        |   4 +-
 src/lang/channel.cc                       |   2 +-
 src/lang/expr.cc                          |   6 +-
 src/lang/ir.cc                            |   4 +-
 src/lang/node.cc                          |  58 +++
 src/lang/reflection.cc                    |  12 +-
 src/lang/tensor.cc                        |   6 +-
 src/op/compute_op.cc                      |   8 +-
 src/op/extern_op.cc                       |   4 +-
 src/op/placeholder_op.cc                  |   2 +-
 src/op/scan_op.cc                         |   4 +-
 src/pass/combine_context_call.cc          |   2 +-
 src/pass/ir_util.cc                       |  14 +-
 src/pass/lower_intrin.cc                  |   2 +-
 src/pass/lower_thread_allreduce.cc        |   2 +-
 src/pass/lower_tvm_builtin.cc             |   2 +-
 src/pass/lower_warp_memory.cc             |   4 +-
 src/pass/make_api.cc                      |   4 +-
 src/pass/remap_thread_axis.cc             |   2 +-
 src/pass/split_host_device.cc             |   6 +-
 src/pass/storage_rewrite.cc               |   3 +-
 src/pass/storage_sync.cc                  |   2 +-
 src/relay/ir/base.cc                      |  43 +-
 src/relay/ir/environment.cc               |  68 +--
 src/relay/ir/expr.cc                      |  72 ++-
 src/relay/ir/op.cc                        |  23 +-
 src/relay/ir/type.cc                      |  11 +-
 src/relay/pass/kind_check.cc              |   3 +-
 src/relay/pass/type_functor.h             |   2 +-
 src/relay/pass/type_infer.cc              |   5 +-
 src/relay/pass/unifier.cc                 |   4 +-
 src/relay/pass/unifier.h                  |   4 +-
 src/schedule/schedule_dataflow_rewrite.cc |  10 +-
 src/schedule/schedule_lang.cc             |  42 +-
 tests/cpp/ir_functor_test.cc              |   2 +-
 tests/scripts/task_python_integration.sh  |   1 +
 85 files changed, 1822 insertions(+), 353 deletions(-)
 create mode 100644 include/tvm/node/container.h
 create mode 100644 include/tvm/node/ir_functor.h
 create mode 100644 include/tvm/node/memory.h
 create mode 100644 include/tvm/node/node.h
 create mode 100644 include/tvm/runtime/node_base.h
 create mode 100644 src/lang/node.cc

diff --git a/HalideIR b/HalideIR
index f519848d972c..cf6090aeaeb7 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit f519848d972c67971b4cbf8c34070d5a5e3ede0d
+Subproject commit cf6090aeaeb782d1daff54b0ca5c2c281d7008db
diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h
index d6e9910ab1ee..1532872397c3 100644
--- a/include/tvm/api_registry.h
+++ b/include/tvm/api_registry.h
@@ -57,7 +57,7 @@ class EnvFuncNode : public Node {
 class EnvFunc : public NodeRef {
  public:
   EnvFunc() {}
-  explicit EnvFunc(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit EnvFunc(NodePtr<Node> n) : NodeRef(n) {}
   /*! \return The internal global function pointer */
   const EnvFuncNode* operator->() const {
     return static_cast<EnvFuncNode*>(node_.get());
@@ -105,7 +105,7 @@ class TypedEnvFunc<R(Args...)> : public NodeRef {
   /*! \brief short hand for this function type */
   using TSelf = TypedEnvFunc<R(Args...)>;
   TypedEnvFunc() {}
-  explicit TypedEnvFunc(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit TypedEnvFunc(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief Assign global function to a TypedEnvFunc
    * \param other Another global function.
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index 54875bbbf474..fe0405264c51 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -38,7 +38,7 @@ class IntSet : public NodeRef {
   /*! \brief constructor */
   IntSet() {}
   // constructor from not container.
-  explicit IntSet(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit IntSet(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 7cd77a92d0dd..7071dad07214 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -136,7 +136,7 @@ class Attrs : public NodeRef {
   // normal constructor
   Attrs() {}
   // construct from shared ptr.
-  explicit Attrs(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Attrs(NodePtr<Node> n) : NodeRef(n) {}
 
   /*! \return The attribute node */
   const BaseAttrsNode* operator->() const {
@@ -442,7 +442,7 @@ class AttrDocEntry {
  public:
   using TSelf = AttrDocEntry;
 
-  explicit AttrDocEntry(std::shared_ptr<AttrFieldInfoNode> info)
+  explicit AttrDocEntry(NodePtr<AttrFieldInfoNode> info)
       : info_(info) {
   }
   TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
@@ -466,15 +466,15 @@ class AttrDocEntry {
   }
 
  private:
-  std::shared_ptr<AttrFieldInfoNode> info_;
+  NodePtr<AttrFieldInfoNode> info_;
 };
 
 class AttrDocVisitor {
  public:
   template<typename T>
   AttrDocEntry operator()(const char* key, T* v) {
-    std::shared_ptr<AttrFieldInfoNode> info
-        = std::make_shared<AttrFieldInfoNode>();
+    NodePtr<AttrFieldInfoNode> info
+        = make_node<AttrFieldInfoNode>();
     info->name = key;
     info->type_info = TypeName<T>::value;
     fields_.push_back(AttrFieldInfo(info));
diff --git a/include/tvm/base.h b/include/tvm/base.h
index c2d796b6002c..7104688aa169 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -8,7 +8,7 @@
 
 #include <dmlc/logging.h>
 #include <dmlc/registry.h>
-#include <tvm/node.h>
+#include <tvm/node/node.h>
 #include <string>
 #include <memory>
 #include <functional>
@@ -25,7 +25,7 @@ using ::tvm::AttrVisitor;
   class TypeName : public ::tvm::NodeRef {                       \
    public:                                                       \
     TypeName() {}                                                 \
-    explicit TypeName(std::shared_ptr<::tvm::Node> n) : NodeRef(n) {}   \
+    explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {}     \
     const NodeName* operator->() const {                          \
       return static_cast<const NodeName*>(node_.get());           \
     }                                                             \
@@ -48,7 +48,7 @@ std::string SaveJSON(const NodeRef& node);
  *
  * \return The shared_ptr of the Node.
  */
-std::shared_ptr<Node> LoadJSON_(std::string json_str);
+NodePtr<Node> LoadJSON_(std::string json_str);
 
 /*!
  * \brief Load the node from json string.
@@ -85,7 +85,7 @@ struct NodeFactoryReg {
    *        If this is not empty then FGlobalKey
    * \return The created function.
    */
-  using FCreate = std::function<std::shared_ptr<Node>(const std::string& global_key)>;
+  using FCreate = std::function<NodePtr<Node>(const std::string& global_key)>;
   /*!
    * \brief Global key function, only needed by global objects.
    * \param node The node pointer.
@@ -123,7 +123,7 @@ struct NodeFactoryReg {
 #define TVM_REGISTER_NODE_TYPE(TypeName)                                \
   static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
       ::tvm::NodeFactoryReg::Registry()->__REGISTER__(TypeName::_type_key) \
-      .set_creator([](const std::string&) { return std::make_shared<TypeName>(); })
+      .set_creator([](const std::string&) { return ::tvm::make_node<TypeName>(); })
 
 
 #define TVM_STRINGIZE_DETAIL(x) #x
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 0f591299718e..5901a27fe1ce 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -6,11 +6,11 @@
 #ifndef TVM_BUFFER_H_
 #define TVM_BUFFER_H_
 
-#include <tvm/container.h>
 #include <string>
 
 #include "base.h"
 #include "expr.h"
+#include "node/container.h"
 
 namespace tvm {
 
@@ -31,7 +31,7 @@ enum class AccessMask : int {
 class Buffer : public NodeRef {
  public:
   Buffer() {}
-  explicit Buffer(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Buffer(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief Return a new buffer that is equivalent with current one
    *  but always add stride field.
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 5dc832041410..7aafad4216e1 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -69,7 +69,7 @@ class TargetNode : public Node {
 class Target : public NodeRef {
  public:
   Target() {}
-  explicit Target(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Target(NodePtr<Node> n) : NodeRef(n) {}
 
   /*!
   * \brief Create a Target given a string
@@ -241,7 +241,7 @@ class BuildConfigNode : public Node {
 class BuildConfig : public ::tvm::NodeRef {
  public:
   BuildConfig() {}
-  explicit BuildConfig(std::shared_ptr<::tvm::Node> n) : NodeRef(n) {}
+  explicit BuildConfig(NodePtr<::tvm::Node> n) : NodeRef(n) {}
 
   const BuildConfigNode* operator->() const {
     return static_cast<const BuildConfigNode*>(node_.get());
@@ -335,7 +335,7 @@ class GenericFuncNode;
 class GenericFunc : public NodeRef {
  public:
   GenericFunc() {}
-  explicit GenericFunc(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit GenericFunc(NodePtr<Node> n) : NodeRef(n) {}
 
   /*!
    * \brief Set the default function implementaiton.
diff --git a/include/tvm/channel.h b/include/tvm/channel.h
index 28d9b5f7ce4a..051b57a194c4 100644
--- a/include/tvm/channel.h
+++ b/include/tvm/channel.h
@@ -17,7 +17,7 @@ class Channel : public NodeRef {
  public:
   /*! \brief default constructor  */
   Channel() {}
-  explicit Channel(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Channel(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index fb2233dacb69..a199d656caf8 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -76,7 +76,7 @@ class Var : public HalideIR::VarExpr {
  public:
   EXPORT explicit Var(const std::string& name_hint = "v",
                Type t = Int(32)) : VarExpr(name_hint, t) {}
-  explicit Var(std::shared_ptr<Node> n) : VarExpr(n) {}
+  explicit Var(NodePtr<Node> n) : VarExpr(n) {}
   explicit Var(VarExpr v) : VarExpr(v) {}
   /*!
    * \brief Make a new copy of var with same type, append suffix
@@ -107,7 +107,7 @@ class Range : public HalideIR::IR::Range {
  public:
   /*! \brief constructor */
   Range() {}
-  explicit Range(std::shared_ptr<Node> n) : HalideIR::IR::Range(n) {}
+  explicit Range(NodePtr<Node> n) : HalideIR::IR::Range(n) {}
   /*!
    * \brief constructor by begin and end
    * \param begin The begin of the range.
@@ -197,7 +197,7 @@ class IterVar : public NodeRef {
   // construct a new iter var without a domain
   IterVar() {}
   // construct from shared ptr.
-  explicit IterVar(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit IterVar(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index f73533439dba..b75d75c18182 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -28,7 +28,7 @@ struct CommReducerNode;
 
 struct CommReducer : public NodeRef {
   CommReducer() {}
-  explicit CommReducer(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit CommReducer(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/ir_functor_ext.h b/include/tvm/ir_functor_ext.h
index a9845fdfc898..85d2de75dd99 100644
--- a/include/tvm/ir_functor_ext.h
+++ b/include/tvm/ir_functor_ext.h
@@ -6,7 +6,7 @@
 #ifndef TVM_IR_FUNCTOR_EXT_H_
 #define TVM_IR_FUNCTOR_EXT_H_
 
-#include <tvm/ir_functor.h>
+#include "node/ir_functor.h"
 #include "ir.h"
 
 namespace tvm {
diff --git a/include/tvm/ir_mutator.h b/include/tvm/ir_mutator.h
index 35c82e9f16c1..6b391caf4b5f 100644
--- a/include/tvm/ir_mutator.h
+++ b/include/tvm/ir_mutator.h
@@ -6,10 +6,10 @@
 #ifndef TVM_IR_MUTATOR_H_
 #define TVM_IR_MUTATOR_H_
 
-#include <tvm/ir_functor.h>
 #include <unordered_map>
 #include "expr.h"
 #include "ir.h"
+#include "node/ir_functor.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index cf20dfa1e9f3..ab42cfc9625f 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -9,7 +9,6 @@
 #ifndef TVM_IR_PASS_H_
 #define TVM_IR_PASS_H_
 
-#include <tvm/ir_functor.h>
 #include <arithmetic/Simplify.h>
 #include <unordered_map>
 #include <vector>
diff --git a/include/tvm/ir_visitor.h b/include/tvm/ir_visitor.h
index 4b2887b28885..265ec0e56efb 100644
--- a/include/tvm/ir_visitor.h
+++ b/include/tvm/ir_visitor.h
@@ -6,8 +6,8 @@
 #ifndef TVM_IR_VISITOR_H_
 #define TVM_IR_VISITOR_H_
 
-#include <tvm/ir_functor.h>
 #include "ir.h"
+#include "node/ir_functor.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h
index acb9813339f8..8bd2b1ba84cf 100644
--- a/include/tvm/lowered_func.h
+++ b/include/tvm/lowered_func.h
@@ -7,13 +7,13 @@
 #ifndef TVM_LOWERED_FUNC_H_
 #define TVM_LOWERED_FUNC_H_
 
-#include <tvm/container.h>
 #include <ir/FunctionBase.h>
 #include <string>
 
 #include "base.h"
 #include "expr.h"
 #include "tensor.h"
+#include "node/container.h"
 
 namespace tvm {
 
@@ -27,7 +27,7 @@ class LoweredFuncNode;
 class LoweredFunc : public FunctionRef {
  public:
   LoweredFunc() {}
-  explicit LoweredFunc(std::shared_ptr<Node> n) : FunctionRef(n) {}
+  explicit LoweredFunc(NodePtr<Node> n) : FunctionRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
new file mode 100644
index 000000000000..43adae27671c
--- /dev/null
+++ b/include/tvm/node/container.h
@@ -0,0 +1,586 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/node/container.h
+ * \brief Array/Map container in the DSL graph.
+ */
+#ifndef TVM_NODE_CONTAINER_H_
+#define TVM_NODE_CONTAINER_H_
+
+#include <type_traits>
+#include <vector>
+#include <initializer_list>
+#include <unordered_map>
+#include <utility>
+#include <string>
+#include "node.h"
+#include "memory.h"
+
+namespace tvm {
+
+/*! \brief array node content in array */
+class ArrayNode : public Node {
+ public:
+  /*! \brief the data content */
+  std::vector<NodePtr<Node> > data;
+
+  void VisitAttrs(AttrVisitor* visitor) final {
+     // Visitor to array have no effect.
+  }
+
+  static constexpr const char* _type_key = "Array";
+  TVM_DECLARE_NODE_TYPE_INFO(ArrayNode, Node);
+};
+
+/*! \brief map node content */
+class MapNode : public Node {
+ public:
+  void VisitAttrs(AttrVisitor* visitor) final {
+     // Visitor to map have no effect.
+  }
+  // hash function
+  struct Hash {
+    size_t operator()(const NodePtr<Node>& n) const {
+      return std::hash<Node*>()(n.get());
+    }
+  };
+  // comparator
+  struct Equal {
+    bool operator()(
+        const NodePtr<Node>& a,
+        const NodePtr<Node>& b) const {
+      return a.get() == b.get();
+    }
+  };
+
+  /*! \brief The corresponding conatiner type */
+  using ContainerType = std::unordered_map<
+   NodePtr<Node>,
+   NodePtr<Node>,
+   Hash, Equal>;
+
+  /*! \brief the data content */
+  ContainerType data;
+
+  static constexpr const char* _type_key = "Map";
+  TVM_DECLARE_NODE_TYPE_INFO(MapNode, Node);
+};
+
+
+/*! \brief specialized map node with string as key */
+class StrMapNode : public Node {
+ public:
+  void VisitAttrs(AttrVisitor* visitor) final {
+     // Visitor to map have no effect.
+  }
+  /*! \brief The corresponding conatiner type */
+  using ContainerType = std::unordered_map<
+    std::string,
+    NodePtr<Node> >;
+
+  /*! \brief the data content */
+  ContainerType data;
+
+  static constexpr const char* _type_key = "StrMap";
+  TVM_DECLARE_NODE_TYPE_INFO(StrMapNode, Node);
+};
+
+/*!
+ * \brief iterator adapter that adapts TIter to return another type.
+ * \tparam Converter a struct that contains converting function
+ * \tparam TIter the content iterator type.
+ */
+template<typename Converter,
+         typename TIter>
+class IterAdapter {
+ public:
+  explicit IterAdapter(TIter iter) : iter_(iter) {}
+  inline IterAdapter& operator++() {  // NOLINT(*)
+    ++iter_;
+    return *this;
+  }
+  inline IterAdapter& operator++(int) {  // NOLINT(*)
+    ++iter_;
+    return *this;
+  }
+  inline IterAdapter operator+(int offset) const {  // NOLINT(*)
+    return IterAdapter(iter_ + offset);
+  }
+  inline bool operator==(IterAdapter other) const {
+    return iter_ == other.iter_;
+  }
+  inline bool operator!=(IterAdapter other) const {
+    return !(*this == other);
+  }
+  inline const typename Converter::ResultType operator*() const {
+    return Converter::convert(*iter_);
+  }
+
+ private:
+  TIter iter_;
+};
+
+/*!
+ * \brief Array container of NodeRef in DSL graph.
+ *  Array implements copy on write semantics, which means array is mutable
+ *  but copy will happen when array is referenced in more than two places.
+ *
+ * operator[] only provide const acces, use Set to mutate the content.
+ * \tparam T The content NodeRef type.
+ */
+template<typename T,
+         typename = typename std::enable_if<std::is_base_of<NodeRef, T>::value>::type >
+class Array : public NodeRef {
+ public:
+  /*!
+   * \brief default constructor
+   */
+  Array() {
+    node_ = make_node<ArrayNode>();
+  }
+  /*!
+   * \brief move constructor
+   * \param other source
+   */
+  Array(Array<T> && other) {  // NOLINT(*)
+    node_ = std::move(other.node_);
+  }
+  /*!
+   * \brief copy constructor
+   * \param other source
+   */
+  Array(const Array<T> &other) { // NOLINT(*)
+    node_ = other.node_;
+  }
+  /*!
+   * \brief constructor from pointer
+   * \param n the container pointer
+   */
+  explicit Array(NodePtr<Node> n) : NodeRef(n) {}
+  /*!
+   * \brief constructor from iterator
+   * \param begin begin of iterator
+   * \param end end of iterator
+   * \tparam IterType The type of iterator
+   */
+  template<typename IterType>
+  Array(IterType begin, IterType end) {
+    assign(begin, end);
+  }
+  /*!
+   * \brief constructor from initializer list
+   * \param init The initalizer list
+   */
+  Array(std::initializer_list<T> init) { // NOLINT(*)
+    assign(init.begin(), init.end());
+  }
+  /*!
+   * \brief constructor from vector
+   * \param init The vector
+   */
+  Array(const std::vector<T>& init) { // NOLINT(*)
+    assign(init.begin(), init.end());
+  }
+  /*!
+   * \brief move assign operator
+   * \param other The source of assignment
+   * \return reference to self.
+   */
+  Array<T>& operator=(Array<T> && other) {
+    node_ = std::move(other.node_);
+    return *this;
+  }
+  /*!
+   * \brief copy assign operator
+   * \param other The source of assignment
+   * \return reference to self.
+   */
+  Array<T>& operator=(const Array<T> & other) {
+    node_ = other.node_;
+    return *this;
+  }
+  /*!
+   * \brief reset the array to content from iterator.
+   * \param begin begin of iterator
+   * \param end end of iterator
+   * \tparam IterType The type of iterator
+   */
+  template<typename IterType>
+  void assign(IterType begin, IterType end) {
+    auto n = make_node<ArrayNode>();
+    for (IterType it = begin; it != end; ++it) {
+      n->data.push_back((*it).node_);
+    }
+    node_ = std::move(n);
+  }
+  /*!
+   * \brief Read i-th element from array.
+   * \param i The index
+   * \return the i-th element.
+   */
+  inline const T operator[](size_t i) const {
+    return T(static_cast<const ArrayNode*>(node_.get())->data[i]);
+  }
+  /*! \return The size of the array */
+  inline size_t size() const {
+    if (node_.get() == nullptr) return 0;
+    return static_cast<const ArrayNode*>(node_.get())->data.size();
+  }
+  /*!
+   * \brief copy on write semantics
+   *  Do nothing if current handle is the unique copy of the array.
+   *  Otherwise make a new copy of the array to ensure the current handle
+   *  hold a unique copy.
+   *
+   * \return Handle to the internal node container(which ganrantees to be unique)
+   */
+  inline ArrayNode* CopyOnWrite() {
+    if (node_.get() == nullptr || !node_.unique())  {
+      NodePtr<ArrayNode> n = make_node<ArrayNode>();
+      n->data = static_cast<ArrayNode*>(node_.get())->data;
+      NodePtr<Node>(std::move(n)).swap(node_);
+    }
+    return static_cast<ArrayNode*>(node_.get());
+  }
+  /*!
+   * \brief push a new item to the back of the list
+   * \param item The item to be pushed.
+   */
+  inline void push_back(const T& item) {
+    ArrayNode* n = this->CopyOnWrite();
+    n->data.push_back(item.node_);
+  }
+  /*!
+   * \brief set i-th element of the array.
+   * \param i The index
+   * \param value The value to be setted.
+   */
+  inline void Set(size_t i, const T& value) {
+    ArrayNode* n = this->CopyOnWrite();
+    n->data[i] = value.node_;
+  }
+  /*! \return whether array is empty */
+  inline bool empty() const {
+    return size() == 0;
+  }
+  /*! \brief specify container node */
+  using ContainerType = ArrayNode;
+
+  struct Ptr2NodeRef {
+    using ResultType = T;
+    static inline T convert(const NodePtr<Node>& n) {
+      return T(n);
+    }
+  };
+  using iterator = IterAdapter<Ptr2NodeRef,
+                               std::vector<NodePtr<Node> >::const_iterator>;
+
+  using reverse_iterator = IterAdapter<
+    Ptr2NodeRef,
+    std::vector<NodePtr<Node> >::const_reverse_iterator>;
+
+  /*! \return begin iterator */
+  inline iterator begin() const {
+    return iterator(static_cast<const ArrayNode*>(node_.get())->data.begin());
+  }
+  /*! \return end iterator */
+  inline iterator end() const {
+    return iterator(static_cast<const ArrayNode*>(node_.get())->data.end());
+  }
+  /*! \return rbegin iterator */
+  inline reverse_iterator rbegin() const {
+    return reverse_iterator(static_cast<const ArrayNode*>(node_.get())->data.rbegin());
+  }
+  /*! \return rend iterator */
+  inline reverse_iterator rend() const {
+    return reverse_iterator(static_cast<const ArrayNode*>(node_.get())->data.rend());
+  }
+};
+
+/*!
+ * \brief Map container of NodeRef->NodeRef in DSL graph.
+ *  Map implements copy on write semantics, which means map is mutable
+ *  but copy will happen when array is referenced in more than two places.
+ *
+ * operator[] only provide const acces, use Set to mutate the content.
+ * \tparam K The key NodeRef type.
+ * \tparam V The value NodeRef type.
+ */
+template<typename K,
+         typename V,
+         typename = typename std::enable_if<
+           std::is_base_of<NodeRef, K>::value ||
+           std::is_base_of<std::string, K>::value >::type,
+         typename = typename std::enable_if<std::is_base_of<NodeRef, V>::value>::type>
+class Map : public NodeRef {
+ public:
+  /*!
+   * \brief default constructor
+   */
+  Map() {
+    node_ = make_node<MapNode>();
+  }
+  /*!
+   * \brief move constructor
+   * \param other source
+   */
+  Map(Map<K, V> && other) {  // NOLINT(*)
+    node_ = std::move(other.node_);
+  }
+  /*!
+   * \brief copy constructor
+   * \param other source
+   */
+  Map(const Map<K, V> &other) { // NOLINT(*)
+    node_ = other.node_;
+  }
+  /*!
+   * \brief constructor from pointer
+   * \param n the container pointer
+   */
+  explicit Map(NodePtr<Node> n) : NodeRef(n) {}
+  /*!
+   * \brief constructor from iterator
+   * \param begin begin of iterator
+   * \param end end of iterator
+   * \tparam IterType The type of iterator
+   */
+  template<typename IterType>
+  Map(IterType begin, IterType end) {
+    assign(begin, end);
+  }
+  /*!
+   * \brief constructor from initializer list
+   * \param init The initalizer list
+   */
+  Map(std::initializer_list<std::pair<K, V> > init) { // NOLINT(*)
+    assign(init.begin(), init.end());
+  }
+  /*!
+   * \brief constructor from vector
+   * \param init The vector
+   */
+  template<typename Hash, typename Equal>
+  Map(const std::unordered_map<K, V, Hash, Equal>& init) { // NOLINT(*)
+    assign(init.begin(), init.end());
+  }
+  /*!
+   * \brief move assign operator
+   * \param other The source of assignment
+   * \return reference to self.
+   */
+  Map<K, V>& operator=(Map<K, V> && other) {
+    node_ = std::move(other.node_);
+    return *this;
+  }
+  /*!
+   * \brief copy assign operator
+   * \param other The source of assignment
+   * \return reference to self.
+   */
+  Map<K, V>& operator=(const Map<K, V> & other) {
+    node_ = other.node_;
+    return *this;
+  }
+  /*!
+   * \brief reset the array to content from iterator.
+   * \param begin begin of iterator
+   * \param end end of iterator
+   * \tparam IterType The type of iterator
+   */
+  template<typename IterType>
+  void assign(IterType begin, IterType end) {
+    NodePtr<MapNode> n = make_node<MapNode>();
+    for (IterType i = begin; i != end; ++i) {
+      n->data.emplace(std::make_pair(i->first.node_,
+                                     i->second.node_));
+    }
+    node_ = std::move(n);
+  }
+  /*!
+   * \brief Read element from map.
+   * \param key The key
+   * \return the corresonding element.
+   */
+  inline const V operator[](const K& key) const {
+    return V(static_cast<const MapNode*>(node_.get())->data.at(key.node_));
+  }
+  /*!
+   * \brief Read element from map.
+   * \param key The key
+   * \return the corresonding element.
+   */
+  inline const V at(const K& key) const {
+    return V(static_cast<const MapNode*>(node_.get())->data.at(key.node_));
+  }
+  /*! \return The size of the array */
+  inline size_t size() const {
+    if (node_.get() == nullptr) return 0;
+    return static_cast<const MapNode*>(node_.get())->data.size();
+  }
+  /*! \return The size of the array */
+  inline size_t count(const K& key) const {
+    if (node_.get() == nullptr) return 0;
+    return static_cast<const MapNode*>(node_.get())->data.count(key.node_);
+  }
+  /*!
+   * \brief copy on write semantics
+   *  Do nothing if current handle is the unique copy of the array.
+   *  Otherwise make a new copy of the array to ensure the current handle
+   *  hold a unique copy.
+   *
+   * \return Handle to the internal node container(which ganrantees to be unique)
+   */
+  inline MapNode* CopyOnWrite() {
+    if (node_.get() == nullptr || !node_.unique())  {
+      NodePtr<MapNode> n = make_node<MapNode>();
+      n->data = static_cast<const MapNode*>(node_.get())->data;
+      NodePtr<Node>(std::move(n)).swap(node_);
+    }
+    return static_cast<MapNode*>(node_.get());
+  }
+  /*!
+   * \brief set the Map.
+   * \param key The index key.
+   * \param value The value to be setted.
+   */
+  inline void Set(const K& key, const V& value) {
+    MapNode* n = this->CopyOnWrite();
+    n->data[key.node_] = value.node_;
+  }
+
+  /*! \return whether array is empty */
+  inline bool empty() const {
+    return size() == 0;
+  }
+  /*! \brief specify container node */
+  using ContainerType = MapNode;
+
+  struct Ptr2NodeRef {
+    using ResultType = std::pair<K, V>;
+    static inline ResultType convert(const std::pair<
+                            NodePtr<Node>,
+                            NodePtr<Node> >& n) {
+      return std::make_pair(K(n.first), V(n.second));
+    }
+  };
+
+  using iterator = IterAdapter<
+    Ptr2NodeRef, MapNode::ContainerType::const_iterator>;
+
+  /*! \return begin iterator */
+  inline iterator begin() const {
+    return iterator(static_cast<const MapNode*>(node_.get())->data.begin());
+  }
+  /*! \return end iterator */
+  inline iterator end() const {
+    return iterator(static_cast<const MapNode*>(node_.get())->data.end());
+  }
+  /*! \return begin iterator */
+  inline iterator find(const K& key) const {
+    return iterator(static_cast<const MapNode*>(node_.get())->data.find(key.node_));
+  }
+};
+
+// specialize of string map
+template<typename V, typename T1, typename T2>
+class Map<std::string, V, T1, T2> : public NodeRef {
+ public:
+  // for code reuse
+  Map() {
+    node_ = make_node<StrMapNode>();
+  }
+  Map(Map<std::string, V> && other) {  // NOLINT(*)
+    node_ = std::move(other.node_);
+  }
+  Map(const Map<std::string, V> &other) { // NOLINT(*)
+    node_ = other.node_;
+  }
+  explicit Map(NodePtr<Node> n) : NodeRef(n) {}
+  template<typename IterType>
+  Map(IterType begin, IterType end) {
+    assign(begin, end);
+  }
+  Map(std::initializer_list<std::pair<std::string, V> > init) { // NOLINT(*)
+    assign(init.begin(), init.end());
+  }
+
+  template<typename Hash, typename Equal>
+  Map(const std::unordered_map<std::string, V, Hash, Equal>& init) { // NOLINT(*)
+    assign(init.begin(), init.end());
+  }
+  Map<std::string, V>& operator=(Map<std::string, V> && other) {
+    node_ = std::move(other.node_);
+    return *this;
+  }
+  Map<std::string, V>& operator=(const Map<std::string, V> & other) {
+    node_ = other.node_;
+    return *this;
+  }
+  template<typename IterType>
+  void assign(IterType begin, IterType end) {
+    auto n = make_node<StrMapNode>();
+    for (IterType i = begin; i != end; ++i) {
+      n->data.emplace(std::make_pair(i->first,
+                                     i->second.node_));
+    }
+    node_ = std::move(n);
+  }
+  inline const V operator[](const std::string& key) const {
+    return V(static_cast<const StrMapNode*>(node_.get())->data.at(key));
+  }
+  inline const V at(const std::string& key) const {
+    return V(static_cast<const StrMapNode*>(node_.get())->data.at(key));
+  }
+  inline size_t size() const {
+    if (node_.get() == nullptr) return 0;
+    return static_cast<const StrMapNode*>(node_.get())->data.size();
+  }
+  inline size_t count(const std::string& key) const {
+    if (node_.get() == nullptr) return 0;
+    return static_cast<const StrMapNode*>(node_.get())->data.count(key);
+  }
+  inline StrMapNode* CopyOnWrite() {
+    if (node_.get() == nullptr || !node_.unique())  {
+      NodePtr<StrMapNode> n = make_node<StrMapNode>();
+      n->data = static_cast<const StrMapNode*>(node_.get())->data;
+      NodePtr<Node>(std::move(n)).swap(node_);
+    }
+    return static_cast<StrMapNode*>(node_.get());
+  }
+  inline void Set(const std::string& key, const V& value) {
+    StrMapNode* n = this->CopyOnWrite();
+    n->data[key] = value.node_;
+  }
+  inline bool empty() const {
+    return size() == 0;
+  }
+  using ContainerType = StrMapNode;
+
+  struct Ptr2NodeRef {
+    using ResultType = std::pair<std::string, V>;
+    static inline ResultType convert(const std::pair<
+                            std::string,
+                            NodePtr<Node> >& n) {
+      return std::make_pair(n.first, V(n.second));
+    }
+  };
+
+  using iterator = IterAdapter<
+    Ptr2NodeRef, StrMapNode::ContainerType::const_iterator>;
+
+  /*! \return begin iterator */
+  inline iterator begin() const {
+    return iterator(static_cast<const StrMapNode*>(node_.get())->data.begin());
+  }
+  /*! \return end iterator */
+  inline iterator end() const {
+    return iterator(static_cast<const StrMapNode*>(node_.get())->data.end());
+  }
+  /*! \return begin iterator */
+  inline iterator find(const std::string& key) const {
+    return iterator(static_cast<const StrMapNode*>(node_.get())->data.find(key));
+  }
+};
+
+}  // namespace tvm
+#endif  // TVM_NODE_CONTAINER_H_
diff --git a/include/tvm/node/ir_functor.h b/include/tvm/node/ir_functor.h
new file mode 100644
index 000000000000..293bec75bbf5
--- /dev/null
+++ b/include/tvm/node/ir_functor.h
@@ -0,0 +1,254 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/node/ir_functor.h
+ * \brief Defines the IRFunctor data structures.
+ */
+#ifndef TVM_NODE_IR_FUNCTOR_H_
+#define TVM_NODE_IR_FUNCTOR_H_
+
+#include <dmlc/logging.h>
+#include <string>
+#include <vector>
+#include <type_traits>
+#include <functional>
+#include "node.h"
+#include "../runtime/registry.h"
+
+namespace tvm {
+/*!
+ * \brief A dynamical dispatched functor on NodeRef in the first argument.
+ *
+ * \code
+ *   IRFunctor<std::string (const NodeRef& n, std::string prefix)> tostr;
+ *   tostr.set_dispatch<Add>([](const Add* op, std::string prefix) {
+ *     return prefix + "Add";
+ *   });
+ *   tostr.set_dispatch<IntImm>([](const IntImm* op) {
+ *     return prefix + "IntImm"
+ *   });
+ *
+ *   Expr x = make_const(1);
+ *   Expr y = x + x;
+ *   // dispatch to IntImm, outputs "MyIntImm"
+ *   LOG(INFO) << tostr(x, "My");
+ *   // dispatch to IntImm, outputs "MyAdd"
+ *   LOG(INFO) << tostr(y, "My");
+ * \endcode
+ *
+ * \tparam FType function signiture
+ *  This type if only defined for FType with function signiture
+ */
+template<typename FType>
+class IRFunctor;
+
+template<typename R, typename ...Args>
+class IRFunctor<R(const NodeRef& n, Args...)> {
+ private:
+  using Function = std::function<R (const NodeRef&n, Args...)>;
+  using TSelf = IRFunctor<R (const NodeRef& n, Args...)>;
+  /*! \brief internal function table */
+  std::vector<Function> func_;
+
+ public:
+  /*! \brief the result type of this functor */
+  using result_type = R;
+  /*!
+   * \brief Whether the functor can dispatch the corresponding Node
+   * \param n The node to be dispatched
+   * \return Whether dispatching function is registered for n's type.
+   */
+  inline bool can_dispatch(const NodeRef& n) const {
+    uint32_t type_index = n.type_index();
+    return type_index < func_.size() && func_[type_index] != nullptr;
+  }
+  /*!
+   * \brief invoke the functor , dispatch on type of n
+   * \param n The Node argument
+   * \param args The additional arguments
+   * \return The result.
+   */
+  inline R operator()(const NodeRef& n, Args... args) const {
+    uint32_t type_index = n.type_index();
+    CHECK(type_index < func_.size() &&
+          func_[type_index] != nullptr)
+        << "IRFunctor calls un-registered function on type "
+        << Node::TypeIndex2Key(type_index);
+    return func_[type_index](n, std::forward<Args>(args)...);
+  }
+  /*!
+   * \brief set the dispacher for type TNode
+   * \param f The function to be set.
+   * \tparam TNode the type of Node to be dispatched.
+   * \return reference to self.
+   */
+  template<typename TNode>
+  inline TSelf& set_dispatch(Function f) {  // NOLINT(*)
+    uint32_t tindex = Node::TypeKey2Index(TNode::_type_key);
+    if (func_.size() <= tindex) {
+      func_.resize(tindex + 1, nullptr);
+    }
+    CHECK(func_[tindex] == nullptr)
+        << "Dispatch for " << Node::TypeIndex2Key(tindex)
+        << " is already set";
+    func_[tindex] = f;
+    return *this;
+  }
+  /*!
+   * \brief set the dispacher for type TNode
+   *  This allows f to used detailed const Node pointer to replace NodeRef
+   *
+   * \param f The function to be set.
+   * \tparam TNode the type of Node to be dispatched.
+   * \return reference to self.
+   */
+  template<typename TNode>
+  inline TSelf& set_dispatch(std::function<R(const TNode* n, Args...)> f) { // NOLINT(*)
+    Function fun = [f](const NodeRef& n, Args... args) {
+      return f(static_cast<const TNode*>(n.node_.get()),
+               std::forward<Args>(args)...);
+    };
+    return this->set_dispatch<TNode>(fun);
+  }
+  /*!
+  * \brief unset the dispacher for type TNode
+  *
+  * \tparam TNode the type of Node to be dispatched.
+  * \return reference to self.
+  */
+  template<typename TNode>
+  inline TSelf& clear_dispatch() {  // NOLINT(*)
+    uint32_t tindex = Node::TypeKey2Index(TNode::_type_key);
+    CHECK_LT(tindex, func_.size()) << "clear_dispatch: index out of range";
+    func_[tindex] = nullptr;
+    return *this;
+  }
+};
+
+#define TVM_REGISTER_VAR_DEF(ClsName)                                 \
+  static TVM_ATTRIBUTE_UNUSED auto & __make_functor ## _ ## ClsName
+
+/*!
+ * \brief Useful macro to set IRFunctor dispatch in a global static field.
+ *
+ * \code
+ *  // Use IRFunctor to implement IRPrinter similar to Visitor Pattern.
+ *  // vtable allows easy patch in of new Node types, without changing
+ *  // interface of IRPrinter.
+ *
+ *  class IRPrinter {
+ *   public:
+ *    std::ostream& stream;
+ *    // the dispatch function.
+ *    void print(Expr e) {
+ *      const static FType& f = *vtable();
+ *      f(e, this);
+ *    }
+ *
+ *    using FType = IRFunctor<void (const NodeRef&, IRPrinter *)>;
+ *    // function to return global function table
+ *    static FType& vtable();
+ *  };
+ *
+ *  // in cpp/cc file
+ *  IRPrinter::FType& IRPrinter::vtable() { // NOLINT(*0
+ *    static FType inst; return inst;
+ *  }
+ *
+ *  TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+ *  .set_dispatch<Add>([](const Add* n, IRPrinter* p) {
+ *    p->print(n->a);
+ *    p->stream << '+'
+ *    p->print(n->b);
+ *  });
+ *
+ *
+ * \endcode
+ *
+ * \param ClsName The name of the class
+ * \param FField The static function that returns a singleton of IRFunctor.
+ */
+#define TVM_STATIC_IR_FUNCTOR(ClsName, FField)                       \
+  TVM_STR_CONCAT(TVM_REGISTER_VAR_DEF(ClsName), __COUNTER__)  =      \
+                              ClsName::FField()
+
+ /*!
+ * \brief A container for a list of callbacks. All callbacks are invoked when
+ * the object is destructed.
+ */
+class IRFunctorCleanList {
+ public:
+  ~IRFunctorCleanList() {
+    for (auto &f : clean_items) {
+      f();
+    }
+  }
+
+  void append(std::function<void()> func) {
+    clean_items.push_back(func);
+  }
+
+ private:
+  std::vector< std::function<void()> > clean_items;
+};
+
+/*!
+* \brief A wrapper around IRFunctor that will record calls to set_dispatch
+* and make a corresponding call to clear_dispatch when the last copy of
+* the IRFunctorStaticRegistry is destructed. When assigned to a static variable,
+* this can be used by NNVM and other libraries to unregister callbacks when
+* the library is unloaded. This prevents crashes when the underlying IRFunctor
+* is destructed as it will no longer contain std::function instances allocated
+* by a library that has been unloaded.
+*/
+template<typename FType>
+class IRFunctorStaticRegistry;
+
+template<typename R, typename ...Args>
+class IRFunctorStaticRegistry<R(const NodeRef& n, Args...)> {
+ private:
+  IRFunctor<R(const NodeRef& n, Args...)> *irf_;
+  std::shared_ptr<IRFunctorCleanList> free_list;
+
+  using TSelf = IRFunctorStaticRegistry<R(const NodeRef& n, Args...)>;
+
+ public:
+  IRFunctorStaticRegistry(IRFunctor<R(const NodeRef& n, Args...)> *irf) {
+    irf_ = irf;
+    free_list = std::make_shared<IRFunctorCleanList>();
+  }
+
+  template<typename TNode>
+  inline TSelf& set_dispatch(std::function<R(const TNode* n, Args...)> f) {  // NOLINT(*)
+    irf_->template set_dispatch<TNode>(f);
+    auto irf_copy = irf_;
+    free_list.get()->append([irf_copy] {
+      irf_copy->template clear_dispatch<TNode>();
+      });
+    return *this;
+  }
+};
+
+/*!
+* \brief Helper function for constructing an IRFunctorStaticRegistry. This allows
+* the compiler to deduce the template types.
+*/
+template<typename R, typename ...Args>
+IRFunctorStaticRegistry<R(const NodeRef& n, Args...)> MakeIRFunctorStaticRegistry(
+  IRFunctor<R(const NodeRef& n, Args...)> *irf) {
+  return IRFunctorStaticRegistry<R(const NodeRef& n, Args...)>(irf);
+}
+
+#define TVM_AUTO_REGISTER_VAR_DEF(ClsName)                           \
+  static TVM_ATTRIBUTE_UNUSED auto __make_functor ## _ ## ClsName
+
+/*!
+* \brief Macro to set IRFunctor dispatch in a global static field using an IRFunctorStaticRegistry.
+* Usage is exactly the same as TVM_STATIC_IR_FUNCTOR. Libraries should use this instead of
+* TVM_STATIC_IR_FUNCTOR.
+*/
+#define TVM_STATIC_IR_FUNCTOR_REGISTER(ClsName, FField)                  \
+  TVM_STR_CONCAT(TVM_AUTO_REGISTER_VAR_DEF(ClsName), __COUNTER__)  = \
+                        MakeIRFunctorStaticRegistry(&ClsName::FField())
+
+}  // namespace tvm
+#endif  // TVM_NODE_IR_FUNCTOR_H_
diff --git a/include/tvm/node/memory.h b/include/tvm/node/memory.h
new file mode 100644
index 000000000000..c0f791eb597b
--- /dev/null
+++ b/include/tvm/node/memory.h
@@ -0,0 +1,59 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/node/memory.h
+ * \brief Node memory management.
+ */
+#ifndef TVM_NODE_MEMORY_H_
+#define TVM_NODE_MEMORY_H_
+
+#include "node.h"
+
+namespace tvm {
+/*!
+ * \brief Allocate a node object.
+ * \param args arguments to the constructor.
+ * \tparam T the node type.
+ * \return The NodePtr to the allocated object.
+ */
+template<typename T, typename... Args>
+inline NodePtr<T> make_node(Args&&... args);
+
+// Detail implementations after this
+//
+// The current design allows swapping the
+// allocator pattern when necessary.
+//
+// Possible future allocator optimizations:
+// - Arena allocator that gives ownership of memory to arena (deleter_= nullptr)
+// - Thread-local object pools: one pool per size and alignment requirement.
+// - Can specialize by type of object to give the specific allocator to each object.
+//
+template<typename T>
+class SimpleNodeAllocator {
+ public:
+  template<typename... Args>
+  static T* New(Args&&... args) {
+    return new T(std::forward<Args>(args)...);
+  }
+  static NodeBase::FDeleter Deleter() {
+    return Deleter_;
+  }
+
+ private:
+  static void Deleter_(NodeBase* ptr) {
+    delete static_cast<T*>(ptr);
+  }
+};
+
+template<typename T, typename... Args>
+inline NodePtr<T> make_node(Args&&... args) {
+  using Allocator = SimpleNodeAllocator<T>;
+  static_assert(std::is_base_of<NodeBase, T>::value,
+                "make_node can only be used to create NodeBase");
+  T* node = Allocator::New(std::forward<Args>(args)...);
+  node->deleter_ = Allocator::Deleter();
+  return NodePtr<T>(node);
+}
+
+}  // namespace tvm
+#endif  // TVM_NODE_MEMORY_H_
diff --git a/include/tvm/node/node.h b/include/tvm/node/node.h
new file mode 100644
index 000000000000..d726b1dab660
--- /dev/null
+++ b/include/tvm/node/node.h
@@ -0,0 +1,295 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/node/node.h
+ * \brief Node system data structure.
+ */
+#ifndef TVM_NODE_NODE_H_
+#define TVM_NODE_NODE_H_
+
+#include <string>
+#include <vector>
+#include <type_traits>
+#include "base/Type.h"
+#include "../runtime/node_base.h"
+#include "../runtime/c_runtime_api.h"
+
+namespace tvm {
+using HalideIR::Type;
+// forward declaration
+class Node;
+class NodeRef;
+
+namespace runtime {
+// forward declaration
+class NDArray;
+}  // namespace runtime
+
+/*!
+ * \brief Visitor class to each node content.
+ *  The content is going to be called for each field.
+ */
+class TVM_DLL AttrVisitor {
+ public:
+//! \cond Doxygen_Suppress
+  virtual void Visit(const char* key, double* value) = 0;
+  virtual void Visit(const char* key, int64_t* value) = 0;
+  virtual void Visit(const char* key, uint64_t* value) = 0;
+  virtual void Visit(const char* key, int* value) = 0;
+  virtual void Visit(const char* key, bool* value) = 0;
+  virtual void Visit(const char* key, std::string* value) = 0;
+  virtual void Visit(const char* key, void** value) = 0;
+  virtual void Visit(const char* key, Type* value) = 0;
+  virtual void Visit(const char* key, NodeRef* value) = 0;
+  virtual void Visit(const char* key, runtime::NDArray* value) = 0;
+  template<typename ENum,
+           typename = typename std::enable_if<std::is_enum<ENum>::value>::type>
+  void Visit(const char* key, ENum* ptr) {
+    static_assert(std::is_same<int, typename std::underlying_type<ENum>::type>::value,
+                  "declare enum to be enum int to use visitor");
+    this->Visit(key, reinterpret_cast<int*>(ptr));
+  }
+//! \endcond
+};
+
+/*!
+ * \brief base class of node container in DSL AST.
+ *  All object's internal is stored as std::shared_ptr<Node>
+ */
+class TVM_DLL Node : public NodeBase {
+ public:
+  /*! \brief virtual destructor */
+  virtual ~Node() {}
+  /*! \return The unique type key of the node */
+  virtual const char* type_key() const = 0;
+  /*!
+   * \brief Apply visitor to each field of the Node
+   *  Visitor could mutate the content of the node.
+   *  override if Node contains attribute fields.
+   * \param visitor The visitor
+   */
+  virtual void VisitAttrs(AttrVisitor* visitor) {}
+  /*! \return the type index of the node */
+  virtual const uint32_t type_index() const = 0;
+  /*!
+   * \brief Whether this node derives from node with type_index=tid.
+   *  Implemented by TVM_DECLARE_NODE_TYPE_INFO
+   *
+   * \param tid The type index.
+   * \return the check result.
+   */
+  virtual const bool _DerivedFrom(uint32_t tid) const;
+  /*!
+   * \brief get a runtime unique type index given a type key
+   * \param type_key Type key of a type.
+   * \return the corresponding type index.
+   */
+  static uint32_t TypeKey2Index(const char* type_key);
+  /*!
+   * \brief get type key from type index.
+   * \param index The type index
+   * \return the corresponding type key.
+   */
+  static const char* TypeIndex2Key(uint32_t index);
+  /*!
+   * \return whether the type is derived from
+   */
+  template<typename T>
+  inline bool derived_from() const;
+  /*!
+   * \return whether the node is of type T
+   * \tparam The type to be checked.
+   */
+  template<typename T>
+  inline bool is_type() const;
+  /*!
+   * \brief Get a NodeRef that holds reference to this Node.
+   * \return the NodeRef
+   */
+  inline NodeRef GetNodeRef() const;
+  // node ref can see this
+  friend class NodeRef;
+  static constexpr const char* _type_key = "Node";
+};
+
+/*! \brief Base class of all node reference object */
+class NodeRef {
+ public:
+  /*! \brief type indicate the container type */
+  using ContainerType = Node;
+  /*!
+   * \brief Comparator
+   * \param other Another node ref.
+   * \return the compare result.
+   */
+  inline bool operator==(const NodeRef& other) const;
+  /*!
+   * \brief Comparator
+   * \param other Another node ref.
+   * \return the compare result.
+   */
+  inline bool same_as(const NodeRef& other) const;
+  /*!
+   * \brief Comparator
+   * \param other Another node ref.
+   * \return the compare result.
+   */
+  inline bool operator<(const NodeRef& other) const;
+  /*!
+   * \brief Comparator
+   * \param other Another node ref.
+   * \return the compare result.
+   */
+  inline bool operator!=(const NodeRef& other) const;
+  /*! \return the hash function for NodeRef */
+  inline size_t hash() const;
+  /*! \return whether the expression is null */
+  inline bool defined() const;
+  /*! \return the internal type index of IRNode */
+  inline uint32_t type_index() const;
+  /*! \return the internal node pointer */
+  inline const Node* get() const;
+  /*! \return the internal node pointer */
+  inline const Node* operator->() const;
+  /*!
+   * \brief Downcast this ir node to its actual type (e.g. Add, or
+   * Select). This returns nullptr if the node is not of the requested
+   * type. Example usage:
+   *
+   * if (const Add *add = node->as<Add>()) {
+   *   // This is an add node
+   * }
+   * \tparam T the target type, must be subtype of IRNode
+   */
+  template<typename T>
+  inline const T *as() const;
+  /*!
+   * \brief A more powerful version of as that also works with
+   *  intermediate base types.
+   * \tparam T the target type, must be subtype of IRNode
+   */
+  template<typename T>
+  inline const T *as_derived() const;
+  /*! \brief default constructor */
+  NodeRef() = default;
+  explicit NodeRef(NodePtr<Node> node) : node_(node) {}
+  /*! \brief the internal node object, do not touch  */
+  NodePtr<Node> node_;
+};
+
+/*!
+ * \brief helper macro to declare type information in a base node.
+ */
+#define TVM_DECLARE_BASE_NODE_INFO(TypeName, Parent)                    \
+  const bool _DerivedFrom(uint32_t tid) const override {                \
+    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
+    if (tidx == tid) return true;                                       \
+    return Parent::_DerivedFrom(tid);                                   \
+  }
+
+/*!
+ * \brief helper macro to declare type information in a terminal node
+ */
+#define TVM_DECLARE_NODE_TYPE_INFO(TypeName, Parent)                    \
+  const char* type_key() const final {                                  \
+    return TypeName::_type_key;                                         \
+  }                                                                     \
+  const uint32_t type_index() const final {                             \
+    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
+    return tidx;                                                        \
+  }                                                                     \
+  const bool _DerivedFrom(uint32_t tid) const final {                   \
+    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
+    if (tidx == tid) return true;                                       \
+    return Parent::_DerivedFrom(tid);                                   \
+  }
+
+// implementations of inline functions after this
+template<typename T>
+inline bool Node::is_type() const {
+  // use static field so query only happens once.
+  static uint32_t type_id = Node::TypeKey2Index(T::_type_key);
+  return type_id == this->type_index();
+}
+
+template<typename T>
+inline bool Node::derived_from() const {
+  // use static field so query only happens once.
+  static uint32_t type_id = Node::TypeKey2Index(T::_type_key);
+  return this->_DerivedFrom(type_id);
+}
+
+inline NodeRef Node::GetNodeRef() const {
+  return NodeRef(NodePtr<Node>(const_cast<Node*>(this)));
+}
+
+inline const Node* NodeRef::get() const {
+  return node_.get();
+}
+
+inline const Node* NodeRef::operator->() const {
+  return node_.get();
+}
+
+inline bool NodeRef::defined() const {
+  return node_.get() != nullptr;
+}
+
+inline bool NodeRef::operator==(const NodeRef& other) const {
+  return node_.get() == other.node_.get();
+}
+
+inline bool NodeRef::same_as(const NodeRef& other) const {
+  return node_.get() == other.node_.get();
+}
+
+inline bool NodeRef::operator<(const NodeRef& other) const {
+  return node_.get() < other.node_.get();
+}
+
+inline bool NodeRef::operator!=(const NodeRef& other) const {
+  return node_.get() != other.node_.get();
+}
+
+inline size_t NodeRef::hash() const {
+  return std::hash<Node*>()(node_.get());
+}
+
+inline uint32_t NodeRef::type_index() const {
+  CHECK(node_.get() != nullptr)
+      << "null type";
+  return get()->type_index();
+}
+
+template<typename T>
+inline const T* NodeRef::as() const {
+  const Node* ptr = static_cast<const Node*>(get());
+  if (ptr && ptr->is_type<T>()) {
+    return static_cast<const T*>(ptr);
+  }
+  return nullptr;
+}
+
+template<typename T>
+inline const T* NodeRef::as_derived() const {
+  const Node* ptr = static_cast<const Node*>(get());
+  if (ptr && (ptr->is_type<T>() || ptr->derived_from<T>())) {
+    return static_cast<const T*>(ptr);
+  }
+  return nullptr;
+}
+
+/*! \brief The hash function for nodes */
+struct NodeHash {
+  size_t operator()(const NodeRef& a) const {
+    return a.hash();
+  }
+};
+
+/*! \brief The equal comparator for nodes */
+struct NodeEqual {
+  bool operator()(const NodeRef& a, const NodeRef& b) const {
+    return a.get() == b.get();
+  }
+};
+}  // namespace tvm
+#endif  // TVM_NODE_NODE_H_
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 78351e094e69..8528eeaa5fa3 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -116,7 +116,7 @@ inline TNodeRef TVMArgValue::AsNodeRef() const {
       "Conversion only works for NodeRef");
   if (type_code_ == kNull) return TNodeRef();
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  std::shared_ptr<Node>& sptr = *ptr<std::shared_ptr<Node> >();
+  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
   CHECK(NodeTypeChecker<TNodeRef>::Check(sptr.get()))
       << "Expected type " << NodeTypeName<TNodeRef>()
       << " but get " << sptr->type_key();
@@ -132,7 +132,7 @@ inline TVMArgValue::operator HalideIR::Expr() const {
     return Expr(static_cast<float>(value_.v_float64));
   }
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  std::shared_ptr<Node>& sptr = *ptr<std::shared_ptr<Node> >();
+  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
   if (sptr->is_type<IterVarNode>()) {
     return IterVar(sptr)->var;
   }
@@ -145,27 +145,27 @@ inline TVMArgValue::operator HalideIR::Expr() const {
   return Expr(sptr);
 }
 
-inline std::shared_ptr<Node>& TVMArgValue::node_sptr() {
+inline NodePtr<Node>& TVMArgValue::node_sptr() {
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  return *ptr<std::shared_ptr<Node> >();
+  return *ptr<NodePtr<Node> >();
 }
 
 
 template<typename TNodeRef, typename>
 inline bool TVMArgValue::IsNodeType() const {
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  std::shared_ptr<Node>& sptr =
-      *ptr<std::shared_ptr<Node> >();
+  NodePtr<Node>& sptr =
+      *ptr<NodePtr<Node> >();
   return NodeTypeChecker<TNodeRef>::Check(sptr.get());
 }
 
 // extensions for TVMRetValue
 inline TVMRetValue& TVMRetValue::operator=(
-    const std::shared_ptr<Node>& other) {
+    const NodePtr<Node>& other) {
   if (other.get() == nullptr) {
     SwitchToPOD(kNull);
   } else {
-    SwitchToClass<std::shared_ptr<Node> >(kNodeHandle, other);
+    SwitchToClass<NodePtr<Node> >(kNodeHandle, other);
   }
   return *this;
 }
@@ -174,7 +174,7 @@ inline TVMRetValue& TVMRetValue::operator=(const NodeRef& other) {
   if (!other.defined()) {
     SwitchToPOD(kNull);
   } else {
-    SwitchToClass<std::shared_ptr<Node> >(kNodeHandle, other.node_);
+    SwitchToClass<NodePtr<Node> >(kNodeHandle, other.node_);
   }
   return *this;
 }
@@ -186,7 +186,7 @@ inline TNodeRef TVMRetValue::AsNodeRef() const {
       "Conversion only works for NodeRef");
   if (type_code_ == kNull) return TNodeRef();
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  std::shared_ptr<Node>& sptr = *ptr<std::shared_ptr<Node> >();
+  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
   CHECK(NodeTypeChecker<TNodeRef>::Check(sptr.get()))
       << "Expected type " << NodeTypeName<TNodeRef>()
       << " but get " << sptr->type_key();
@@ -195,7 +195,7 @@ inline TNodeRef TVMRetValue::AsNodeRef() const {
 
 inline void TVMArgsSetter::operator()(size_t i, const NodeRef& other) const {  // NOLINT(*)
   if (other.defined()) {
-    values_[i].v_handle = const_cast<std::shared_ptr<Node>*>(&(other.node_));
+    values_[i].v_handle = const_cast<NodePtr<Node>*>(&(other.node_));
     type_codes_[i] = kNodeHandle;
   } else {
     type_codes_[i] = kNull;
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index 7c66d2c2de43..ecf45353af67 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -8,7 +8,7 @@
 
 #include <tvm/api_registry.h>
 #include <tvm/ir.h>
-#include <tvm/node.h>
+#include <tvm/node/node.h>
 #include <string>
 #include <vector>
 
@@ -55,16 +55,16 @@ using NodeEqual = ::tvm::NodeEqual;
  * \param NodeName The internal container name.
  * \param NodeRefBase The base type.
  */
-#define RELAY_DEFINE_NODE_REF(TypeName, NodeName, NodeRefBase)            \
-  class TypeName : public NodeRefBase {                                   \
-   public:                                                                \
-    TypeName() {}                                                         \
-    explicit TypeName(std::shared_ptr<::tvm::Node> n) : NodeRefBase(n) {} \
-    const NodeName* operator->() const {                                  \
-      return static_cast<const NodeName*>(node_.get());                   \
-    }                                                                     \
-    operator bool() { return this->defined(); }                           \
-    using ContainerType = NodeName;                                       \
+#define RELAY_DEFINE_NODE_REF(TypeName, NodeName, NodeRefBase)          \
+  class TypeName : public NodeRefBase {                                 \
+   public:                                                              \
+    TypeName() {}                                                        \
+    explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : NodeRefBase(n) {} \
+    const NodeName* operator->() const {                                \
+      return static_cast<const NodeName*>(node_.get());                 \
+    }                                                                   \
+    operator bool() { return this->defined(); }                         \
+    using ContainerType = NodeName;                                     \
   };
 
 /*!
@@ -82,8 +82,6 @@ class SourceNameNode : public Node {
   // override attr visitor
   void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); }
 
-  TVM_DLL static SourceName make(std::string name);
-
   static constexpr const char* _type_key = "relay.SourceName";
   TVM_DECLARE_NODE_TYPE_INFO(SourceNameNode, Node);
 };
@@ -98,7 +96,7 @@ class SourceName : public NodeRef {
   SourceName() {}
 
   /*! \brief constructor from node pointer */
-  explicit SourceName(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit SourceName(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -109,9 +107,9 @@ class SourceName : public NodeRef {
    * \brief Get an SourceName for a given operator name.
    *  Will raise an error if the source name has not been registered.
    * \param name Name of the operator.
-   * \return Reference to a SourceName valid throughout program lifetime.
+   * \return SourceName valid throughout program lifetime.
    */
-  TVM_DLL static const SourceName& Get(const std::string& name);
+  TVM_DLL static SourceName Get(const std::string& name);
 
   /*! \brief specify container node */
   using ContainerType = SourceNameNode;
@@ -176,7 +174,7 @@ template <typename RefType, typename NodeType>
 RefType GetRef(const NodeType* ptr) {
   static_assert(std::is_same<typename RefType::ContainerType, NodeType>::value,
                 "Can only cast to the ref of same container type");
-  return RefType(const_cast<NodeType*>(ptr)->shared_from_this());
+  return RefType(std::move(ptr->GetNodeRef().node_));
 }
 
 // TODO(@tqchen, @jroesch): can we move these semantics to HalideIR
diff --git a/include/tvm/relay/environment.h b/include/tvm/relay/environment.h
index 7e07dc01eab4..46cedf12b816 100644
--- a/include/tvm/relay/environment.h
+++ b/include/tvm/relay/environment.h
@@ -98,15 +98,15 @@ class EnvironmentNode : public RelayNode {
   TVM_DECLARE_NODE_TYPE_INFO(EnvironmentNode, Node);
 
  private:
-  /*! \brief A map from string names to global variables that 
-   * ensures global uniqueness. 
+  /*! \brief A map from string names to global variables that
+   * ensures global uniqueness.
    */
   tvm::Map<std::string, GlobalVar> global_map_;
 };
 
 struct Environment : public NodeRef {
   Environment() {}
-  explicit Environment(std::shared_ptr<tvm::Node> p) : NodeRef(p) {}
+  explicit Environment(NodePtr<tvm::Node> p) : NodeRef(p) {}
 
   inline EnvironmentNode* operator->() const {
     return static_cast<EnvironmentNode*>(node_.get());
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index 8ad0537ad68b..27bb464b98a3 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -7,7 +7,7 @@
 #ifndef TVM_RELAY_EXPR_FUNCTOR_H_
 #define TVM_RELAY_EXPR_FUNCTOR_H_
 
-#include <tvm/ir_functor.h>
+#include <tvm/node/ir_functor.h>
 #include <string>
 #include "./expr.h"
 #include "./op.h"
@@ -19,7 +19,7 @@ namespace relay {
  * \brief A dynamical functor that dispatches on in the first Expr argument.
  *  You can use this as a more powerful Visitor, since it allows you to
  *  define function signatures of Visit Function.
- * 
+ *
  * \sa tvm/ir_functor.h
  *
  * \tparam FType function signiture
@@ -30,7 +30,7 @@ template <typename FType>
 class ExprFunctor;
 
 // functions to be overriden.
-#define EXPR_FUNCTOR_DEFAULT \
+#define EXPR_FUNCTOR_DEFAULT                                      \
   { return VisitExprDefault_(op, std::forward<Args>(args)...); }
 
 #define RELAY_EXPR_FUNCTOR_DISPATCH(OP)                                \
@@ -152,12 +152,12 @@ class ExprMutator
   Expr VisitExpr_(const CallNode* call_node, const Expr& e) override;
   Expr VisitExpr_(const LetNode* op, const Expr& e) override;
   Expr VisitExpr_(const IfNode* op, const Expr& e) override;
-  /*! \brief Used to visit the types inside of expressions. 
-   *  
+  /*! \brief Used to visit the types inside of expressions.
+   *
    * Can be overloaded to transform the types in arbitrary
    * ways, one way would be to define a sub-class of type
    * visitor for types which transform them appropriately.
-   */ 
+   */
   virtual Type VisitType(const Type& t);
 
  private:
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index 49661fec5731..9f4e7be08a8c 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -90,7 +90,7 @@ class Op : public relay::Expr {
   /*! \brief default constructor  */
   Op() {}
   /*! \brief constructor from node pointer */
-  explicit Op(std::shared_ptr<Node> n) : Expr(n) {}
+  explicit Op(NodePtr<Node> n) : Expr(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -149,9 +149,9 @@ class OpRegistry {
                                   const std::string& description);
   /*!
    * \brief Attach the type function corresponding to the return type.
-   * \param rel_name The type relation name to register. 
+   * \param rel_name The type relation name to register.
    * \param type_rel_func The backing relation function which can solve an arbitrary
-   * relation on variables. 
+   * relation on variables.
    * \return reference to self.
    */
   inline OpRegistry& add_type_rel(
@@ -338,7 +338,7 @@ inline OpRegistry& OpRegistry::describe(
 inline OpRegistry& OpRegistry::add_argument(const std::string& name,
                                             const std::string& type,
                                             const std::string& description) {
-  std::shared_ptr<AttrFieldInfoNode> n = std::make_shared<AttrFieldInfoNode>();
+  auto n = make_node<AttrFieldInfoNode>();
   n->name = name;
   n->type_info = type;
   n->description = description;
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index 44030ad8d97f..f972eb85b041 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -8,7 +8,7 @@
 
 #include <tvm/api_registry.h>
 #include <tvm/ir.h>
-#include <tvm/node.h>
+#include <tvm/node/node.h>
 #include <string>
 
 #include "./base.h"
@@ -37,7 +37,7 @@ class TypeNode : public RelayNode {
 class Type : public NodeRef {
  public:
   Type() {}
-  explicit Type(std::shared_ptr<tvm::Node> p) : NodeRef(p) {}
+  explicit Type(NodePtr<tvm::Node> p) : NodeRef(p) {}
 
   using ContainerType = TypeNode;
 };
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index a3359289e261..313e0a5c3da8 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -263,12 +263,16 @@ struct NDArray::Container {
 // the usages of functions are documented in place.
 inline NDArray::NDArray(Container* data)
   : data_(data) {
-  data_->IncRef();
+  if (data != nullptr) {
+    data_->IncRef();
+  }
 }
 
 inline NDArray::NDArray(const NDArray& other)
   : data_(other.data_) {
-  data_->IncRef();
+  if (data_ != nullptr) {
+    data_->IncRef();
+  }
 }
 
 inline void NDArray::reset() {
diff --git a/include/tvm/runtime/node_base.h b/include/tvm/runtime/node_base.h
new file mode 100644
index 000000000000..bc62ac460cff
--- /dev/null
+++ b/include/tvm/runtime/node_base.h
@@ -0,0 +1,241 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/runtime/node_base.h
+ * \brief Base data structure for Node.
+ *
+ * \note Node is not a runtime feature.
+ *  This file only exposes the signature of NodePtr for PackedFunc.
+ */
+#ifndef TVM_RUNTIME_NODE_BASE_H_
+#define TVM_RUNTIME_NODE_BASE_H_
+
+#include <utility>
+#include <atomic>
+
+namespace tvm {
+
+// forward declarations
+template<typename T>
+class NodePtr;
+class Node;
+class NodeRef;
+
+/*!
+ * \brief Base class of Node for runtime destructor purposes.
+ *
+ * Node is a reference counted object which is used to construct AST.
+ * Each node is backed by a custom deleter, which deletes the object.
+ * Do not call create raw Node pointer, always use tvm::make_node.
+ *
+ * \note In most cases, please inheritate tvm::Node.
+ * \sa Node, NodePtr, make_node
+ */
+class NodeBase {
+ public:
+  /*!
+   * \brief type of NodeBase deleter
+   * \param self pointer to the NodeBase.
+   */
+  typedef void (*FDeleter)(NodeBase* self);
+
+ protected:
+  // default constructor and copy constructor
+  NodeBase() {}
+  // override the copy and assign constructors to do nothing.
+  // This is to make sure only contents, but not deleter and ref_counter
+  // are copied when a child class copies itself.
+  NodeBase(const NodeBase& other) {  // NOLINT(*)
+  }
+  NodeBase(NodeBase&& other) {  // NOLINT(*)
+  }
+  NodeBase& operator=(const NodeBase& other) {  //NOLINT(*)
+    return *this;
+  }
+  NodeBase& operator=(NodeBase&& other) {  //NOLINT(*)
+    return *this;
+  }
+
+ private:
+  /*! \brief Internal reference counter */
+  std::atomic<int> ref_counter_{0};
+  /*!
+   * \brief deleter of this object to enable customized allocation.
+   * If the deleter is nullptr, no deletion will be performed.
+   * The creator of the Node must always set the deleter field properly.
+   */
+  FDeleter deleter_ = nullptr;
+  // reference counting functions
+  void IncRef() {
+    ref_counter_.fetch_add(1, std::memory_order_relaxed);
+  }
+  void DecRef() {
+    if (ref_counter_.fetch_sub(1, std::memory_order_release) == 1) {
+      std::atomic_thread_fence(std::memory_order_acquire);
+      if (this->deleter_ != nullptr) {
+        (*this->deleter_)(this);
+      }
+    }
+  }
+  int use_count() const {
+    return ref_counter_.load(std::memory_order_relaxed);
+  }
+  // friend declaration
+  template<typename>
+  friend class NodePtr;
+  template<typename Y, typename... Args>
+  friend NodePtr<Y> make_node(Args&&...);
+};
+
+/*!
+ * \brief Smart pointer for Node containers,
+ *  must be subclass of NodeBase
+ * \tparam T the content data type.
+ */
+template<typename T>
+class NodePtr {
+ public:
+  /*! \brief default constructor */
+  NodePtr() {}
+  /*! \brief default constructor */
+  NodePtr(std::nullptr_t) {}  // NOLINT(*)
+  /*!
+   * \brief copy constructor
+   * \param other The value to be moved
+   */
+  NodePtr(const NodePtr<T>& other)  // NOLINT(*)
+      : NodePtr(other.data_) {
+  }
+  /*!
+   * \brief copy constructor
+   * \param other The value to be moved
+   */
+  template<typename Y>
+  NodePtr(const NodePtr<Y>& other)  // NOLINT(*)
+      : NodePtr(other.data_) {
+    static_assert(std::is_base_of<T, Y>::value,
+                  "can only assign of child class NodePtr to parent");
+  }
+  /*!
+   * \brief move constructor
+   * \param other The value to be moved
+   */
+  NodePtr(NodePtr<T>&& other) // NOLINT(*)
+      : data_(other.data_) {
+    other.data_ = nullptr;
+  }
+  /*!
+   * \brief move constructor
+   * \param other The value to be moved
+   */
+  template<typename Y>
+  NodePtr(NodePtr<Y>&& other)  // NOLINT(*)
+      : data_(other.data_) {
+    static_assert(std::is_base_of<T, Y>::value,
+                  "can only assign of child class NodePtr to parent");
+    other.data_ = nullptr;
+  }
+  /*! \brief destructor */
+  ~NodePtr() {
+    this->reset();
+  }
+  /*!
+   * \brief Swap this array with another NDArray
+   * \param other The other NDArray
+   */
+  void swap(NodePtr<T>& other) {  // NOLINT(*)
+    std::swap(data_, other.data_);
+  }
+  /*!
+   * \return Get the content of the pointer
+   */
+  T* get() const {
+    return static_cast<T*>(data_);
+  }
+  /*!
+   * \return The pointer
+   */
+  T* operator->() const {
+    return get();
+  }
+  /*!
+   * \return The reference
+   */
+  T& operator*() const { // NOLINT(*)
+    return *get();
+  }
+  /*!
+   * \brief copy assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  NodePtr<T>& operator=(const NodePtr<T>& other) {  // NOLINT(*)
+    // takes in plane operator to enable copy elison.
+    // copy-and-swap idiom
+    NodePtr(other).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+  /*!
+   * \brief move assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  NodePtr<T>& operator=(NodePtr<T>&& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    NodePtr(std::move(other)).swap(*this); // NOLINT(*)
+    return *this;
+  }
+  /*! \brief reset the content of ptr to be nullptr */
+  void reset() {
+    if (data_ != nullptr) {
+      data_->DecRef();
+      data_ = nullptr;
+    }
+  }
+  /*! \return The use count of the ptr, for debug purposes */
+  int use_count() const {
+    return data_ != nullptr ? data_->use_count() : 0;
+  }
+  /*! \return whether the reference is unique */
+  bool unique() const {
+    return data_ != nullptr && data_->use_count() == 1;
+  }
+  /*! \return Whether two NodePtr do not equals each other */
+  bool operator==(const NodePtr<T>& other) const {
+    return data_ == other.data_;
+  }
+  /*! \return Whether two NodePtr equals each other */
+  bool operator!=(const NodePtr<T>& other) const {
+    return data_ != other.data_;
+  }
+  /*! \return Whether the pointer is nullptr */
+  bool operator==(std::nullptr_t null) const {
+    return data_ == nullptr;
+  }
+  /*! \return Whether the pointer is not nullptr */
+  bool operator!=(std::nullptr_t null) const {
+    return data_ != nullptr;
+  }
+
+ private:
+  /*! \brief internal pointer field */
+  NodeBase* data_{nullptr};
+  /*!
+   * \brief constructor from NodeBase
+   * \param data The node base pointer
+   */
+  explicit NodePtr(NodeBase* data)
+      : data_(data) {
+    if (data != nullptr) {
+      data_->IncRef();
+    }
+  }
+  // friend declaration
+  friend class Node;
+  template<typename>
+  friend class NodePtr;
+  template<typename Y, typename... Args>
+  friend NodePtr<Y> make_node(Args&&...);
+};
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_NODE_BASE_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index d1206a8a34f4..401b0bbb97ed 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -17,6 +17,7 @@
 #include "c_runtime_api.h"
 #include "module.h"
 #include "ndarray.h"
+#include "node_base.h"
 
 namespace HalideIR {
 // Forward declare type for extensions
@@ -31,12 +32,6 @@ struct Expr;
 #endif
 
 namespace tvm {
-// Forward declare NodeRef and Node for extensions.
-// This header works fine without depend on NodeRef
-// as long as it is not used.
-class Node;
-class NodeRef;
-
 namespace runtime {
 // forward declarations
 class TVMArgs;
@@ -549,7 +544,7 @@ class TVMArgValue : public TVMPODValue_ {
   inline operator HalideIR::Type() const;
   inline operator HalideIR::Expr() const;
   // get internal node ptr, if it is node
-  inline std::shared_ptr<Node>& node_sptr();
+  inline NodePtr<Node>& node_sptr();
 };
 
 /*!
@@ -745,7 +740,7 @@ class TVMRetValue : public TVMPODValue_ {
   template<typename TNodeRef>
   inline TNodeRef AsNodeRef() const;
   inline TVMRetValue& operator=(const NodeRef& other);
-  inline TVMRetValue& operator=(const std::shared_ptr<Node>& other);
+  inline TVMRetValue& operator=(const NodePtr<Node>& other);
   // type related
   inline operator HalideIR::Type() const;
   inline TVMRetValue& operator=(const HalideIR::Type& other);
@@ -775,8 +770,8 @@ class TVMRetValue : public TVMPODValue_ {
         break;
       }
       case kNodeHandle: {
-        SwitchToClass<std::shared_ptr<Node> >(
-            kNodeHandle, *other.template ptr<std::shared_ptr<Node> >());
+        SwitchToClass<NodePtr<Node> >(
+            kNodeHandle, *other.template ptr<NodePtr<Node> >());
         break;
       }
       default: {
@@ -821,7 +816,7 @@ class TVMRetValue : public TVMPODValue_ {
       case kStr: delete ptr<std::string>(); break;
       case kFuncHandle: delete ptr<PackedFunc>(); break;
       case kModuleHandle: delete ptr<Module>(); break;
-      case kNodeHandle: delete ptr<std::shared_ptr<Node> >(); break;
+      case kNodeHandle: delete ptr<NodePtr<Node> >(); break;
       case kNDArrayContainer: {
         static_cast<NDArray::Container*>(value_.v_handle)->DecRef();
         break;
diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index b72eb7105faa..af72f3153291 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -36,7 +36,7 @@ enum AttachType : int {
 class Stage : public NodeRef {
  public:
   Stage() {}
-  explicit Stage(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Stage(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief create a new schedule for op.
    * \param op The operator in the schedule
@@ -260,7 +260,7 @@ class Stage : public NodeRef {
 class Schedule : public NodeRef {
  public:
   Schedule() {}
-  explicit Schedule(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Schedule(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief Get a copy of current schedule.
    * \return The copied schedule.
@@ -383,7 +383,7 @@ class Schedule : public NodeRef {
 class IterVarRelation : public NodeRef {
  public:
   IterVarRelation() {}
-  explicit IterVarRelation(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit IterVarRelation(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -397,7 +397,7 @@ class IterVarRelation : public NodeRef {
 class IterVarAttr : public NodeRef {
  public:
   IterVarAttr() {}
-  explicit IterVarAttr(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit IterVarAttr(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index ddccfce2fefb..48d959301e63 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -6,7 +6,6 @@
 #ifndef TVM_TENSOR_H_
 #define TVM_TENSOR_H_
 
-#include <tvm/container.h>
 #include <ir/FunctionBase.h>
 #include <string>
 #include <vector>
@@ -15,6 +14,7 @@
 #include "base.h"
 #include "expr.h"
 #include "arithmetic.h"
+#include "node/container.h"
 
 namespace tvm {
 
@@ -33,7 +33,7 @@ class Tensor : public NodeRef {
  public:
   /*! \brief default constructor, used internally */
   Tensor() {}
-  explicit Tensor(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Tensor(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -118,7 +118,7 @@ class Operation : public FunctionRef {
  public:
   /*! \brief default constructor  */
   Operation() {}
-  explicit Operation(std::shared_ptr<Node> n) : FunctionRef(n) {}
+  explicit Operation(NodePtr<Node> n) : FunctionRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index fa8c895ccb08..944498d1e615 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -19,7 +19,7 @@ class TensorIntrinNode;
 class TensorIntrin : public NodeRef {
  public:
   TensorIntrin() {}
-  explicit TensorIntrin(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit TensorIntrin(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
index b9b27621840c..6df70b53ccae 100644
--- a/nnvm/src/compiler/compile_engine.cc
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -94,7 +94,7 @@ class CompileEngine {
       return it->second->graph_func;
     }
     GraphFunc f = DoLower(key->graph, key->inputs, key->target, master_idx);
-    std::shared_ptr<GraphCacheEntryNode> n = std::make_shared<GraphCacheEntryNode>();
+    auto n = tvm::make_node<GraphCacheEntryNode>();
     n->graph_func = f;
     n->use_count = 1;
     n->master_idx = master_idx;
@@ -107,8 +107,7 @@ class CompileEngine {
     Array<NodeRef> items;
     for (auto& kv : cache_) {
       items.push_back(kv.first);
-      std::shared_ptr<GraphCacheEntryNode> n =
-          std::make_shared<GraphCacheEntryNode>(*(kv.second.operator->()));
+      auto n = tvm::make_node<GraphCacheEntryNode>(*(kv.second.operator->()));
       items.push_back(GraphCacheEntry(n));
     }
     return items;
@@ -126,7 +125,7 @@ class CompileEngine {
   // Set the given function on given graph key.
   void Set(const GraphKey& key, GraphFunc func) {
     std::lock_guard<std::mutex> lock(mutex_);
-    std::shared_ptr<GraphCacheEntryNode> n = std::make_shared<GraphCacheEntryNode>();
+    auto n = tvm::make_node<GraphCacheEntryNode>();
     n->graph_func = func;
     n->use_count = 1;
     cache_[key] = GraphCacheEntry(n);
@@ -265,7 +264,7 @@ class CompileEngine {
         graph, inputs, target, master_idx,
         &readable_name, &outputs);
 
-    std::shared_ptr<GraphFuncNode> gf = std::make_shared<GraphFuncNode>();
+    auto gf = tvm::make_node<GraphFuncNode>();
     gf->target = target;
     gf->func_name = GetUniqeName(readable_name);
     gf->inputs = inputs;
diff --git a/nnvm/src/compiler/compile_engine.h b/nnvm/src/compiler/compile_engine.h
index 7696b3b5f4eb..23e5e1d1a49c 100644
--- a/nnvm/src/compiler/compile_engine.h
+++ b/nnvm/src/compiler/compile_engine.h
@@ -71,7 +71,7 @@ struct GraphCacheEntryNode : public tvm::Node {
 class GraphCacheEntry : public ::tvm::NodeRef {
  public:
   GraphCacheEntry() {}
-  explicit GraphCacheEntry(std::shared_ptr<::tvm::Node> n) : NodeRef(n) {}
+  explicit GraphCacheEntry(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {}
   GraphCacheEntryNode* operator->() {
     return static_cast<GraphCacheEntryNode*>(node_.get());
   }
diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc
index ca68727ea067..f14a60e80d8c 100644
--- a/nnvm/src/compiler/graph_hash.cc
+++ b/nnvm/src/compiler/graph_hash.cc
@@ -74,8 +74,7 @@ bool GraphKeyEqual::Equal(const GraphKey& a,
 GraphKey GraphKeyNode::make(Graph graph,
                             tvm::Array<Tensor> inputs,
                             std::string target) {
-  std::shared_ptr<GraphKeyNode> n
-      = std::make_shared<GraphKeyNode>();
+  auto n = tvm::make_node<GraphKeyNode>();
   n->graph = std::move(graph);
   n->inputs = inputs;
   n->target = std::move(target);
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
index c680e82dd936..e4865df3f9f0 100644
--- a/nnvm/src/compiler/graph_runtime.cc
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -91,8 +91,7 @@ TVM_REGISTER_GLOBAL("nnvm.compiler._load_param_dict")
     for (size_t i = 0; i < size; ++i) {
       tvm::runtime::NDArray temp;
       temp.Load(strm);
-      std::shared_ptr<NDArrayWrapperNode> n
-          = std::make_shared<NDArrayWrapperNode>();
+      auto n = tvm::make_node<NDArrayWrapperNode>();
       n->name = std::move(names[i]);
       n->array = temp;
       ret.push_back(NDArrayWrapper(n));
diff --git a/nnvm/src/compiler/graph_runtime.h b/nnvm/src/compiler/graph_runtime.h
index 272e2be7f251..e5ba3681d2bf 100644
--- a/nnvm/src/compiler/graph_runtime.h
+++ b/nnvm/src/compiler/graph_runtime.h
@@ -9,6 +9,7 @@
 #include <nnvm/graph.h>
 #include <tvm/base.h>
 #include <tvm/expr.h>
+#include <tvm/node/memory.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/runtime/ndarray.h>
 #include <vector>
diff --git a/nnvm/src/compiler/packed_func_ext.cc b/nnvm/src/compiler/packed_func_ext.cc
index 64846fc8e247..1a19feabfe8a 100644
--- a/nnvm/src/compiler/packed_func_ext.cc
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -96,7 +96,7 @@ TVM_REGISTER_GLOBAL("nnvm._register_compute")
                         const Array<Tensor>& out_info)
         -> Array<Tensor> {
       TVMRetValue ret = (*f)(GetAttrDict(attrs), inputs, out_info);
-      if ((*ret.ptr<std::shared_ptr<tvm::Node> >())->derived_from<tvm::TensorNode>()) {
+      if ((*ret.ptr<::tvm::NodePtr<tvm::Node> >())->derived_from<tvm::TensorNode>()) {
         return {ret.operator Tensor()};
       } else {
         return ret;
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index 8c55684ed851..8ca49f19baec 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -45,11 +45,11 @@ TVM_REGISTER_API("_str")
 
 TVM_REGISTER_API("_Array")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    std::vector<std::shared_ptr<Node> > data;
+    std::vector<NodePtr<Node> > data;
     for (int i = 0; i < args.size(); ++i) {
       data.push_back(args[i].node_sptr());
     }
-    auto node = std::make_shared<ArrayNode>();
+    auto node = make_node<ArrayNode>();
     node->data = std::move(data);
     *ret = node;
   });
@@ -87,7 +87,7 @@ TVM_REGISTER_API("_Map")
         data.emplace(std::make_pair(args[i].operator std::string(),
                                     args[i + 1].node_sptr()));
       }
-      auto node = std::make_shared<StrMapNode>();
+      auto node = make_node<StrMapNode>();
       node->data = std::move(data);
       *ret = node;
     } else {
@@ -101,7 +101,7 @@ TVM_REGISTER_API("_Map")
         data.emplace(std::make_pair(args[i].node_sptr(),
                                     args[i + 1].node_sptr()));
       }
-      auto node = std::make_shared<MapNode>();
+      auto node = make_node<MapNode>();
       node->data = std::move(data);
       *ret = node;
     }
@@ -163,7 +163,7 @@ TVM_REGISTER_API("_MapItems")
     auto& sptr = args[0].node_sptr();
     if (sptr->is_type<MapNode>()) {
       auto* n = static_cast<const MapNode*>(sptr.get());
-      auto rkvs = std::make_shared<ArrayNode>();
+      auto rkvs = make_node<ArrayNode>();
       for (const auto& kv : n->data) {
         rkvs->data.push_back(kv.first);
         rkvs->data.push_back(kv.second);
@@ -171,7 +171,7 @@ TVM_REGISTER_API("_MapItems")
       *ret = rkvs;
     } else {
       auto* n = static_cast<const StrMapNode*>(sptr.get());
-      auto rkvs = std::make_shared<ArrayNode>();
+      auto rkvs = make_node<ArrayNode>();
       for (const auto& kv : n->data) {
         rkvs->data.push_back(ir::StringImm::make(kv.first).node_);
         rkvs->data.push_back(kv.second);
diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc
index 9157e62fda8a..1c2c294a5f30 100644
--- a/src/api/dsl_api.cc
+++ b/src/api/dsl_api.cc
@@ -28,7 +28,7 @@ struct TVMAPIThreadLocalEntry {
 /*! \brief Thread local store that can be used to hold return values. */
 typedef dmlc::ThreadLocalStore<TVMAPIThreadLocalEntry> TVMAPIThreadLocalStore;
 
-using TVMAPINode = std::shared_ptr<Node>;
+using TVMAPINode = NodePtr<Node>;
 
 struct APIAttrGetter : public AttrVisitor {
   std::string skey;
diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc
index 99f9f0c073c3..0fa7b846cf7e 100644
--- a/src/arithmetic/canonical.cc
+++ b/src/arithmetic/canonical.cc
@@ -48,7 +48,7 @@ struct ComExprEntry {
 };
 
 // canonical expression for communicative expression.
-struct ComExprNode {
+struct ComExprNode : public NodeBase {
   // base constant value.
   int64_t base{0};
   // The values to be sumed.
@@ -60,7 +60,7 @@ struct ComExpr {
  public:
   // constructor
   ComExpr() {}
-  explicit ComExpr(std::shared_ptr<ComExprNode> ptr) : ptr_(ptr) {}
+  explicit ComExpr(NodePtr<ComExprNode> ptr) : ptr_(ptr) {}
   // get member
   ComExprNode* operator->() const {
     return ptr_.get();
@@ -106,7 +106,7 @@ struct ComExpr {
   }
 
  private:
-  std::shared_ptr<ComExprNode> ptr_;
+  NodePtr<ComExprNode> ptr_;
 };
 
 // binary comparison op.
@@ -173,7 +173,7 @@ class Canonical::Internal : public IRMutator {
       if (sum.defined()) return sum;
       const int64_t *v1 = as_const_int(value);
       const uint64_t *v2 = as_const_uint(value);
-      std::shared_ptr<ComExprNode> n = std::make_shared<ComExprNode>();
+      auto n = make_node<ComExprNode>();
       if (v1) {
         n->base = *v1;
       } else if (v2) {
@@ -471,8 +471,8 @@ class Canonical::Internal : public IRMutator {
     Type type = coeff.type();
     int64_t value = GetConstIntValue(coeff);
     if (value < 0) return {};
-    std::shared_ptr<ComExprNode> xnode = std::make_shared<ComExprNode>();
-    std::shared_ptr<ComExprNode> ynode = std::make_shared<ComExprNode>();
+    auto xnode = make_node<ComExprNode>();
+    auto ynode = make_node<ComExprNode>();
     if (a->base % value == 0) {
       xnode->base = a->base;
     } else {
@@ -507,7 +507,7 @@ class Canonical::Internal : public IRMutator {
     std::vector<ComExpr> pair = TryLinearEquation(a, v);
     if (pair.size() == 0) {
       int64_t value = GetConstIntValue(v);
-      std::shared_ptr<ComExprNode> n = std::make_shared<ComExprNode>();
+      auto n = make_node<ComExprNode>();
       n->base = a->base % value;
       for (auto e : a->elem) {
         if (e.scale % value == 0) continue;
@@ -554,8 +554,7 @@ class Canonical::Internal : public IRMutator {
     if (value == 0) {
       return make_zero(v.type());
     }
-    std::shared_ptr<ComExprNode> vsum =
-        std::make_shared<ComExprNode>(*a.operator->());
+    auto vsum = make_node<ComExprNode>(*a.operator->());
     vsum->base *= value;
     for (auto& e : vsum->elem) {
       e.scale *= value;
@@ -576,7 +575,7 @@ class Canonical::Internal : public IRMutator {
   ComExpr SumAdd_(const ComExpr& suma,
                   const ComExpr& sumb,
                   int bscale) {
-    std::shared_ptr<ComExprNode> n = std::make_shared<ComExprNode>();
+    auto n = make_node<ComExprNode>();
     n->base = suma->base + sumb->base * bscale;
     // merge of suma and sumb;
     size_t i = 0, j = 0;
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index c1b68fddd0e9..78c592471a1a 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -329,7 +329,7 @@ inline IntSet AsStrideSet(IntSet a) {
   if (a.as<StrideSet>()) return a;
   const IntervalSet* s = a.as<IntervalSet>();
   CHECK(s->i.is_bounded());
-  std::shared_ptr<StrideSet> n = std::make_shared<StrideSet>();
+  NodePtr<StrideSet> n = make_node<StrideSet>();
   n->base = s->i;
   return IntSet(n);
 }
@@ -348,7 +348,7 @@ inline IntSet CombineSets<Add>(IntSet a, IntSet b) {
   b = AsStrideSet(b);
   const StrideSet* a_stride = a.as<StrideSet>();
   const StrideSet* b_stride = b.as<StrideSet>();
-  auto n = std::make_shared<StrideSet>(*a_stride);
+  auto n = make_node<StrideSet>(*a_stride);
   for (size_t i = 0; i < b_stride->extents.size(); ++i) {
     n->extents.push_back(b_stride->extents[i]);
     n->strides.push_back(b_stride->strides[i]);
diff --git a/src/arithmetic/int_set_internal.h b/src/arithmetic/int_set_internal.h
index 9284e6e016e0..e28fe2a9d958 100644
--- a/src/arithmetic/int_set_internal.h
+++ b/src/arithmetic/int_set_internal.h
@@ -21,14 +21,14 @@ struct IntervalSet : public IntSetNode {
   Interval i;
 
   static IntSet make(Interval i) {
-    std::shared_ptr<IntervalSet> n =
-        std::make_shared<IntervalSet>();
+    NodePtr<IntervalSet> n =
+        make_node<IntervalSet>();
     n->i = i;
     return IntSet(n);
   }
   static IntSet make(Expr min, Expr max) {
-    std::shared_ptr<IntervalSet> n =
-        std::make_shared<IntervalSet>();
+    NodePtr<IntervalSet> n =
+        make_node<IntervalSet>();
     n->i.min = min;
     n->i.max = max;
     return IntSet(n);
diff --git a/src/arithmetic/modular.cc b/src/arithmetic/modular.cc
index 1c03d0f97485..d79300eb7782 100644
--- a/src/arithmetic/modular.cc
+++ b/src/arithmetic/modular.cc
@@ -159,7 +159,7 @@ IntSet EvalModular(const Expr& e,
     CHECK(m) << "Need to pass ModularSet for Modular Analysis";
     mmap[kv.first.get()] = m->e;
   }
-  std::shared_ptr<ModularSet> n = std::make_shared<ModularSet>();
+  NodePtr<ModularSet> n = make_node<ModularSet>();
   n->e = ModularEvaluator(mmap)(e);
   return IntSet(n);
 }
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index f35b09d1dfe6..5c0a5e07cd2a 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -32,7 +32,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 */
 Target CreateTarget(const std::string& target_name,
                     const std::vector<std::string>& options) {
-  auto target = Target(std::make_shared<TargetNode>());
+  auto target = Target(make_node<TargetNode>());
   auto t = static_cast<TargetNode*>(target.node_.get());
 
   t->target_name = target_name;
@@ -475,7 +475,7 @@ runtime::Module build(const Array<LoweredFunc>& funcs,
 }
 
 BuildConfig build_config() {
-  return BuildConfig(std::make_shared<BuildConfigNode>());
+  return BuildConfig(make_node<BuildConfigNode>());
 }
 
 /*! \brief Entry to hold the BuildConfig context stack. */
@@ -533,7 +533,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 });
 
 struct GenericFunc::Manager {
-  std::unordered_map<std::string, std::shared_ptr<Node> > fmap;
+  std::unordered_map<std::string, NodePtr<Node> > fmap;
   // mutex
   std::mutex mutex;
 
@@ -551,7 +551,7 @@ GenericFunc GenericFunc::Get(const std::string& name) {
   std::lock_guard<std::mutex>(m->mutex);
   auto it = m->fmap.find(name);
   if (it == m->fmap.end()) {
-    auto f = std::make_shared<GenericFuncNode>();
+    auto f = make_node<GenericFuncNode>();
     f->name_ = name;
     m->fmap[name] = f;
     return GenericFunc(f);
@@ -669,7 +669,7 @@ TVM_REGISTER_API("_BuildConfigGetAddLowerPassInfo")
 
 TVM_REGISTER_API("_GenericFuncCreate")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = GenericFunc(std::make_shared<GenericFuncNode>());
+  *ret = GenericFunc(make_node<GenericFuncNode>());
   });
 
 TVM_REGISTER_API("_GenericFuncGetGlobal")
diff --git a/src/codegen/verilog/verilog_ir.cc b/src/codegen/verilog/verilog_ir.cc
index b7576c83dfa8..dea8ebaebb8d 100644
--- a/src/codegen/verilog/verilog_ir.cc
+++ b/src/codegen/verilog/verilog_ir.cc
@@ -17,14 +17,14 @@ using namespace ir;
 
 ControlSignal ControlSignalNode::make(
     ControlSignalType type, int advance_size) {
-  auto n = std::make_shared<ControlSignalNode>();
+  auto n = make_node<ControlSignalNode>();
   n->ctrl_type = type;
   n->advance_size = advance_size;
   return ControlSignal(n);
 }
 
 StageInput StageInputNode::make(Var var, StageInputType input_type) {
-  std::shared_ptr<StageInputNode> n = std::make_shared<StageInputNode>();
+  NodePtr<StageInputNode> n = make_node<StageInputNode>();
   n->var = var;
   n->input_type = input_type;
   return StageInput(n);
@@ -81,7 +81,7 @@ class PipelineExtractor: public IRVisitor {
         arg_handle_[arg.get()] = arg;
       }
     }
-    pipeline_ = std::make_shared<PipelineNode>();
+    pipeline_ = make_node<PipelineNode>();
     this->Visit(f->body);
     // setup channels
     for (const auto &kv : cmap_) {
@@ -113,7 +113,7 @@ class PipelineExtractor: public IRVisitor {
       if (cb.node != nullptr) {
         CHECK(cb.node->channel.same_as(ch));
       } else {
-        cb.node = std::make_shared<ChannelBlockNode>();
+        cb.node = make_node<ChannelBlockNode>();
         cb.node->channel = ch;
       }
       if (op->attr_key == attr::channel_read_scope) {
@@ -167,8 +167,8 @@ class PipelineExtractor: public IRVisitor {
     // The replace logic
     StageInputReplacer repl(var_info_);
     // Setup the compute block.
-    std::shared_ptr<ComputeBlockNode> compute =
-        std::make_shared<ComputeBlockNode>();
+    NodePtr<ComputeBlockNode> compute =
+        make_node<ComputeBlockNode>();
     compute->loop = Array<Stmt>(loop_);
     // setup the advance triggers
     for (const auto& e : trigger_) {
@@ -180,8 +180,8 @@ class PipelineExtractor: public IRVisitor {
       } else {
         ch = Channel(attr->node.node_);
       }
-      std::shared_ptr<SignalTriggerNode> trigger
-          = std::make_shared<SignalTriggerNode>();
+      NodePtr<SignalTriggerNode> trigger
+          = make_node<SignalTriggerNode>();
       trigger->channel_var = ch->handle_var;
       // predicate for the trigger
       Expr predicate = const_true();
@@ -249,7 +249,7 @@ class PipelineExtractor: public IRVisitor {
     CHECK(!cmap_.count(var))
         << "Multiple access to the same handle";
     ChannelEntry& cb = cmap_[var];
-    cb.node = std::make_shared<ChannelBlockNode>();
+    cb.node = make_node<ChannelBlockNode>();
     cb.node->channel = ChannelNode::make(arg_handle_.at(var), dtype);
     return cb.node->channel;
   }
@@ -257,7 +257,7 @@ class PipelineExtractor: public IRVisitor {
  private:
   // The channel information.
   struct ChannelEntry {
-    std::shared_ptr<ChannelBlockNode> node;
+    NodePtr<ChannelBlockNode> node;
     int read_ref_count{0};
     int write_ref_count{0};
   };
@@ -276,7 +276,7 @@ class PipelineExtractor: public IRVisitor {
   // The argument handle map
   std::unordered_map<const Variable*, Var> arg_handle_;
   // The result block.
-  std::shared_ptr<PipelineNode> pipeline_;
+  NodePtr<PipelineNode> pipeline_;
 };
 
 Pipeline MakePipeline(LoweredFunc f) {
diff --git a/src/codegen/verilog/vpi_session.cc b/src/codegen/verilog/vpi_session.cc
index ac2861e8f74f..36c08cac3f84 100644
--- a/src/codegen/verilog/vpi_session.cc
+++ b/src/codegen/verilog/vpi_session.cc
@@ -50,7 +50,7 @@ inline VPIHandleNode* VPIHandle::get() const {
 VPIHandle VPIHandleCreate(
     const std::shared_ptr<VPISessionEntry>& sess,
     VPIRawHandle handle) {
-  std::shared_ptr<VPIHandleNode> n = std::make_shared<VPIHandleNode>();
+  auto n = make_node<VPIHandleNode>();
   n->sess = sess;
   n->handle = handle;
   return VPIHandle(n);
@@ -102,7 +102,7 @@ int VPIGetIntProp(VPIHandleNode* h, int code) {
 }
 
 VPISession VPISession::make(int h_pipe_read, int h_pipe_write) {
-  std::shared_ptr<VPISessionNode> n = std::make_shared<VPISessionNode>();
+  auto n = make_node<VPISessionNode>();
   n->sess = std::make_shared<VPISessionEntry>(h_pipe_read, h_pipe_write);
   n->sess->in_control = true;
   VPISession sess(n);
diff --git a/src/codegen/verilog/vpi_session.h b/src/codegen/verilog/vpi_session.h
index 88a7f2f1906e..9fab0f173995 100644
--- a/src/codegen/verilog/vpi_session.h
+++ b/src/codegen/verilog/vpi_session.h
@@ -27,7 +27,7 @@ using runtime::PackedFunc;
 class VPISession : public NodeRef {
  public:
   VPISession() {}
-  explicit VPISession(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit VPISession(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief Get handle by name.
    * \param name The name of the handle.
@@ -63,7 +63,7 @@ class VPISession : public NodeRef {
 class VPIHandle : public NodeRef {
  public:
   VPIHandle() {}
-  explicit VPIHandle(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit VPIHandle(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief Get handle by name.
    * \param name The name of the handle.
diff --git a/src/lang/api_registry.cc b/src/lang/api_registry.cc
index 466ee1d3dd68..c9f84092f5da 100644
--- a/src/lang/api_registry.cc
+++ b/src/lang/api_registry.cc
@@ -11,10 +11,10 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
     p->stream << "EnvFunc(" << op->name << ")";
 });
 
-std::shared_ptr<EnvFuncNode> CreateEnvNode(const std::string& name) {
+NodePtr<EnvFuncNode> CreateEnvNode(const std::string& name) {
   auto* f = runtime::Registry::Get(name);
   CHECK(f != nullptr) << "Cannot find global function \'" << name << '\'';
-  std::shared_ptr<EnvFuncNode> n = std::make_shared<EnvFuncNode>();
+  NodePtr<EnvFuncNode> n = make_node<EnvFuncNode>();
   n->func = *f;
   n->name = name;
   return n;
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index 360c5b2e9833..12ebbff4be74 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -30,7 +30,7 @@ Array<AttrFieldInfo> DictAttrsNode::ListFieldInfo() const {
 }
 
 Attrs DictAttrsNode::make(Map<std::string, NodeRef> dict) {
-  std::shared_ptr<DictAttrsNode> n = std::make_shared<DictAttrsNode>();
+  NodePtr<DictAttrsNode> n = make_node<DictAttrsNode>();
   n->dict = std::move(dict);
   return Attrs(n);
 }
diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc
index 3f23c2d480bf..cb3194f8eb1d 100644
--- a/src/lang/buffer.cc
+++ b/src/lang/buffer.cc
@@ -289,7 +289,7 @@ Buffer Buffer::MakeStrideView() const {
   if ((*this)->strides.size() != 0) return *this;
   if ((*this)->shape.size() == 0) return *this;
   std::vector<Expr> temp;
-  auto n = std::make_shared<BufferNode>(*operator->());
+  auto n = make_node<BufferNode>(*operator->());
   Expr acc = make_const(n->DefaultIndexType(), 1);
   for (size_t i = n->shape.size(); i != 0 ; --i) {
     temp.push_back(acc);
@@ -373,7 +373,7 @@ Buffer BufferNode::make(Var data,
                         std::string scope,
                         int data_alignment,
                         int offset_factor) {
-  auto n = std::make_shared<BufferNode>();
+  auto n = make_node<BufferNode>();
   n->data = std::move(data);
   n->dtype = dtype;
   n->shape = std::move(shape);
diff --git a/src/lang/channel.cc b/src/lang/channel.cc
index dd850becf956..dcc44a0d0611 100644
--- a/src/lang/channel.cc
+++ b/src/lang/channel.cc
@@ -7,7 +7,7 @@
 namespace tvm {
 
 Channel ChannelNode::make(Var handle_var, Type dtype) {
-  auto n = std::make_shared<ChannelNode>();
+  auto n = make_node<ChannelNode>();
   n->handle_var = handle_var;
   n->dtype = dtype;
   return Channel(n);
diff --git a/src/lang/expr.cc b/src/lang/expr.cc
index c2dab10c26d5..062ea9217e63 100644
--- a/src/lang/expr.cc
+++ b/src/lang/expr.cc
@@ -13,18 +13,18 @@ namespace tvm {
 using HalideIR::IR::RangeNode;
 
 Range::Range(Expr begin, Expr end)
-    : Range(std::make_shared<RangeNode>(
+    : Range(make_node<RangeNode>(
           begin,
           is_zero(begin) ? end : (end - begin))) {
 }
 
 Range Range::make_by_min_extent(Expr min, Expr extent) {
-  return Range(std::make_shared<HalideIR::IR::RangeNode>(min, extent));
+  return Range(make_node<HalideIR::IR::RangeNode>(min, extent));
 }
 
 IterVar IterVarNode::make(Range dom, Var var,
                           IterVarType t, std::string thread_tag) {
-  std::shared_ptr<IterVarNode> n = std::make_shared<IterVarNode>();
+  NodePtr<IterVarNode> n = make_node<IterVarNode>();
   n->dom = dom;
   n->var = var;
   n->iter_type = t;
diff --git a/src/lang/ir.cc b/src/lang/ir.cc
index 1e0a6e5065f4..875258540584 100644
--- a/src/lang/ir.cc
+++ b/src/lang/ir.cc
@@ -52,7 +52,7 @@ CommReducer CommReducerNode::make(Array<Var> lhs,
                                   Array<Var> rhs,
                                   Array<Expr> result,
                                   Array<Expr> identity_element) {
-  auto node = std::make_shared<CommReducerNode>();
+  auto node = make_node<CommReducerNode>();
   node->lhs = lhs;
   node->rhs = rhs;
   node->result = result;
@@ -83,7 +83,7 @@ Expr Reduce::make(CommReducer combiner, Array<Expr> source,
   if (!condition.defined()) {
     condition = const_true();
   }
-  auto n = std::make_shared<Reduce>();
+  auto n = make_node<Reduce>();
   CHECK(source.defined());
   for (size_t i = 0; i < axis.size(); ++i) {
     CHECK(axis[i].defined());
diff --git a/src/lang/node.cc b/src/lang/node.cc
new file mode 100644
index 000000000000..f7043eaf7b2a
--- /dev/null
+++ b/src/lang/node.cc
@@ -0,0 +1,58 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *  Implementation of IR Node API
+ * \file node.cc
+ */
+#include <tvm/node/node.h>
+#include <memory>
+#include <atomic>
+#include <mutex>
+#include <unordered_map>
+
+namespace tvm {
+
+namespace {
+// single manager of operator information.
+struct TypeManager {
+  // mutex to avoid registration from multiple threads.
+  // recursive is needed for trigger(which calls UpdateAttrMap)
+  std::mutex mutex;
+  std::atomic<uint32_t> type_counter{0};
+  std::unordered_map<std::string, uint32_t> key2index;
+  std::vector<std::string> index2key;
+  // get singleton of the
+  static TypeManager* Global() {
+    static TypeManager inst;
+    return &inst;
+  }
+};
+}  // namespace
+
+const bool Node::_DerivedFrom(uint32_t tid) const {
+  static uint32_t tindex = TypeKey2Index(Node::_type_key);
+  return tid == tindex;
+}
+
+// this is slow, usually caller always hold the result in a static variable.
+uint32_t Node::TypeKey2Index(const char* key) {
+  TypeManager *t = TypeManager::Global();
+  std::lock_guard<std::mutex>(t->mutex);
+  std::string skey = key;
+  auto it = t->key2index.find(skey);
+  if (it != t->key2index.end()) {
+    return it->second;
+  }
+  uint32_t tid = ++(t->type_counter);
+  t->key2index[skey] = tid;
+  t->index2key.push_back(skey);
+  return tid;
+}
+
+const char* Node::TypeIndex2Key(uint32_t index) {
+  TypeManager *t = TypeManager::Global();
+  std::lock_guard<std::mutex>(t->mutex);
+  internal_assert(index != 0);
+  return t->index2key.at(index - 1).c_str();
+}
+
+}  // namespace tvm
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index a33594107a69..497ec24f4129 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -6,7 +6,7 @@
 #include <tvm/base.h>
 #include <tvm/expr.h>
 #include <tvm/attrs.h>
-#include <tvm/container.h>
+#include <tvm/node/container.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/runtime/ndarray.h>
 #include <dmlc/json.h>
@@ -248,7 +248,7 @@ class JSONAttrGetter : public AttrVisitor {
 
 class JSONAttrSetter : public AttrVisitor {
  public:
-  const std::vector<std::shared_ptr<Node> >* node_list_;
+  const std::vector<NodePtr<Node> >* node_list_;
   const std::vector<runtime::NDArray>* tensor_list_;
   JSONNode* node_;
 
@@ -401,13 +401,13 @@ std::string SaveJSON(const NodeRef& n) {
   return os.str();
 }
 
-std::shared_ptr<Node> LoadJSON_(std::string json_str) {
+NodePtr<Node> LoadJSON_(std::string json_str) {
   std::istringstream is(json_str);
   dmlc::JSONReader reader(&is);
   JSONGraph jgraph;
   // load in json graph.
   jgraph.Load(&reader);
-  std::vector<std::shared_ptr<Node> > nodes;
+  std::vector<NodePtr<Node> > nodes;
   std::vector<runtime::NDArray> tensors;
   // load in tensors
   for (const std::string& blob : jgraph.b64ndarrays) {
@@ -427,7 +427,7 @@ std::shared_ptr<Node> LoadJSON_(std::string json_str) {
           << "Node type \'" << jnode.type_key << "\' is not registered in TVM";
       nodes.emplace_back(f->fcreator(jnode.global_key));
     } else {
-      nodes.emplace_back(std::shared_ptr<Node>());
+      nodes.emplace_back(NodePtr<Node>());
     }
   }
   CHECK_EQ(nodes.size(), jgraph.nodes.size());
@@ -526,7 +526,7 @@ void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
   TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1);
   CHECK(f->fglobal_key == nullptr)
       << "Cannot make node type \'" << type_key << "\' with global_key.";
-  std::shared_ptr<Node> n = f->fcreator(empty_str);
+  NodePtr<Node> n = f->fcreator(empty_str);
   if (n->derived_from<BaseAttrsNode>()) {
     static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
   } else {
diff --git a/src/lang/tensor.cc b/src/lang/tensor.cc
index 5db4f45e799f..4f9c3e9d1782 100644
--- a/src/lang/tensor.cc
+++ b/src/lang/tensor.cc
@@ -30,7 +30,7 @@ Tensor TensorNode::make(Array<Expr> shape,
                         Type dtype,
                         Operation op,
                         int value_index) {
-  auto n = std::make_shared<TensorNode>();
+  auto n = make_node<TensorNode>();
   n->shape = std::move(shape);
   n->dtype = dtype;
   n->op = op;
@@ -47,7 +47,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 TVM_REGISTER_NODE_TYPE(TensorNode);
 
 Tensor Operation::output(size_t i) const {
-  auto node = std::make_shared<TensorNode>();
+  auto node = make_node<TensorNode>();
   node->op = *this;
   node->value_index = i;
   node->dtype = (*this)->output_dtype(i);
@@ -62,7 +62,7 @@ TensorIntrin TensorIntrinNode::make(std::string name,
                                     Stmt body,
                                     Stmt reduce_init,
                                     Stmt reduce_update) {
-  auto n = std::make_shared<TensorIntrinNode>();
+  auto n = make_node<TensorIntrinNode>();
   n->name = std::move(name);
   n->op = std::move(op);
   n->inputs = std::move(inputs);
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index 267a25ff372b..6100c957e473 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -69,7 +69,7 @@ Tensor compute(Array<Expr> shape,
                std::string name,
                std::string tag,
                Map<std::string, NodeRef> attrs) {
-  auto op_node = std::make_shared<ComputeOpNode>();
+  auto op_node = make_node<ComputeOpNode>();
   // compute dimension.
   size_t ndim = shape.size();
   std::vector<IterVar> axis;
@@ -91,7 +91,7 @@ Array<Tensor> compute(Array<Expr> shape,
                       std::string name,
                       std::string tag,
                       Map<std::string, NodeRef> attrs) {
-  auto op_node = std::make_shared<ComputeOpNode>();
+  auto op_node = make_node<ComputeOpNode>();
   // compute dimension.
   size_t ndim = shape.size();
   std::vector<IterVar> axis;
@@ -117,7 +117,7 @@ Operation ComputeOpNode::make(std::string name,
                               Map<std::string, NodeRef> attrs,
                               Array<IterVar> axis,
                               Array<Expr> body) {
-  auto n = std::make_shared<ComputeOpNode>();
+  auto n = make_node<ComputeOpNode>();
   n->name = std::move(name);
   n->tag = std::move(tag);
   n->attrs = std::move(attrs);
@@ -163,7 +163,7 @@ Operation ComputeOpNode::ReplaceInputs(
     if (!new_reduce.same_as(this->body[0])) {
       const ir::Reduce* r = new_reduce.as<ir::Reduce>();
       for (size_t k = 0; k < this->body.size(); ++k) {
-        std::shared_ptr<ir::Reduce> n = std::make_shared<ir::Reduce>(*r);
+        auto n = make_node<ir::Reduce>(*r);
         n->value_index = static_cast<int>(k);
         n->type = r->source[k].type();
         arr.push_back(Expr(n));
diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc
index 86c1d5e74527..952e52a852bd 100644
--- a/src/op/extern_op.cc
+++ b/src/op/extern_op.cc
@@ -43,7 +43,7 @@ Operation ExternOpNode::make(std::string name,
                              Array<Buffer> input_placeholders,
                              Array<Buffer> output_placeholders,
                              Stmt body) {
-  auto n = std::make_shared<ExternOpNode>();
+  auto n = make_node<ExternOpNode>();
   n->name = std::move(name);
   n->tag = std::move(tag);
   n->attrs = std::move(attrs);
@@ -68,7 +68,7 @@ Operation ExternOpNode::ReplaceInputs(
     const Operation& self,
     const std::unordered_map<Tensor, Tensor>& rmap) const {
   CHECK_EQ(self.operator->(), this);
-  auto n = std::make_shared<ExternOpNode>(*this);
+  auto n = make_node<ExternOpNode>(*this);
   n->body = op::ReplaceTensor(this->body, rmap);
   for (size_t i = 0; i < n->inputs.size(); ++i) {
     Tensor t = n->inputs[i];
diff --git a/src/op/placeholder_op.cc b/src/op/placeholder_op.cc
index a2cd0eb2d81f..fcd5993dafa5 100644
--- a/src/op/placeholder_op.cc
+++ b/src/op/placeholder_op.cc
@@ -36,7 +36,7 @@ Array<Expr> PlaceholderOpNode::output_shape(size_t i) const {
 Operation PlaceholderOpNode::make(std::string name,
                                   Array<Expr> shape,
                                   Type dtype) {
-  auto n = std::make_shared<PlaceholderOpNode>();
+  auto n = make_node<PlaceholderOpNode>();
   n->name = name;
   n->shape = shape;
   n->dtype = dtype;
diff --git a/src/op/scan_op.cc b/src/op/scan_op.cc
index d03601709ab4..60369aaabb33 100644
--- a/src/op/scan_op.cc
+++ b/src/op/scan_op.cc
@@ -51,7 +51,7 @@ Operation ScanOpNode::make(std::string name,
                            Array<Tensor> update,
                            Array<Tensor> state_placeholder,
                            Array<Tensor> inputs) {
-  auto n = std::make_shared<ScanOpNode>();
+  auto n = make_node<ScanOpNode>();
   CHECK_EQ(init.size(), update.size());
   CHECK_EQ(init.size(), state_placeholder.size());
 
@@ -135,7 +135,7 @@ Operation ScanOpNode::ReplaceInputs(
     const Operation& self,
     const std::unordered_map<Tensor, Tensor>& rmap) const {
   CHECK_EQ(self.operator->(), this);
-  std::shared_ptr<ScanOpNode> n = std::make_shared<ScanOpNode>(*this);
+  auto n = make_node<ScanOpNode>(*this);
   for (size_t i = 0; i < n->init.size(); ++i) {
     if (rmap.count(n->init[i])) {
       n->init.Set(i, rmap.at(n->init[i]));
diff --git a/src/pass/combine_context_call.cc b/src/pass/combine_context_call.cc
index dff91e6690f2..d60256bcfcf0 100644
--- a/src/pass/combine_context_call.cc
+++ b/src/pass/combine_context_call.cc
@@ -90,7 +90,7 @@ class ContextCallCombiner final : public IRMutator {
 };
 
 LoweredFunc CombineContextCall(LoweredFunc f) {
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = ContextCallCombiner().Combine(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/ir_util.cc b/src/pass/ir_util.cc
index d06839beca33..89426f982ba8 100644
--- a/src/pass/ir_util.cc
+++ b/src/pass/ir_util.cc
@@ -13,38 +13,38 @@ Stmt MergeNest(const std::vector<Stmt>& nest, Stmt body) {
   for (auto ri = nest.rbegin(); ri != nest.rend(); ++ri) {
     Stmt s = *ri;
     if (s.as<For>()) {
-      auto n = std::make_shared<For>(*s.as<For>());
+      auto n = make_node<For>(*s.as<For>());
       CHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (s.as<LetStmt>()) {
-      auto n = std::make_shared<LetStmt>(*s.as<LetStmt>());
+      auto n = make_node<LetStmt>(*s.as<LetStmt>());
       CHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (s.as<AttrStmt>()) {
-      auto n = std::make_shared<AttrStmt>(*s.as<AttrStmt>());
+      auto n = make_node<AttrStmt>(*s.as<AttrStmt>());
       CHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (s.as<IfThenElse>()) {
-      auto n = std::make_shared<IfThenElse>(*s.as<IfThenElse>());
+      auto n = make_node<IfThenElse>(*s.as<IfThenElse>());
       CHECK(is_no_op(n->then_case));
       CHECK(!n->else_case.defined());
       n->then_case = body;
       body = Stmt(n);
     } else if (s.as<Block>()) {
-      auto n = std::make_shared<Block>(*s.as<Block>());
+      auto n = make_node<Block>(*s.as<Block>());
       CHECK(is_no_op(n->rest));
       n->rest = body;
       body = Stmt(n);
     } else if (s.as<AssertStmt>()) {
-      auto n = std::make_shared<AssertStmt>(*s.as<AssertStmt>());
+      auto n = make_node<AssertStmt>(*s.as<AssertStmt>());
       CHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (s.as<Allocate>()) {
-      auto n = std::make_shared<Allocate>(*s.as<Allocate>());
+      auto n = make_node<Allocate>(*s.as<Allocate>());
       CHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
diff --git a/src/pass/lower_intrin.cc b/src/pass/lower_intrin.cc
index b38051326d1d..1a9caf4b591e 100644
--- a/src/pass/lower_intrin.cc
+++ b/src/pass/lower_intrin.cc
@@ -104,7 +104,7 @@ class IntrinInjecter : public IRMutator {
 
 LoweredFunc
 LowerIntrin(LoweredFunc f, const std::string& target) {
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = IntrinInjecter(target).Mutate(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/lower_thread_allreduce.cc b/src/pass/lower_thread_allreduce.cc
index 4d7f086d0534..2f700ed9112d 100644
--- a/src/pass/lower_thread_allreduce.cc
+++ b/src/pass/lower_thread_allreduce.cc
@@ -317,7 +317,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
 LoweredFunc
 LowerThreadAllreduce(LoweredFunc f, int warp_size) {
   CHECK_NE(f->func_type, kHostFunc);
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = ThreadAllreduceBuilder(warp_size).Mutate(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc
index 46686a65803a..cf3d9f7eeeb1 100644
--- a/src/pass/lower_tvm_builtin.cc
+++ b/src/pass/lower_tvm_builtin.cc
@@ -288,7 +288,7 @@ class BuiltinLower : public IRMutator {
 };
 
 LoweredFunc LowerTVMBuiltin(LoweredFunc f) {
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = BuiltinLower().Build(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/lower_warp_memory.cc b/src/pass/lower_warp_memory.cc
index 85ae365f2a82..01ab2b51752e 100644
--- a/src/pass/lower_warp_memory.cc
+++ b/src/pass/lower_warp_memory.cc
@@ -93,7 +93,7 @@ class WarpStoreCoeffFinder : private IRVisitor {
         arith::DetectLinearEquation(index, {warp_index_});
     CHECK_EQ(m.size(), 2U)
         << "LowerWarpMemory failed due to store index=" << index;
-    int coeff;
+    int coeff = 0;
     Expr mcoeff = ir::Simplify(m[0]);
 
     CHECK(arith::GetConstInt(mcoeff, &coeff) && coeff > 0)
@@ -317,7 +317,7 @@ class WarpMemoryRewriter : private IRMutator {
 LoweredFunc
 LowerWarpMemory(LoweredFunc f, int warp_size) {
   CHECK_EQ(f->func_type, kDeviceFunc);
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = WarpMemoryRewriter(warp_size).Rewrite(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 8113c58f3f78..41f92ad24085 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -132,7 +132,7 @@ LoweredFunc MakeAPI(Stmt body,
     }
   }
 
-  std::shared_ptr<LoweredFuncNode> n = std::make_shared<LoweredFuncNode>();
+  NodePtr<LoweredFuncNode> n = make_node<LoweredFuncNode>();
   n->name = name;
   n->args = args;
   n->handle_data_type = binder.def_handle_dtype();
@@ -197,7 +197,7 @@ class DeviceTypeBinder: public IRMutator {
 
 LoweredFunc BindDeviceType(LoweredFunc f,
                            int device_type) {
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = DeviceTypeBinder(device_type).Mutate(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/remap_thread_axis.cc b/src/pass/remap_thread_axis.cc
index 94e4819a1d71..08a62b25e2c4 100644
--- a/src/pass/remap_thread_axis.cc
+++ b/src/pass/remap_thread_axis.cc
@@ -67,7 +67,7 @@ RemapThreadAxis(LoweredFunc f, Map<Expr, IterVar> thread_map) {
   }
 
   CHECK_EQ(f->func_type, kDeviceFunc);
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   // replace the thread axis
   for (size_t i = 0; i < n->thread_axis.size(); ++i) {
     auto it = tmap.find(n->thread_axis[i]->thread_tag);
diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc
index c7b20e137638..112c2c173df1 100644
--- a/src/pass/split_host_device.cc
+++ b/src/pass/split_host_device.cc
@@ -165,8 +165,8 @@ class HostDeviceSplitter : public IRMutator {
       handle_data_type_[kv.first.get()] = kv.second;
     }
     name_ = f->name;
-    std::shared_ptr<LoweredFuncNode> n =
-        std::make_shared<LoweredFuncNode>(*f.operator->());
+    NodePtr<LoweredFuncNode> n =
+        make_node<LoweredFuncNode>(*f.operator->());
     n->body = this->Mutate(f->body);
     n->func_type = kHostFunc;
     Array<LoweredFunc> ret{LoweredFunc(n)};
@@ -180,7 +180,7 @@ class HostDeviceSplitter : public IRMutator {
   Stmt SplitDeviceFunc(Stmt body) {
     std::ostringstream os;
     os << name_ << "_kernel" << device_funcs_.size();
-    std::shared_ptr<LoweredFuncNode> n = std::make_shared<LoweredFuncNode>();
+    NodePtr<LoweredFuncNode> n = make_node<LoweredFuncNode>();
     // isolate the device function.
     IRUseDefAnalysis m;
     m.visit_thread_extent_ = false;
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 58b62f291d39..2bab21d85737 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -950,8 +950,7 @@ class VectorAllocRewriter : public IRMutator {
 
 
 LoweredFunc PointerValueTypeRewrite(LoweredFunc f) {
-  std::shared_ptr<LoweredFuncNode> n =
-      std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   VectorAllocRewriter rewriter;
   n->body = rewriter.Mutate(n->body);
   for (Var arg : f->args) {
diff --git a/src/pass/storage_sync.cc b/src/pass/storage_sync.cc
index 43f3b94d114f..6f7fc886fd8c 100644
--- a/src/pass/storage_sync.cc
+++ b/src/pass/storage_sync.cc
@@ -329,7 +329,7 @@ Stmt ThreadSync(Stmt stmt, std::string storage_scope) {
 
 LoweredFunc ThreadSync(LoweredFunc f, std::string storage_scope) {
   CHECK_NE(f->func_type, kHostFunc);
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = ThreadSync(f->body, storage_scope);
   return LoweredFunc(n);
 }
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
index 7e7fb71f6d6c..97ac9e52a4c2 100644
--- a/src/relay/ir/base.cc
+++ b/src/relay/ir/base.cc
@@ -12,50 +12,39 @@ namespace relay {
 using tvm::IRPrinter;
 using namespace tvm::runtime;
 
-SourceName SourceNameNode::make(std::string name) {
-  std::shared_ptr<SourceNameNode> n = std::make_shared<SourceNameNode>();
-  n->name = std::move(name);
-  return SourceName(n);
-}
-
-std::shared_ptr<SourceNameNode> CreateSourceName(const std::string& name) {
-  SourceName sn = SourceName::Get(name);
-  CHECK(!sn.defined()) << "Cannot find source name \'" << name << '\'';
-  std::shared_ptr<Node> node = sn.node_;
-  return std::dynamic_pointer_cast<SourceNameNode>(node);
-}
-
-const SourceName& SourceName::Get(const std::string& name) {
-  static std::unordered_map<std::string, SourceName> source_map;
+NodePtr<SourceNameNode> GetSourceNameNode(const std::string& name) {
+  // always return pointer as the reference can change as map re-allocate.
+  // or use another level of indirection by creating a unique_ptr
+  static std::unordered_map<std::string, NodePtr<SourceNameNode> > source_map;
 
   auto sn = source_map.find(name);
   if (sn == source_map.end()) {
-    auto source_name = SourceNameNode::make(name);
-    source_map.insert({name, source_name});
-    return source_map.at(name);
+    NodePtr<SourceNameNode> n = make_node<SourceNameNode>();
+    n->name = std::move(name);
+    source_map[name] = n;
+    return n;
   } else {
     return sn->second;
   }
 }
 
-TVM_REGISTER_API("relay._make.SourceName")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue *ret) {
-      *ret = SourceNameNode::make(args[0]);
-    });
+SourceName SourceName::Get(const std::string& name) {
+  return SourceName(GetSourceNameNode(name));
+}
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<SourceNameNode>([](const SourceNameNode *node, tvm::IRPrinter *p) {
-      p->stream << "SourceNameNode(" << node->name << ", " << node << ")";
-    });
+.set_dispatch<SourceNameNode>([](const SourceNameNode *node, tvm::IRPrinter *p) {
+    p->stream << "SourceName(" << node->name << ", " << node << ")";
+  });
 
 TVM_REGISTER_NODE_TYPE(SourceNameNode)
-.set_creator(CreateSourceName)
+.set_creator(GetSourceNameNode)
 .set_global_key([](const Node* n) {
     return static_cast<const SourceNameNode*>(n)->name;
   });
 
 Span SpanNode::make(SourceName source, int lineno, int col_offset) {
-  std::shared_ptr<SpanNode> n = std::make_shared<SpanNode>();
+  auto n = make_node<SpanNode>();
   n->source = std::move(source);
   n->lineno = lineno;
   n->col_offset = col_offset;
diff --git a/src/relay/ir/environment.cc b/src/relay/ir/environment.cc
index 47c9789ab5ae..16b0314507cf 100644
--- a/src/relay/ir/environment.cc
+++ b/src/relay/ir/environment.cc
@@ -15,7 +15,7 @@ using tvm::IRPrinter;
 using namespace runtime;
 
 Environment EnvironmentNode::make(tvm::Map<GlobalVar, Function> global_funcs) {
-  std::shared_ptr<EnvironmentNode> n = std::make_shared<EnvironmentNode>();
+  auto n = make_node<EnvironmentNode>();
   n->functions = std::move(global_funcs);
   return Environment(n);
 }
@@ -31,20 +31,22 @@ GlobalVar EnvironmentNode::GetGlobalVar(const std::string &str) {
   }
 }
 
-/*! \brief Add a new item to the global environment
+/*!
+ * \brief Add a new item to the global environment
  * \note if the update flag is not set adding a duplicate
  * definition will trigger an exception, otherwise we will
  * update the definition if and only if it is type compatible.
  */
-void EnvironmentNode::Add(const GlobalVar &var, const Function &func,
+void EnvironmentNode::Add(const GlobalVar &var,
+                          const Function &func,
                           bool update) {
   // Type check the item before we add it to the environment.
-  auto env = GetRef<Environment>(this);
+  auto env = relay::GetRef<Environment>(this);
 
   Expr checked_expr = InferType(env, var, func);
 
   if (const FunctionNode *func_node = checked_expr.as<FunctionNode>()) {
-    auto checked_func = GetRef<Function>(func_node);
+    auto checked_func = relay::GetRef<Function>(func_node);
     auto type = checked_func->checked_type();
 
     CHECK(IsFullyResolved(type));
@@ -100,46 +102,46 @@ void EnvironmentNode::Merge(const Environment &env) {
 }
 
 TVM_REGISTER_API("relay._make.Environment")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      *ret = EnvironmentNode::make(args[0]);
-    });
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = EnvironmentNode::make(args[0]);
+  });
 
 TVM_REGISTER_API("relay._env.Environment_Add")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      Environment env = args[0];
-      env->Add(args[1], args[2], false);
-    });
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    Environment env = args[0];
+    env->Add(args[1], args[2], false);
+  });
 
 TVM_REGISTER_API("relay._env.Environment_GetGlobalVar")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      Environment env = args[0];
-      *ret = env->GetGlobalVar(args[1]);
-    });
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    Environment env = args[0];
+    *ret = env->GetGlobalVar(args[1]);
+  });
 
 TVM_REGISTER_API("relay._env.Environment_Lookup")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      Environment env = args[0];
-      GlobalVar var = args[1];
-      *ret = env->Lookup(var);
-    });
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    Environment env = args[0];
+    GlobalVar var = args[1];
+    *ret = env->Lookup(var);
+  });
 
 TVM_REGISTER_API("relay._env.Environment_Lookup_str")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      Environment env = args[0];
-      std::string var_name = args[1];
-      auto var = env->GetGlobalVar(var_name);
-      *ret = env->Lookup(var);
-    });
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    Environment env = args[0];
+    std::string var_name = args[1];
+    auto var = env->GetGlobalVar(var_name);
+    *ret = env->Lookup(var);
+  });
 
 TVM_REGISTER_API("relay._env.Environment_Merge")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      Environment env = args[0];
-      env->Merge(args[1]);
-    });
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    Environment env = args[0];
+    env->Merge(args[1]);
+  });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<EnvironmentNode>([](const EnvironmentNode *node,
-                                      tvm::IRPrinter *p) {
+.set_dispatch<EnvironmentNode>(
+    [](const EnvironmentNode *node, tvm::IRPrinter *p) {
       p->stream << "EnvironmentNode( " << node->functions << ")";
     });
 
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index f4363f5312c4..241ccc0b85c3 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -3,7 +3,6 @@
  * \file src/tvm/ir/expr.cc
  * \brief The expression AST nodes of Relay.
  */
-#include <tvm/ir_functor.h>
 #include <tvm/relay/expr.h>
 
 namespace tvm {
@@ -13,21 +12,20 @@ using tvm::IRPrinter;
 using namespace tvm::runtime;
 
 Constant ConstantNode::make(runtime::NDArray data) {
-  std::shared_ptr<ConstantNode> n = std::make_shared<ConstantNode>();
+  NodePtr<ConstantNode> n = make_node<ConstantNode>();
   n->data = std::move(data);
   return Constant(n);
 }
 
 TVM_REGISTER_API("relay._make.Constant")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      *ret = ConstantNode::make(args[0]);
-    });
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = ConstantNode::make(args[0]);
+  });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<ConstantNode>([](const ConstantNode *node,
-                                   tvm::IRPrinter *p) {
-      p->stream << "ConstantNode(TODO)";
-    });
+.set_dispatch<ConstantNode>([](const ConstantNode *node, tvm::IRPrinter *p) {
+    p->stream << "Constant(TODO)";
+  });
 
 TensorType ConstantNode::tensor_type() const {
   auto dtype = TVMType2Type(data->dtype);
@@ -41,57 +39,55 @@ TensorType ConstantNode::tensor_type() const {
 }
 
 Tuple TupleNode::make(tvm::Array<relay::Expr> fields) {
-  std::shared_ptr<TupleNode> n = std::make_shared<TupleNode>();
+  NodePtr<TupleNode> n = make_node<TupleNode>();
   n->fields = std::move(fields);
   return Tuple(n);
 }
 
 TVM_REGISTER_API("relay._make.Tuple")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      *ret = TupleNode::make(args[0]);
-    });
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = TupleNode::make(args[0]);
+  });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<TupleNode>([](const TupleNode *node, tvm::IRPrinter *p) {
-      p->stream << "TupleNode(" << node->fields << ")";
-    });
+.set_dispatch<TupleNode>([](const TupleNode *node, tvm::IRPrinter *p) {
+    p->stream << "Tuple(" << node->fields << ")";
+  });
 
 Var VarNode::make(std::string name_hint) {
-  std::shared_ptr<VarNode> n = std::make_shared<VarNode>();
+  NodePtr<VarNode> n = make_node<VarNode>();
   n->name_hint = std::move(name_hint);
   return Var(n);
 }
 
 TVM_REGISTER_API("relay._make.Var")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      *ret = VarNode::make(args[0]);
-    });
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = VarNode::make(args[0]);
+  });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<VarNode>([](const VarNode *node,
-                                   tvm::IRPrinter *p) {
-      p->stream << "VarNode(" << node->name_hint << ")";
-    });
+.set_dispatch<VarNode>([](const VarNode *node, tvm::IRPrinter *p) {
+    p->stream << "Var(" << node->name_hint << ")";
+  });
 
 GlobalVar GlobalVarNode::make(std::string name_hint) {
-  std::shared_ptr<GlobalVarNode> n = std::make_shared<GlobalVarNode>();
+  NodePtr<GlobalVarNode> n = make_node<GlobalVarNode>();
   n->name_hint = std::move(name_hint);
   return GlobalVar(n);
 }
 
 TVM_REGISTER_API("relay._make.GlobalVar")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      *ret = GlobalVarNode::make(args[0]);
-    });
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = GlobalVarNode::make(args[0]);
+  });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<GlobalVarNode>([](const GlobalVarNode *node,
-                                    tvm::IRPrinter *p) {
-      p->stream << "GlobalVarNode(" << node->name_hint << ")";
-    });
+.set_dispatch<GlobalVarNode>([](const GlobalVarNode *node, tvm::IRPrinter *p) {
+    p->stream << "GlobalVar(" << node->name_hint << ")";
+  });
 
 Param ParamNode::make(Var var, Type type) {
-  std::shared_ptr<ParamNode> n = std::make_shared<ParamNode>();
+  NodePtr<ParamNode> n = make_node<ParamNode>();
   n->var = std::move(var);
   n->type = std::move(type);
   return Param(n);
@@ -104,12 +100,12 @@ TVM_REGISTER_API("relay._make.Param")
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ParamNode>([](const ParamNode *node, tvm::IRPrinter *p) {
-  p->stream << "ParamNode(" << node->var << ", " << node->type << ")";
+    p->stream << "Param(" << node->var << ", " << node->type << ")";
 });
 
 Function FunctionNode::make(tvm::Array<Param> params, Type ret_type, Expr body,
                             tvm::Array<TypeParam> type_params) {
-  std::shared_ptr<FunctionNode> n = std::make_shared<FunctionNode>();
+  NodePtr<FunctionNode> n = make_node<FunctionNode>();
   n->params = std::move(params);
   n->ret_type = std::move(ret_type);
   n->body = std::move(body);
@@ -140,7 +136,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 
 Call CallNode::make(Expr op, Array<Expr> args, Attrs attrs,
                     Array<Type> type_args) {
-  std::shared_ptr<CallNode> n = std::make_shared<CallNode>();
+  NodePtr<CallNode> n = make_node<CallNode>();
   n->op = std::move(op);
   n->args = std::move(args);
   n->attrs = std::move(attrs);
@@ -160,7 +156,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 });
 
 Let LetNode::make(Var var, Expr value, Expr body, Type value_type) {
-  std::shared_ptr<LetNode> n = std::make_shared<LetNode>();
+  NodePtr<LetNode> n = make_node<LetNode>();
   n->var = std::move(var);
   n->value = std::move(value);
   n->body = std::move(body);
@@ -180,7 +176,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 });
 
 If IfNode::make(Expr cond, Expr true_branch, Expr false_branch) {
-  std::shared_ptr<IfNode> n = std::make_shared<IfNode>();
+  NodePtr<IfNode> n = make_node<IfNode>();
   n->cond = std::move(cond);
   n->true_branch = std::move(true_branch);
   n->false_branch = std::move(false_branch);
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
index d1a9dd072d31..4826aed54ba5 100644
--- a/src/relay/ir/op.cc
+++ b/src/relay/ir/op.cc
@@ -51,7 +51,7 @@ const Op& Op::Get(const std::string& name) {
 
 OpRegistry::OpRegistry() {
   OpManager* mgr = OpManager::Global();
-  std::shared_ptr<OpNode> n = std::make_shared<OpNode>();
+  NodePtr<OpNode> n = make_node<OpNode>();
   n->index_ = mgr->op_counter++;
   op_ = Op(n);
 }
@@ -90,14 +90,14 @@ void OpRegistry::UpdateAttr(const std::string& key, TVMRetValue value,
 
 // Frontend APIs
 TVM_REGISTER_API("relay.op._ListOpNames")
-    .set_body_typed<Array<tvm::Expr>()>([]() {
-      Array<tvm::Expr> ret;
-      for (const std::string& name :
-           dmlc::Registry<OpRegistry>::ListAllNames()) {
-        ret.push_back(tvm::Expr(name));
-      }
-      return ret;
-    });
+.set_body_typed<Array<tvm::Expr>()>([]() {
+    Array<tvm::Expr> ret;
+    for (const std::string& name :
+             dmlc::Registry<OpRegistry>::ListAllNames()) {
+      ret.push_back(tvm::Expr(name));
+    }
+    return ret;
+  });
 
 TVM_REGISTER_API("relay.op._GetOp").set_body_typed<Op(std::string)>(Op::Get);
 
@@ -138,11 +138,10 @@ TVM_REGISTER_API("relay.op._Register")
       }
     });
 
-std::shared_ptr<OpNode> CreateOp(const std::string& name) {
+NodePtr<Node> CreateOp(const std::string& name) {
   auto op = Op::Get(name);
   CHECK(!op.defined()) << "Cannot find op \'" << name << '\'';
-  std::shared_ptr<Node> node = op.node_;
-  return std::dynamic_pointer_cast<OpNode>(node);
+  return op.node_;
 }
 
 TVM_REGISTER_NODE_TYPE(OpNode)
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
index c13fea26dacd..fce01390fa94 100644
--- a/src/relay/ir/type.cc
+++ b/src/relay/ir/type.cc
@@ -3,7 +3,6 @@
  * \file src/tvm/ir/type.cc
  * \brief The type system AST nodes of Relay.
  */
-#include <tvm/ir_functor.h>
 #include <tvm/relay/type.h>
 
 namespace tvm {
@@ -13,7 +12,7 @@ using tvm::IRPrinter;
 using namespace tvm::runtime;
 
 TensorType TensorTypeNode::make(Array<ShapeExpr> shape, DataType dtype) {
-  std::shared_ptr<TensorTypeNode> n = std::make_shared<TensorTypeNode>();
+  NodePtr<TensorTypeNode> n = make_node<TensorTypeNode>();
   n->shape = std::move(shape);
   n->dtype = std::move(dtype);
   return TensorType(n);
@@ -36,7 +35,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 });
 
 TypeParam TypeParamNode::make(std::string name, TypeParamNode::Kind kind) {
-  std::shared_ptr<TypeParamNode> n = std::make_shared<TypeParamNode>();
+  NodePtr<TypeParamNode> n = make_node<TypeParamNode>();
   n->var = tvm::Var(name);
   n->kind = std::move(kind);
   return TypeParam(n);
@@ -59,7 +58,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 FuncType FuncTypeNode::make(tvm::Array<Type> arg_types, Type ret_type,
                             tvm::Array<TypeParam> type_params,
                             tvm::Array<TypeConstraint> type_constraints) {
-  std::shared_ptr<FuncTypeNode> n = std::make_shared<FuncTypeNode>();
+  NodePtr<FuncTypeNode> n = make_node<FuncTypeNode>();
   n->arg_types = std::move(arg_types);
   n->ret_type = std::move(ret_type);
   n->type_params = std::move(type_params);
@@ -81,7 +80,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 });
 
 TypeRelation TypeRelationNode::make(std::string name, TypeRelationFn func, Array<Type> args) {
-  std::shared_ptr<TypeRelationNode> n = std::make_shared<TypeRelationNode>();
+  NodePtr<TypeRelationNode> n = make_node<TypeRelationNode>();
   n->name = std::move(name);
   n->func_ = std::move(func);
   n->args = std::move(args);
@@ -101,7 +100,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 });
 
 TupleType TupleTypeNode::make(Array<Type> fields) {
-  std::shared_ptr<TupleTypeNode> n = std::make_shared<TupleTypeNode>();
+  NodePtr<TupleTypeNode> n = make_node<TupleTypeNode>();
   n->fields = std::move(fields);
   return TupleType(n);
 }
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
index 522eb93483fb..91d2d5822110 100644
--- a/src/relay/pass/kind_check.cc
+++ b/src/relay/pass/kind_check.cc
@@ -10,10 +10,9 @@
  *
  * For example tensors are not allowed to contain functions in Relay.
  *
- * We check this by ensuring the `dtype` field of a Tensor always 
+ * We check this by ensuring the `dtype` field of a Tensor always
  * contains a data type such as `int`, `float`, `uint`.
  */
-#include <tvm/ir_functor.h>
 #include <tvm/relay/pass.h>
 #include "./type_visitor.h"
 
diff --git a/src/relay/pass/type_functor.h b/src/relay/pass/type_functor.h
index 339552108af4..cccde62625ea 100644
--- a/src/relay/pass/type_functor.h
+++ b/src/relay/pass/type_functor.h
@@ -6,7 +6,7 @@
 #ifndef TVM_RELAY_PASS_TYPE_FUNCTOR_H_
 #define TVM_RELAY_PASS_TYPE_FUNCTOR_H_
 
-#include <tvm/ir_functor.h>
+#include <tvm/node/ir_functor.h>
 #include <tvm/relay/expr.h>
 #include "./incomplete_type.h"
 
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index f4f6d82eb5e1..deed982acbc6 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -137,7 +137,7 @@ class TypeInferencer : private ExprFunctor<CheckedExpr(const Expr&)> {
   void Solve(TypeRelationData& ty_rel);
 
   /*! \brief Attempt to solve all pending relations.
-   * 
+   *
    * If the solver
    */
   SolverResult Solve(std::vector<TypeRelationData>& rels);
@@ -607,8 +607,7 @@ TVM_REGISTER_API("relay._ir_pass._get_checked_type")
 /* Incomplete Type */
 
 IncompleteType IncompleteTypeNode::make(TypeParamNode::Kind kind) {
-  std::shared_ptr<IncompleteTypeNode> n =
-      std::make_shared<IncompleteTypeNode>();
+  auto n = make_node<IncompleteTypeNode>();
   n->kind = std::move(kind);
   return IncompleteType(n);
 }
diff --git a/src/relay/pass/unifier.cc b/src/relay/pass/unifier.cc
index b0ed71d17911..67cc58ffc0a3 100644
--- a/src/relay/pass/unifier.cc
+++ b/src/relay/pass/unifier.cc
@@ -21,7 +21,7 @@ using tvm::IRPrinter;
 using namespace tvm::runtime;
 
 UnionFind UnionFindNode::make(tvm::Map<IncompleteType, Type> uf_map) {
-  std::shared_ptr<UnionFindNode> n = std::make_shared<UnionFindNode>();
+  auto n = make_node<UnionFindNode>();
   n->uf_map = uf_map;
   return UnionFind(n);
 }
@@ -130,7 +130,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
     });
 
 TypeUnifier TypeUnifierNode::make(UnionFind union_find) {
-  std::shared_ptr<TypeUnifierNode> n = std::make_shared<TypeUnifierNode>();
+  auto n = make_node<TypeUnifierNode>();
   n->union_find = union_find;
   return TypeUnifier(n);
 }
diff --git a/src/relay/pass/unifier.h b/src/relay/pass/unifier.h
index 4e939cc26bca..feda644cdd1d 100644
--- a/src/relay/pass/unifier.h
+++ b/src/relay/pass/unifier.h
@@ -67,7 +67,7 @@ class UnionFindNode : public Node {
 class UnionFind : public NodeRef {
  public:
   UnionFind() {}
-  explicit UnionFind(std::shared_ptr<tvm::Node> p) : NodeRef(p) {}
+  explicit UnionFind(NodePtr<tvm::Node> p) : NodeRef(p) {}
 
   // The union find structure is mutable so we do not use the standard macros
   // and expose the pointer via `->`.
@@ -126,7 +126,7 @@ class TypeUnifierNode : public Node,
 class TypeUnifier : public NodeRef {
  public:
   TypeUnifier() {}
-  explicit TypeUnifier(std::shared_ptr<tvm::Node> p) : NodeRef(p) {}
+  explicit TypeUnifier(NodePtr<tvm::Node> p) : NodeRef(p) {}
 
   // no const so that unifier can be mutable as a member of typechecker
   inline TypeUnifierNode* operator->() const {
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index fa26aea51a2b..8591c77bd7cc 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -46,7 +46,7 @@ Expr InjectPredicate(const Array<Expr>& predicates,
   if (predicates.size() == 0) return body;
   const Reduce* reduce = body.as<Reduce>();
   if (reduce) {
-    std::shared_ptr<Reduce> n = std::make_shared<Reduce>(*reduce);
+    auto n = make_node<Reduce>(*reduce);
     n->condition = n->condition && arith::ComputeReduce<ir::And>(predicates, Expr());
     return Expr(n);
   }
@@ -400,7 +400,7 @@ void InjectInline(ScheduleNode* sch) {
               CHECK_EQ(new_body[j].size(), r->source.size());
               CHECK(r != nullptr);
               for (size_t k = 0; k < new_body[j].size(); ++k) {
-                std::shared_ptr<ir::Reduce> n = std::make_shared<ir::Reduce>(*r);
+                auto n = make_node<ir::Reduce>(*r);
                 n->value_index = static_cast<int>(k);
                 n->type = r->source[k].type();
                 new_body[j].Set(k, Expr(n));
@@ -520,11 +520,11 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
   const int factor_axis_pos = \
       factor_axis >= 0 ? factor_axis : static_cast<int>(compute_op->axis.size() + 1) + factor_axis;
   CHECK_LE(factor_axis_pos, compute_op->axis.size());
-  auto n = std::make_shared<ComputeOpNode>();
+  auto n = make_node<ComputeOpNode>();
   n->name = compute_op->name + ".rf";
   {
     // axis relacement.
-    auto iv_node = std::make_shared<IterVarNode>();
+    auto iv_node = make_node<IterVarNode>();
     iv_node->dom = dom_map.at(axis);
     CHECK(is_zero(iv_node->dom->min))
         << "Can only factor reduction domain starting from 0";
@@ -565,7 +565,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
   for (IterVar iv : reduce_stage->leaf_iter_vars) {
     if (touch_map.count(iv) && !iv.same_as(axis)) {
       CHECK_EQ(iv->iter_type, kCommReduce);
-      auto ncpy = std::make_shared<IterVarNode>(*iv.operator->());
+      auto ncpy = make_node<IterVarNode>(*iv.operator->());
       ncpy->dom = dom_map.at(iv);
       n->reduce_axis.push_back(IterVar(ncpy));
     }
diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index 1490c85ff786..d503e978887e 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -70,7 +70,7 @@ void Split(StageNode* self,
 }  // namespace
 
 Stage::Stage(Operation op) {
-  auto n = std::make_shared<StageNode>();
+  auto n = make_node<StageNode>();
   n->op = op;
   n->origin_op = op;
   n->all_iter_vars = op->root_iter_vars();
@@ -164,16 +164,16 @@ Stage& Stage::bind(IterVar ivar, IterVar thread_ivar) {   // NOLINT(*)
   FindLeafVar(all_vars, leaf_vars, ivar);
 
   auto it = self->iter_var_attrs.find(ivar);
-  std::shared_ptr<IterVarAttrNode> n;
+  NodePtr<IterVarAttrNode> n;
   if (it != self->iter_var_attrs.end()) {
-    n = std::make_shared<IterVarAttrNode>(*(*it).second.operator->());
+    n = make_node<IterVarAttrNode>(*(*it).second.operator->());
     if (n->bind_thread.defined() &&
         !n->bind_thread.same_as(thread_ivar)) {
       LOG(WARNING) << "Axis " << ivar
                    << " is already bind to another thread " << n->bind_thread;
     }
   } else {
-    n = std::make_shared<IterVarAttrNode>();
+    n = make_node<IterVarAttrNode>();
   }
   n->bind_thread = thread_ivar;
   self->iter_var_attrs.Set(ivar, IterVarAttr(n));
@@ -188,7 +188,7 @@ Stage& Stage::env_threads(Array<IterVar> threads) {
       << "Already set env_threads";
   ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
-  std::vector<std::shared_ptr<Node> > temp;
+  std::vector<NodePtr<Node> > temp;
   for (IterVar iv : threads) {
     temp.push_back(iv.node_);
   }
@@ -303,7 +303,7 @@ Stage& Stage::reorder(const Array<IterVar>& order) {  // NOLINT(*)
   for (size_t i = 0; i < order.size(); ++i) {
     pos.push_back(FindLeafVar(all_vars, leaf_vars, order[i]));
   }
-  std::vector<std::shared_ptr<Node> > temp;
+  std::vector<NodePtr<Node> > temp;
   for (size_t i = 0; i < pos.size(); ++i) {
     temp.emplace_back(leaf_vars->data[pos[i]]);
   }
@@ -335,11 +335,11 @@ inline void UpdateIterVarAttr(StageNode* self,
     FindLeafVar(all_vars, leaf_vars, var);
   }
   auto it = self->iter_var_attrs.find(var);
-  std::shared_ptr<IterVarAttrNode> n;
+  NodePtr<IterVarAttrNode> n;
   if (it != self->iter_var_attrs.end()) {
-    n = std::make_shared<IterVarAttrNode>(*(*it).second.operator->());
+    n = make_node<IterVarAttrNode>(*(*it).second.operator->());
   } else {
-    n = std::make_shared<IterVarAttrNode>();
+    n = make_node<IterVarAttrNode>();
   }
   fupdate(n.get());
   self->iter_var_attrs.Set(var, IterVarAttr(n));
@@ -397,11 +397,11 @@ Stage& Stage::prefetch(const Tensor &tensor, IterVar var, Expr offset) {
   ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
   FindLeafVar(all_vars, leaf_vars, var);
   auto it = self->iter_var_attrs.find(var);
-  std::shared_ptr<IterVarAttrNode> n;
+  NodePtr<IterVarAttrNode> n;
   if (it != self->iter_var_attrs.end()) {
-    n = std::make_shared<IterVarAttrNode>(*(*it).second.operator->());
+    n = make_node<IterVarAttrNode>(*(*it).second.operator->());
   } else {
-    n = std::make_shared<IterVarAttrNode>();
+    n = make_node<IterVarAttrNode>();
   }
   n->prefetch_data.push_back(tensor);
   n->prefetch_offset.push_back(offset);
@@ -468,8 +468,8 @@ Stage& Stage::opengl() {
 }
 
 Stage CopyStage(const Stage& s) {
-  std::shared_ptr<StageNode> n =
-      std::make_shared<StageNode>(*s.operator->());
+  NodePtr<StageNode> n =
+      make_node<StageNode>(*s.operator->());
   return Stage(n);
 }
 
@@ -477,7 +477,7 @@ Schedule Schedule::copy() const {
   // map of stages.
   const ScheduleNode* self = operator->();
   std::unordered_map<Stage, Stage, NodeHash, NodeEqual> smap;
-  std::shared_ptr<ScheduleNode> n = std::make_shared<ScheduleNode>();
+  NodePtr<ScheduleNode> n = make_node<ScheduleNode>();
   n->outputs = self->outputs;
   // Copy the stages.
   for (Stage s : self->stages) {
@@ -599,7 +599,7 @@ Stage Schedule::create_group(const Array<Tensor>& outputs,
     }
   }
   // Create the new group stage.
-  Stage gstage(std::make_shared<StageNode>());
+  Stage gstage(make_node<StageNode>());
   gstage->group = parent_group;
   if (parent_group.defined()) {
     ++parent_group->num_child_stages;
@@ -687,7 +687,7 @@ void ScheduleNode::InitCache() {
 }
 
 Schedule ScheduleNode::make(Array<Operation> ops) {
-  auto n = std::make_shared<ScheduleNode>();
+  auto n = make_node<ScheduleNode>();
   Schedule sch(n);
   n->outputs = ops;
   auto g = schedule::CreateReadGraph(n->outputs);
@@ -731,7 +731,7 @@ IterVarRelation SplitNode::make(IterVar parent,
                                 IterVar inner,
                                 Expr factor,
                                 Expr nparts) {
-  auto n = std::make_shared<SplitNode>();
+  auto n = make_node<SplitNode>();
   n->parent = parent;
   n->outer = outer;
   n->inner = inner;
@@ -742,7 +742,7 @@ IterVarRelation SplitNode::make(IterVar parent,
 
 IterVarRelation FuseNode::make(
     IterVar outer, IterVar inner, IterVar fused) {
-  auto n = std::make_shared<FuseNode>();
+  auto n = make_node<FuseNode>();
   n->outer = outer;
   n->inner = inner;
   n->fused = fused;
@@ -750,14 +750,14 @@ IterVarRelation FuseNode::make(
 }
 
 IterVarRelation RebaseNode::make(IterVar parent, IterVar rebased) {
-  auto n = std::make_shared<RebaseNode>();
+  auto n = make_node<RebaseNode>();
   n->parent = parent;
   n->rebased = rebased;
   return IterVarRelation(n);
 }
 
 IterVarRelation SingletonNode::make(IterVar iter) {
-  auto n = std::make_shared<SingletonNode>();
+  auto n = make_node<SingletonNode>();
   n->iter = iter;
   return IterVarRelation(n);
 }
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index f87924d84619..db140f240344 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -1,7 +1,7 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <tvm/tvm.h>
-#include <tvm/ir_functor.h>
+#include <tvm/node/ir_functor.h>
 #include <tvm/ir_functor_ext.h>
 
 TEST(IRF, Basic) {
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 7dcd5c921905..818376717176 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -10,6 +10,7 @@ make cython3 || exit -1
 
 # Test extern package package
 cd apps/extension
+rm -rf lib
 make || exit -1
 cd ../..
 python -m nose -v apps/extension/tests || exit -1

From 5ff53c21cb5b46574e213e98bc8cd152fd8585a4 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Thu, 20 Sep 2018 08:37:01 +0530
Subject: [PATCH 110/529] [FRONTEND][TENSORFLOW] GPU support for tensorflow
 models. (#1718)

---
 nnvm/python/nnvm/frontend/tensorflow.py       | 90 +++++++++++++------
 .../frontend/tensorflow/test_forward.py       | 28 ++++--
 tutorials/nnvm/from_tensorflow.py             | 18 ++--
 3 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index ab5664678fb6..9c9fac897ab2 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -35,6 +35,7 @@ def __call__(self, inputs, attrs, *args):
         self._ignores.append('use_cudnn_on_gpu')
         self._ignores.append('_node_name')
         self._ignores.append('is_training')
+        self._ignores.append('_target_layout')
         # Retain the names
         try:
             attrs['name'] = attrs['_node_name']
@@ -121,6 +122,9 @@ def _pooling(name):
     def _impl(inputs, attr, params):
 
         attr['data_format'] = attr['data_format'].decode("utf-8")
+        flip_layout = False
+
+        input_shape = attr['_input_shapes'][inputs[0]][0]
 
         if attr['data_format'] == 'NHWC':
             attr['kernel_shape'] = (attr['ksize'][1], attr['ksize'][2])
@@ -129,11 +133,17 @@ def _impl(inputs, attr, params):
         else:
             raise TypeError("Unsupported data_format type : {}".format(attr['data_format']))
 
+        if attr['_target_layout'] == "NCHW" and attr['data_format'] == "NHWC":
+            tmp_shape = attr['_input_shapes'][inputs[0]][0]
+            input_shape = [tmp_shape[ii] for ii in (0, 3, 1, 2)]
+            inputs[0] = _sym.transpose(inputs[0], axes=(0, 3, 1, 2))
+            attr['data_format'] = "NCHW"
+            flip_layout = True
+
         # Fix strides
         attr['strides'] = (attr['strides'][1], attr['strides'][2])
 
         # Fix padding
-        input_shapes = attr['_input_shapes'][inputs[0]]
         attr['padding'] = attr['padding'].decode("utf-8")
 
         if attr['padding'] == 'VALID':
@@ -142,11 +152,11 @@ def _impl(inputs, attr, params):
             stride_h, stride_w = attr['strides']
             kernel_h, kernel_w = attr['kernel_shape']
             if attr['data_format'] == 'NHWC':
-                in_h = input_shapes[0][1]
-                in_w = input_shapes[0][2]
+                in_h = input_shape[1]
+                in_w = input_shape[2]
             else:
-                in_h = input_shapes[0][2]
-                in_w = input_shapes[0][3]
+                in_h = input_shape[2]
+                in_w = input_shape[3]
 
             pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
             pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
@@ -158,7 +168,7 @@ def _impl(inputs, attr, params):
         if name == "avg_pool":
             attr['count_include_pad'] = False
 
-        return AttrCvt(
+        out = AttrCvt(
             op_name=_dimension_picker(name),
             transforms={
                 'kernel_shape':'pool_size',
@@ -166,33 +176,53 @@ def _impl(inputs, attr, params):
             ignores=['ksize'],
             extras={'ceil_mode': False},
             custom_check=_dimension_constraint())(inputs, attr)
+
+        if flip_layout:
+            out = _sym.transpose(out, axes=(0, 2, 3, 1))
+
+        return out
     return _impl
 
 def _conv(opname):
     def _impl(inputs, attr, params):
         attr['data_format'] = attr['data_format'].decode("utf-8")
-        input_shapes = attr['_input_shapes'][inputs[0]]
+        flip_layout = False
 
-        # Extract kernel shape from params
-        conv_param_weights = params[inputs[1].list_output_names()[0]]
+        input_shape = attr['_input_shapes'][inputs[0]][0]
+        weights_shape = params[inputs[1].list_output_names()[0]].shape
+
+        if attr['_target_layout'] == "NCHW" and attr['data_format'] == "NHWC":
+            input_shape = [input_shape[ii] for ii in (0, 3, 1, 2)]
+            inputs[0] = _sym.transpose(inputs[0], axes=(0, 3, 1, 2))
+            if opname == 'conv':
+                weights_shape = [weights_shape[ii] for ii in (3, 2, 0, 1)]
+                inputs[1] = _sym.transpose(inputs[1], axes=(3, 2, 0, 1))
+            else:
+                weights_shape = [weights_shape[ii] for ii in (2, 3, 0, 1)]
+                inputs[1] = _sym.transpose(inputs[1], axes=(2, 3, 0, 1))
+
+            attr['data_format'] = "NCHW"
+            flip_layout = True
 
         if attr['data_format'] == 'NHWC':
-            kernel_h, kernel_w, _, depth_mult = conv_param_weights.shape
-            attr['kernel_shape'] = (conv_param_weights.shape[0], conv_param_weights.shape[1])
+            kernel_h, kernel_w, _, depth_mult = weights_shape
+            attr['kernel_shape'] = (weights_shape[0], weights_shape[1])
             if opname == 'conv':
-                attr['channels'] = conv_param_weights.shape[3]
+                attr['channels'] = weights_shape[3]
             else:
-                attr['channels'] = input_shapes[0][3] * depth_mult
+                attr['channels'] = input_shape[3] * depth_mult
 
             if 'dilations' in attr:
                 attr['dilations'] = (attr['dilations'][0], attr['dilations'][1])
         elif attr['data_format'] == 'NCHW':
-            depth_mult, _, kernel_h, kernel_w = conv_param_weights.shape
-            attr['kernel_shape'] = (conv_param_weights.shape[2], conv_param_weights.shape[3])
+            depth_mult, _, kernel_h, kernel_w = weights_shape
+            attr['kernel_shape'] = (weights_shape[2], weights_shape[3])
             if opname == 'conv':
-                attr['channels'] = conv_param_weights.shape[1]
+                attr['channels'] = weights_shape[0]
             else:
-                attr['channels'] = input_shapes[0][1] * depth_mult
+                attr['channels'] = input_shape[0] * depth_mult
+                if attr['channels'] < 0:
+                    attr['channels'] *= -1
 
             if 'dilations' in attr:
                 attr['dilations'] = (attr['dilations'][2], attr['dilations'][3])
@@ -215,11 +245,11 @@ def _impl(inputs, attr, params):
             stride_h, stride_w = attr['strides']
             kernel_h, kernel_w = attr['kernel_shape']
             if attr['data_format'] == 'NHWC':
-                in_h = input_shapes[0][1]
-                in_w = input_shapes[0][2]
+                in_h = input_shape[1]
+                in_w = input_shape[2]
             else:
-                in_h = input_shapes[0][2]
-                in_w = input_shapes[0][3]
+                in_h = input_shape[2]
+                in_w = input_shape[3]
 
             pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
             pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
@@ -248,7 +278,7 @@ def _impl(inputs, attr, params):
             else:
                 attr['kernel_layout'] = 'HWOI' if attr['data_format'] == 'NHWC' else 'OIHW'
 
-        return AttrCvt(
+        out = AttrCvt(
             op_name=_dimension_picker('conv'),
             transforms={
                 'kernel_shape': 'kernel_size',
@@ -257,6 +287,11 @@ def _impl(inputs, attr, params):
                 'group': ('groups', 1)},
             extras={'use_bias': len(inputs) == 3},
             custom_check=_dimension_constraint())(inputs, attr)
+
+        if flip_layout:
+            out = _sym.transpose(out, axes=(0, 2, 3, 1))
+
+        return out
     return _impl
 
 def _decode_image():
@@ -305,7 +340,7 @@ def _matmul():
     def _impl(inputs, attr, params):
         channels = _infer_channels(inputs[1], params, not attr['transpose_b'])
         if attr['transpose_a']:
-            inputs[0] = _sym.transpose(inputs[0], axis(1, 0))
+            inputs[0] = _sym.transpose(inputs[0], axes(1, 0))
         if not attr['transpose_b']:
             inputs[1] = _sym.transpose(inputs[1], axes=(1, 0))
         return AttrCvt(op_name="dense",
@@ -948,7 +983,7 @@ def __init__(self):
         self._num_param = 0
         self._num_rnn_layer = False
 
-    def from_tensorflow(self, graph):
+    def from_tensorflow(self, graph, layout="NHWC"):
         """Construct nnvm nodes from tensorflow  graph definition - GraphDef.
 
         Follow the tensorflow graph definition to parse and convert it to NNVM.
@@ -1036,6 +1071,9 @@ def from_tensorflow(self, graph):
                 # Pass the node name too in attr
                 attr["_node_name"] = node.name
 
+                # Pass the target layout
+                attr["_target_layout"] = layout
+
                 #ToDo: Some of the tensorflow operators internaly maintain
                 #execution layers and its output name will the layer number along with
                 #graph node name.eg: Node name:- 'Model/RNN/cell_0/RnnCell', but the
@@ -1265,7 +1303,7 @@ def _fix_extranodes(self, op_name, attr, inputs):
 
         return inputs
 
-def from_tensorflow(graph):
+def from_tensorflow(graph, layout="NHWC"):
     """  Load tensorflow graph which is a python tensorflow graph object into nnvm graph.
     The companion parameters will be handled automatically.
 
@@ -1283,5 +1321,5 @@ def from_tensorflow(graph):
         Dict of converted parameters stored in tvm.ndarray format
     """
     g = GraphProto()
-    sym, params = g.from_tensorflow(graph)
+    sym, params = g.from_tensorflow(graph, layout)
     return sym, params
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 616259504a67..ad7f41a83e62 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -26,11 +26,15 @@
 #######################################################################
 # Generic run functions for TVM & tensorflow
 # ------------------------------------------
-def run_tvm_graph(graph_def, input_data, input_node, output_shape, output_dtype):
+def run_tvm_graph(graph_def, input_data, input_node, output_shape, output_dtype, target='llvm'):
     """ Generic function to compile on nnvm and execute on tvm """
 
-    sym, params = nnvm.frontend.from_tensorflow(graph_def)
-    target = 'llvm'
+    layout = None
+    if target == "cuda":
+        layout = "NCHW"
+
+    sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout)
+    target_host = 'llvm'
     if isinstance(input_data, list):
         shape_dict = {}
         dtype_dict = {}
@@ -41,10 +45,10 @@ def run_tvm_graph(graph_def, input_data, input_node, output_shape, output_dtype)
         shape_dict = {input_node: input_data.shape}
         dtype_dict = {input_node: input_data.dtype}
 
-    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict,
+    graph, lib, params = nnvm.compiler.build(sym, target=target, target_host=target_host, shape=shape_dict,
                                              dtype=dtype_dict, params=params)
 
-    ctx = tvm.cpu(0)
+    ctx = tvm.context(target, 0)
     from tvm.contrib import graph_runtime
     m = graph_runtime.create(graph, lib, ctx)
     # set inputs
@@ -106,9 +110,17 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False)
             )
 
         tf_output = run_tf_graph(sess, in_data, in_name, out_name)
-        tvm_output = run_tvm_graph(final_graph_def, in_data,
-                                   in_node, tf_output.shape, tf_output.dtype)
-        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+
+        for device in ["llvm", "cuda"]:
+            ctx = tvm.context(device, 0)
+            if not ctx.exist:
+                print("Skip because %s is not enabled" % device)
+                continue
+
+            tvm_output = run_tvm_graph(final_graph_def, in_data,
+                                       in_node, tf_output.shape, tf_output.dtype, target=device)
+            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+
         sess.close()
 
 #######################################################################
diff --git a/tutorials/nnvm/from_tensorflow.py b/tutorials/nnvm/from_tensorflow.py
index 033cdd8a4cab..7cd7e784e7c4 100644
--- a/tutorials/nnvm/from_tensorflow.py
+++ b/tutorials/nnvm/from_tensorflow.py
@@ -50,6 +50,16 @@
 lable_map = 'imagenet_synset_to_human_label_map.txt'
 lable_map_url = os.path.join(repo_base, lable_map)
 
+# Target settings
+# Use these commented settings to build for cuda.
+#target = 'cuda'
+#target_host = 'llvm'
+#layout = "NCHW"
+#ctx = tvm.gpu(0)
+target = 'llvm'
+target_host = 'llvm'
+layout = None
+ctx = tvm.cpu(0)
 
 ######################################################################
 # Download required files
@@ -99,7 +109,7 @@
 # Results:
 #   sym: nnvm graph for given tensorflow protobuf.
 #   params: params converted from tensorflow params (tensor protobuf).
-sym, params = nnvm.frontend.from_tensorflow(graph_def)
+sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout)
 
 print ("Tensorflow protobuf imported as nnvm graph")
 ######################################################################
@@ -113,18 +123,16 @@
 #   lib: target library which can be deployed on target with tvm runtime.
 
 import nnvm.compiler
-target = 'llvm'
 shape_dict = {'DecodeJpeg/contents': x.shape}
 dtype_dict = {'DecodeJpeg/contents': 'uint8'}
-graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, dtype=dtype_dict, params=params)
+graph, lib, params = nnvm.compiler.build(sym, shape=shape_dict, target=target, target_host=target_host, dtype=dtype_dict, params=params)
 
 ######################################################################
 # Execute the portable graph on TVM
 # ---------------------------------
-# Now we can try deploying the NNVM compiled model on cpu target.
+# Now we can try deploying the NNVM compiled model on target.
 
 from tvm.contrib import graph_runtime
-ctx = tvm.cpu(0)
 dtype = 'uint8'
 m = graph_runtime.create(graph, lib, ctx)
 # set inputs

From b108079574f06abbd9b307b4c3a7776ca9d23b23 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 19 Sep 2018 20:43:26 -0700
Subject: [PATCH 111/529] [CI] always rebuild sphinx-gallery docs from scratch
 (#1742)

---
 tests/scripts/task_python_docs.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 4ff7c490935e..72c9fbf6c0dd 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 mkdir -p docs/_build/html
 rm -rf docs/_build/html/jsdoc
+
+# remove stale tutorials and always build from scratch.
+rm -rf docs/tutorials
+
 # C++ doc
 make doc
 

From 0e16d230e073ae06a9d88549b4309de5633e594e Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Fri, 21 Sep 2018 01:05:35 +0900
Subject: [PATCH 112/529] [TEST][KERAS] convert tvm output to channels_last
 format (#1733)

---
 .../python/frontend/keras/test_forward.py     | 36 +++++++------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index b1c3730820d4..459be8737658 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -21,15 +21,6 @@ def verify_keras_frontend(keras_model, need_transpose=True):
     for layer in keras_model._input_layers:
         in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
 
-    #keras_model._output_coordinates contains the output_node, node_index and tensor_index
-    #get the outshapes from combining output node and tensor index
-    out_shapes = []
-    for layer, node_index, tensor_index in keras_model._output_coordinates:
-        layer_out = layer.output
-        if isinstance(layer.output, list):#if multiple outputs are there
-            layer_out = layer.output[tensor_index]
-        out_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer_out.shape))
-
     def get_keras_output(xs, dtype='float32'):
         return keras_model.predict(xs)
 
@@ -44,20 +35,24 @@ def get_tvm_output(xs, target, ctx, dtype='float32'):
         m.set_input(**params)
         m.run()
 
-        out = [m.get_output(i).asnumpy()
-                   for i, shape in enumerate(out_shapes)]
-        return out if len(out) > 1 else out[0]
+        return [m.get_output(i).asnumpy() for i in range(m.get_num_outputs())]
+
+    def to_channels_first(arr):
+        return arr.transpose([0, -1] + list(range(1, arr.ndim - 1)))
+
+    def to_channels_last(arr):
+        return arr.transpose([0] + list(range(2, arr.ndim)) + [1])
 
     xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
     keras_out = get_keras_output(xs)
 
+    keras_out = keras_out if isinstance(keras_out, list) else [keras_out]
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output([x.transpose([0,3,1,2]) for x in xs ] if need_transpose else xs, target, ctx)
-        if isinstance (keras_out, list):
-            for kout, tout in zip(keras_out, tvm_out):
-                np.testing.assert_allclose(kout, tout.reshape(kout.shape), rtol=1e-5, atol=1e-5)
-        else:
-            np.testing.assert_allclose(keras_out, tvm_out.reshape(keras_out.shape), rtol=1e-5, atol=1e-5)
+        tvm_out = get_tvm_output([to_channels_first(x) for x in xs] if need_transpose else xs, target, ctx)
+        for kout, tout in zip(keras_out, tvm_out):
+            if need_transpose:
+                tout = to_channels_last(tout)
+            np.testing.assert_allclose(kout, tout, rtol=1e-5, atol=1e-5)
 
 def test_forward_elemwise_add():
     r = []
@@ -111,7 +106,6 @@ def test_forward_conv():
                   keras.layers.SeparableConv2D(filters=10, kernel_size=(3,3), padding='same')]
     for conv_func in conv_funcs:
         x = conv_func(data)
-        x = keras.layers.GlobalAveragePooling2D()(x)
         keras_model = keras.models.Model(data, x)
         verify_keras_frontend(keras_model)
 
@@ -119,7 +113,6 @@ def test_forward_conv():
 def test_forward_upsample():
     data = keras.layers.Input(shape=(32,32,3))
     x = keras.layers.UpSampling2D(size=(3,3))(data)
-    x = keras.layers.GlobalAveragePooling2D()(x)
     keras_model = keras.models.Model(data, x)
     verify_keras_frontend(keras_model)
 
@@ -127,7 +120,6 @@ def test_forward_upsample():
 def test_forward_reshape():
     data = keras.layers.Input(shape=(32,32,3))
     x = keras.layers.Reshape(target_shape=(32,32,3))(data)
-    x = keras.layers.GlobalAveragePooling2D()(x)
     keras_model = keras.models.Model(data, x)
     verify_keras_frontend(keras_model)
 
@@ -141,7 +133,6 @@ def test_forward_crop():
     x = keras.layers.Cropping2D(cropping=(1, 0))(x)
     x = keras.layers.Cropping2D(cropping=0)(x)
     x = keras.layers.Add()([x, x])
-    x = keras.layers.GlobalAveragePooling2D()(x)
     keras_model = keras.models.Model(data, x)
     verify_keras_frontend(keras_model)
 
@@ -189,7 +180,6 @@ def test_forward_activations():
                  keras.layers.Activation('linear')]
     for act_func in act_funcs:
         x = act_func(data)
-        x = keras.layers.GlobalAveragePooling2D()(x)
         keras_model = keras.models.Model(data, x)
         verify_keras_frontend(keras_model)
 

From 7df3b78602ee353864dfe43ca734018adddcf06a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 20 Sep 2018 12:37:18 -0700
Subject: [PATCH 113/529] [TEAM] jroesch -> Reviewer (#1746)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index c220196717c7..42e1468053d8 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -25,6 +25,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Kazutaka Morita](https://github.com/kazum)
 - [Tatsuya Nishiyama](https://github.com/nishi-t)
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
+- [Jared Roesch](https://github.com/jroesch)
 - [Siva](https://github.com/srkreddy1238)
 - [Alex Weaver](https://github.com/alex-weaver)
 - [Yao Wang](https://github.com/kevinthesun)

From 981bae8f4513c34b2b2c2c0938e0dbc494cbde61 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 20 Sep 2018 12:37:27 -0700
Subject: [PATCH 114/529] [TEAM] siju-samuel -> Reviewer (#1745)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 42e1468053d8..e06ba5055672 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -27,6 +27,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
 - [Jared Roesch](https://github.com/jroesch)
 - [Siva](https://github.com/srkreddy1238)
+- [Siju Samuel](https://github.com/siju-samuel)
 - [Alex Weaver](https://github.com/alex-weaver)
 - [Yao Wang](https://github.com/kevinthesun)
 - [Jian Weng](https://github.com/were)

From 9015056097cc12aea7fa7b63906bb694e0fbcf6c Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 20 Sep 2018 20:17:24 -0700
Subject: [PATCH 115/529] [NODE][RELAY] Move most of the reference related code
 to node (#1747)

---
 include/tvm/node/node.h          | 52 +++++++++++++++--
 include/tvm/relay/base.h         | 37 ------------
 include/tvm/relay/expr.h         |  6 +-
 include/tvm/relay/expr_functor.h | 24 ++++----
 src/relay/ir/environment.cc      |  4 +-
 src/relay/ir/expr_functor.cc     | 97 +++++++++++++++++---------------
 src/relay/pass/type_visitor.h    |  3 +-
 tests/cpp/expr_test.cc           |  2 +-
 8 files changed, 120 insertions(+), 105 deletions(-)

diff --git a/include/tvm/node/node.h b/include/tvm/node/node.h
index d726b1dab660..efa930568c48 100644
--- a/include/tvm/node/node.h
+++ b/include/tvm/node/node.h
@@ -102,10 +102,10 @@ class TVM_DLL Node : public NodeBase {
   template<typename T>
   inline bool is_type() const;
   /*!
-   * \brief Get a NodeRef that holds reference to this Node.
-   * \return the NodeRef
+   * \brief Get a NodePtr that holds reference to this Node.
+   * \return the NodePtr
    */
-  inline NodeRef GetNodeRef() const;
+  inline NodePtr<Node> GetNodePtr() const;
   // node ref can see this
   friend class NodeRef;
   static constexpr const char* _type_key = "Node";
@@ -176,6 +176,32 @@ class NodeRef {
   NodePtr<Node> node_;
 };
 
+/*!
+ * \brief Get a reference type from a Node ptr type
+ *
+ *  It is always important to get a reference type
+ *  if we want to return a value as reference or keep
+ *  the node alive beyond the scope of the function.
+ *
+ * \param ptr The node pointer
+ * \tparam RefType The reference type
+ * \tparam NodeType The node type
+ * \return The corresponding RefType
+ */
+template <typename RefType, typename NodeType>
+inline RefType GetRef(const NodeType* ptr);
+
+/*!
+ * \brief Downcast a base reference type to a more specific type.
+ *
+ * \param ref The inptut reference
+ * \return The corresponding SubRef.
+ * \tparam SubRef The target specific reference type.
+ * \tparam BaseRef the current reference type.
+ */
+template <typename SubRef, typename BaseRef>
+inline SubRef Downcast(BaseRef ref);
+
 /*!
  * \brief helper macro to declare type information in a base node.
  */
@@ -218,8 +244,24 @@ inline bool Node::derived_from() const {
   return this->_DerivedFrom(type_id);
 }
 
-inline NodeRef Node::GetNodeRef() const {
-  return NodeRef(NodePtr<Node>(const_cast<Node*>(this)));
+inline NodePtr<Node> Node::GetNodePtr() const {
+  return NodePtr<Node>(const_cast<Node*>(this));
+}
+
+template <typename RefType, typename NodeType>
+inline RefType GetRef(const NodeType* ptr) {
+  static_assert(std::is_base_of<typename RefType::ContainerType, NodeType>::value,
+                "Can only cast to the ref of same container type");
+  return RefType(ptr->GetNodePtr());
+}
+
+template <typename SubRef, typename BaseRef>
+inline SubRef Downcast(BaseRef ref) {
+  CHECK(ref->template is_type<typename SubRef::ContainerType>() ||
+        ref->template derived_from<typename SubRef::ContainerType>())
+      << "Downcast from " << ref->type_key() << " to "
+      << SubRef::ContainerType::_type_key << " failed.";
+  return SubRef(std::move(ref.node_));
 }
 
 inline const Node* NodeRef::get() const {
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index ecf45353af67..ab55f6f3965f 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -158,43 +158,6 @@ class RelayNode : public Node {
   TVM_DECLARE_BASE_NODE_INFO(RelayNode, Node);
 };
 
-/*!
- * \brief Get a reference type from a Node ptr type
- *
- *  It is always important to get a reference type
- *  if we want to return a value as reference or keep
- *  the node alive beyond the scope of the function.
- *
- * \param ptr The node pointer
- * \tparam RefType The reference type
- * \tparam NodeType The node type
- * \return The corresponding RefType
- */
-template <typename RefType, typename NodeType>
-RefType GetRef(const NodeType* ptr) {
-  static_assert(std::is_same<typename RefType::ContainerType, NodeType>::value,
-                "Can only cast to the ref of same container type");
-  return RefType(std::move(ptr->GetNodeRef().node_));
-}
-
-// TODO(@tqchen, @jroesch): can we move these semantics to HalideIR
-template <typename T>
-inline const T* As(const NodeRef& node) {
-  const Node* ptr = static_cast<const Node*>(node.get());
-  if (ptr && (ptr->is_type<T>() || ptr->derived_from<T>())) {
-    return static_cast<const T*>(ptr);
-  }
-  return nullptr;
-}
-
-template <typename SubRef, typename BaseRef>
-SubRef Downcast(BaseRef ref) {
-  CHECK(ref->template is_type<typename SubRef::ContainerType>())
-      << "Downcast from " << ref->type_key() << " to "
-      << SubRef::ContainerType::_type_key << " failed.";
-  return SubRef(ref.node_);
-}
-
 }  // namespace relay
 }  // namespace tvm
 
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 6388e8367bf6..0dc2ff6fce2d 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -65,7 +65,9 @@ class ConstantNode : public ExprNode {
   TensorType tensor_type() const;
 
   /*! \return Whether it is scalar(rank-0 tensor) */
-  bool is_scalar() const { return data->ndim == 0; }
+  bool is_scalar() const {
+    return data->ndim == 0;
+  }
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("data", &data);
@@ -341,7 +343,7 @@ RELAY_DEFINE_NODE_REF(Let, LetNode, Expr);
  *
  * let x = if (true) { 1 } else { 0 }; // x is 1
  * let y = if (false) { 1 } else { 0 }; // y is 0
- * 
+ *
  * \note This is similar to C's ternary operator.
  */
 class If;
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index 27bb464b98a3..e79535a5034b 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -139,19 +139,19 @@ class ExprVisitor : public ::tvm::relay::ExprFunctor<void(const Expr& n)> {
 * the cost of using functional updates.
 */
 class ExprMutator
-    : public ::tvm::relay::ExprFunctor<Expr(const Expr&, const Expr&)> {
+    : public ::tvm::relay::ExprFunctor<Expr(const Expr&)> {
  public:
   Expr Mutate(const Expr& expr);
-  Expr VisitExpr_(const VarNode* op, const Expr& e) override;
-  Expr VisitExpr_(const ConstantNode* op, const Expr& e) override;
-  Expr VisitExpr_(const GlobalVarNode* op, const Expr& e) override;
-  Expr VisitExpr_(const OpNode* op, const Expr& expr) override;
-  Expr VisitExpr_(const TupleNode* op, const Expr& e) override;
-  Expr VisitExpr_(const ParamNode* op, const Expr& e) override;
-  Expr VisitExpr_(const FunctionNode* op, const Expr& e) override;
-  Expr VisitExpr_(const CallNode* call_node, const Expr& e) override;
-  Expr VisitExpr_(const LetNode* op, const Expr& e) override;
-  Expr VisitExpr_(const IfNode* op, const Expr& e) override;
+  Expr VisitExpr_(const VarNode* op) override;
+  Expr VisitExpr_(const ConstantNode* op) override;
+  Expr VisitExpr_(const GlobalVarNode* op) override;
+  Expr VisitExpr_(const OpNode* op) override;
+  Expr VisitExpr_(const TupleNode* op) override;
+  Expr VisitExpr_(const ParamNode* op) override;
+  Expr VisitExpr_(const FunctionNode* op) override;
+  Expr VisitExpr_(const CallNode* call_node) override;
+  Expr VisitExpr_(const LetNode* op) override;
+  Expr VisitExpr_(const IfNode* op) override;
   /*! \brief Used to visit the types inside of expressions.
    *
    * Can be overloaded to transform the types in arbitrary
@@ -162,7 +162,7 @@ class ExprMutator
 
  private:
   /*! \brief Internal map used for memoization. */
-  tvm::Map<Expr, Expr> memo_;
+  std::unordered_map<Expr, Expr, NodeHash, NodeEqual> memo_;
 };
 
 }  // namespace relay
diff --git a/src/relay/ir/environment.cc b/src/relay/ir/environment.cc
index 16b0314507cf..d7a28231ceac 100644
--- a/src/relay/ir/environment.cc
+++ b/src/relay/ir/environment.cc
@@ -41,12 +41,12 @@ void EnvironmentNode::Add(const GlobalVar &var,
                           const Function &func,
                           bool update) {
   // Type check the item before we add it to the environment.
-  auto env = relay::GetRef<Environment>(this);
+  auto env = GetRef<Environment>(this);
 
   Expr checked_expr = InferType(env, var, func);
 
   if (const FunctionNode *func_node = checked_expr.as<FunctionNode>()) {
-    auto checked_func = relay::GetRef<Function>(func_node);
+    auto checked_func = GetRef<Function>(func_node);
     auto type = checked_func->checked_type();
 
     CHECK(IsFullyResolved(type));
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 85ae5ffa694e..e3393bdb039b 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -13,33 +13,33 @@ namespace tvm {
 namespace relay {
 
 Expr ExprMutator::Mutate(const Expr& expr) {
-  auto cached_expr = this->memo_.find(expr);
-  if (cached_expr != this->memo_.end()) {
-    return (*cached_expr).second;
+  auto it = this->memo_.find(expr);
+  if (it != this->memo_.end()) {
+    return it->second;
   } else {
-    auto new_expr = this->ExprMutator::VisitExpr(expr, expr);
-    this->memo_.Set(expr, new_expr);
+    Expr new_expr = ExprMutator::VisitExpr(expr);
+    memo_[expr] = new_expr;
     return new_expr;
   }
 }
 
-Expr ExprMutator::VisitExpr_(const VarNode* op, const Expr& expr) {
-  return expr;
+Expr ExprMutator::VisitExpr_(const VarNode* op) {
+  return GetRef<Expr>(op);
 }
 
-Expr ExprMutator::VisitExpr_(const ConstantNode* op, const Expr& expr) {
-  return expr;
+Expr ExprMutator::VisitExpr_(const ConstantNode* op) {
+  return GetRef<Expr>(op);
 }
 
-Expr ExprMutator::VisitExpr_(const GlobalVarNode* op, const Expr& expr) {
-  return expr;
+Expr ExprMutator::VisitExpr_(const GlobalVarNode* op) {
+  return GetRef<Expr>(op);
 }
 
-Expr ExprMutator::VisitExpr_(const OpNode* op, const Expr& expr) {
-  return expr;
+Expr ExprMutator::VisitExpr_(const OpNode* op) {
+  return GetRef<Expr>(op);
 }
 
-Expr ExprMutator::VisitExpr_(const TupleNode* op, const Expr& e) {
+Expr ExprMutator::VisitExpr_(const TupleNode* op) {
   tvm::Array<Expr> fields;
   bool all_fields_unchanged = true;
   for (auto field : op->fields) {
@@ -49,23 +49,23 @@ Expr ExprMutator::VisitExpr_(const TupleNode* op, const Expr& e) {
   }
 
   if (all_fields_unchanged) {
-    return e;
+    return GetRef<Expr>(op);
   } else {
     return TupleNode::make(fields);
   }
 }
 
-Expr ExprMutator::VisitExpr_(const ParamNode* op, const Expr& e) {
+Expr ExprMutator::VisitExpr_(const ParamNode* op) {
   Var var = Downcast<Var>(this->Mutate(op->var));
   auto type = this->VisitType(op->type);
-  if (var == op->var && type == op->type) {
-    return e;
+  if (op->var.same_as(var) && op->type.same_as(type)) {
+    return GetRef<Expr>(op);
   } else {
     return ParamNode::make(var, type);
   }
 }
 
-Expr ExprMutator::VisitExpr_(const FunctionNode* op, const Expr& e) {
+Expr ExprMutator::VisitExpr_(const FunctionNode* op) {
   tvm::Array<TypeParam> ty_params;
   bool all_ty_params_changed = true;
 
@@ -86,74 +86,82 @@ Expr ExprMutator::VisitExpr_(const FunctionNode* op, const Expr& e) {
   auto ret_type = this->VisitType(op->ret_type);
   auto body = this->Mutate(op->body);
 
-  if (ty_params.same_as(op->type_params) && params.same_as(op->params) &&
-      ret_type.same_as(op->ret_type) && body.same_as(op->body)) {
-    return e;
+  if (ty_params.same_as(op->type_params) &&
+      params.same_as(op->params) &&
+      ret_type.same_as(op->ret_type) &&
+      body.same_as(op->body)) {
+    return GetRef<Expr>(op);
   } else {
     return FunctionNode::make(params, ret_type, body, ty_params);
   }
 }
 
-Expr ExprMutator::VisitExpr_(const CallNode* call_node, const Expr& e) {
-  auto op = this->Mutate(call_node->op);
+Expr ExprMutator::VisitExpr_(const CallNode* call_node) {
+  auto new_op = this->Mutate(call_node->op);
+  bool unchanged = call_node->op.same_as(new_op);
 
   tvm::Array<Type> ty_args;
-  bool all_ty_args_unchanged = true;
   for (auto ty_arg : call_node->type_args) {
     auto new_ty_arg = this->VisitType(ty_arg);
     ty_args.push_back(new_ty_arg);
-    all_ty_args_unchanged &= new_ty_arg.same_as(ty_arg);
+    unchanged &= new_ty_arg.same_as(ty_arg);
   }
 
   tvm::Array<Expr> call_args;
-  bool all_args_unchanged = true;
   for (auto arg : call_node->args) {
     auto new_arg = this->Mutate(arg);
     call_args.push_back(new_arg);
-    all_args_unchanged &= new_arg.same_as(arg);
+    unchanged &= new_arg.same_as(arg);
   }
 
-  if (all_ty_args_unchanged && all_args_unchanged &&
-      call_node->op.same_as(op)) {
-    return e;
+  if (unchanged) {
+    return GetRef<Expr>(call_node);
   } else {
-    return CallNode::make(op, call_args, call_node->attrs, ty_args);
+    return CallNode::make(new_op, call_args, call_node->attrs, ty_args);
   }
 }
 
-Expr ExprMutator::VisitExpr_(const LetNode* op, const Expr& e) {
+Expr ExprMutator::VisitExpr_(const LetNode* op) {
   Var var = Downcast<Var>(this->Mutate(op->var));
   auto type = this->VisitType(op->value_type);
   auto value = this->Mutate(op->value);
   auto body = this->Mutate(op->body);
 
-  if (var.same_as(op->var) && type.same_as(op->value_type) &&
-      value.same_as(op->value) && body.same_as(op->body)) {
-    return e;
+  if (var.same_as(op->var) &&
+      type.same_as(op->value_type) &&
+      value.same_as(op->value) &&
+      body.same_as(op->body)) {
+    return GetRef<Expr>(op);
   } else {
     return LetNode::make(var, value, body, type);
   }
 }
 
-Expr ExprMutator::VisitExpr_(const IfNode* op, const Expr& e) {
+Expr ExprMutator::VisitExpr_(const IfNode* op) {
   auto guard = this->Mutate(op->cond);
   auto true_b = this->Mutate(op->true_branch);
   auto false_b = this->Mutate(op->false_branch);
-  if (op->cond == guard && true_b == op->true_branch &&
-      false_b == op->false_branch) {
-    return e;
+  if (op->cond.same_as(guard) &&
+      op->true_branch.same_as(true_b) &&
+      op->false_branch.same_as(false_b)) {
+    return GetRef<Expr>(op);;
   } else {
     return IfNode::make(guard, true_b, false_b);
   }
 }
 
-Type ExprMutator::VisitType(const Type& t) { return t; }
+Type ExprMutator::VisitType(const Type& t) {
+  return t;
+}
 
-void ExprVisitor::ExprVisitor::VisitExpr_(const VarNode* op) { return; }
+void ExprVisitor::ExprVisitor::VisitExpr_(const VarNode* op) {
+}
 
-void ExprVisitor::ExprVisitor::VisitExpr_(const GlobalVarNode* op) { return; }
+void ExprVisitor::ExprVisitor::VisitExpr_(const GlobalVarNode* op) {
+}
 
-void ExprVisitor::ExprVisitor::VisitExpr_(const ConstantNode* op) { return; }
+void ExprVisitor::ExprVisitor::VisitExpr_(const ConstantNode* op) {
+}
 
 void ExprVisitor::ExprVisitor::VisitExpr_(const TupleNode* op) {
   for (auto field : op->fields) {
@@ -202,4 +210,3 @@ void ExprVisitor::VisitType(const Type& t) { return; }
 
 }  // namespace relay
 }  // namespace tvm
-
diff --git a/src/relay/pass/type_visitor.h b/src/relay/pass/type_visitor.h
index 725e3d9b3846..c37b536ce0d0 100644
--- a/src/relay/pass/type_visitor.h
+++ b/src/relay/pass/type_visitor.h
@@ -78,7 +78,8 @@ struct TypeMutator : TypeFunctor<Type(const Type& n)> {
     Array<TypeConstraint> type_constraints;
     for (auto type_cs : op->type_constraints) {
       auto new_type_cs = VisitType(type_cs);
-      if (const TypeConstraintNode* tin = As<TypeConstraintNode>(new_type_cs)) {
+      if (const TypeConstraintNode* tin =
+          new_type_cs.as_derived<TypeConstraintNode>()) {
         type_constraints.push_back(GetRef<TypeConstraint>(tin));
       } else {
         CHECK(false) << new_type_cs << std::endl;
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index 9cdfef7f6a01..dca76205d79f 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -20,7 +20,7 @@ TEST(ExprNodeRef, Basic) {
   Var x("x");
   Expr z = max(x + 1 + 2, 100);
   const ir::Max* op = z.as<ir::Max>();
-  CHECK(op->GetNodeRef().same_as(z));
+  CHECK(NodeRef(op->GetNodePtr()).same_as(z));
 }
 
 

From 133274ab1f20eee9df6f13eb1b4e62129b738fc0 Mon Sep 17 00:00:00 2001
From: hlu1 <14827759+hlu1@users.noreply.github.com>
Date: Fri, 21 Sep 2018 11:03:27 -0700
Subject: [PATCH 116/529] Fix Softmax in onnx frontend (#1642)

---
 nnvm/python/nnvm/frontend/onnx.py             |  16 ++-
 .../frontend/onnx/model_zoo/squeezenet.py     | 118 ++++++++++++++++++
 .../python/frontend/onnx/test_forward.py      |  32 ++++-
 nnvm/tests/python/frontend/onnx/test_graph.py |  11 +-
 4 files changed, 173 insertions(+), 4 deletions(-)
 create mode 100644 nnvm/tests/python/frontend/onnx/model_zoo/squeezenet.py
 mode change 100644 => 100755 nnvm/tests/python/frontend/onnx/test_graph.py

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index 22602d7483f0..1584c960aeb4 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -597,6 +597,20 @@ def _impl_v1(cls, inputs, attr, params):
         attr = {'axis':axis, 'keepdims':keepdims}
         return AttrCvt(op_name='argmin')(inputs, attr)
 
+class Softmax(OnnxOpConverter):
+    """ Operator converter for Softmax.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # set default value when axis is not set in the model
+        if 'axis' not in attr:
+            attr['axis'] = 1
+        return AttrCvt(
+            op_name='softmax',
+            transforms={
+                'axis': ('axis', 1),
+            })(inputs, attr, params)
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -664,7 +678,7 @@ def _get_convert_map(opset):
         'Mean': Mean.get_converter(opset),
         'Clip': AttrCvt('clip', transforms={'min': 'a_min', 'max': 'a_max'}),
         # softmax default axis is different in onnx
-        'Softmax': AttrCvt('softmax', {'axis': ('axis', 1)}),
+        'Softmax': Softmax.get_converter(opset),
         'LogSoftmax': AttrCvt('log_softmax', {'axis': ('axis', 1)}),
         # 'Hardmax'
         'Softsign': Softsign.get_converter(opset),
diff --git a/nnvm/tests/python/frontend/onnx/model_zoo/squeezenet.py b/nnvm/tests/python/frontend/onnx/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..2de2d1075494
--- /dev/null
+++ b/nnvm/tests/python/frontend/onnx/model_zoo/squeezenet.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=unused-argument
+
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+from nnvm import symbol as sym
+from nnvm.testing.utils import create_workload
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = sym.concatenate(left, right, axis=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = sym.conv2d(net, channels=channels, kernel_size=(kernel_size, kernel_size),
+                     padding=(padding, padding))
+    net = sym.relu(net)
+    return net
+
+# Net
+def get_symbol(num_classes, version, **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version == '1.1', ("Unsupported SqueezeNet version {version}:"
+                              "1.1 expected".format(version=version))
+    net = sym.Variable("data")
+
+    net = sym.conv2d(net, channels=64, kernel_size=(3, 3), strides=(2, 2))
+    net = sym.relu(net)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 16, 64, 64)
+    net = _make_fire(net, 16, 64, 64)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 32, 128, 128)
+    net = _make_fire(net, 32, 128, 128)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 64, 256, 256)
+    net = _make_fire(net, 64, 256, 256)
+
+    net = sym.dropout(net, rate=0.5)
+    net = sym.conv2d(net, channels=num_classes, kernel_size=(1, 1))
+    net = sym.relu(net)
+    net = sym.global_avg_pool2d(net)
+    return sym.softmax(net, axis=1)
+
+def get_workload(batch_size=1, num_classes=1000, version='1.0',
+                 image_shape=(3, 224, 224), dtype="float32", **kwargs):
+    """Get benchmark workload for SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, version=version, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 7939796ae683..645174d04fe2 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -387,7 +387,7 @@ def _test_upsample_nearest():
     in_shape = (1, 1, 3, 3)
     out_shape = (1, 1, 3*scale, 3*scale)
     y = helper.make_node("Upsample", ['in'], ['out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0])
-    
+
     in_array = np.random.uniform(size=in_shape).astype(np.float32)
     out_array = topi.testing.upsampling_python(in_array, scale, "NCHW")
 
@@ -407,7 +407,7 @@ def _test_upsample_bilinear():
     in_shape = (1, 1, 3, 3)
     out_shape = (1, 1, 3*scale, 3*scale)
     y = helper.make_node("Upsample", ['in'], ['out'], mode='linear', scales=[1.0, 1.0, 2.0, 2.0])
-    
+
     in_array = np.random.uniform(size=in_shape).astype(np.float32)
     out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW")
 
@@ -426,6 +426,33 @@ def test_upsample():
     _test_upsample_nearest()
     _test_upsample_bilinear()
 
+def _test_softmax(inshape, axis):
+    opname = 'Softmax'
+    indata = np.random.uniform(size=inshape).astype(np.float32)
+    outshape = inshape
+    outdata = topi.testing.softmax_python(indata)
+    if isinstance(axis, int):
+        y = helper.make_node(opname, ['in'], ['out'], axis = axis)
+    elif axis is None:
+        y = helper.make_node(opname, ['in'], ['out'])
+
+    graph = helper.make_graph([y],
+                              opname+'_test',
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+
+    model = helper.make_model(graph, producer_name=opname+'_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outshape, 'float32')
+        np.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_softmax():
+    _test_softmax((1, 10), None)
+    _test_softmax((1, 10), 1)
+
 def verify_min(input_dim):
     dtype = 'float32'
 
@@ -676,3 +703,4 @@ def test_forward_arg_min_max():
     test_forward_mean()
     test_forward_hardsigmoid()
     test_forward_arg_min_max()
+    test_softmax()
diff --git a/nnvm/tests/python/frontend/onnx/test_graph.py b/nnvm/tests/python/frontend/onnx/test_graph.py
old mode 100644
new mode 100755
index 7fa705ef4c65..0aad9d22f1be
--- a/nnvm/tests/python/frontend/onnx/test_graph.py
+++ b/nnvm/tests/python/frontend/onnx/test_graph.py
@@ -3,6 +3,7 @@
 import onnx
 from nnvm.compiler import graph_util, graph_attr
 from model_zoo import super_resolution, super_resolution_sym
+from model_zoo import squeezenet as squeezenet
 
 def compare_graph(onnx_file, nnvm_sym, ishape):
     onnx_model = onnx.load(onnx_file)
@@ -18,8 +19,16 @@ def compare_graph(onnx_file, nnvm_sym, ishape):
     graph_util.check_graph_equal(g1, g2)
 
 def test_super_resolution_example():
-    fname, symbol = super_resolution, super_resolution_sym
+    fname, symbol = "super_resolution.onnx", super_resolution_sym
     compare_graph(fname, symbol, ishape=(1, 1, 224, 224))
 
+def test_squeeze_net():
+    # Only works for model downloaded from
+    # https://github.com/onnx/models/tree/master/squeezenet
+    fname = "squeezenet1_1.onnx"
+    symbol, params = squeezenet.get_workload(version='1.1')
+    compare_graph(fname, symbol, ishape=(1, 3, 224, 224))
+
 if __name__ == '__main__':
     test_super_resolution_example()
+    test_squeeze_net()

From ff696617c00f89f61e8401a84dcb65778c66eade Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Sat, 22 Sep 2018 03:41:39 +0300
Subject: [PATCH 117/529] [TVM] Fix operator!= for Tensor (#1753)

---
 include/tvm/tensor.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index 48d959301e63..e205f6b9ff5e 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -45,6 +45,12 @@ class Tensor : public NodeRef {
    * \return whether the two tensors equals each other.
    */
   inline bool operator==(const Tensor& other) const;
+  /*!
+   * \brief check if two tensors are different.
+   * \param other tensor to be checked.
+   * \return whether the two tensors are different.
+   */
+  inline bool operator!=(const Tensor& other) const;
   /*! \return The dimension of the tensor */
   inline size_t ndim() const;
   /*!
@@ -184,6 +190,10 @@ inline bool Tensor::operator==(const Tensor& other) const {
   }
 }
 
+inline bool Tensor::operator!=(const Tensor& other) const {
+  return !(*this == other);
+}
+
 // macro to turn every operation of slice to expression
 #define DEFINE_OVERLOAD_SLICE_UNARY_OP(Op)                              \
   inline Expr operator Op (const Tensor::Slice& a) {                    \

From ad394a589161bbe537d14b00fa3e7862f3b56870 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Fri, 21 Sep 2018 17:42:40 -0700
Subject: [PATCH 118/529] [RELAY] IR Wellform Checker (#1748)

---
 include/tvm/relay/error.h              | 12 +++--
 include/tvm/relay/expr.h               |  1 +
 include/tvm/relay/pass.h               | 12 +++++
 python/tvm/relay/_ir_pass.pyi          |  1 +
 python/tvm/relay/ir_pass.py            |  2 +
 src/relay/pass/well_formed.cc          | 61 ++++++++++++++++++++++++++
 tests/python/relay/test_relay_op.py    |  1 -
 tests/python/relay/test_well_formed.py | 18 ++++++++
 8 files changed, 100 insertions(+), 8 deletions(-)
 create mode 100644 src/relay/pass/well_formed.cc
 create mode 100644 tests/python/relay/test_well_formed.py

diff --git a/include/tvm/relay/error.h b/include/tvm/relay/error.h
index 8ce73a027ca0..1c2b90611bbd 100644
--- a/include/tvm/relay/error.h
+++ b/include/tvm/relay/error.h
@@ -12,21 +12,19 @@
 namespace tvm {
 namespace relay {
 
-struct Error : dmlc::Error {
+struct Error : public dmlc::Error {
   explicit Error(const std::string &msg) : dmlc::Error(msg) {}
 };
 
-struct InternalError : Error {
+struct InternalError : public Error {
   explicit InternalError(const std::string &msg) : Error(msg) {}
 };
 
-// TODO(@jroesch): we should change spanned errors to report
-// errors against the Environment, inverting control to error definition.
-struct FatalTypeError : dmlc::Error {
-  explicit FatalTypeError(const std::string &s) : dmlc::Error(s) {}
+struct FatalTypeError : public Error {
+  explicit FatalTypeError(const std::string &s) : Error(s) {}
 };
 
-struct TypecheckerError : public dmlc::Error {
+struct TypecheckerError : public Error {
   explicit TypecheckerError(const std::string &msg) : Error(msg) {}
 };
 
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 0dc2ff6fce2d..02753a76b0da 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -8,6 +8,7 @@
 
 #include <tvm/attrs.h>
 #include <string>
+#include <functional>
 #include "./base.h"
 #include "./type.h"
 
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index e956097780bb..b49d53815f62 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -80,6 +80,18 @@ bool AlphaEqual(const Expr& e1, const Expr& e2);
  */
 bool AlphaEqual(const Type& t1, const Type& t2);
 
+/*! brief Check that each Var is only bind once.
+ *
+ * For example, the expression `let x = 1 in let x = 2 in 3` bound x twice.
+ *
+ * `let f = (\x -> x) in let g = (\x -> x + 1) in f(g(2))` also bound x twice, although x is not shadowed.
+ *
+ * \param e the expression to check.
+ *
+ * \return true iff all Var in e is bind at most once.
+ */
+bool WellFormed(const Expr & e);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_H_
diff --git a/python/tvm/relay/_ir_pass.pyi b/python/tvm/relay/_ir_pass.pyi
index 1bb42ab854c2..f321083aa443 100644
--- a/python/tvm/relay/_ir_pass.pyi
+++ b/python/tvm/relay/_ir_pass.pyi
@@ -4,3 +4,4 @@ from . import ir
 def check_expr(env: Environment, expr: ir.Expr) -> ir.Type: ...
 def generalize(env: Environment, expr: ir.Expr) -> ir.Expr: ...
 def _get_checked_type(expr: ir.Expr) -> ir.Type: ...
+def well_formed(expr: ir.Expr) -> bool: ...
\ No newline at end of file
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index bbc294b59f5b..84189c840d71 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -10,3 +10,5 @@
 # Expose checking expression, should rename to infer_type.
 # pylint: disable=invalid-name
 check_expr = _ir_pass.check_expr
+
+well_formed = _ir_pass.well_formed
diff --git a/src/relay/pass/well_formed.cc b/src/relay/pass/well_formed.cc
new file mode 100644
index 000000000000..a9bce74926bf
--- /dev/null
+++ b/src/relay/pass/well_formed.cc
@@ -0,0 +1,61 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file well_formed.cc
+ * \brief check that expression is well formed.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <unordered_set>
+
+namespace tvm {
+namespace relay {
+
+struct NotWellFormed { };
+
+//! brief make sure each Var is bind at most once.
+class WellFormedChecker : private ExprVisitor {
+  bool well_formed = true;
+
+  std::unordered_set<Var, NodeHash, NodeEqual> s;
+
+  void Check(const Var & v) {
+    if (s.count(v) != 0) {
+      well_formed = false;
+    }
+    s.insert(v);
+  }
+
+  void VisitExpr_(const LetNode * l) final {
+    // we do letrec only for FunctionNode,
+    // but shadowing let in let binding is likely programming error, and we should forbidden it.
+    Check(l->var);
+    CheckWellFormed(l->value);
+    CheckWellFormed(l->body);
+  }
+
+  void VisitExpr_(const FunctionNode * f) final {
+    for (const Param & p : f->params) {
+      Check(p->var);
+    }
+    CheckWellFormed(f->body);
+  }
+
+ public:
+  bool CheckWellFormed(const Expr & e) {
+    this->VisitExpr(e);
+    return well_formed;
+  }
+};
+
+bool WellFormed(const Expr & e) {
+  return WellFormedChecker().CheckWellFormed(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.well_formed")
+  .set_body([](TVMArgs args, TVMRetValue *ret) {
+      Expr e = args[0];
+      *ret = WellFormed(e);
+    });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_relay_op.py b/tests/python/relay/test_relay_op.py
index 1f95a3f72c15..3b1d914fe02c 100644
--- a/tests/python/relay/test_relay_op.py
+++ b/tests/python/relay/test_relay_op.py
@@ -24,4 +24,3 @@ def test_op_level1():
 if __name__ == "__main__":
     test_op_attr()
     test_op_level1()
-
diff --git a/tests/python/relay/test_well_formed.py b/tests/python/relay/test_well_formed.py
new file mode 100644
index 000000000000..8bdef4d0edb5
--- /dev/null
+++ b/tests/python/relay/test_well_formed.py
@@ -0,0 +1,18 @@
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import well_formed
+
+def test_well_formed():
+    x = relay.Var("x")
+    assert well_formed(x)
+    v = relay.Constant(tvm.nd.array(10))
+    ty = None
+    let = relay.Let(x, v, x, ty)
+    assert well_formed(let)
+    assert not well_formed(relay.Let(x, v, let, ty))
+    f = relay.Function([relay.Param(x, ty)], ty, x)
+    assert well_formed(f)
+    # this test should pass in case of weak uniqueness (only test for shadowing)
+    # but we want all binder to be distinct from each other.
+    assert not well_formed(relay.Let(relay.Var("y"), f,
+                                     relay.Let(relay.Var("z"), f, v, ty), ty))

From 6d44dbd426623add135e6d02ec22d21659c80dc7 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Fri, 21 Sep 2018 20:32:33 -0700
Subject: [PATCH 119/529] Heterogeneous Runtime (#1695)

---
 python/tvm/build_module.py                    |  32 +-
 python/tvm/contrib/graph_runtime.py           |  74 ++--
 src/runtime/graph/graph_runtime.cc            | 216 +++++++---
 .../unittest/test_runtime_heterogeneous.py    | 405 ++++++++++++++++++
 4 files changed, 626 insertions(+), 101 deletions(-)
 create mode 100644 tests/python/unittest/test_runtime_heterogeneous.py

diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 70935cde1816..8e0d16286d6a 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -384,8 +384,14 @@ def build(sch,
           target=None,
           target_host=None,
           name="default_function",
-          binds=None):
-    """Build a function with arguments as signiture.
+          binds=None,
+          postpone_host_codegen=False):
+    """Build a function with arguments as signature. Code will be generated
+    for a device specified by the target. For homogeneous execution, a module
+    that contains both host and device code is returned. For heterogeneous
+    execution, a list of lowered functions for the host and a module containing
+    device code are returned, but actual code generation for the host module is
+    postponed after code generation is finished for all devices.
 
     Parameters
     ----------
@@ -414,10 +420,18 @@ def build(sch,
         Dictionary that maps the binding of symbolic buffer to Tensor.
         By default, a new buffer is created for each tensor in the argument.
 
+    postpone_host_codegen : bool, optional
+        A bool value that indicates if code generation for the host module
+        should be postponed. This variable is set to be true for heterogeneous
+        execution. Otherwise, it is defaulted to false.
+
     Returns
     -------
-    f : Function, or pair of functions
-       The result function.
+    ret : tvm.module, or (list of LoweredFunc, tvm.module) tuple
+        A module that combines both host and device code is returned when
+        postpone_host_codegen is not set. Otherwise, a list of lowered
+        functions for the host and a module contains only device code are
+        returned.
 
     Note
     ----
@@ -498,9 +512,15 @@ def build(sch,
     fdevice = [ir_pass.LowerIntrin(x, target_device.target_name) for x in fdevice]
     fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost]
     fhost = [ir_pass.CombineContextCall(x) for x in fhost]
-    mhost = codegen.build_module(fhost, str(target_host))
 
+    # Append fhost to the device module and return the updated module. All
+    # device modules will be imported to the host module after all of them are
+    # collected.
+    mdev = codegen.build_module(fdevice, str(target_device)) if fdevice else None
+    if postpone_host_codegen:
+        return fhost, mdev
+
+    mhost = codegen.build_module(fhost, str(target_host))
     if fdevice:
-        mdev = codegen.build_module(fdevice, str(target_device))
         mhost.import_module(mdev)
     return mhost
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index e49b966e6a1e..f0e83eec0bb8 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -3,26 +3,24 @@
 
 from .._ffi.base import string_types
 from .._ffi.function import get_global_func
+from .._ffi.runtime_ctypes import TVMContext
 from ..rpc import base as rpc_base
-from .. import ndarray as nd
-
 
 def create(graph_json_str, libmod, ctx):
     """Create a runtime executor module given a graph and module.
-
     Parameters
     ----------
     graph_json_str : str or graph class
         The graph to be deployed in json format output by nnvm graph.
         The graph can only contain one operator(tvm_op) that
         points to the name of PackedFunc in the libmod.
-
     libmod : tvm.Module
         The module of the corresponding function
-
-    ctx : TVMContext
-        The context to deploy the module, can be local or remote.
-
+    ctx : TVMContext or list of TVMContext
+        The context to deploy the module. It can be local or remote when there
+        is only one TVMContext. Otherwise, the first context in the list will
+        be used as this purpose. All context should be given for heterogeneous
+        execution.
     Returns
     -------
     graph_module : GraphModule
@@ -33,17 +31,42 @@ def create(graph_json_str, libmod, ctx):
             graph_json_str = graph_json_str._tvm_graph_json()
         except AttributeError:
             raise ValueError("Type %s is not supported" % type(graph_json_str))
-    device_type = ctx.device_type
-    device_id = ctx.device_id
-    if device_type >= rpc_base.RPC_SESS_MASK:
-        assert libmod.type_key == "rpc"
-        assert rpc_base._SessTableIndex(libmod) == ctx._rpc_sess._tbl_index
+    if isinstance(ctx, TVMContext):
+        ctx = [ctx]
+    elif not isinstance(ctx, (list, tuple)):
+        raise ValueError("ctx has to be the type of TVMContext or a list of "
+                         "TVMCTVMContext")
+    for cur_ctx in ctx:
+        if not isinstance(cur_ctx, TVMContext):
+            raise ValueError("ctx has to be the type of TVMContext or a list "
+                             "of TVMContext")
+
+    # device_type_id[0], device_type_id[1] are used as the primary/fallback
+    # context type and id. All other ones are used as device context for
+    # heterogeneous execution.
+    num_rpc_ctx = 0
+    device_type_id = []
+    for cur_ctx in ctx:
+        device_type = cur_ctx.device_type
+        if device_type >= rpc_base.RPC_SESS_MASK:
+            assert libmod.type_key == "rpc"
+            assert rpc_base._SessTableIndex(
+                libmod) == cur_ctx._rpc_sess._tbl_index
+            num_rpc_ctx += 1
+            device_type = cur_ctx.device_type % rpc_base.RPC_SESS_MASK
+        device_type_id.append(device_type)
+        device_type_id.append(cur_ctx.device_id)
+
+    if 0 < num_rpc_ctx < len(ctx):
+        raise ValueError("Either all or none of the contexts should be rpc.")
+
+    if num_rpc_ctx == len(ctx):
         hmod = rpc_base._ModuleHandle(libmod)
-        fcreate = ctx._rpc_sess.get_function("tvm.graph_runtime.remote_create")
-        device_type = device_type % rpc_base.RPC_SESS_MASK
-        return GraphModule(fcreate(graph_json_str, hmod, device_type, device_id), ctx)
+        fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
+        return GraphModule(fcreate(graph_json_str, hmod, *device_type_id))
+
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return GraphModule(fcreate(graph_json_str, libmod, device_type, device_id), ctx)
+    return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
 
 
 class GraphModule(object):
@@ -58,18 +81,13 @@ class GraphModule(object):
     module : Module
         The interal tvm module that holds the actual graph functions.
 
-    ctx : TVMContext
-        The context this module is under
-
     Attributes
     ----------
     module : Module
         The interal tvm module that holds the actual graph functions.
-
-    ctx : TVMContext
-        The context this module is under
     """
-    def __init__(self, module, ctx):
+
+    def __init__(self, module):
         self.module = module
         self._set_input = module["set_input"]
         self._run = module["run"]
@@ -81,7 +99,6 @@ def __init__(self, module, ctx):
         except AttributeError:
             pass
         self._load_params = module["load_params"]
-        self.ctx = ctx
 
     def set_input(self, key=None, value=None, **params):
         """Set inputs to the module via kwargs
@@ -98,14 +115,14 @@ def set_input(self, key=None, value=None, **params):
            Additonal arguments
         """
         if key:
-            self._set_input(key, nd.array(value, ctx=self.ctx))
+            self._get_input(key).copyfrom(value)
 
         if params:
             # upload big arrays first to avoid memory issue in rpc mode
             keys = list(params.keys())
             keys.sort(key=lambda x: -np.prod(params[x].shape))
             for k in keys:
-                self._set_input(k, nd.array(params[k], ctx=self.ctx))
+                self._get_input(k).copyfrom(params[k])
 
     def run(self, **input_dict):
         """Run forward execution of the graph
@@ -177,7 +194,8 @@ def debug_get_output(self, node, out):
         if hasattr(self, '_debug_get_output'):
             self._debug_get_output(node, out)
         else:
-            raise RuntimeError("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0")
+            raise RuntimeError(
+                "Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0")
         return out
 
     def load_params(self, params_bytes):
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 162d616dea8a..a48047fe369c 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -2,22 +2,26 @@
  *  Copyright (c) 2017 by Contributors
  * \file graph_runtime.cc
  */
+#include "graph_runtime.h"
+
+#include <dlpack/dlpack.h>
+#include <dmlc/json.h>
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/device_api.h>
-#include <dmlc/memory_io.h>
-#include <dmlc/json.h>
-#include <numeric>
+#include <tvm/runtime/serializer.h>
+
 #include <algorithm>
-#include <vector>
 #include <functional>
-#include "graph_runtime.h"
+#include <numeric>
+#include <vector>
 
 namespace tvm {
 namespace runtime {
 
-/*! \brief macro to do C API call */
+/*! \brief Macro to do C API call. */
 #define TVM_CCALL(func)                                            \
   {                                                                \
     int ret = (func);                                              \
@@ -34,7 +38,7 @@ namespace runtime {
 class GraphRuntime : public ModuleNode {
  public:
   /*!
-   * \brief Get member function to front-end
+   * \brief Get member function to front-end.
    * \param name The name of the function.
    * \param sptr_to_self The pointer to the module node.
    * \return The corresponding member function.
@@ -58,12 +62,13 @@ class GraphRuntime : public ModuleNode {
   /*!
    * \brief Initialize the graph executor with graph and context.
    * \param graph_json The execution graph.
-   * \param module The module containing the compiled functions.
-   * \param ctx The context where the graph should sit on
+   * \param module The module containing the compiled functions for the host
+   * processor.
+   * \param ctxs The context of the host and devices where graph nodes will be
+   * executed on.
    */
-  void Init(const std::string& graph_json,
-            tvm::runtime::Module module,
-            TVMContext ctx) {
+  void Init(const std::string& graph_json, const tvm::runtime::Module& module,
+            const std::vector<TVMContext>& ctxs) {
 #ifndef _LIBCPP_SGX_NO_IOSTREAMS
     std::istringstream is(graph_json);
 #else
@@ -72,10 +77,11 @@ class GraphRuntime : public ModuleNode {
     dmlc::JSONReader reader(&is);
     this->Load(&reader);
     module_ = module;
-    ctx_ = ctx;
+    ctxs_ = ctxs;
     this->SetupStorage();
     this->SetupOpExecs();
   }
+
   /*!
    * \brief Get the input index given the name of input.
    * \param name The name of the input.
@@ -92,7 +98,7 @@ class GraphRuntime : public ModuleNode {
     return -1;
   }
   /*!
-   * \brief set index-th input to the graph.
+   * \brief Set index-th input to the graph.
    * \param index The input index.
    * \param data_in The input data.
    */
@@ -134,7 +140,7 @@ class GraphRuntime : public ModuleNode {
   /*!
    * \brief Copy index-th output to data_out.
    * \param index The output index.
-   * \param data_out the output data.
+   * \param data_out The output data.
    */
   void CopyOutputTo(int index, DLTensor* data_out) {
     CHECK_LT(static_cast<size_t>(index), outputs_.size());
@@ -172,8 +178,8 @@ class GraphRuntime : public ModuleNode {
    * from begining upto the index-th node and return output of index-th node.
    * This is costly operation and suggest to use only for debug porpose.
    *
-   * \param index: The  index of the node.
-   * \param data_out the node data.
+   * \param index The index of the node.
+   * \param data_out The node data.
    */
   void DebugGetNodeOutput(int index, DLTensor* data_out) {
     CHECK_LT(static_cast<size_t>(index), nodes_.size());
@@ -188,7 +194,7 @@ class GraphRuntime : public ModuleNode {
   }
 #endif
   /*!
-   * \brief Load parameters from binary stream
+   * \brief Load parameters from binary stream.
    * \param strm The input stream.
    */
   void LoadParams(dmlc::Stream* strm);
@@ -202,6 +208,12 @@ class GraphRuntime : public ModuleNode {
   }
 
  private:
+  // Memory pool entry.
+  struct PoolEntry {
+    size_t size;
+    int device_type;
+    PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {}
+  };
   // Node entry
   struct NodeEntry {
     uint32_t node_id;
@@ -260,7 +272,6 @@ class GraphRuntime : public ModuleNode {
     // JSON Loader
     void Load(dmlc::JSONReader *reader) {
       reader->BeginObject();
-      std::unordered_map<std::string, std::string> dict;
       int bitmask = 0;
       std::string key;
       while (reader->NextObjectItem(&key)) {
@@ -287,6 +298,7 @@ class GraphRuntime : public ModuleNode {
   struct GraphAttr {
     size_t storage_num_not_alloctaed{0};
     std::vector<int> storage_id;
+    std::vector<int> device_index;
     std::vector<std::string> dltype;
     std::vector<std::vector<int64_t> > shape;
     // The graph attribute fields.
@@ -322,6 +334,14 @@ class GraphRuntime : public ModuleNode {
           reader->Read(&shape);
           CHECK(!reader->NextArrayItem());
           bitmask |= 4;
+        } else if (key == "device_index") {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          CHECK_EQ(type, "list_int");
+          CHECK(reader->NextArrayItem());
+          reader->Read(&device_index);
+          CHECK(!reader->NextArrayItem());
         } else {
           reader->BeginArray();
           CHECK(reader->NextArrayItem());
@@ -372,13 +392,14 @@ class GraphRuntime : public ModuleNode {
   }
   /*! \brief Setup the temporal storage */
   void SetupStorage();
-  /*! \brief Setup the executors */
+  /*! \brief Setup the executors. */
   void SetupOpExecs();
   /*!
    * \brief Create a executtion function given input.
-   * \param attrs The node attributes
+   * \param attrs The node attributes.
    * \param args The arguments to the functor, including inputs and outputs.
-   * \param num_inputs Number of inputs
+   * \param num_inputs Number of inputs.
+   * \param dev_type The device type of the tvm_op.
    * \return The created executor.
    */
   std::function<void()> CreateTVMOp(const TVMOpParam& attrs,
@@ -392,7 +413,7 @@ class GraphRuntime : public ModuleNode {
   uint32_t entry_id(const NodeEntry& e) const {
     return entry_id(e.node_id, e.index);
   }
-  // Number of node entries
+  // Number of node entries.
   uint32_t num_node_entries() const {
     return node_row_ptr_.back();
   }
@@ -400,25 +421,25 @@ class GraphRuntime : public ModuleNode {
   uint32_t num_nodes() const {
     return static_cast<uint32_t>(nodes_.size());
   }
-  // The graph nodes.
+  /*! \brief The graph nodes. */
   std::vector<Node> nodes_;
-  // The argument nodes.
+  /*! \brief The argument nodes. */
   std::vector<uint32_t> input_nodes_;
-  // used or quick entry indexing
+  /*! \brief Used for quick entry indexing. */
   std::vector<uint32_t> node_row_ptr_;
-  // output entries
+  /*! \brief Output entries. */
   std::vector<NodeEntry> outputs_;
-  // Additional graph attributes
+  /*! \brief Additional graph attributes. */
   GraphAttr attrs_;
-  /*! \brief The code module */
+  /*! \brief The code module that contains both host and device code. */
   tvm::runtime::Module module_;
-  /*! \brief execution context */
-  TVMContext ctx_;
-  /*! \brief common storage pool */
+  /*! \brief Execution context of all devices including the host. */
+  std::vector<TVMContext> ctxs_;
+  /*! \brief Common storage pool for all devices. */
   std::vector<NDArray> storage_pool_;
-  /*! \brief data entry of each node */
+  /*! \brief Data entry of each node. */
   std::vector<NDArray> data_entry_;
-  /*! \brief operator on each node */
+  /*! \brief Operator on each node. */
   std::vector<std::function<void()> > op_execs_;
 };
 
@@ -458,12 +479,17 @@ void GraphRuntime::SetupStorage() {
   for (const std::string& s_type : attrs_.dltype) {
     vtype.push_back(tvm::runtime::String2TVMType(s_type));
   }
-  data_entry_.resize(num_node_entries());
-  // size of each storage pool entry
-  std::vector<size_t> pool_entry_bytes;
+
+  // Size and device type of each storage pool entry.
+  std::vector<PoolEntry> pool_entry;
   // Find the maximum space size.
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
+    // Use the fallback device if no device index is available.
+    int device_type = static_cast<int>(ctxs_[0].device_type);
+    if (!attrs_.device_index.empty()) {
+      device_type = attrs_.device_index[i];
+    }
     size_t size = 1;
     for (int64_t sz : attrs_.shape[i]) {
       size *= static_cast<size_t>(sz);
@@ -474,23 +500,42 @@ void GraphRuntime::SetupStorage() {
     CHECK_EQ(bits % 8U, 0U);
     size_t bytes = (bits / 8U) * size;
 
-    size_t sid = static_cast<size_t>(storage_id);
-    if (sid >= pool_entry_bytes.size()) {
-      pool_entry_bytes.resize(sid + 1, 0);
+    uint32_t sid = static_cast<uint32_t>(storage_id);
+    if (sid >= pool_entry.size()) {
+      pool_entry.resize(sid + 1, {0, -1});
+    } else {
+      CHECK(pool_entry[sid].device_type == -1 ||
+            pool_entry[sid].device_type == device_type)
+          << "The same pool entry cannot be assigned to multiple devices";
     }
-    pool_entry_bytes[sid] = std::max(pool_entry_bytes[sid], bytes);
+    pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
+    pool_entry[sid].device_type = device_type;
   }
+
   // Allocate the space.
-  for (size_t i = 0; i < pool_entry_bytes.size(); ++i) {
+  for (const auto& pit : pool_entry) {
     std::vector<int64_t> shape;
-    shape.push_back(static_cast<int64_t>(pool_entry_bytes[i] + 3) / 4);
-    storage_pool_.push_back(NDArray::Empty(shape, DLDataType {kDLFloat, 32, 1}, ctx_));
+    // This for loop is very fast since there are usually only a couple of
+    // devices available on the same hardware.
+    const auto& cit =
+        std::find_if(ctxs_.begin(), ctxs_.end(), [&pit](const TVMContext& c) {
+          return pit.device_type == static_cast<int>(c.device_type);
+        });
+    TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
+    shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
+    storage_pool_.push_back(
+        NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
   }
-  // Assign the pooled entries.
+
+  // Assign the pooled entries. A unified memory pool is used to simplifiy
+  // memory assignment for each node entry. The allocated memory on each device
+  // is mapped to this pool.
+  data_entry_.resize(num_node_entries());
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     CHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+    data_entry_[i] =
+        storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
   }
 }
 
@@ -508,8 +553,8 @@ void GraphRuntime::SetupOpExecs() {
       uint32_t eid = this->entry_id(nid, index);
       args.push_back(*(data_entry_[eid].operator->()));
     }
-    CHECK_EQ(inode.op_type, "tvm_op")
-        << "Can only take tvm_op as op";
+    CHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
+
     op_execs_[nid] = CreateTVMOp(inode.param, args, inode.inputs.size());
   }
 }
@@ -543,13 +588,26 @@ std::function<void()> GraphRuntime::CreateTVMOp(
       t->shape = &(arg_ptr->shape_data[i]);
     }
   }
+
   if (param.func_name == "__nop") {
     return [](){};
+  } else if (param.func_name == "__copy") {
+    // Perform cross device data copy.
+    // Directly copy data from the input to the output.
+    auto fexec = [arg_ptr]() {
+      DLTensor* from = static_cast<DLTensor*>(arg_ptr->arg_values[0].v_handle);
+      DLTensor* to = static_cast<DLTensor*>(arg_ptr->arg_values[1].v_handle);
+      TVM_CCALL(TVMArrayCopyFromTo(from, to, nullptr));
+    };
+    return fexec;
   }
-  // get compiled function from module.
+
+  // Get compiled function from the module that contains both host and device
+  // code.
   tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false);
   CHECK(pf != nullptr) << "no such function in module: " << param.func_name;
-  auto fexec = [arg_ptr, pf] () {
+
+  auto fexec = [arg_ptr, pf]() {
     TVMRetValue rv;
     TVMArgs targs(arg_ptr->arg_values.data(),
                   arg_ptr->arg_tcodes.data(),
@@ -562,7 +620,7 @@ std::function<void()> GraphRuntime::CreateTVMOp(
 PackedFunc GraphRuntime::GetFunction(
     const std::string& name,
     const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  // return member functions during query.
+  // Return member functions during query.
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         if (args[0].type_code() == kStr) {
@@ -618,29 +676,53 @@ PackedFunc GraphRuntime::GetFunction(
   }
 }
 
-Module GraphRuntimeCreate(std::string sym_json,
-                          tvm::runtime::Module m,
-                          int device_type,
-                          int device_id) {
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id   = device_id;
+Module GraphRuntimeCreate(const std::string& sym_json,
+                          const tvm::runtime::Module& m,
+                          const std::vector<TVMContext>& ctxs) {
   std::shared_ptr<GraphRuntime> exec = std::make_shared<GraphRuntime>();
-  exec->Init(sym_json, m, ctx);
+  exec->Init(sym_json, m, ctxs);
   return Module(exec);
 }
 
+// Get all context for the host and other runtime devices.
+std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
+  // Reserve the first item as the fallback device.
+  std::vector<TVMContext> ret;
+  TVMContext ctx;
+  for (int i = 2; i < args.num_args; i += 2) {
+    int dev_type = args[i];
+    ctx.device_type = static_cast<DLDeviceType>(dev_type);
+    ctx.device_id = args[i + 1];
+    ret.push_back(ctx);
+  }
+  return ret;
+}
+
+// 4-argument version is currently reserved to keep support of calling
+// from tvm4j and javascript, since they don't have heterogeneous
+// execution support yet. For heterogenenous execution, at least 5 arguments will
+// be passed in. The third one is the number of devices.
+// Eventually, we will only probably pass TVMContext for all the languages.
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-    *rv = GraphRuntimeCreate(args[0], args[1], args[2], args[3]);
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    CHECK_GE(args.num_args, 4)
+        << "The expected number of arguments for graph_runtime.create is "
+           "at least 4, but it has "
+        << args.num_args;
+    const auto& contexts = GetAllContext(args);
+    *rv = GraphRuntimeCreate(args[0], args[1], contexts);
   });
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
+                                  "graph_runtime.remote_create is "
+                                  "at least 4, but it has "
+                               << args.num_args;
     void* mhandle = args[1];
-    *rv = GraphRuntimeCreate(args[0],
-                             *static_cast<tvm::runtime::Module*>(mhandle),
-                             args[2], args[3]);
+    const auto& contexts = GetAllContext(args);
+    *rv = GraphRuntimeCreate(
+        args[0], *static_cast<tvm::runtime::Module*>(mhandle), contexts);
   });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
new file mode 100644
index 000000000000..b916ee285717
--- /dev/null
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -0,0 +1,405 @@
+# pylint: disable=too-many-locals
+"""Unit tests for heterogeneous runtime"""
+import json
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime, util
+import topi
+
+def get_simplex_graph(host_dev_type, device_dev_type):
+    r""" Return the hand-crafted json object where only one copy node is
+    inserted. This node copies data from the target device to cpu.
+    The network is constructed as following:
+                 A    B
+                  \  /
+             elemwise_add  (gpu)
+                     \
+                     copy      C
+                       \      /
+                     elemwise_sub  (cpu)
+
+    Parameters
+    ----------
+    host_dev_type : int
+        The device type of the host processor, e.g. cpu.
+    device_dev_type : int
+        The device type of the device processor, e.g. gpu, opencl, etc.
+
+    Returns
+    -------
+    json : json
+        A json encoded object.
+    """
+    # Construct each node in the graph.
+    var_a = {"op": "null", "name": "A", "inputs": []}
+    var_b = {"op": "null", "name": "B", "inputs": []}
+    elemwise_add = {
+        "op": "tvm_op", "name": "elemwise_add",
+        "attrs": {
+            "flatten_data": "1",
+            "func_name": "elemwise_add",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[0, 0, 0], [1, 0, 0]]
+    }
+    copy = {
+        "op": "tvm_op",
+        "name": "__copy_add_to_sub",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "__copy",
+            "num_inputs": "1",
+            "num_outputs": "1"
+        },
+        "inputs": [[2, 0, 0]]
+    }
+    var_c = {"op": "null", "name": "C", "inputs": []}
+    elemwise_sub = {
+        "op": "tvm_op", "name": "elemwise_sub",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "elemwise_sub",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[3, 0, 0], [4, 0, 0]]
+    }
+
+    # Group the nodes.
+    nodes = [var_a, var_b, elemwise_add, copy, var_c, elemwise_sub]
+    arg_nodes = [0, 1, 4]
+    node_row_ptr = [0, 1, 2, 3, 4, 5, 6]
+    heads = [[5, 0, 0]]
+    shape = (4,)
+    attrs = {
+        "storage_id": ["list_int", [3, 4, 0, 1, 5, 2]],
+        "shape": ["list_shape", [shape, shape, shape, shape, shape, shape]],
+        "device_index": ["list_int", [device_dev_type, device_dev_type,
+                                      device_dev_type, host_dev_type,
+                                      host_dev_type, host_dev_type]],
+        "dtype": ["list_int", [0, 0, 0, 0, 0, 0]],
+        "dltype": ["list_str", ["float32", "float32", "float32",
+                                "float32", "float32", "float32"]]
+    }
+
+    # Construct the graph.
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": heads,
+             "attrs": attrs}
+    return json.dumps(graph)
+
+
+def test_simplex_data_transferring():
+    r"""
+    Test the heterogeneous execution of a simple network where data
+    transferring is from the target device to the host processor at runtime.
+    The host processor is always assumed to be cpu, and the device varies.
+    """
+    host = "cpu"
+    target_host = "llvm"
+    host_ctx = tvm.context(host)
+    if not tvm.module.enabled(target_host):
+        print("Skip test because llvm is not enabled.")
+        return
+
+    def check_device(device, target_device):
+        if not tvm.module.enabled(target_device):
+            print("Skip test because {} is not enabled.".format(target_device))
+            return
+
+        device_ctx = tvm.context(device)
+        graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type)
+        shape = (4,)
+
+        # Create module for add whose target is the device.
+        tensor_a = tvm.placeholder(shape, name="A")
+        tensor_b = tvm.placeholder(shape, name="B")
+        elemwise_add = tvm.compute(shape, lambda *i: tensor_a(*i)
+                                   + tensor_b(*i), name="elemwise_add")
+        target = topi.cpp.TEST_create_target(device)
+        schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
+        lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add],
+                              name="elemwise_add")
+        host_funcs_add, lib_add = tvm.build(lower_add, target=target_device,
+                                            name="elemwise_add",
+                                            postpone_host_codegen=True)
+
+        # Insert copy. Neither compute nor schedule is required for the copy
+        # node. The compute will be performed at runtime which is just data
+        # copy from the input to the output.
+        tensor_copy = tvm.placeholder(shape, name="__copy")
+
+        # Create module for sub whose target is the host.
+        tensor_c = tvm.placeholder(shape, name="C")
+        elemwise_sub = tvm.compute(shape, lambda *i: tensor_copy(*i)
+                                   - tensor_c(*i), name="elemwise_sub")
+        schedule_sub = tvm.create_schedule(elemwise_sub.op)
+        lower_sub = tvm.lower(schedule_sub, [tensor_copy, tensor_c,
+                                             elemwise_sub],
+                              name="elemwise_sub")
+
+        host_funcs_sub, lib_sub = tvm.build(lower_sub, target=target_host,
+                                            name="elemwise_sub",
+                                            postpone_host_codegen=True)
+        host_funcs = host_funcs_add + host_funcs_sub
+        mhost = tvm.codegen.build_module(host_funcs, target_host)
+        if lib_add:
+            mhost.import_module(lib_add)
+        if lib_sub:
+            mhost.import_module(lib_sub)
+
+        ctx = [host_ctx, device_ctx]
+        mod = graph_runtime.create(graph, mhost, ctx)
+        params = {}
+        params["A"] = tensor_a = np.random.uniform(
+            size=shape).astype(tensor_a.dtype)
+        params["B"] = tensor_b = np.random.uniform(
+            size=shape).astype(tensor_b.dtype)
+        params["C"] = tensor_c = np.random.uniform(
+            size=shape).astype(tensor_c.dtype)
+        mod.set_input(**params)
+        mod.run()
+        out = mod.get_output(0, tvm.nd.empty(shape))
+        np.testing.assert_equal(
+            out.asnumpy(), (tensor_a + tensor_b) - tensor_c)
+
+    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
+    for device, target in dev_tar.items():
+        check_device(device, target)
+
+
+def get_duplex_graph(host_dev_type, device_dev_type):
+    r""" Return the hand-crafted json object where two copy nodes are inserted.
+    Data transferring happens back-and-forth between the target device and CPU.
+    The network is constructed as following:
+                 A    B
+                  \  /
+             elemwise_add  (gpu)
+                     \
+                     copy        C
+                       \        /
+                      elemwise_sub  (cpu)
+                         \
+                         copy          D
+                           \          /
+                           elemwise_add  (gpu)
+
+    Parameters
+    ----------
+    host_dev_type : int
+        The device type of the host processor, e.g. cpu.
+    device_dev_type : int
+        The device type of the device processor, e.g. gpu, opencl, etc.
+
+    Returns
+    -------
+    json : json
+        A json encoded object.
+    """
+    # Construct each node in the graph.
+    var_a = {"op": "null", "name": "A", "inputs": []}
+    var_b = {"op": "null", "name": "B", "inputs": []}
+    elemwise_add0 = {
+        "op": "tvm_op", "name": "elemwise_add0",
+        "attrs": {
+            "flatten_data": "1",
+            "func_name": "elemwise_add0",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[0, 0, 0], [1, 0, 0]]
+    }
+    copy_add_sub = {
+        "op": "tvm_op",
+        "name": "__copy_add_to_sub",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "__copy",
+            "num_inputs": "1",
+            "num_outputs": "1"
+        },
+        "inputs": [[2, 0, 0]]
+    }
+    var_c = {"op": "null", "name": "C", "inputs": []}
+    elemwise_sub = {
+        "op": "tvm_op", "name": "elemwise_sub",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "elemwise_sub",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[3, 0, 0], [4, 0, 0]]
+    }
+    copy_sub_add = {
+        "op": "tvm_op",
+        "name": "__copy_sub_to_add",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "__copy",
+            "num_inputs": "1",
+            "num_outputs": "1"
+        },
+        "inputs": [[5, 0, 0]]
+    }
+    var_d = {"op": "null", "name": "D", "inputs": []}
+    elemwise_add1 = {
+        "op": "tvm_op", "name": "elemwise_add1",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "elemwise_add1",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[6, 0, 0], [7, 0, 0]]
+    }
+
+    # Group the nodes.
+    nodes = [var_a, var_b, elemwise_add0, copy_add_sub, var_c, elemwise_sub,
+             copy_sub_add, var_d, elemwise_add1]
+    arg_nodes = [0, 1, 4, 7]
+    node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    heads = [[8, 0, 0]]
+    shape = (4,)
+    attrs = {
+        "storage_id": ["list_int", [4, 5, 0, 1, 6, 2, 0, 7, 3]],
+        "shape": ["list_shape", [shape, shape, shape, shape, shape, shape,
+                                 shape, shape, shape]],
+        "device_index": ["list_int", [device_dev_type, device_dev_type,
+                                      device_dev_type,
+                                      host_dev_type, host_dev_type, host_dev_type,
+                                      device_dev_type, device_dev_type,
+                                      device_dev_type]],
+        "dtype": ["list_int", [0, 0, 0, 0, 0, 0, 0, 0, 0]],
+        "dltype": ["list_str", ["float32", "float32", "float32",
+                                "float32", "float32", "float32",
+                                "float32", "float32", "float32"]]
+    }
+
+    # Construct the graph.
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": heads,
+             "attrs": attrs}
+    return json.dumps(graph)
+
+
+def test_duplex_data_transferring():
+    r"""
+    Test the heterogeneous execution of a simple network where data
+    transferring occurs back-and-forth between the target device and host
+    processor.
+    The host processor is always assumed to be cpu, and the target device
+    varies.
+    """
+    host = "cpu"
+    target_host = "llvm"
+    host_ctx = tvm.context(host)
+    if not tvm.module.enabled(target_host):
+        print("Skip test because llvm is not enabled.")
+        return
+
+    def check_device(device, target_device):
+        if not tvm.module.enabled(target_device):
+            print("Skip test because {} is not enabled.".format(target_device))
+            return
+
+        device_ctx = tvm.context(device)
+        graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type)
+        shape = (4,)
+
+        # Insert copy nodes for data transferring between add and sub nodes.
+        # Transfers data from gpu to cpu.
+        copy_add_sub = tvm.placeholder(shape, name="__copy0")
+        # Transfers data from cpu to gpu.
+        copy_sub_add = tvm.placeholder(shape, name="__copy1")
+
+        # Create a module containing adds on the device.
+        tensor_a = tvm.placeholder(shape, name="A")
+        tensor_b = tvm.placeholder(shape, name="B")
+        tensor_d = tvm.placeholder(shape, name="D")
+        elemwise_add0 = tvm.compute(shape, lambda *i: tensor_a(*i)
+                                    + tensor_b(*i), name="elemwise_add0")
+        elemwise_add1 = tvm.compute(shape, lambda *i: copy_sub_add(*i)
+                                    + tensor_d(*i), name="elemwise_add1")
+        target = topi.cpp.TEST_create_target(device)
+        add_schedule0 = topi.cpp.cuda.schedule_injective(
+            target, [elemwise_add0])
+        lower_add0 = tvm.lower(
+            add_schedule0, [tensor_a, tensor_b, elemwise_add0],
+            name="elemwise_add0")
+        add_schedule1 = topi.cpp.cuda.schedule_injective(
+            target, [elemwise_add1])
+        lower_add1 = tvm.lower(
+            add_schedule1, [tensor_d, copy_sub_add, elemwise_add1],
+            name="elemwise_add1")
+        host_funcs_add, lib_add = tvm.build([lower_add0, lower_add1],
+                                            target=target_device,
+                                            postpone_host_codegen=True)
+
+        # Create module for sub whose target is the host.
+        tensor_c = tvm.placeholder(shape, name="C")
+        elemwise_sub = tvm.compute(shape, lambda *i: copy_add_sub(*i)
+                                   - tensor_c(*i), name="elemwise_sub")
+        sub_schedule = tvm.create_schedule(elemwise_sub.op)
+        lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c,
+                                             elemwise_sub],
+                              name="elemwise_sub")
+        host_funcs_sub, lib_sub = tvm.build(lower_sub, target=target_host,
+                                            postpone_host_codegen=True)
+        host_funcs = host_funcs_add + host_funcs_sub
+        mhost = tvm.codegen.build_module(host_funcs, target_host)
+        if lib_add:
+            mhost.import_module(lib_add)
+        if lib_sub:
+            mhost.import_module(lib_sub)
+
+        ctx = [host_ctx, device_ctx]
+        params = {}
+        params["A"] = tensor_a = np.random.uniform(
+            size=shape).astype(tensor_a.dtype)
+        params["B"] = tensor_b = np.random.uniform(
+            size=shape).astype(tensor_b.dtype)
+        params["C"] = tensor_c = np.random.uniform(
+            size=shape).astype(tensor_c.dtype)
+        params["D"] = tensor_d = np.random.uniform(
+            size=shape).astype(tensor_d.dtype)
+
+        def check_verify():
+            mod = graph_runtime.create(graph, mhost, ctx)
+            mod.set_input(**params)
+            mod.run()
+            out = mod.get_output(0, tvm.nd.empty(shape))
+            np.testing.assert_equal(
+                out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
+
+        def check_load_module():
+            temp = util.tempdir()
+            path_lib = temp.relpath("deploy.so")
+            mhost.export_library(path_lib)
+            with open(temp.relpath("deploy.json"), "w") as out_file:
+                out_file.write(graph)
+            loaded_lib = tvm.module.load(path_lib)
+            loaded_graph = open(temp.relpath("deploy.json")).read()
+            mod = graph_runtime.create(loaded_graph, loaded_lib, ctx)
+            mod.set_input(**params)
+            mod.run()
+            out = mod.get_output(0, tvm.nd.empty(shape))
+            np.testing.assert_equal(
+                out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
+
+        check_verify()
+        check_load_module()
+
+    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
+    for device, target in dev_tar.items():
+        check_device(device, target)
+
+if __name__ == "__main__":
+    test_simplex_data_transferring()
+    test_duplex_data_transferring()

From 4664b482e531c9107e2562d666c0e9293661b084 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Sat, 22 Sep 2018 23:00:41 +0530
Subject: [PATCH 120/529] [NNVM]Keras SimpleRnn and GRU support (#1729)

---
 nnvm/python/nnvm/frontend/keras.py            | 98 +++++++++++++++++--
 .../python/frontend/keras/test_forward.py     | 54 ++++++++++
 2 files changed, 145 insertions(+), 7 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 07f1ce5024e2..a1e089b210c5 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -28,6 +28,10 @@ def _get_elu(insym, alpha):
     """
     return -alpha * _sym.relu(1 - _sym.exp(insym)) + _sym.relu(insym)
 
+def _convert_recurrent_activation(insym, keras_layer):
+    act_type = keras_layer.recurrent_activation.__name__
+    return _convert_activation(insym, act_type, None)
+
 def _convert_activation(insym, keras_layer, _):
     if isinstance(keras_layer, str):
         act_type = keras_layer
@@ -420,16 +424,96 @@ def _convert_lstm(insym, keras_layer, symtab):
     ixh2 = _sym.dense(in_state_h, recurrent_wt, in_bias, use_bias=True, units=units)
     gate = ixh1 + ixh2
     gates = _sym.split(gate, indices_or_sections=4, axis=1)
-    in_gate = _sym.sigmoid(gates[0])
-    in_transform = _sym.sigmoid(gates[1])
-    next_c = in_transform * in_state_c + in_gate * _sym.tanh(gates[2])
-    out_gate = _sym.sigmoid(gates[3])
-    next_h = out_gate * _sym.tanh(next_c)
+    in_gate = _convert_recurrent_activation(gates[0], keras_layer)
+    in_transform = _convert_recurrent_activation(gates[1], keras_layer)
+    next_c = in_transform * in_state_c + in_gate * _convert_activation(gates[2], keras_layer, None)
+    out_gate = _convert_recurrent_activation(gates[3], keras_layer)
+    next_h = out_gate * _convert_activation(next_c, keras_layer, None)
 
     out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
     out = _sym.reshape(next_h, shape=out_shape)
     return [out, next_h, next_c]
 
+def _convert_simple_rnn(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        buffer = np.zeros((1, keras_layer.units), 'float32')
+        prev_sym = symtab.new_const(buffer)
+        insym = [insym, prev_sym]
+    in_data = insym[0]
+    prev_sym = insym[1]
+
+    weightList = keras_layer.get_weights()
+    kernel_wt = symtab.new_const(weightList[0].transpose([1, 0]))
+    recurrent_wt = symtab.new_const(weightList[1].transpose([1, 0]))
+    in_bias = symtab.new_const(weightList[2])
+    units = list(weightList[0].shape)[1]
+
+    in_data = _sym.flatten(in_data)
+    ixh = _sym.dense(in_data, kernel_wt, in_bias, use_bias=True, units=units)
+    prev_sym = _sym.flatten(prev_sym)
+    ixh2 = _sym.dense(prev_sym, recurrent_wt, use_bias=False, units=units)
+    output = ixh + ixh2
+    output = _convert_activation(output, keras_layer, None)
+
+    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
+    output = _sym.reshape(output, shape=out_shape)
+
+    return [output, output]
+
+def _convert_gru(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        buffer = np.zeros((1, keras_layer.units), 'float32')
+        h_tm1 = symtab.new_const(buffer)
+        insym = [insym, h_tm1]
+    in_data = insym[0]
+    h_tm1_sym = insym[1]
+
+    weightList = keras_layer.get_weights()
+    kernel_wt = symtab.new_const(weightList[0].transpose([1, 0]))
+    recurrent_wt = symtab.new_const(weightList[1].transpose([1, 0]))
+    in_bias = symtab.new_const(weightList[2])
+
+    units = list(weightList[0].shape)[1]
+
+    in_data = _sym.flatten(in_data)
+    matrix_x = _sym.dense(in_data, kernel_wt, in_bias, use_bias=True, units=units)
+
+    # inputs projected by all gate matrices at once
+    split_indices = [keras_layer.units, 2 * keras_layer.units]
+    gates = _sym.split(matrix_x, indices_or_sections=split_indices, axis=1)
+    x_z = gates[0]
+    x_r = gates[1]
+    x_h = gates[2]
+
+    # hidden state projected separately for update/reset and new
+    units = 2 * keras_layer.units
+    split_indices = [units]
+    rec_wts = _sym.split(recurrent_wt, indices_or_sections=split_indices, axis=0)
+
+    h_tm1_sym = _sym.flatten(h_tm1_sym)
+    matrix_inner = _sym.dense(h_tm1_sym, rec_wts[0], use_bias=False, units=units)
+
+    split_indices = [keras_layer.units]
+    recurrent = _sym.split(matrix_inner, indices_or_sections=split_indices, axis=1)
+    recurrent_z = recurrent[0]
+    recurrent_r = recurrent[1]
+
+    rec_act_z = _convert_recurrent_activation(x_z + recurrent_z, keras_layer)
+    rec_act_r = _convert_recurrent_activation(x_r + recurrent_r, keras_layer)
+
+    units = keras_layer.units
+    recurrent_h = _sym.dense(rec_act_r * h_tm1_sym, rec_wts[1], use_bias=False, units=units)
+    act_hh = _convert_activation(x_h + recurrent_h, keras_layer, None)
+
+    # previous and candidate state mixed by update gate
+    output = rec_act_z * h_tm1_sym + (1 - rec_act_z) * act_hh
+
+    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
+    output = _sym.reshape(output, shape=out_shape)
+    return [output, output]
+
 def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     """Layers that can be skipped because they are train time only."""
     return insym
@@ -475,9 +559,9 @@ def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     # 'UpSampling3D'           : _convert_upsample,
     # 'Conv1D'                 : _convert_convolution1d,
 
-    # 'GRU'                    : _convert_gru,
+    'SimpleRNN'                : _convert_simple_rnn,
     'LSTM'                     : _convert_lstm,
-    # 'SimpleRNN'              : _convert_simple_rnn,
+    'GRU'                      : _convert_gru,
     # 'Bidirectional'          : _convert_bidirectional,
     # 'TimeDistributed'        : _default_skip,
 
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 459be8737658..2e1c378d27cd 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -254,6 +254,58 @@ def test_forward_LSTM():
     _test_LSTM(4, 4, return_state=False)
     _test_LSTM_MultiLayer(4, 4)
 
+def _test_RNN(inputs, units):
+    data = keras.layers.Input(shape=(1, inputs))
+    rnn_out = keras.layers.SimpleRNN(units, return_state=True,
+                                 activation='tanh')
+    x = rnn_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_RNN_MultiLayer(inputs, units):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.SimpleRNN(units, return_state=True, return_sequences=True,
+                                   activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.SimpleRNN(units, activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_RNN():
+    _test_RNN(2, 4)
+    _test_RNN(4, 3)
+    _test_RNN_MultiLayer(4, 12)
+
+def _test_GRU(inputs, units):
+    data = keras.layers.Input(shape=(1, inputs))
+    gru_out = keras.layers.GRU(units,
+                               return_state=True,
+                               recurrent_activation='sigmoid',
+                               activation='tanh')
+    x = gru_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_GRU_MultiLayer(inputs, units):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.GRU(units,
+                             return_state=True,
+                             return_sequences=True,
+                             recurrent_activation='sigmoid',
+                             activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.GRU(units, recurrent_activation='sigmoid',
+                              activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_GRU():
+    _test_GRU(2, 4)
+    _test_GRU(4, 3)
+    _test_GRU_MultiLayer(4, 4)
+
 if __name__ == '__main__':
     test_forward_elemwise_add()
     test_forward_activations()
@@ -272,3 +324,5 @@ def test_forward_LSTM():
     test_forward_multi_outputs()
     test_forward_reuse_layers()
     test_forward_LSTM()
+    test_forward_RNN()
+    test_forward_GRU()

From 934c60b61c28971c759f88bb056a22c363800f53 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Sun, 23 Sep 2018 09:55:06 +0530
Subject: [PATCH 121/529] [DOCKER] Golang CI recipe. (#1759)

---
 docker/Dockerfile.ci_cpu                | 4 ++++
 docker/README.md                        | 6 ++++++
 docker/install/ubuntu_install_golang.sh | 4 ++++
 3 files changed, 14 insertions(+)
 create mode 100644 docker/install/ubuntu_install_golang.sh

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 0f0fc6f04d4c..a0313a566b11 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -18,3 +18,7 @@ RUN bash /install/ubuntu_install_llvm.sh
 # AutoTVM deps
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
+
+# Golang environment
+COPY install/ubuntu_install_golang.sh /install/ubuntu_install_golang.sh
+RUN bash /install/ubuntu_install_golang.sh
diff --git a/docker/README.md b/docker/README.md
index e9b8b503062f..213c84cc6e5e 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -85,3 +85,9 @@ Here are some common use examples to perform CI tasks.
   ```bash
   ./docker/ci_build.sh ci_gpu make -C docs html
   ```
+
+- build golang test suite.
+
+  ```bash
+  ./docker/build.sh ci_cpu make -C golang tests
+  ```
diff --git a/docker/install/ubuntu_install_golang.sh b/docker/install/ubuntu_install_golang.sh
new file mode 100644
index 000000000000..9585824091a7
--- /dev/null
+++ b/docker/install/ubuntu_install_golang.sh
@@ -0,0 +1,4 @@
+#install the necessary dependancies for golang build
+apt-get update && apt-get install -y golang-go
+apt-get update && apt-get install -y godoc
+apt-get update && apt-get install -y golint

From ee5550fe544eff1c7080d4441f479c02f41c730b Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Sun, 23 Sep 2018 17:23:52 -0700
Subject: [PATCH 122/529] fix buffer elem_offset calculation (#1762)

---
 src/lang/buffer.cc                        | 14 +++++---------
 tests/python/unittest/test_lang_buffer.py |  9 +++++++++
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc
index cb3194f8eb1d..69967c55a7ff 100644
--- a/src/lang/buffer.cc
+++ b/src/lang/buffer.cc
@@ -226,16 +226,12 @@ inline Expr ElemOffset(const BufferNode* n, Array<Expr> index) {
   Expr base = n->elem_offset;
   if (n->strides.size() == 0) {
     CHECK_EQ(n->shape.size(), index.size());
-    if (n->shape.size() != 0) {
-      if (is_zero(base)) {
-        base = index[0];
-      } else {
-        base = base + index[0];
+    if (index.size() > 0) {
+      Expr offset = index[0];
+      for (size_t i = 1; i < index.size(); ++i) {
+        offset = MergeMulMod(offset * n->shape[i] + index[i]);
       }
-    }
-    base = MergeMulMod(base);
-    for (size_t i = 1; i < index.size(); ++i) {
-      base = MergeMulMod(base * n->shape[i] + index[i]);
+      base = base + offset;
     }
   } else {
     CHECK_EQ(n->strides.size(), index.size());
diff --git a/tests/python/unittest/test_lang_buffer.py b/tests/python/unittest/test_lang_buffer.py
index a5a8f5d065a6..51f1e3abb7e9 100644
--- a/tests/python/unittest/test_lang_buffer.py
+++ b/tests/python/unittest/test_lang_buffer.py
@@ -41,6 +41,14 @@ def test_buffer_access_ptr_offset():
     assert tvm.ir_pass.Equal(offset, tvm.call_extern('int32', "test_call", 200 + v))
     assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
 
+def test_buffer_vload():
+    m = tvm.var('m')
+    n = tvm.var('n')
+    Ab = tvm.decl_buffer((m, n), tvm.float32, elem_offset=100)
+    load = Ab.vload([2, 3])
+    offset = tvm.ir_pass.Simplify(load.index)
+    assert tvm.ir_pass.Equal(offset, n * 2 + 103)
+
 def test_buffer_index_merge_mult_mod():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -76,4 +84,5 @@ def assert_simplified_equal(index_simplified, index_direct):
     test_buffer()
     test_buffer_access_ptr()
     test_buffer_access_ptr_offset()
+    test_buffer_vload()
     test_buffer_index_merge_mult_mod()

From ee8911f2eee407f9f15b43a3209a983fcdeb8415 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Mon, 24 Sep 2018 07:25:01 +0530
Subject: [PATCH 123/529] [FRONTEND][TENSORFLOW] NCHW layout support (Resnet
 V1/V2). (#1743)

---
 nnvm/python/nnvm/frontend/tensorflow.py       |  70 +++++--
 .../frontend/tensorflow/test_forward.py       | 193 +++++++++++-------
 2 files changed, 174 insertions(+), 89 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 9c9fac897ab2..3bd3ee079ee0 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -110,11 +110,6 @@ def _elemwise(name):
     def _impl(inputs, attr, *args):
         assert len(inputs) == 2, "Math op take 2 inputs, {} given".format(len(inputs))
         op_name = _math_name_picker(name)(attr)
-        axis = int(attr.get('axis', 0))
-        conv_ops = ["conv2d", "conv2d_transpose"]
-        if op_name == 'broadcast_add' and inputs[0].attr('op_name') in conv_ops:
-            # TODO: remove hard coded infershape
-            inputs[1] = _sym.expand_dims(inputs[1], axis=axis, num_newaxis=2)
         return get_nnvm_op(op_name)(*inputs)
     return _impl
 
@@ -128,8 +123,10 @@ def _impl(inputs, attr, params):
 
         if attr['data_format'] == 'NHWC':
             attr['kernel_shape'] = (attr['ksize'][1], attr['ksize'][2])
+            attr['strides'] = (attr['strides'][1], attr['strides'][2])
         elif attr['data_format'] == 'NCHW':
             attr['kernel_shape'] = (attr['ksize'][2], attr['ksize'][3])
+            attr['strides'] = (attr['strides'][2], attr['strides'][3])
         else:
             raise TypeError("Unsupported data_format type : {}".format(attr['data_format']))
 
@@ -140,9 +137,6 @@ def _impl(inputs, attr, params):
             attr['data_format'] = "NCHW"
             flip_layout = True
 
-        # Fix strides
-        attr['strides'] = (attr['strides'][1], attr['strides'][2])
-
         # Fix padding
         attr['padding'] = attr['padding'].decode("utf-8")
 
@@ -188,8 +182,15 @@ def _impl(inputs, attr, params):
         attr['data_format'] = attr['data_format'].decode("utf-8")
         flip_layout = False
 
+        # NCHW Layout require weights transpose
+        if attr['data_format'] == 'NCHW':
+            tmp_shape = attr['_input_shapes'][inputs[1]][0]
+            tmp_shape = [tmp_shape[ii] for ii in (3, 2, 0, 1)]
+            inputs[1] = _sym.transpose(inputs[1], axes=(3, 2, 0, 1))
+            attr['_input_shapes'][inputs[1]] = [tmp_shape]
+
         input_shape = attr['_input_shapes'][inputs[0]][0]
-        weights_shape = params[inputs[1].list_output_names()[0]].shape
+        weights_shape = attr['_input_shapes'][inputs[1]][0]
 
         if attr['_target_layout'] == "NCHW" and attr['data_format'] == "NHWC":
             input_shape = [input_shape[ii] for ii in (0, 3, 1, 2)]
@@ -202,6 +203,7 @@ def _impl(inputs, attr, params):
                 inputs[1] = _sym.transpose(inputs[1], axes=(2, 3, 0, 1))
 
             attr['data_format'] = "NCHW"
+            attr['strides'] = [attr['strides'][ii] for ii in (0, 3, 1, 2)]
             flip_layout = True
 
         if attr['data_format'] == 'NHWC':
@@ -214,6 +216,7 @@ def _impl(inputs, attr, params):
 
             if 'dilations' in attr:
                 attr['dilations'] = (attr['dilations'][0], attr['dilations'][1])
+            attr['strides'] = (attr['strides'][1], attr['strides'][2])
         elif attr['data_format'] == 'NCHW':
             depth_mult, _, kernel_h, kernel_w = weights_shape
             attr['kernel_shape'] = (weights_shape[2], weights_shape[3])
@@ -226,6 +229,7 @@ def _impl(inputs, attr, params):
 
             if 'dilations' in attr:
                 attr['dilations'] = (attr['dilations'][2], attr['dilations'][3])
+            attr['strides'] = (attr['strides'][2], attr['strides'][3])
         else:
             raise TypeError("Unsupported data format type : {}".format(attr['data_format']))
 
@@ -233,9 +237,6 @@ def _impl(inputs, attr, params):
         if opname == 'depthwise':
             attr['groups'] = attr['channels']
 
-        # Fix strides
-        attr['strides'] = (attr['strides'][1], attr['strides'][2])
-
         # Fix padding
         attr['padding'] = attr['padding'].decode("utf-8")
 
@@ -416,12 +417,27 @@ def _fused_batch_norm():
     def _impl(inputs, attr, params):
         # Tensorflow: (data, gamma, beta, moving_mean, moving_variance)
         # NNVM:       (data, gamma, beta, moving_mean, moving_varience)
-        return AttrCvt(
-            op_name='batch_norm',
-            transforms={'scale_after_normalization':'scale', 'variance_epsilon':'epsilon'},
-            extras={'axis': 3}, # Fix axis
-            ignores=['data_format'],
-            disables=['momentum'])(inputs, attr)
+        axis = 3
+        need_cast = False
+
+        if 'data_format' in attr:
+            attr['data_format'] = attr['data_format'].decode("utf-8")
+            if attr['data_format'] == 'NCHW':
+                axis = 1
+        if 'U' in attr:
+            need_cast = True
+            inputs[0] = _sym.cast(inputs[0], dtype=attr['U'].name)
+
+        out = AttrCvt(op_name='batch_norm',
+                      transforms={'scale_after_normalization':'scale',
+                                  'variance_epsilon':'epsilon'},
+                      extras={'axis': axis},
+                      ignores=['data_format', 'U'],
+                      disables=['momentum'])(inputs, attr)
+
+        if need_cast:
+            out = _sym.cast(out, dtype=attr['T'].name)
+        return out
     return _impl
 
 def _batch_norm():
@@ -432,10 +448,16 @@ def _impl(inputs, attr, params):
         # (data, gamma, beta, moving_mean, moving_var)
         new_inputs = [inputs[0], inputs[4], inputs[3], inputs[1], inputs[2]]
 
+        axis = 3
+        if 'data_format' in attr:
+            attr['data_format'] = attr['data_format'].decode("utf-8")
+            if attr['data_format'] == 'NCHW':
+                axis = 1
+
         return AttrCvt(
             op_name='batch_norm',
             transforms={'scale_after_normalization':'scale', 'variance_epsilon':'epsilon'},
-            extras={'axis': 3}, # Fix axis
+            extras={'axis': axis},
             ignores=['data_format'],
             disables=['momentum'])(new_inputs, attr)
     return _impl
@@ -729,6 +751,14 @@ def _impl(inputs, attr, params):
         return gamma * (-alpha * _sym.relu(1 - _sym.exp(inputs[0])) + _sym.relu(inputs[0]))
     return _impl
 
+def _mean():
+    def _impl(inputs, attr, params):
+        axis = params.pop(inputs[1].list_output_names()[0])
+        return AttrCvt(op_name="mean", ignores=['Tdim', 'Tidx'],
+                       transforms={'keep_dims': 'keepdims'},
+                       extras={'axis': tuple(axis.asnumpy())})(inputs[0], attr)
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -773,6 +803,7 @@ def _impl(inputs, attr, params):
     'Rsqrt'                             : _rsqrt(),
     'Squeeze'                           : _squeeze(),
     'FusedBatchNorm'                    : _fused_batch_norm(),
+    'FusedBatchNormV2'                  : _fused_batch_norm(),
     'Relu6'                             : _relu6(),
     'DepthwiseConv2dNative'             : _conv('depthwise'),
     'Shape'                             : _shape(),
@@ -787,6 +818,7 @@ def _impl(inputs, attr, params):
     'Rank'                              : _rank(),
     'Transpose'                         : _transpose(),
     'Tanh'                              : AttrCvt('tanh'),
+    'Mean'                              : _mean(),
 }
 
 # _convert_map_rnn defines maps of rnn operator name to
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index ad7f41a83e62..2d965a6540a1 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -88,7 +88,7 @@ def run_tf_graph(sess, input_data, input_node, output_node):
     return output_data
 
 
-def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False):
+def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False, no_gpu=False):
     """Generic function to generate and compare tensorflow and TVM output"""
 
     out_node = out_name.split(':')[0] if ":" in out_name else out_name
@@ -116,6 +116,8 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False)
             if not ctx.exist:
                 print("Skip because %s is not enabled" % device)
                 continue
+            if no_gpu and device == 'cuda':
+                continue
 
             tvm_output = run_tvm_graph(final_graph_def, in_data,
                                        in_node, tf_output.shape, tf_output.dtype, target=device)
@@ -123,10 +125,20 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False)
 
         sess.close()
 
+def is_gpu_available():
+    from tensorflow.python.client import device_lib
+    local_device_protos = device_lib.list_local_devices()
+    gpu_list = [x.name for x in local_device_protos if x.device_type == 'GPU']
+    if len(gpu_list) < 0:
+        print("Tensorflow GPU:", gpu_list)
+        return True
+    else:
+        return False
+
 #######################################################################
 # Pooling
 # -------
-def _test_pooling(input_shape, **kwargs):
+def _test_pooling_iteration(input_shape, **kwargs):
     """ One iteration of pool operation with given shapes and attributes """
 
     x = -np.arange(
@@ -143,61 +155,45 @@ def _test_pooling(input_shape, **kwargs):
 
         compare_tf_with_tvm(x, 'Placeholder:0', out_name)
 
+def _test_pooling(input_shape, **kwargs):
+    _test_pooling_iteration(input_shape, **kwargs)
+
+    if is_gpu_available():
+        input_shape = [input_shape[ii] for ii in (0, 3, 1, 2)]
+        kwargs['data_layout'] = 'NCHW'
+        _test_pooling_iteration(input_shape, **kwargs)
+
 def test_forward_pooling():
     """ Pooling """
 
-    _test_pooling(input_shape=[2, 9, 10, 2],
-                 window_shape=[1, 1],
-                 padding='SAME',
-                 pooling_type='MAX',
-                 dilation_rate=[1, 1],
-                 strides=[1, 1])
-    _test_pooling(input_shape=[2, 9, 10, 2],
-                 window_shape=[1, 1],
-                 padding='SAME',
-                 pooling_type='AVG',
-                 dilation_rate=[1, 1],
-                 strides=[1, 1])
-
-    _test_pooling(input_shape=[2, 10, 9, 2],
-                 window_shape=[1, 1],
-                 padding='SAME',
-                 pooling_type='MAX',
-                 dilation_rate=[1, 1],
-                 strides=[1, 1])
-    _test_pooling(input_shape=[2, 10, 9, 2],
-                 window_shape=[1, 1],
-                 padding='SAME',
-                 pooling_type='AVG',
-                 dilation_rate=[1, 1],
-                 strides=[1, 1])
-
-    _test_pooling(input_shape=[2, 9, 10, 2],
-                 window_shape=[2, 1],
-                 padding='SAME',
-                 pooling_type='MAX',
-                 dilation_rate=[1, 1],
-                 strides=[1, 1])
-    _test_pooling(input_shape=[2, 9, 10, 2],
-                 window_shape=[2, 1],
-                 padding='SAME',
-                 pooling_type='AVG',
-                 dilation_rate=[1, 1],
-                 strides=[2, 1])
-
-    _test_pooling(input_shape=[2, 10, 9, 2],
-                 window_shape=[2, 3],
-                 padding='SAME',
-                 pooling_type='MAX',
-                 dilation_rate=[1, 1],
-                 strides=[2, 1])
-    _test_pooling(input_shape=[2, 10, 9, 2],
-                 window_shape=[2, 3],
-                 padding='SAME',
-                 pooling_type='AVG',
-                 dilation_rate=[1, 1],
-                 strides=[1, 2])
-
+    for pool_type in ['AVG', 'MAX']:
+            _test_pooling(input_shape=[2, 9, 10, 2],
+                         window_shape=[1, 1],
+                         padding='SAME',
+                         pooling_type=pool_type,
+                         dilation_rate=[1, 1],
+                         strides=[1, 1])
+
+            _test_pooling(input_shape=[2, 10, 9, 2],
+                         window_shape=[1, 1],
+                         padding='SAME',
+                         pooling_type=pool_type,
+                         dilation_rate=[1, 1],
+                         strides=[1, 1])
+
+            _test_pooling(input_shape=[2, 9, 10, 2],
+                         window_shape=[2, 1],
+                         padding='SAME',
+                         pooling_type=pool_type,
+                         dilation_rate=[1, 1],
+                         strides=[1, 1])
+
+            _test_pooling(input_shape=[2, 10, 9, 2],
+                         window_shape=[2, 3],
+                         padding='SAME',
+                         pooling_type=pool_type,
+                         dilation_rate=[1, 1],
+                         strides=[2, 1])
 
 #######################################################################
 # Convolution
@@ -234,6 +230,12 @@ def _test_convolution(tensor_in_sizes, filter_in_sizes,
                             'Placeholder:0', 'Conv2D:0')
 
 def test_forward_convolution():
+    if is_gpu_available():
+        _test_convolution([4, 176, 8, 8], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME', 'NCHW')
+        _test_convolution([4, 19, 17, 17], [3, 3, 19, 19], [1, 1], [2, 2], 'VALID', 'NCHW')
+        _test_convolution([4, 124, 17, 17], [1, 1, 124, 19], [1, 1], [1, 1], 'SAME', 'NCHW')
+        _test_convolution([4, 12, 17, 17], [3, 3, 12, 32], [1, 1], [2, 2], 'VALID', 'NCHW')
+
     _test_convolution([4, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME', 'NHWC')
     _test_convolution([4, 17, 17, 19], [3, 3, 19, 19], [1, 1], [2, 2], 'VALID', 'NHWC')
     _test_convolution([4, 17, 17, 124], [1, 1, 124, 19], [1, 1], [1, 1], 'SAME', 'NHWC')
@@ -711,6 +713,25 @@ def test_forward_mobilenet():
             tvm_output = run_tvm_graph(graph_def, data, 'input', tf_output.shape, 'float32')
             np.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
 
+#######################################################################
+# ResnetV2
+# ---------
+def test_forward_resnetv2():
+    '''test resnet model'''
+    if is_gpu_available():
+        with tf.Graph().as_default():
+            graph_def = nnvm.testing.tf.get_workload("ResnetV2/resnet-20180601_resnet_v2_imagenet-shapes.pb")
+            # Call the utility to import the graph definition into default graph.
+            graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
+
+            data = np.random.uniform(size=(128, 224, 224, 3)).astype('float32')
+            out_node = 'ArgMax'
+
+            with tf.Session() as sess:
+                tf_output = run_tf_graph(sess, data, 'input_tensor:0', out_node + ':0')
+                tvm_output = run_tvm_graph(graph_def, data, 'input_tensor', tf_output.shape, 'float32')
+                np.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
+
 #######################################################################
 # PTB
 # ---
@@ -946,38 +967,70 @@ def test_forward_tanh():
         tf.nn.tanh(in1)
         compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Tanh:0')
 
+#######################################################################
+# Mean
+# ----
+def test_forward_mean():
+    def check_mean(ishape, **kwargs):
+        inp_array = np.random.uniform(size=ishape).astype(np.float32)
+        with tf.Graph().as_default():
+            in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+            tf.keras.backend.mean(in1, **kwargs)
+            compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Mean:0', no_gpu=True)
+
+    check_mean((10, 8, 16, 32))
+    check_mean((10, 8, 16, 32), axis=(2,3))
+    check_mean((10, 8, 16, 32), axis=(1,2), keepdims=True)
+
 #######################################################################
 # Main
 # ----
 if __name__ == '__main__':
+    # Transforms
     test_forward_transpose()
-    test_forward_convolution()
-    test_forward_pooling()
     test_forward_reshape()
     test_forward_squeeze()
+    test_forward_pack()
+    test_forward_resize_bilinear()
+    test_forward_pad()
+    test_forward_gather()
+    #test_forward_stridedslice()
+
+    # Activations
     test_forward_sigmoid()
+    test_forward_relu()
+    test_forward_leaky_relu()
+    test_forward_elu()
+    test_forward_selu()
+    test_forward_tanh()
+
+    # Reductions
     test_forward_argminmax()
     test_forward_reduce()
+    test_forward_mean()
+
+    # NN
+    test_forward_convolution()
+    test_forward_pooling()
     if tf.__version__ == '1.4.1':
         _test_forward_concat_v2()
+    test_forward_lrn()
+    test_forward_l2_normalize()
+
+    # General
     test_forward_multi_input()
-    test_forward_pack()
+    test_forward_variable()
+
+    # End to End
     test_forward_inception_v3()
     test_forward_inception_v1()
     test_forward_mobilenet()
-    test_forward_variable()
-    test_forward_resize_bilinear()
-    test_forward_pad()
-    #test_forward_lstm()
-    #test_forward_stridedslice()
-    test_forward_gather()
+    test_forward_resnetv2()
     test_forward_ptb()
-    test_forward_lrn()
-    test_forward_l2_normalize()
+
+    # RNN
+    #test_forward_lstm()
+
+    # Elementwise
     test_forward_ceil()
     test_forward_floor()
-    test_forward_relu()
-    test_forward_leaky_relu()
-    test_forward_elu()
-    test_forward_selu()
-    test_forward_tanh()

From 8683c3169e3c1153dc3a87c660bd976e869f1096 Mon Sep 17 00:00:00 2001
From: nhynes <nhynes@berkeley.edu>
Date: Mon, 24 Sep 2018 09:46:28 -0700
Subject: [PATCH 124/529] Update SGX cmake (#1763)

---
 .gitignore                              |  1 +
 CMakeLists.txt                          |  5 ++++
 cmake/config.cmake                      | 11 ++++++++
 cmake/modules/SGX.cmake                 | 37 +++++++++++++++++++++++++
 src/runtime/sgx/tvm.edl                 |  2 ++
 src/runtime/sgx/untrusted/sgx_module.cc |  3 +-
 6 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 cmake/modules/SGX.cmake

diff --git a/.gitignore b/.gitignore
index 3c968eb3ed47..01f91d69874f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -182,6 +182,7 @@ cat.jpg
 docs.tgz
 cat.png
 *.mlmodel
+tvm_u.*
 # Mac OS X
 .DS_Store
 build*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65a7d9e36e2d..1f03b9f64ab9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" O
 tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
 tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
+tvm_option(USE_SGX "Build with SGX" OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
 tvm_option(USE_MSVC_MT "Build with MT" OFF)
 tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF)
@@ -170,6 +171,7 @@ include(cmake/modules/OpenGL.cmake)
 include(cmake/modules/Vulkan.cmake)
 include(cmake/modules/Metal.cmake)
 include(cmake/modules/ROCM.cmake)
+include(cmake/modules/SGX.cmake)
 include(cmake/modules/LLVM.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/Random.cmake)
@@ -179,6 +181,9 @@ include(cmake/modules/contrib/NNPack.cmake)
 add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})
 add_library(tvm_topi SHARED ${TOPI_SRCS})
 add_library(tvm_runtime SHARED ${RUNTIME_SRCS})
+if(NOT USE_SGX STREQUAL "OFF")
+  add_dependencies(tvm_runtime sgx_edl)
+endif()
 add_library(nnvm_compiler SHARED ${NNVM_COMPILER_SRCS})
 
 target_link_libraries(tvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
diff --git a/cmake/config.cmake b/cmake/config.cmake
index c364a88cce11..a92be7ce3008 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -62,6 +62,17 @@ set(USE_VULKAN OFF)
 # Whether enable OpenGL runtime
 set(USE_OPENGL OFF)
 
+# Whether to enable SGX runtime
+#
+# Possible values for USE_SGX:
+# - /path/to/sgxsdk: path to Intel SGX SDK
+# - OFF: disable SGX
+#
+# SGX_MODE := HW|SIM
+set(USE_SGX OFF)
+set(SGX_MODE "SIM")
+set(RUST_SGX_SDK "/path/to/rust-sgx-sdk")
+
 # Whether enable RPC runtime
 set(USE_RPC ON)
 
diff --git a/cmake/modules/SGX.cmake b/cmake/modules/SGX.cmake
new file mode 100644
index 000000000000..d7b8546d5d41
--- /dev/null
+++ b/cmake/modules/SGX.cmake
@@ -0,0 +1,37 @@
+if(NOT USE_SGX STREQUAL "OFF")
+  message(STATUS "Build with SGX support")
+
+  set(_sgx_src ${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/sgx)
+  set(_tvm_u_h ${_sgx_src}/untrusted/tvm_u.h)
+  set(_tvm_edl ${_sgx_src}/tvm.edl)
+  set(_sgx_ustdc ${RUST_SGX_SDK}/sgx_ustdc)
+
+  set(_urts_lib "sgx_urts")
+  if(SGX_MODE STREQUAL "SIM")
+    set(_urts_lib "${_urts_lib}_sim")
+  endif()
+
+  add_custom_command(
+    OUTPUT ${_tvm_u_h}
+    COMMAND ${USE_SGX}/bin/x64/sgx_edger8r --untrusted
+      --untrusted-dir ${_sgx_src}/untrusted
+      --search-path ${USE_SGX}/include --search-path ${RUST_SGX_SDK}/edl
+      ${_tvm_edl}
+    COMMAND sed -i "4i '#include <tvm/runtime/c_runtime_api.h>'" ${_tvm_u_h}
+    DEPENDS ${_tvm_edl}
+  )
+  add_custom_command(
+    OUTPUT ${_sgx_ustdc}/libsgx_ustdc.a
+    COMMAND make
+    WORKING_DIRECTORY ${_sgx_ustdc}
+  )
+  add_custom_target(sgx_edl DEPENDS ${_tvm_u_h} ${_sgx_ustdc}/libsgx_ustdc.a)
+
+  include_directories(${USE_SGX}/include)
+  file(GLOB RUNTIME_SGX_SRCS ${_sgx_src}/untrusted/*.c*)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS
+    -lpthread
+    -L${USE_SGX}/lib64 -l${_urts_lib}
+    -L${RUST_SGX_SDK}/sgx_ustdc -lsgx_ustdc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_SGX_SRCS})
+endif()
diff --git a/src/runtime/sgx/tvm.edl b/src/runtime/sgx/tvm.edl
index b4d9852f8499..55c8a878d766 100644
--- a/src/runtime/sgx/tvm.edl
+++ b/src/runtime/sgx/tvm.edl
@@ -1,5 +1,7 @@
 enclave {
     from "sgx_tstdc.edl" import *;
+    from "sgx_stdio.edl" import *;
+    from "sgx_backtrace.edl" import *;
 
     trusted {
         public void tvm_ecall_init([isptr, user_check] TVMRetValueHandle ret);
diff --git a/src/runtime/sgx/untrusted/sgx_module.cc b/src/runtime/sgx/untrusted/sgx_module.cc
index 8dd696349b05..b1c1692de398 100644
--- a/src/runtime/sgx/untrusted/sgx_module.cc
+++ b/src/runtime/sgx/untrusted/sgx_module.cc
@@ -4,11 +4,11 @@
  * \brief SGX enclave module.
  */
 #include <dmlc/logging.h>
+#include <sgx_urts.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/threading_backend.h>
-#include <sgx_urts.h>
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include "../common.h"
 #include "../../file_util.h"
+#include "./tvm_u.h"
 
 namespace tvm {
 namespace runtime {

From 9edb9216ac18266fcabf219107b1ba193d8e8c57 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Mon, 24 Sep 2018 22:57:12 +0530
Subject: [PATCH 125/529] [DOC] Argument name correction. (#1765)

---
 nnvm/src/top/tensor/transform.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index b1485438ca50..40c8c930a029 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -171,7 +171,7 @@ Example::
    y = [[3,3],[4,4],[5,5]]
    z = [[6,6], [7,7],[8,8]]
 
-   concatenate(x,y,z,dim=0) = [[ 1.,  1.],
+   concatenate(x,y,z,axis=0) = [[ 1.,  1.],
                                [ 2.,  2.],
                                [ 3.,  3.],
                                [ 4.,  4.],
@@ -183,7 +183,7 @@ Example::
    Note that you cannot concat x,y,z along dimension 1 since dimension
    0 is not the same for all the input arrays.
 
-   concatenate(y,z,dim=1) = [[ 3.,  3.,  6.,  6.],
+   concatenate(y,z,axis=1) = [[ 3.,  3.,  6.,  6.],
                              [ 4.,  4.,  7.,  7.],
                              [ 5.,  5.,  8.,  8.]]
 

From 17232fc903707cb067d80f8aa299da1132db76a5 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 25 Sep 2018 08:39:48 +0530
Subject: [PATCH 126/529] [DOC]Errors corrected (#1767)

---
 include/tvm/ir_pass.h                  | 2 +-
 python/tvm/_ffi/runtime_ctypes.py      | 2 +-
 python/tvm/schedule.py                 | 2 +-
 python/tvm/tensor_intrin.py            | 2 +-
 src/codegen/codegen_c.cc               | 2 +-
 src/codegen/verilog/codegen_verilog.cc | 4 ++--
 src/op/tensorize.cc                    | 4 ++--
 src/runtime/pack_args.h                | 2 +-
 src/runtime/rpc/rpc_session.cc         | 4 ++--
 vta/python/vta/ir_pass.py              | 2 +-
 10 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index ab42cfc9625f..9403a2e6151b 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -217,7 +217,7 @@ Stmt NarrowChannelAccess(Stmt stmt);
  * \param auto_max_step The maximum step before stop attach automatic unroll
  * \param auto_max_depth The maximum depth before stop attach automatic unroll
  * \param auto_max_extent The maximum extent of the loop we can unroll,
- *                        this is an legacy option that donot take the loop total steps into account.
+ *                     this is an legacy option that do not take the loop total steps into account.
  * \param explicit_unroll Whether explicitly unroll the loop, or leave unroll annotation to codegen.
  * \return Transformed stmt.
  */
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 4f94e0e62d0a..2aced1aef7d2 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -67,7 +67,7 @@ def __init__(self, type_str):
             bits = 64
             head = ""
         else:
-            raise ValueError("Donot know how to handle type %s" % type_str)
+            raise ValueError("Do not know how to handle type %s" % type_str)
         bits = int(head) if head else bits
         self.bits = bits
 
diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py
index 594c2f2dc8bd..6c261a453457 100644
--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -362,7 +362,7 @@ def split(self, parent, factor=None, nparts=None):
         """
         if nparts is not None:
             if factor is not None:
-                raise ValueError("Donot need to provide both outer and nparts")
+                raise ValueError("Do not need to provide both outer and nparts")
             outer, inner = _api_internal._StageSplitByNParts(self, parent, nparts)
         else:
             if factor is None:
diff --git a/python/tvm/tensor_intrin.py b/python/tvm/tensor_intrin.py
index 62f8c8897d10..193124b2f946 100644
--- a/python/tvm/tensor_intrin.py
+++ b/python/tvm/tensor_intrin.py
@@ -72,7 +72,7 @@ def decl_tensor_intrin(op,
     binds_list = []
     for t in inputs:
         if not isinstance(t.op, _tensor.PlaceholderOp):
-            raise ValueError("Donot yet support composition op")
+            raise ValueError("Do not yet support composition op")
 
     cfg = current_build_config()
     for t in tensors:
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index c3b0d278c7ac..d902437dd990 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -207,7 +207,7 @@ std::string CodeGenC::GetStructRef(
     } else if (t.is_int()) {
       os << "v_int64";
     } else {
-      LOG(FATAL) << "donot know how to handle type" << t;
+      LOG(FATAL) << "Do not know how to handle type" << t;
     }
     os << ")";
     return os.str();
diff --git a/src/codegen/verilog/codegen_verilog.cc b/src/codegen/verilog/codegen_verilog.cc
index d7e149257fdb..af3d2fcfe467 100644
--- a/src/codegen/verilog/codegen_verilog.cc
+++ b/src/codegen/verilog/codegen_verilog.cc
@@ -213,11 +213,11 @@ VerilogValue CodeGenVerilog::VisitExpr_(const UIntImm *op) {
   return IntConst(op, this);
 }
 VerilogValue CodeGenVerilog::VisitExpr_(const FloatImm *op) {
-  LOG(FATAL) << "Donot support float constant in Verilog";
+  LOG(FATAL) << "Do not support float constant in Verilog";
   return VerilogValue();
 }
 VerilogValue CodeGenVerilog::VisitExpr_(const StringImm *op) {
-  LOG(FATAL) << "Donot support string constant in Verilog";
+  LOG(FATAL) << "Do not support string constant in Verilog";
   return VerilogValue();
 }
 
diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc
index 6423c4e942e4..6daaedd16de1 100644
--- a/src/op/tensorize.cc
+++ b/src/op/tensorize.cc
@@ -52,10 +52,10 @@ size_t InferTensorizeRegion(
       const IterVarAttr& attr = (*iit).second;
       if (!found_point) {
         CHECK(!attr->bind_thread.defined())
-            << "Donot allow thread in tensorize scope";
+            << "Do not allow thread in tensorize scope";
       }
       if (attr->iter_type == kTensorized) {
-        CHECK(!found_point) << "Donot allow two tensorized point";
+        CHECK(!found_point) << "Do not allow two tensorized point";
         found_point = true;
         loc_scope = i - 1;
       }
diff --git a/src/runtime/pack_args.h b/src/runtime/pack_args.h
index 0a00e79f07df..5170e5fd9e9a 100644
--- a/src/runtime/pack_args.h
+++ b/src/runtime/pack_args.h
@@ -168,7 +168,7 @@ inline PackedFunc PackFuncNonBufferArg_(
       switch (codes[i]) {
         case INT64_TO_INT64:
         case FLOAT64_TO_FLOAT64: {
-          LOG(FATAL) << "Donot support 64bit argument to device function"; break;
+          LOG(FATAL) << "Do not support 64bit argument to device function"; break;
         }
         case INT64_TO_INT32: {
           holder[i].v_int32 = static_cast<int32_t>(args.values[base + i].v_int64);
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 0e2d637ab475..208944a69dce 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -250,9 +250,9 @@ class RPCSession::EventHandler : public dmlc::Stream {
           this->Write(arr->dtype);
           this->WriteArray(arr->shape, arr->ndim);
           CHECK(arr->strides == nullptr)
-              << "Donot support strided remote array";
+              << "Do not support strided remote array";
           CHECK_EQ(arr->byte_offset, 0)
-              << "Donot support send byte offset";
+              << "Do not support send byte offset";
           break;
         }
         case kNull: break;
diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py
index 90df67c53278..3efef7135edb 100644
--- a/vta/python/vta/ir_pass.py
+++ b/vta/python/vta/ir_pass.py
@@ -556,7 +556,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             return irb.get()
 
         else:
-            raise RuntimeError("Donot support copy %s->%s" % (src.scope, dst.scope))
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope, dst.scope))
 
     return tvm.ir_pass.InjectCopyIntrin(stmt_in, "dma_copy", _inject_copy)
 

From b69bb1020ee581dc0d924f99910f8a9cb610c46d Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 25 Sep 2018 12:10:11 +0900
Subject: [PATCH 127/529] [NNVM] Bugfix operator fusion for residual block with
 layout transform (#1760)

* Bugfix operator fusion for residual block with layout transform

* add a test case

* update error message
---
 nnvm/src/compiler/graph_compile.cc           |  3 +-
 nnvm/src/compiler/graph_fuse.cc              | 17 +++++++--
 nnvm/tests/python/compiler/test_op_fusion.py | 39 ++++++++++++++++++++
 3 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/nnvm/src/compiler/graph_compile.cc b/nnvm/src/compiler/graph_compile.cc
index e51730c09d66..3316f3932e27 100644
--- a/nnvm/src/compiler/graph_compile.cc
+++ b/nnvm/src/compiler/graph_compile.cc
@@ -109,13 +109,14 @@ nnvm::Graph GraphCompile(const nnvm::Graph& g) {
       inputs.push_back(it->second);
     }
     // Find master idx in the subgraph.
-    int sub_master_idx = 0;
+    int sub_master_idx = -1;
     for (uint32_t i = 0; i < subidx.num_nodes(); i++) {
       if (subidx[i].source->op() == idx[master].source->op()) {
         sub_master_idx = i;
         break;
       }
     }
+    CHECK_NE(sub_master_idx, -1) << "A master node not found in the subgraph.";
     fe.compiled_func = GraphLower(fe.subgraph, inputs, target, sub_master_idx);
     for (LoweredFunc f : fe.compiled_func->funcs) {
       if (!func_set.count(f.get())) {
diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
index c9ea58affb2c..4d724ae66c35 100644
--- a/nnvm/src/compiler/graph_fuse.cc
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -136,11 +136,15 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
 
   // Point to the group root id of each node.
   GroupVec group_vec(idx.num_nodes(), -1);
+  std::vector<std::vector<uint32_t> > node_ids_per_group(idx.num_nodes());
   for (uint32_t i = idx.num_nodes(); i != 0; --i) {
     uint32_t nid = i - 1;
     const auto& inode = idx[nid];
+    bool is_root = false;
     if (group_vec[nid] == -1) {
       group_vec[nid] = nid;
+      node_ids_per_group[nid].push_back(nid);
+      is_root = true;
     }
 
     // Check if injective op and out_ewise_fusable op (e.g. conv2d) are in the same group.
@@ -156,7 +160,15 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
       }
     }
     // Change the master node from out_ewise_fusable op to itself
-    if (parent_injective && parent_out_ewise) master_vec[nid] = nid;
+    if (parent_injective && parent_out_ewise) {
+      master_vec[nid] = nid;
+      if (!is_root) {
+        // Children nodes in the same group might be pointing to a master node in a different group.
+        for (uint32_t j : node_ids_per_group[group_vec[nid]]) {
+          master_vec[j] = nid;
+        }
+      }
+    }
 
     // Propagate the group id.
     for (const auto& e : inode.inputs) {
@@ -172,6 +184,7 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
         CHECK(group_vec[e.node_id] == -1||
               group_vec[e.node_id] == group_vec[nid]);
         group_vec[e.node_id] = group_vec[nid];
+        node_ids_per_group[group_vec[nid]].push_back(e.node_id);
       }
     }
   }
@@ -223,12 +236,10 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
   */
   if (opt_level >= 1) {
     std::vector<std::vector<uint32_t> > children_group_ids(idx.num_nodes());
-    std::vector<std::vector<uint32_t> > node_ids_per_group(idx.num_nodes());
     for (uint32_t nid = idx.num_nodes() - 1; nid != 0; --nid) {
       const auto& inode = idx[nid];
       if (inode.source->is_variable()) continue;
       CHECK_NE(group_vec[nid], -1);
-      node_ids_per_group[group_vec[nid]].push_back(nid);
       if (inode.inputs.size() != 1) continue;
       const uint32_t parent_nid = inode.inputs[0].node_id;
       // if parent node has more than one child, record each child's group id.
diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py
index 0c81ac890d55..288f112f1063 100644
--- a/nnvm/tests/python/compiler/test_op_fusion.py
+++ b/nnvm/tests/python/compiler/test_op_fusion.py
@@ -143,6 +143,44 @@ def test_concatenate_conv2d():
         np.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
 
 
+def test_residual_block_layout_transform():
+    ch = 16
+    size = 32
+    data = sym.Variable(name="data")
+    conv1 = sym.conv2d(data=data, kernel_size=(3,3), channels=ch, padding = (1, 1), use_bias=False, name="conv1")
+    layout_transform1 = sym.__layout_transform__(data=conv1, src_layout="NCHW", dst_layout="NCHW8c")
+    layout_transform2 = sym.__layout_transform__(data=layout_transform1, src_layout="NCHW8c", dst_layout="NCHW")
+    conv2 = sym.conv2d(data=conv1, kernel_size=(3,3), channels=ch, padding = (1, 1), use_bias=False, name="conv2")
+    elemwise_sum = sym.elemwise_add(layout_transform2, conv2)
+    out = sym.relu(elemwise_sum)
+
+    dtype="float32"
+    dshape = (1, ch, size, size)
+    kshape = (ch, ch, 3, 3)
+    oshape = (1, ch, size, size)
+    shape_dict = {"data": dshape}
+
+    target = "llvm" # only test on llvm since it involves NCHW8c layout
+    ctx = tvm.context(target, 0)
+    graph, lib, _ = nnvm.compiler.build(out, target, shape_dict)
+    # data, conv1 weight, conv1, layout transform + elemwise add + relu, conv2 weight, conv2 op
+    assert graph.index.num_nodes == 6
+
+    data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+    kernel1 = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+    kernel2 = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+    m = graph_runtime.create(graph, lib, ctx)
+    m.run(data=data, conv1_weight=kernel1, conv2_weight=kernel2)
+    out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+
+    conv1 = topi.testing.conv2d_nchw_python(
+        data.asnumpy(), kernel1.asnumpy(), (1,1), 'SAME')
+    conv2 = topi.testing.conv2d_nchw_python(
+        conv1, kernel2.asnumpy(), (1,1), 'SAME')
+    ref = np.maximum(conv1 + conv2, 0)
+    np.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
+
+
 def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2):
     with nnvm.compiler.build_config(opt_level=opt_level):
         graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params)
@@ -191,3 +229,4 @@ def get_sym(out_channel):
     test_fuse_conv2d_elu()
     test_injective_conv2d()
     test_concatenate_conv2d()
+    test_residual_block_layout_transform()

From bd4ce3b21a84bd8453abccb26b93824e0d4aeb59 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Tue, 25 Sep 2018 08:42:29 +0530
Subject: [PATCH 128/529] [DOCKER] CUDA upgrade to 9.0 to acommodate
 tensorflow-gpu (1.10.0). (#1761)

---
 docker/Dockerfile.ci_gpu                    | 5 ++++-
 docker/Dockerfile.demo_gpu                  | 2 +-
 docker/install/ubuntu_install_tensorflow.sh | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)
 create mode 100644 docker/install/ubuntu_install_tensorflow.sh

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index c177ef9d420a..bca16b59366b 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -1,5 +1,5 @@
 # CI docker GPU env
-FROM nvidia/cuda:8.0-cudnn7-devel
+FROM nvidia/cuda:9.0-cudnn7-devel
 
 # Base scripts
 RUN apt-get update --fix-missing
@@ -62,6 +62,9 @@ RUN pip3 install Pillow
 COPY install/ubuntu_install_vulkan.sh /install/ubuntu_install_vulkan.sh
 RUN bash /install/ubuntu_install_vulkan.sh
 
+COPY install/ubuntu_install_tensorflow.sh /install/ubuntu_install_tensorflow.sh
+RUN bash /install/ubuntu_install_tensorflow.sh
+
 # AutoTVM deps
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
diff --git a/docker/Dockerfile.demo_gpu b/docker/Dockerfile.demo_gpu
index 6f249986e22c..d20293c4ed3d 100644
--- a/docker/Dockerfile.demo_gpu
+++ b/docker/Dockerfile.demo_gpu
@@ -1,6 +1,6 @@
 # Minimum docker image for demo purposes
 # prebuilt-image: tvmai/demo-gpu
-FROM nvidia/cuda:8.0-cudnn7-devel
+FROM nvidia/cuda:9.0-cudnn7-devel
 
 RUN apt-get update --fix-missing
 
diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
new file mode 100644
index 000000000000..407954f8fd46
--- /dev/null
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -0,0 +1 @@
+pip3 install tensorflow-gpu

From 916a03cefebb6b01c481660f9f4450f59326fc3b Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Mon, 24 Sep 2018 20:13:34 -0700
Subject: [PATCH 129/529] [RUNTIME] Add fp16/fp32 conversion functions (#1766)

---
 .gitmodules                                   |   6 +-
 HalideIR => 3rdparty/HalideIR                 |   0
 3rdparty/compiler-rt/builtin_fp16.h           | 210 ++++++++++++++++++
 dlpack => 3rdparty/dlpack                     |   0
 dmlc-core => 3rdparty/dmlc-core               |   0
 CMakeLists.txt                                |  17 +-
 Makefile                                      |  14 +-
 .../app/src/main/jni/Android.mk               |   6 +-
 apps/android_rpc/app/src/main/jni/Android.mk  |   6 +-
 apps/extension/Makefile                       |   6 +-
 apps/howto_deploy/Makefile                    |   4 +-
 apps/howto_deploy/tvm_runtime_pack.cc         |   4 +-
 apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj |  16 +-
 apps/rocm_rpc/Makefile                        |   4 +-
 apps/sgx/Makefile                             |   2 +-
 nnvm/Makefile                                 |   2 +-
 nnvm/amalgamation/Makefile                    |   2 +-
 python/setup.py                               |   4 +-
 src/runtime/builtin_fp16.cc                   |  21 ++
 tests/python/unittest/test_runtime_ndarray.py |  21 ++
 vta/python/vta/pkg_config.py                  |   4 +-
 21 files changed, 301 insertions(+), 48 deletions(-)
 rename HalideIR => 3rdparty/HalideIR (100%)
 create mode 100644 3rdparty/compiler-rt/builtin_fp16.h
 rename dlpack => 3rdparty/dlpack (100%)
 rename dmlc-core => 3rdparty/dmlc-core (100%)
 create mode 100644 src/runtime/builtin_fp16.cc

diff --git a/.gitmodules b/.gitmodules
index 3f0b222a86c6..8011ec12d24b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,9 @@
 [submodule "dmlc-core"]
-	path = dmlc-core
+	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core
 [submodule "HalideIR"]
-	path = HalideIR
+	path = 3rdparty/HalideIR
 	url = https://github.com/dmlc/HalideIR
 [submodule "dlpack"]
-	path = dlpack
+	path = 3rdparty/dlpack
 	url = https://github.com/dmlc/dlpack
diff --git a/HalideIR b/3rdparty/HalideIR
similarity index 100%
rename from HalideIR
rename to 3rdparty/HalideIR
diff --git a/3rdparty/compiler-rt/builtin_fp16.h b/3rdparty/compiler-rt/builtin_fp16.h
new file mode 100644
index 000000000000..1657d2830119
--- /dev/null
+++ b/3rdparty/compiler-rt/builtin_fp16.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2009-2015 by llvm/compiler-rt contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+
+ * Copyright (c) 2018 by Contributors
+ * \file builtin_fp16.cc
+ * \brief Functions for conversion between fp32 and fp16, adopted from compiler-rt.
+ */
+
+#include <cstdint>
+
+static inline uint32_t __clz(uint32_t x) {
+  // count leading zeros
+  int n = 32;
+  uint32_t y;
+
+  y = x >>16; if (y) { n = n -16; x = y; }
+  y = x >> 8; if (y) { n = n - 8; x = y; }
+  y = x >> 4; if (y) { n = n - 4; x = y; }
+  y = x >> 2; if (y) { n = n - 2; x = y; }
+  y = x >> 1; if (y) return n - 2;
+  return n - x;
+}
+
+template <typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
+          typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
+static inline DST_T __truncXfYf2__(SRC_T a) {
+  // Various constants whose values follow from the type parameters.
+  // Any reasonable optimizer will fold and propagate all of these.
+  const int srcBits = sizeof(SRC_T) * 8;
+  const int srcExpBits = srcBits - SRC_SIG_BITS - 1;
+  const int srcInfExp = (1 << srcExpBits) - 1;
+  const int srcExpBias = srcInfExp >> 1;
+
+  const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS;
+  const SRC_REP_T srcSignificandMask = srcMinNormal - 1;
+  const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS;
+  const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits);
+  const SRC_REP_T srcAbsMask = srcSignMask - 1;
+  const SRC_REP_T roundMask = (SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS)) - 1;
+  const SRC_REP_T halfway = SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS - 1);
+  const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
+  const SRC_REP_T srcNaNCode = srcQNaN - 1;
+
+  const int dstBits = sizeof(DST_T) * 8;
+  const int dstExpBits = dstBits - DST_SIG_BITS - 1;
+  const int dstInfExp = (1 << dstExpBits) - 1;
+  const int dstExpBias = dstInfExp >> 1;
+
+  const int underflowExponent = srcExpBias + 1 - dstExpBias;
+  const int overflowExponent = srcExpBias + dstInfExp - dstExpBias;
+  const SRC_REP_T underflow = (SRC_REP_T)underflowExponent << SRC_SIG_BITS;
+  const SRC_REP_T overflow = (SRC_REP_T)overflowExponent << SRC_SIG_BITS;
+
+  const DST_REP_T dstQNaN = DST_REP_T(1) << (DST_SIG_BITS - 1);
+  const DST_REP_T dstNaNCode = dstQNaN - 1;
+
+  // Break a into a sign and representation of the absolute value
+  const union { SRC_T f; SRC_REP_T i; } src_rep = {.f = a};
+  const SRC_REP_T aRep = src_rep.i;
+  const SRC_REP_T aAbs = aRep & srcAbsMask;
+  const SRC_REP_T sign = aRep & srcSignMask;
+  DST_REP_T absResult;
+
+  if (aAbs - underflow < aAbs - overflow) {
+    // The exponent of a is within the range of normal numbers in the
+    // destination format.  We can convert by simply right-shifting with
+    // rounding and adjusting the exponent.
+    absResult = aAbs >> (SRC_SIG_BITS - DST_SIG_BITS);
+    absResult -= (DST_REP_T)(srcExpBias - dstExpBias) << DST_SIG_BITS;
+
+    const SRC_REP_T roundBits = aAbs & roundMask;
+    // Round to nearest
+    if (roundBits > halfway)
+      absResult++;
+      // Ties to even
+    else if (roundBits == halfway)
+      absResult += absResult & 1;
+  }
+  else if (aAbs > srcInfinity) {
+    // a is NaN.
+    // Conjure the result by beginning with infinity, setting the qNaN
+    // bit and inserting the (truncated) trailing NaN field.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+    absResult |= dstQNaN;
+    absResult |= ((aAbs & srcNaNCode) >> (SRC_SIG_BITS - DST_SIG_BITS)) & dstNaNCode;
+  }
+  else if (aAbs >= overflow) {
+    // a overflows to infinity.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+  }
+  else {
+    // a underflows on conversion to the destination type or is an exact
+    // zero.  The result may be a denormal or zero.  Extract the exponent
+    // to get the shift amount for the denormalization.
+    const int aExp = aAbs >> SRC_SIG_BITS;
+    const int shift = srcExpBias - dstExpBias - aExp + 1;
+
+    const SRC_REP_T significand = (aRep & srcSignificandMask) | srcMinNormal;
+
+    // Right shift by the denormalization amount with sticky.
+    if (shift > SRC_SIG_BITS) {
+      absResult = 0;
+    } else {
+      const bool sticky = significand << (srcBits - shift);
+      SRC_REP_T denormalizedSignificand = significand >> shift | sticky;
+      absResult = denormalizedSignificand >> (SRC_SIG_BITS - DST_SIG_BITS);
+      const SRC_REP_T roundBits = denormalizedSignificand & roundMask;
+      // Round to nearest
+      if (roundBits > halfway)
+        absResult++;
+        // Ties to even
+      else if (roundBits == halfway)
+        absResult += absResult & 1;
+    }
+  }
+
+  // Apply the signbit to (DST_T)abs(a).
+  const DST_REP_T result = absResult | sign >> (srcBits - dstBits);
+  const union { DST_T f; DST_REP_T i; } dst_rep = {.i = result};
+  return dst_rep.f;
+}
+
+template<typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
+         typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
+static inline DST_T __extendXfYf2__(SRC_T a) {
+  // Various constants whose values follow from the type parameters.
+  // Any reasonable optimizer will fold and propagate all of these.
+  const int srcBits = sizeof(SRC_T) * 8;
+  const int srcExpBits = srcBits - SRC_SIG_BITS - 1;
+  const int srcInfExp = (1 << srcExpBits) - 1;
+  const int srcExpBias = srcInfExp >> 1;
+
+  const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS;
+  const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS;
+  const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits);
+  const SRC_REP_T srcAbsMask = srcSignMask - 1;
+  const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
+  const SRC_REP_T srcNaNCode = srcQNaN - 1;
+
+  const int dstBits = sizeof(DST_T)*8;
+  const int dstExpBits = dstBits - DST_SIG_BITS - 1;
+  const int dstInfExp = (1 << dstExpBits) - 1;
+  const int dstExpBias = dstInfExp >> 1;
+
+  const DST_REP_T dstMinNormal = DST_REP_T(1) << DST_SIG_BITS;
+
+  // Break a into a sign and representation of the absolute value
+  const union { SRC_T f; SRC_REP_T i; } src_rep = {.f = a};
+  const SRC_REP_T aRep = src_rep.i;
+  const SRC_REP_T aAbs = aRep & srcAbsMask;
+  const SRC_REP_T sign = aRep & srcSignMask;
+  DST_REP_T absResult;
+
+  // If sizeof(SRC_REP_T) < sizeof(int), the subtraction result is promoted
+  // to (signed) int.  To avoid that, explicitly cast to SRC_REP_T.
+  if ((SRC_REP_T)(aAbs - srcMinNormal) < srcInfinity - srcMinNormal) {
+    // a is a normal number.
+    // Extend to the destination type by shifting the significand and
+    // exponent into the proper position and rebiasing the exponent.
+    absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS);
+    absResult += (DST_REP_T)(dstExpBias - srcExpBias) << DST_SIG_BITS;
+  }
+
+  else if (aAbs >= srcInfinity) {
+    // a is NaN or infinity.
+    // Conjure the result by beginning with infinity, then setting the qNaN
+    // bit (if needed) and right-aligning the rest of the trailing NaN
+    // payload field.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+    absResult |= (DST_REP_T)(aAbs & srcQNaN) << (DST_SIG_BITS - SRC_SIG_BITS);
+    absResult |= (DST_REP_T)(aAbs & srcNaNCode) << (DST_SIG_BITS - SRC_SIG_BITS);
+  }
+  else if (aAbs) {
+    // a is denormal.
+    // renormalize the significand and clear the leading bit, then insert
+    // the correct adjusted exponent in the destination type.
+    const int scale = __clz(aAbs) - __clz(srcMinNormal);
+    absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS + scale);
+    absResult ^= dstMinNormal;
+    const int resultExponent = dstExpBias - srcExpBias - scale + 1;
+    absResult |= (DST_REP_T)resultExponent << DST_SIG_BITS;
+  }
+  else {
+    // a is zero.
+    absResult = 0;
+  }
+
+  // Apply the signbit to (DST_T)abs(a).
+  const DST_REP_T result = absResult | (DST_REP_T)sign << (dstBits - srcBits);
+  const union { DST_T f; DST_REP_T i; } dst_rep = {.i = result};
+  return dst_rep.f;
+}
diff --git a/dlpack b/3rdparty/dlpack
similarity index 100%
rename from dlpack
rename to 3rdparty/dlpack
diff --git a/dmlc-core b/3rdparty/dmlc-core
similarity index 100%
rename from dmlc-core
rename to 3rdparty/dmlc-core
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f03b9f64ab9..5e1d8bcc38fb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,8 +50,9 @@ tvm_option(USE_RANDOM "Build with random support" OFF)
 
 # include directories
 include_directories("include")
-include_directories("dlpack/include")
-include_directories("dmlc-core/include")
+include_directories("3rdparty/dlpack/include")
+include_directories("3rdparty/dmlc-core/include")
+include_directories("3rdparty/compiler-rt")
 
 # initial variables
 set(TVM_LINKER_LIBS "")
@@ -87,8 +88,8 @@ else(MSVC)
 endif(MSVC)
 
 # add source group
-FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "HalideIR/src/*.cpp" "nnvm/src/*.cc")
-FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h" "HalideIR/src/*.h"
+FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "3rdparty/HalideIR/src/*.cpp" "nnvm/src/*.cc")
+FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h" "3rdparty/HalideIR/src/*.h"
                                 "nnvm/src/*.h" "nnvm/include/*.h")
 assign_source_group("Source" ${GROUP_SOURCE})
 assign_source_group("Include" ${GROUP_INCLUDE})
@@ -127,7 +128,7 @@ file(GLOB_RECURSE NNVM_COMPILER_SRCS
 file(GLOB TOPI_SRCS
     topi/src/*.cc
 )
-file(GLOB_RECURSE HALIDEIR_SRCS HalideIR/src/*.cpp)
+file(GLOB_RECURSE HALIDEIR_SRCS 3rdparty/HalideIR/src/*.cpp)
 list(APPEND COMPILER_SRCS ${HALIDEIR_SRCS})
 file(GLOB RUNTIME_SRCS src/runtime/*.cc)
 
@@ -194,7 +195,7 @@ target_link_libraries(nnvm_compiler tvm)
 # Related headers
 target_include_directories(
   tvm
-  PUBLIC "HalideIR/src"
+  PUBLIC "3rdparty/HalideIR/src"
   PUBLIC "topi/include")
 target_include_directories(
   tvm_topi
@@ -244,12 +245,12 @@ if (INSTALL_DEV)
     PATTERN "*.h"
   )
   install(
-    DIRECTORY "HalideIR/src/." DESTINATION "include/HalideIR"
+    DIRECTORY "3rdparty/HalideIR/src/." DESTINATION "include/HalideIR"
     FILES_MATCHING
     PATTERN "*.h"
   )
   install(
-    DIRECTORY "dlpack/include/." DESTINATION "include"
+    DIRECTORY "3rdparty/dlpack/include/." DESTINATION "include"
     FILES_MATCHING
     PATTERN "*.h"
     )
diff --git a/Makefile b/Makefile
index 2d3d4843c4c0..6a9e3063de39 100644
--- a/Makefile
+++ b/Makefile
@@ -4,11 +4,11 @@ ROOTDIR = $(CURDIR)
 	 cython cython2 cython3 web runtime vta
 
 ifndef DMLC_CORE_PATH
-  DMLC_CORE_PATH = $(ROOTDIR)/dmlc-core
+  DMLC_CORE_PATH = $(ROOTDIR)/3rdparty/dmlc-core
 endif
 
 ifndef DLPACK_PATH
-  DLPACK_PATH = $(ROOTDIR)/dlpack
+  DLPACK_PATH = $(ROOTDIR)/3rdparty/dlpack
 endif
 
 INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include
@@ -50,10 +50,10 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc
 
 # Lint scripts
 cpplint:
-	python3 dmlc-core/scripts/lint.py vta cpp vta/include vta/src
-	python3 dmlc-core/scripts/lint.py topi cpp topi/include;
-	python3 dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
-	python3 dmlc-core/scripts/lint.py tvm cpp include src verilog\
+	python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
+	python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include;
+	python3 3rdparty/dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
+	python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp include src verilog\
 	 examples/extension/src examples/graph_executor/src
 
 pylint:
@@ -63,7 +63,7 @@ pylint:
 	python3 -m pylint vta/python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc
 
 jnilint:
-	python3 dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
+	python3 3rdparty/dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
 
 lint: cpplint pylint jnilint
 
diff --git a/apps/android_deploy/app/src/main/jni/Android.mk b/apps/android_deploy/app/src/main/jni/Android.mk
index a99517f90332..da5f499ea706 100644
--- a/apps/android_deploy/app/src/main/jni/Android.mk
+++ b/apps/android_deploy/app/src/main/jni/Android.mk
@@ -20,9 +20,9 @@ LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc
 LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
 
 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
-                    $(ROOT_PATH)/dlpack/include \
-                    $(ROOT_PATH)/dmlc-core/include \
-                    $(ROOT_PATH)/HalideIR/src \
+                    $(ROOT_PATH)/3rdparty/dlpack/include \
+                    $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/HalideIR/src \
                     $(ROOT_PATH)/topi/include
 
 LOCAL_MODULE = tvm4j_runtime_packed
diff --git a/apps/android_rpc/app/src/main/jni/Android.mk b/apps/android_rpc/app/src/main/jni/Android.mk
index a99517f90332..da5f499ea706 100644
--- a/apps/android_rpc/app/src/main/jni/Android.mk
+++ b/apps/android_rpc/app/src/main/jni/Android.mk
@@ -20,9 +20,9 @@ LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc
 LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
 
 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
-                    $(ROOT_PATH)/dlpack/include \
-                    $(ROOT_PATH)/dmlc-core/include \
-                    $(ROOT_PATH)/HalideIR/src \
+                    $(ROOT_PATH)/3rdparty/dlpack/include \
+                    $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/HalideIR/src \
                     $(ROOT_PATH)/topi/include
 
 LOCAL_MODULE = tvm4j_runtime_packed
diff --git a/apps/extension/Makefile b/apps/extension/Makefile
index 29b9a1163f16..3a1f8a2160ee 100644
--- a/apps/extension/Makefile
+++ b/apps/extension/Makefile
@@ -2,9 +2,9 @@
 TVM_ROOT=$(shell cd ../..; pwd)
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
-	-I${TVM_ROOT}/dmlc-core/include\
-	-I${TVM_ROOT}/dlpack/include\
-	-I${TVM_ROOT}/HalideIR/src
+	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/HalideIR/src
 
 PKG_LDFLAGS =-L${TVM_ROOT}/lib
 UNAME_S := $(shell uname -s)
diff --git a/apps/howto_deploy/Makefile b/apps/howto_deploy/Makefile
index ad4e56680d21..7accb7dd64ae 100644
--- a/apps/howto_deploy/Makefile
+++ b/apps/howto_deploy/Makefile
@@ -1,12 +1,12 @@
 # Makefile Example to deploy TVM modules.
 TVM_ROOT=$(shell cd ../..; pwd)
 NNVM_PATH=nnvm
-DMLC_CORE=${TVM_ROOT}/dmlc-core
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
 
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
 
 PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -lpthread
 
diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc
index 27f95e9e6065..c4b6e2a2d44e 100644
--- a/apps/howto_deploy/tvm_runtime_pack.cc
+++ b/apps/howto_deploy/tvm_runtime_pack.cc
@@ -8,8 +8,8 @@
  *  - Compile with -std=c++11
  *  - Add the following include path
  *     - /path/to/tvm/include/
- *     - /path/to/tvm/dmlc-core/include/
- *     - /path/to/tvm/dlpack/include/
+ *     - /path/to/tvm/3rdparty/dmlc-core/include/
+ *     - /path/to/tvm/3rdparty/dlpack/include/
  *   - Add -lpthread -ldl to the linked library.
  *   - You are good to go.
  *   - See the Makefile in the same folder for example.
diff --git a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
index d53ed6ba4cb9..60b6e99e7a92 100644
--- a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
+++ b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
@@ -386,8 +386,8 @@
 				GCC_SYMBOLS_PRIVATE_EXTERN = NO;
 				HEADER_SEARCH_PATHS = (
 					../../include,
-					../../dlpack/include,
-					"../../dmlc-core/include",
+					../../3rdparty/dlpack/include,
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpc/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
@@ -406,8 +406,8 @@
 				GCC_SYMBOLS_PRIVATE_EXTERN = NO;
 				HEADER_SEARCH_PATHS = (
 					../../include,
-					../../dlpack/include,
-					"../../dmlc-core/include",
+					../../3rdparty/dlpack/include,
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpc/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
@@ -422,9 +422,9 @@
 				BUNDLE_LOADER = "$(TEST_HOST)";
 				DEVELOPMENT_TEAM = 3FR42MXLK9;
 				HEADER_SEARCH_PATHS = (
-					../../dlpack/include,
+					../../3rdparty/dlpack/include,
 					../../include,
-					"../../dmlc-core/include",
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpcLauncher/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks @loader_path/Frameworks";
@@ -440,9 +440,9 @@
 				BUNDLE_LOADER = "$(TEST_HOST)";
 				DEVELOPMENT_TEAM = 3FR42MXLK9;
 				HEADER_SEARCH_PATHS = (
-					../../dlpack/include,
+					../../3rdparty/dlpack/include,
 					../../include,
-					"../../dmlc-core/include",
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpcLauncher/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks @loader_path/Frameworks";
diff --git a/apps/rocm_rpc/Makefile b/apps/rocm_rpc/Makefile
index b4e527980941..d4e3ec06ca99 100644
--- a/apps/rocm_rpc/Makefile
+++ b/apps/rocm_rpc/Makefile
@@ -3,12 +3,12 @@ ROCM_PATH=/opt/rocm
 
 TVM_ROOT=$(shell cd ../..; pwd)
 NNVM_PATH=nnvm
-DMLC_CORE=${TVM_ROOT}/dmlc-core
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
 
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
 	-I${ROCM_PATH}/include
 
 PKG_LDFLAGS = -L${ROCM_PATH}/lib -L${TVM_ROOT}/lib -ldl -lpthread -lhip_hcc -lMIOpen
diff --git a/apps/sgx/Makefile b/apps/sgx/Makefile
index cd7034d4c41b..1038f57c3ba1 100644
--- a/apps/sgx/Makefile
+++ b/apps/sgx/Makefile
@@ -23,7 +23,7 @@ uservice_library_name := sgx_uae_service$(sgx_sim)
 pkg_cflags := -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
 	-I.\
 	-DDMLC_LOG_STACK_TRACE=0\
 	-fmax-errors=4
diff --git a/nnvm/Makefile b/nnvm/Makefile
index adbae329e144..4ebd9ac95b70 100644
--- a/nnvm/Makefile
+++ b/nnvm/Makefile
@@ -13,7 +13,7 @@ TVMPATH = ..
 
 export LDFLAGS = -pthread -lm
 export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
-CFLAGS += -I$(TVMPATH)/include -I$(TVMPATH)/dlpack/include -I$(TVMPATH)/HalideIR/src -I$(TVMPATH)/topi/include
+CFLAGS += -I$(TVMPATH)/include -I$(TVMPATH)/3rdparty/dlpack/include -I$(TVMPATH)/3rdparty/HalideIR/src -I$(TVMPATH)/topi/include
 
 ifdef DMLC_CORE_PATH
   CFLAGS += -I$(DMLC_CORE_PATH)/include
diff --git a/nnvm/amalgamation/Makefile b/nnvm/amalgamation/Makefile
index 1f286f055237..4305339e0075 100644
--- a/nnvm/amalgamation/Makefile
+++ b/nnvm/amalgamation/Makefile
@@ -4,7 +4,7 @@ export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
 ifdef DMLC_CORE_PATH
   CFLAGS += -I$(DMLC_CORE_PATH)/include
 else
-  CFLAGS += -I$(CURDIR)/../dmlc-core/include
+  CFLAGS += -I$(CURDIR)/../3rdparty/dmlc-core/include
 endif
 
 .PHONY: all clean
diff --git a/python/setup.py b/python/setup.py
index cbf8c5591703..71d61a52e349 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -74,8 +74,8 @@ def config_cython():
                 "tvm._ffi.%s.%s" % (subdir, fn[:-4]),
                 ["tvm/_ffi/_cython/%s" % fn],
                 include_dirs=["../include/",
-                              "../dmlc-core/include",
-                              "../dlpack/include",
+                              "../3rdparty/dmlc-core/include",
+                              "../3rdparty/dlpack/include",
                 ],
                 library_dirs=library_dirs,
                 libraries=libraries,
diff --git a/src/runtime/builtin_fp16.cc b/src/runtime/builtin_fp16.cc
new file mode 100644
index 000000000000..c259399e05e9
--- /dev/null
+++ b/src/runtime/builtin_fp16.cc
@@ -0,0 +1,21 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file builtin_fp16.cc
+ * \brief Functions for conversion between fp32 and fp16
+*/
+
+#include <builtin_fp16.h>
+
+namespace tvm {
+namespace runtime {
+
+extern "C"  uint16_t __gnu_f2h_ieee(float a) {
+  return __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(a);
+}
+
+extern "C" float __gnu_h2f_ieee(uint16_t a) {
+  return __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(a);
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/unittest/test_runtime_ndarray.py
index 9f33e2aabfd8..7be538199a58 100644
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -35,5 +35,26 @@ def test_nd_create():
         ctx.sync()
 
 
+def test_fp16_conversion():
+    n = 100
+
+    for (src, dst) in [('float32', 'float16'), ('float16', 'float32')]:
+        A = tvm.placeholder((n,), dtype=src)
+        B = tvm.compute((n,), lambda i: A[i].astype(dst))
+
+        s = tvm.create_schedule([B.op])
+        func = tvm.build(s, [A, B], 'llvm')
+
+        x_tvm = tvm.nd.array(100 * np.random.randn(n).astype(src) - 50)
+        y_tvm = tvm.nd.array(100 * np.random.randn(n).astype(dst) - 50)
+
+        func(x_tvm, y_tvm)
+
+        expected = x_tvm.asnumpy().astype(dst)
+        real = y_tvm.asnumpy()
+
+        np.testing.assert_allclose(expected, real)
+
 if __name__ == "__main__":
     test_nd_create()
+    test_fp16_conversion()
diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py
index c3fe09effb76..30b4808f5e2d 100644
--- a/vta/python/vta/pkg_config.py
+++ b/vta/python/vta/pkg_config.py
@@ -42,8 +42,8 @@ def __init__(self, cfg, proj_root):
         self.include_path = [
             "-I%s/include" % proj_root,
             "-I%s/vta/include" % proj_root,
-            "-I%s/dlpack/include" % proj_root,
-            "-I%s/dmlc-core/include" % proj_root
+            "-I%s/3rdparty/dlpack/include" % proj_root,
+            "-I%s/3rdparty/dmlc-core/include" % proj_root
         ]
         # List of source files that can be used to build standalone library.
         self.lib_source = []

From 61329d816c0243f39bef4b5c80181c0b11f5e0af Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 25 Sep 2018 13:30:20 -0400
Subject: [PATCH 130/529] INT8 conv operator implementation with NCHWc data
 layout for Intel machines (#1680)

* Int8 implementation for convolution operator on Intel Skylake

* Int8 implementation for convolution operator on Intel Skylake

* PR changes

* PR changes

* PR changes

* Fixing an error

* Fixing an error

* Minor typos fix

* Minor typos fix

* Removing the broadcast16 CPP code. Using astype feature instead

* Replacing constant by variable name num_elements_intel

* Name fixes and tensorize update rule updated

* Fixing the bug about checking skylake

* Replacing bitcast with reinterpret

* Isolating INT8 and FP32 schedules to ease out future AutoTVM PR merge

* Putting check_skylake function in the x86 directory

* Added documentation and organizing files to better locations

* Tensor intrin renaming. Avoid code duplication for intrin by kernel reshaping
---
 topi/python/topi/nn/conv2d.py             |  28 +++-
 topi/python/topi/x86/check_targets.py     |  12 ++
 topi/python/topi/x86/conv2d.py            | 145 +++++++++++++++++++--
 topi/python/topi/x86/conv2d_avx_1x1.py    | 117 +++++++++++++++++
 topi/python/topi/x86/conv2d_avx_common.py | 123 ++++++++++++++++++
 topi/python/topi/x86/tensor_intrin.py     |  84 ++++++++++++
 topi/recipe/conv/test_conv_int8_intel.py  | 149 ++++++++++++++++++++++
 7 files changed, 643 insertions(+), 15 deletions(-)
 create mode 100644 topi/python/topi/x86/check_targets.py
 create mode 100644 topi/python/topi/x86/tensor_intrin.py
 create mode 100644 topi/recipe/conv/test_conv_int8_intel.py

diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index e0d2c403d4b4..3e06f6f6fed5 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -79,12 +79,27 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
         HSTR, WSTR = stride
     else:
         HSTR, WSTR = stride, stride
-    assert data.dtype == kernel.dtype, \
+    assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
+        "Do not support inputs with different data types now. ' \
+        '{} vs. {}".format(data.dtype, kernel.dtype)
+    return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
+
+def _get_workload_int8(data, kernel, stride, padding, out_dtype):
+    """ Get the workload structure. """
+    _, CI, IH, IW = [x.value for x in data.shape]
+    CO, _, KH, KW = [x.value for x in kernel.shape]
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
         "Do not support inputs with different data types now. ' \
         '{} vs. {}".format(data.dtype, kernel.dtype)
     return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
 
 
+
 @tvm.target.generic_func
 def _get_alter_layout_schedule(wkl):
     # pylint: disable=unreachable
@@ -118,6 +133,17 @@ def _get_schedule_NCHWc(wkl, layout, out_layout):
     return wkl
 
 
+@tvm.target.generic_func
+def _get_schedule_NCHWc_int8(wkl, layout, out_layout):
+    # pylint: disable=unreachable
+    """ Get the platform specific schedule. """
+    target = tvm.target.current_target()
+    raise RuntimeError(
+        "No schedule for current target:{}".format(target))
+    # This return has no use, merely to supress pylint warning
+    return wkl
+
+
 def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     """Convolution operator in NCHW layout.
 
diff --git a/topi/python/topi/x86/check_targets.py b/topi/python/topi/x86/check_targets.py
new file mode 100644
index 000000000000..fad74eaf582a
--- /dev/null
+++ b/topi/python/topi/x86/check_targets.py
@@ -0,0 +1,12 @@
+# pylint: disable=invalid-name,unused-variable,invalid-name,unused-argument
+"""Checks different x86 targets for target specific schedules"""
+
+def check_skylake(target):
+    """
+    Checks if the target is skylake
+    """
+
+    for opt in target.options:
+        if opt == '-mcpu=skylake-avx512':
+            return True
+    return False
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 721c7c169d99..6fe59a909510 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -5,12 +5,13 @@
 from .. import nn
 from ..nn.util import infer_pad, infer_stride
 from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, \
-    _get_workload, _get_schedule, _get_schedule_NCHWc, \
-    _get_alter_layout_schedule, Workload
+    _get_workload, _get_workload_int8, _get_schedule, _get_schedule_NCHWc, \
+    _get_schedule_NCHWc_int8, _get_alter_layout_schedule, Workload
 
 from . import conv2d_avx_1x1, conv2d_avx_common
 from .conv2d_avx_common import AVXConvCommonFwd
 from .conv2d_avx_1x1 import AVXConv1x1Fwd
+from .check_targets import check_skylake
 
 @_get_schedule.register("cpu")
 def _get_schedule_conv(wkl):
@@ -100,10 +101,95 @@ def _get_schedule_conv(wkl):
     sch = _SCHEDULES_AVX[idx]
     return sch
 
+def _get_schedule_conv_int8(wkl):
+    _WORKLOADS_AVX = [
+        ## Following are for INT8 kernels
+        Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+        Workload('uint8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+        Workload('uint8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+        Workload('uint8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+        # workloads of resnet34_v1 on imagenet, no extra workload required
+        # workloads of resnet50_v1 on imagenet
+        Workload('uint8', 'int32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
+    ]
+
+    fp32_vec_len = 8
+    target = tvm.target.current_target(allow_none=False)
+    if check_skylake(target):
+        fp32_vec_len = 16
+
+    _SCHEDULES_AVX = [
+        # Following are for INT8 operations
+        # workloads of resnet18_v1 on imagenet
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, False),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, True),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 7),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
+        # workloads of resnet34_v1 on imagenet, no extra workload required
+        # workloads of resnet50_v1 on imagenet
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        # workloads of resnet101_v1 on imagenet, no extra workload required
+        # workloads of resnet152_v1 on imagenet, no extra workload required
+        # workloads of resnet18_v2 on imagenet, no extra workload required
+        # workloads of resnet34_v2 on imagenet, no extra workload required
+    ]
+
+    if wkl not in _WORKLOADS_AVX:
+        if wkl.hkernel == 1 and wkl.wkernel == 1:
+            return conv2d_avx_1x1._get_default_schedule(wkl, fp32_vec_len)
+        return conv2d_avx_common._get_default_schedule(wkl, fp32_vec_len)
+    idx = _WORKLOADS_AVX.index(wkl)
+    sch = _SCHEDULES_AVX[idx]
+    return sch
+
 @_get_schedule_NCHWc.register("cpu")
 def _get_schedule_NCHWc_x86(wkl, layout, out_layout):
     return _get_schedule_conv(wkl)
 
+@_get_schedule_NCHWc_int8.register("cpu")
+def _get_schedule_NCHWc_x86_int8(wkl, layout, out_layout):
+    return _get_schedule_conv_int8(wkl)
+
 @_get_alter_layout_schedule.register("cpu")
 def _get_alter_layout_schedule_x86(wkl):
     return _get_schedule_conv(wkl)
@@ -162,6 +248,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
 
 
+
 @conv2d_NCHWc.register("cpu")
 def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride,
                             padding, layout, out_layout, out_dtype):
@@ -169,13 +256,29 @@ def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride,
         AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc,
         AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc
     }
+
+    # Use int8 schedules if the input data is of int8 dtype
+    if data.dtype == 'uint8':
+        _AVX_SCH_TO_DECL_FUNC = {
+            AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc_int8,
+            AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc_int8
+        }
+
     n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
     ic = ic_chunk * ic_block
     kh, kw = kernel_size
-    wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=out_dtype),
-                        tvm.placeholder((num_filter, ic, kh, kw), dtype=out_dtype),
-                        stride, padding, out_dtype)
-    sch = _get_schedule_NCHWc(wkl, layout, out_layout)
+    if data.dtype == 'uint8':
+        wkl = _get_workload_int8(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
+                                 tvm.placeholder((num_filter, ic, kh, kw),
+                                                 dtype=kernel.dtype),
+                                 stride, padding, out_dtype)
+        sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout)
+    else:
+        wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
+                            tvm.placeholder((num_filter, ic, kh, kw),
+                                            dtype=kernel.dtype),
+                            stride, padding, out_dtype)
+        sch = _get_schedule_NCHWc(wkl, layout, out_layout)
     return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel)
 
 
@@ -289,10 +392,6 @@ def traverse(op):
 def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding,
                           layout, out_layout, outs):
     """Create schedule for tensors"""
-    _AVX_SCH_TO_SCH_FUNC = {
-        AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc,
-        AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc
-    }
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
@@ -317,15 +416,33 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
+            _AVX_SCH_TO_SCH_FUNC = {
+                AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc,
+                AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc
+            }
+
+            # Use int8 schedules if the input data is of int8 dtype
+            if data.dtype == 'uint8':
+                _AVX_SCH_TO_SCH_FUNC = {
+                    AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc_int8,
+                    AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc_int8
+                }
+
             n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
             ic = ic_chunk * ic_block
-            original_data = tvm.placeholder((n, ic, h, w), dtype=conv_out.dtype)
+            original_data = tvm.placeholder((n, ic, h, w), dtype=data.dtype)
 
             kh, kw = kernel_size
-            original_kernel = tvm.placeholder((num_filter, ic, kh, kw), dtype=conv_out.dtype)
+            original_kernel = tvm.placeholder((num_filter, ic, kh, kw),
+                                              dtype=kernel.dtype)
 
-            wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype)
-            sch = _get_schedule_NCHWc(wkl, layout, out_layout)
+            if data.dtype == 'uint8':
+                wkl = _get_workload_int8(original_data, original_kernel,
+                                         stride, padding, conv_out.dtype)
+                sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout)
+            else:
+                wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype)
+                sch = _get_schedule_NCHWc(wkl, layout, out_layout)
             _AVX_SCH_TO_SCH_FUNC[type(sch)](s, wkl, sch, data_vec,
                                             kernel, conv_out, outs[0])
 
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 7d820701e1f4..bace7451d665 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -3,11 +3,14 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
+import topi
 
 from ..util import get_const_tuple
 from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
+from .tensor_intrin import dot_16x1x16_int8_int8_int32
+from .check_targets import check_skylake
 
 AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor'])
 
@@ -229,3 +232,117 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
         s[O].parallel(parallel_axis)
 
     return s
+
+
+def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
+    """ Declaration for int8 conv"""
+    out_dtype = wkl.out_dtype
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+
+    batch_size = data.shape[0]
+    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
+    else:
+        data_pad = data
+
+    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
+
+    # Intel performs dot product of 2 "4" Int8 values
+    n_elems = 4
+    assert sch.ic_bn%n_elems == 0
+    ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer')
+    ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner')
+    ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
+
+    # Reshaping kernel as the last 2 dimensions are 1x1 (k_h x k_w)
+    k_shape = kernel.shape
+    kernel = topi.reshape(kernel, (k_shape[0], k_shape[1], k_shape[2], k_shape[3],
+                                   k_shape[4] * k_shape[5] * k_shape[6]))
+
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_pad[n, ic_outer, oh*HSTR, ow*WSTR,
+                                        ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) *
+                               kernel[oc_chunk, ic_outer, ic_f_inner,
+                                      oc_block, ic_s_inner].astype(out_dtype),
+                               axis=[ic_outer, ic_f_inner, ic_s_inner]),
+                       name='conv2d_NCHWc_int8',
+                       tag="conv2d_NCHWc_int8")
+
+
+    return conv
+
+
+def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
+    """
+    Defines the schedule for INT8 for intel machines
+    Uses the Intel intrinsics to use INT8 operations
+    More details - https://software.intel.com/en-us/articles/
+    lower-numerical-precision-deep-learning-inference-and-training
+    """
+
+    target = tvm.target.current_target(allow_none=False)
+    int32_lanes = -1
+    if check_skylake(target):
+        int32_lanes = 16
+    else:
+        return s
+    assert int32_lanes != -1
+
+    # schedule data
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
+        parallel_axis = s[A].fuse(ic_chunk, ih)
+        s[A].parallel(parallel_axis)
+
+    C, O = conv_out, last
+    CC = s.cache_write(C, 'global')
+
+    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor)
+    ow_outer, ow_inner = s[C].split(ow, factor=sch.ow_factor)
+    s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+    s[C].vectorize(oc_block)
+
+    parallel_axis = s[C].fuse(oc_chunk, oh_outer)
+    s[CC].compute_at(s[C], parallel_axis)
+    if C == O:
+        s[C].parallel(parallel_axis)
+
+    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+    ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
+
+    # Skylake and future processors have 16 vector lanes
+    assert sch.oc_bn % int32_lanes == 0
+
+    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
+
+    oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
+    ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
+
+    s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_outer, ic_f_inner, oh_inner,
+                  ow_inner, oc_f_inner, oc_s_inner, ic_s_inner)
+    s[CC].fuse(oc_chunk, oh_outer)
+
+    pc = dot_16x1x16_int8_int8_int32()
+    s[CC].tensorize(oc_s_inner, pc)
+    s[CC].unroll(ow_inner)
+    s[CC].unroll(oh_inner)
+
+    if C != O:
+        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+        oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor)
+        ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor)
+        s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+
+        parallel_axis = s[O].fuse(oc_chunk, oh_outer)
+        s[C].compute_at(s[O], parallel_axis)
+        s[O].vectorize(oc_block)
+        s[O].parallel(parallel_axis)
+
+    return s
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 8f8086fdebb4..0d7aba23d236 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -8,6 +8,8 @@
 from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
+from .tensor_intrin import dot_16x1x16_int8_int8_int32
+from .check_targets import check_skylake
 
 AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw'])
 
@@ -252,3 +254,124 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
         s[O].parallel(parallel_axis)
 
     return s
+
+
+def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
+    """
+    This function sets up the compute for INT8 conv 2d
+    Inputs are in INT8 datatype
+    Output is in INT32 datatype
+    """
+
+    out_dtype = wkl.out_dtype
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+
+    batch_size = data.shape[0]
+    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+
+    # pack data
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
+    else:
+        data_pad = data
+
+    # convolution
+    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
+    kh = tvm.reduce_axis((0, wkl.hkernel), name='kh')
+    kw = tvm.reduce_axis((0, wkl.wkernel), name='kw')
+
+    # Intel performs dot product of 2 "4" Int8 values
+    # Current implementation requires ic_bn to be a multiple of 4
+    n_elems = 4
+    assert sch.ic_bn%n_elems == 0
+
+    ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer')
+    ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner')
+    ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw,
+                                        ic_f_inner * n_elems +  ic_s_inner].astype(out_dtype) *
+                               kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner,
+                                      oc_block, ic_s_inner].astype(out_dtype),
+                               axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
+                       name='conv2d_NCHWc_int8',
+                       tag="conv2d_NCHWc_int8")
+    return conv
+
+def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
+    """
+    Defines the schedule for INT8 for intel machines
+    Uses the Intel intrinsics to use INT8 operations
+    More details - https://software.intel.com/en-us/articles/
+    lower-numerical-precision-deep-learning-inference-and-training
+    """
+
+    # Currently INT8 operations are supported for only Skylake
+    # In future the _intrin_reduce4int8 will be updated for VNNI instructions
+    # In case of unsupported target, the schedule will go to the original
+    # compute
+
+    target = tvm.target.current_target(allow_none=False)
+    int32_lanes = -1
+    if check_skylake(target):
+        int32_lanes = 16
+    else:
+        return s
+    assert int32_lanes != -1
+
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ic_chunk, ih, iw, _ = s[A].op.axis
+        parallel_axis = s[A].fuse(ic_chunk, ih)
+        s[A].parallel(parallel_axis)
+
+    # schedule 5-D NCHW[x]c conv
+    C, O = conv_out, last
+    CC = s.cache_write(C, 'global')
+
+    _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n)
+    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    parallel_axis = s[C].fuse(oc_chunk, oh)
+    s[C].vectorize(oc_block)
+    if C == O:
+        s[C].parallel(parallel_axis)
+
+    s[CC].compute_at(s[C], ow_chunk)
+    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+    kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
+
+    ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
+
+    # Skylake and future processors have 16 vector lanes
+    assert sch.oc_bn % int32_lanes == 0
+
+    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
+
+    if sch.unroll_kw:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, ic_f_inner, kw,
+                      ow_block, oc_f_inner, oc_s_inner, ic_s_inner)
+        s[CC].unroll(kw)
+    else:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, kw, ic_f_inner,
+                      ow_block, oc_f_inner, oc_s_inner, ic_s_inner)
+
+
+    pc = dot_16x1x16_int8_int8_int32()
+    s[CC].tensorize(oc_s_inner, pc)
+    s[CC].unroll(ow_block)
+    s[CC].unroll(oc_f_inner)
+
+    if C != O:
+        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+        ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n)
+        s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+        parallel_axis = s[O].fuse(oc_chunk, oh)
+        s[C].compute_at(s[O], parallel_axis)
+        s[O].vectorize(oc_block)
+        s[O].parallel(parallel_axis)
+
+    return s
diff --git a/topi/python/topi/x86/tensor_intrin.py b/topi/python/topi/x86/tensor_intrin.py
new file mode 100644
index 000000000000..28e57f1c10f8
--- /dev/null
+++ b/topi/python/topi/x86/tensor_intrin.py
@@ -0,0 +1,84 @@
+"""Core kernel of dot product of 4 Int8 operations"""
+#pylint: disable=invalid-name
+import tvm
+
+
+def dot_16x1x16_int8_int8_int32():
+    """
+    Int8 dot product by every 4 elements using AVX2 Skylake instructions.
+    This function takes two arrays of int8 datatype -- data[4] and
+    kernel[16][4] -- and computes a dot product of data[4] with every
+    4 elements of kernels, resulting in output[16] of int32 datatype.
+    The pseudo code is as follows.
+    .. code-block:: c
+        void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
+                int32 output[16]){
+            for (int i = 0; i < 16; i++){
+                out[i] = 0;
+                for (int k = 0; k < 4; k++){
+                    out[i] += data[k] * kernel[i][k]
+                }
+            }
+        }
+
+    Physically, the kernel array sits in an AVX512 vector register and
+    the data[4] is broadcasted to another AVX512 vector register. This
+    function returns a TensorIntrin that can be used to tensorize
+    a schedule.
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
+    """
+
+    int32_lanes = 16 # 16 int32 lanes in AVX512
+    num_int8_elements = 4 # 4 int8 elements in int32
+    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
+    kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel')
+    k = tvm.reduce_axis((0, num_int8_elements), name='k')
+    C = tvm.compute((int32_lanes,),
+                    lambda i: tvm.sum(data[k].astype('int32') *
+                                      kernel[i, k].astype('int32'),
+                                      axis=k),
+                    name="C")
+
+    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
+                               offset_factor=1,
+                               strides=[1])
+    b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
+                               offset_factor=1,
+                               strides=[tvm.var('ldw'), 1])
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.ir_builder.create()
+            if index == 1:
+                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
+                return ib.get()
+
+            a_int8 = ins[0].vload([0], "uint8x4")
+            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
+            vec_ai32 = re_int32.astype('int32x16')
+            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
+            vec_b = ins[1].vload([0, 0], "int8x64")
+            vec_one = tvm.const(1, "int16x32")
+            pair_reduction = tvm.call_llvm_intrin('int16x32',
+                                                  'llvm.x86.avx512.pmaddubs.w.512',
+                                                  tvm.const(0, 'uint32'),
+                                                  vec_a, vec_b)
+            quad_reduction = tvm.call_llvm_intrin('int32x16',
+                                                  'llvm.x86.avx512.pmaddw.d.512',
+                                                  tvm.const(0, 'uint32'),
+                                                  pair_reduction, vec_one)
+            if index == 0:
+                ib.emit(outs[0].vstore(0, quad_reduction))
+            else:
+                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16')))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    with tvm.build_config(offset_factor=1, partition_const_loop=True):
+        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
diff --git a/topi/recipe/conv/test_conv_int8_intel.py b/topi/recipe/conv/test_conv_int8_intel.py
new file mode 100644
index 000000000000..863b3a6a41ab
--- /dev/null
+++ b/topi/recipe/conv/test_conv_int8_intel.py
@@ -0,0 +1,149 @@
+#pylint: disable-msg=too-many-arguments, too-many-locals, assignment-from-no-return
+""" Conv Int8 functional and performance testing"""
+import sys
+import logging
+import numpy as np
+import tvm
+import topi
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+LOGGER = logging.getLogger('test_conv_int8_intel')
+LOGGER.disabled = False
+
+# All the WORKLOADS from Resnet except first layer
+# Workload is ['height', 'width', 'in_filter', 'out_filter',
+#              'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+WORKLOADS = [(56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+             (56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+             (56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+             (56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+             (28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+             (28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+             (28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+             (14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+             (14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+             (14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+             (7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+             (56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
+             (56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
+             (56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
+             (28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
+             (56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
+             (28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
+             (28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
+             (14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
+             (28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
+             (14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
+             (14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
+             (7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
+             (14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
+             (7, 7, 2048, 512, 1, 1, 0, 0, 1, 1)
+            ]
+
+
+TARGET_NAME = 'llvm -mcpu=skylake-avx512'
+NUM_VEC_LANES = 16
+CTX = tvm.context(TARGET_NAME, 0)
+
+def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad,
+              hstride, wstride, out_dtype):
+    """
+    Finds out the shape of all data structures
+    """
+    ## Find shapes
+    data_shape = (1, in_filter//NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)
+
+    if out_dtype == 'int32':
+        if k_h != 1:
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
+                            NUM_VEC_LANES//4, NUM_VEC_LANES, 4)
+        else:
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES//4,
+                            NUM_VEC_LANES, 4, k_h, k_w)
+    elif out_dtype == 'float32':
+        if k_h != 1:
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
+                            NUM_VEC_LANES, NUM_VEC_LANES)
+        else:
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES,
+                            NUM_VEC_LANES, k_h, k_w)
+    out_height = (im_height + 2 * hpad - k_h) // hstride + 1
+    out_width = (im_width + 2 * wpad - k_w) // wstride + 1
+    o_shape = (1, out_filter//NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
+    return (data_shape, kernel_shape, o_shape)
+
+
+
+def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_filter,
+                  out_filter, k_h, k_w, hpad, wpad, hstride, wstride):
+    """
+    Runs the inference and checks the functional correctness between
+    compute and schedule outputs
+    """
+    (data_shape, kernel_shape, o_shape) = get_shape(im_height, im_width, in_filter,
+                                                    out_filter, k_h, k_w, hpad, wpad,
+                                                    hstride, wstride, out_dtype)
+
+    # Create TVM placeholders
+    data = tvm.placeholder(data_shape, name='data', dtype=data_dtype)
+    kernel = tvm.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype)
+
+    # Create the numpy arrays to be used for executing conv models
+    if data_dtype == 'float32':
+        data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX)
+        kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX)
+    else:
+        data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
+        kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))
+
+    # c_orig will be used for declaration ouptut
+    # c_sch will be used for scheduled computation output
+    c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
+    c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
+
+
+    with tvm.target.create(TARGET_NAME):
+        conv = topi.nn.conv2d_NCHWc(data, kernel, num_filter=out_filter,
+                                    kernel_size=(k_h, k_w), stride=hstride,
+                                    padding=hpad, layout='NCHWc',
+                                    out_layout='NCHWc', out_dtype=out_dtype)
+        out = topi.nn.relu(conv)
+        sch = tvm.create_schedule(out.op)
+        func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name='out')
+        func(data_array, kernel_array, c_orig)
+        LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))
+
+        # Generate and run the optimized schedule
+        sconv = topi.generic.nn.schedule_conv2d_NCHWc(num_filter=out_filter,
+                                                      kernel_size=(k_h, k_w),
+                                                      strides=hstride,
+                                                      padding=hpad,
+                                                      layout='NCHWc',
+                                                      out_layout='NCHWc',
+                                                      outs=[out])
+        func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name='conv')
+        func(data_array, kernel_array, c_sch)
+
+        # Functional check
+        if data_dtype == 'uint8':
+            np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy())
+        else:
+            assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy())
+
+        evaluator = func.time_evaluator(func.entry_name, CTX, number=1000)
+        LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
+        return evaluator(data_array, kernel_array, c_sch).mean
+
+if __name__ == "__main__":
+    LOGGER.info("Workload, Kernel_size, FP32_time, INT8_time, Speedup")
+    SPEEDUP_ARRAY = []
+    for i, wkl in enumerate(WORKLOADS):
+        fp32_time = run_inference('float32', 'float32', 'float32', *wkl)
+        int8_time = run_inference('uint8', 'int8', 'int32', *wkl)
+        kernel_h = wkl[4]
+        kernel_w = wkl[5]
+        LOGGER.info("Workload#" + str(i) + ", " + str(kernel_h) + "x" + str(kernel_w) + ", "
+                    + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time))
+
+        SPEEDUP_ARRAY.append(fp32_time/int8_time)
+    LOGGER.info("Average speedup --> %s" % str(sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY))))

From e2617bd4222cf7e3e0b156a797326269f60c5eb5 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Tue, 25 Sep 2018 12:15:15 -0700
Subject: [PATCH 131/529] [RUNTIME] Use weak link for fp16 functions  (#1769)

---
 src/runtime/builtin_fp16.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/runtime/builtin_fp16.cc b/src/runtime/builtin_fp16.cc
index c259399e05e9..79c3cc474269 100644
--- a/src/runtime/builtin_fp16.cc
+++ b/src/runtime/builtin_fp16.cc
@@ -5,17 +5,16 @@
 */
 
 #include <builtin_fp16.h>
+#include <tvm/runtime/c_runtime_api.h>
 
-namespace tvm {
-namespace runtime {
+extern "C" {
 
-extern "C"  uint16_t __gnu_f2h_ieee(float a) {
+TVM_WEAK uint16_t __gnu_f2h_ieee(float a) {
   return __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(a);
 }
 
-extern "C" float __gnu_h2f_ieee(uint16_t a) {
+TVM_WEAK float __gnu_h2f_ieee(uint16_t a) {
   return __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(a);
 }
 
-}  // namespace runtime
-}  // namespace tvm
+}

From a22ba5183ac4cf57f54c3a1ae6d73b62dca7dcba Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Tue, 25 Sep 2018 21:26:40 -0700
Subject: [PATCH 132/529] [Relay] Restore kind checking (#1758)

---
 python/tvm/relay/__init__.py          |  1 +
 python/tvm/relay/ir_pass.py           |  2 +
 python/tvm/relay/ty.py                | 22 +++++++-
 python/tvm/relay/ty.pyi               | 21 +++++++
 src/relay/pass/kind_check.cc          | 69 +++++++++++++++++++++++
 tests/python/relay/test_check_kind.py | 79 +++++++++++++++++++++++++++
 tests/python/relay/test_ir_nodes.py   | 14 ++++-
 7 files changed, 205 insertions(+), 3 deletions(-)
 create mode 100644 tests/python/relay/test_check_kind.py

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 18a53be92815..7007028af6c7 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -15,6 +15,7 @@
 
 # Type
 Type = ty.Type
+TupleType = ty.TupleType
 TensorType = ty.TensorType
 Kind = ty.Kind
 TypeParam = ty.TypeParam
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 84189c840d71..8a9612420327 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -12,3 +12,5 @@
 check_expr = _ir_pass.check_expr
 
 well_formed = _ir_pass.well_formed
+
+check_kind = _ir_pass.check_kind
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index 10e267a53977..d2a256e77f5b 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -21,7 +21,6 @@ def same_as(self, other):
         """Compares two Relay types by referential equality."""
         return super().__eq__(other)
 
-
 @register_relay_node
 class TensorType(Type):
     """A concrete TensorType in Relay, see tvm/relay/type.h for more details.
@@ -94,6 +93,27 @@ class TypeConstraint(Type):
     pass
 
 
+@register_relay_node
+class TupleType(Type):
+    """A tuple type in Relay, see tvm/relay/type.h for more details.
+
+    Lists the type of each field in the tuple.
+    """
+
+    def __init__(self, fields):
+        """Constructs a tuple type
+
+        Parameters
+        ----------
+        fields: list of tvm.Type
+
+        Returns
+        -------
+        tuple_type: the tuple type
+        """
+        self.__init_handle_by_constructor__(_make.TupleType, fields)
+
+
 @register_relay_node
 class FuncType(Type):
     """A function type in Relay, see tvm/relay/type.h for more details.
diff --git a/python/tvm/relay/ty.pyi b/python/tvm/relay/ty.pyi
index 0581847598d4..1aba99e42a27 100644
--- a/python/tvm/relay/ty.pyi
+++ b/python/tvm/relay/ty.pyi
@@ -94,6 +94,27 @@ class TypeConstraint(Type):
     pass
 
 
+@register_relay_node
+class TupleType(Type):
+    """A tuple type in Relay, see tvm/relay/type.h for more details.
+
+    Lists the type of each field in the tuple.
+    """
+
+    def __init__(self, fields):
+        """Constructs a tuple type
+
+        Parameters
+        ----------
+        fields: list of tvm.Type
+
+        Returns
+        -------
+        tuple_type: the tuple type
+        """
+        self.__init_handle_by_constructor__(_make.TupleType, fields)
+
+
 @register_relay_node
 class FuncType(Type):
     """A function type in Relay, see tvm/relay/type.h for more details.
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
index 91d2d5822110..83f52d8873e3 100644
--- a/src/relay/pass/kind_check.cc
+++ b/src/relay/pass/kind_check.cc
@@ -20,12 +20,72 @@ namespace tvm {
 namespace relay {
 
 using namespace tvm::runtime;
+using Kind = TypeParamNode::Kind;
 
 struct KindChecker : TypeVisitor<> {
   bool valid;
 
   KindChecker() : valid(true) {}
 
+  // checks if t is an incomplete node of kind k or a type param of kind k
+  bool MatchKind(const Type& t, Kind k) {
+    if (const IncompleteTypeNode *tv = t.as<IncompleteTypeNode>()) {
+      return tv->kind == k;
+    }
+
+    if (const TypeParamNode *tp = t.as<TypeParamNode>()) {
+      return tp->kind == k;
+    }
+
+    return false;
+  }
+
+  bool IsTypeKind(const Type& t) {
+    if (MatchKind(t, Kind::kType)) {
+      return true;
+    }
+
+    return t.as<TensorTypeNode>() || t.as<BaseTensorTypeNode>()
+      || t.as<TupleTypeNode>() || t.as<FuncTypeNode>();
+  }
+
+  void VisitType_(const TupleTypeNode* op) override {
+    // tuples should only contain normal types
+    for (const Type& t : op->fields) {
+      this->VisitType(t);
+      valid = valid && IsTypeKind(t);
+      if (!valid) {
+        return;
+      }
+    }
+  }
+
+  void VisitType_(const FuncTypeNode* op) override {
+    // func types should only take normal types for arguments
+    // and only return a normal type
+    for (const Type& t : op->arg_types) {
+      this->VisitType(t);
+      valid = valid && IsTypeKind(t);
+      if (!valid) {
+        return;
+      }
+    }
+
+    this->VisitType(op->ret_type);
+    valid = valid && IsTypeKind(op->ret_type);
+  }
+
+  void VisitType_(const TypeRelationNode* op) override {
+    // arguments to type relation should be normal types
+    for (const Type& t : op->args) {
+      this->VisitType(t);
+      valid = valid && IsTypeKind(t);
+      if (!valid) {
+        return;
+      }
+    }
+  }
+
   bool Check(const Type &t) {
     this->VisitType(t);
     return valid;
@@ -37,5 +97,14 @@ bool KindCheck(const Environment& env, const Type &t) {
   return kc.Check(t);
 }
 
+TVM_REGISTER_API("relay._ir_pass.check_kind")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
+      if (args.size() == 1) {
+        *ret = KindCheck(EnvironmentNode::make({}), args[0]);
+      } else {
+        *ret = KindCheck(args[0], args[1]);
+      }
+    });
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_check_kind.py b/tests/python/relay/test_check_kind.py
new file mode 100644
index 000000000000..413e6d7051d6
--- /dev/null
+++ b/tests/python/relay/test_check_kind.py
@@ -0,0 +1,79 @@
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import check_kind
+
+def test_tuple_kinds():
+    # only contain type kinds
+    tp = relay.TypeParam('tp', relay.Kind.Type)
+    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    tf = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([]))
+    fields = tvm.convert([tp, tf, tt])
+
+    tup_ty = relay.TupleType(fields)
+    assert check_kind(tup_ty)
+
+def test_func_kind():
+    # only contain type kinds
+    tp1 = relay.TypeParam('tp1', relay.Kind.Type)
+    tp2 = relay.TypeParam('tp2', relay.Kind.Type)
+
+    shape = tvm.convert([1, 2, 3])
+    dtype = 'float32'
+    tensor_type = relay.TensorType(shape, dtype)
+
+    type_params = tvm.convert([tp1, tp2])
+    type_constraints = tvm.convert([])
+    arg_types = tvm.convert([tp1, tensor_type])
+    ret_type = relay.TupleType(tvm.convert([tp2, tensor_type]))
+
+    tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
+    assert check_kind(tf)
+
+def test_invalid_tuple_kinds():
+    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeParam('tp2', relay.Kind.BaseType)
+    tp3 = relay.TypeParam('tp3', relay.Kind.ShapeVar)
+    fields = tvm.convert([tp1, tp2, tp3])
+
+    tup_ty = relay.TupleType(fields)
+    assert not check_kind(tup_ty)
+
+def test_invalid_func_kind():
+    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeParam('tp2', relay.Kind.BaseType)
+    tp3 = relay.TypeParam('tp3', relay.Kind.ShapeVar)
+
+    type_params = tvm.convert([tp1, tp2, tp3])
+    type_constraints = tvm.convert([])
+    arg_types = tvm.convert([tp1, tp2])
+    ret_type = tp3
+
+    tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
+    assert not check_kind(tf)
+
+def test_func_with_invalid_ret_type():
+    tp1 = relay.TypeParam('tp1', relay.Kind.Type)
+    tp2 = relay.TypeParam('tp2', relay.Kind.Shape)
+    tf = relay.FuncType(tvm.convert([tp1]), tp2, tvm.convert([tp1, tp2]), tvm.convert([]))
+
+def test_func_with_invalid_arg_types():
+    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeParam('tp2', relay.Kind.Type)
+    tf = relay.FuncType(tvm.convert([tp1]), tp2, tvm.convert([tp1, tp2]), tvm.convert([]))
+
+def test_func_with_invalid_tuple():
+    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
+
+    ret_type = relay.TupleType(tvm.convert([tp1, tp1, tp1]))
+
+    tf = relay.FuncType(tvm.convert([]), ret_type, tvm.convert([tp1]), tvm.convert([]))
+    assert not check_kind(tf)
+
+def test_tuple_with_invalid_func():
+    tensor_type = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+
+    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
+    tf = relay.FuncType(tvm.convert([]), tp1, tvm.convert([tp1]), tvm.convert([]))
+
+    tup_ty = relay.TupleType(tvm.convert([tensor_type, tf]))
+    assert not check_kind(tup_ty)
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index 803b3d0faa0c..fc5d8ee0777d 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -28,8 +28,8 @@ def test_tensor_type():
 
 def test_type_param():
     tp = relay.TypeParam('name', relay.Kind.Shape)
-    tp.kind == relay.Kind.Shape
-    tp.span  # TODO allow us to set span
+    assert tp.kind == relay.Kind.Shape
+    # assert tp.span  # TODO allow us to set span
     str(tp)
 
 
@@ -48,6 +48,16 @@ def test_func_type():
     str(tf)
 
 
+def test_tuple_type():
+    tp = relay.TypeParam('tp', relay.Kind.Type)
+    tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([]))
+    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    fields = tvm.convert([tp, tf, tt])
+
+    tup_ty = relay.TupleType(fields)
+    assert tup_ty.fields == fields
+
+
 def test_constant():
     arr = tvm.nd.array(10)
     const = relay.Constant(arr)

From ccda83014cc86df56235e074c8772b5edcd5b31a Mon Sep 17 00:00:00 2001
From: Sergey Mironov <grrwlf@gmail.com>
Date: Thu, 27 Sep 2018 05:02:27 +0300
Subject: [PATCH 133/529] [TOPI] Access topi::matmul from Python (#1744)

---
 nnvm/src/top/tensor/matrix_op.cc      |  2 +-
 python/tvm/api.py                     | 12 ++++----
 python/tvm/tag.py                     | 21 ++++++++++---
 topi/include/topi/nn.h                | 31 -------------------
 topi/include/topi/transform.h         | 31 +++++++++++++++++++
 topi/python/topi/reduction.py         |  8 -----
 topi/python/topi/tensor.py            |  5 ---
 topi/python/topi/transform.py         | 22 ++++++++++++--
 topi/src/topi.cc                      |  9 ++++++
 topi/tests/python/test_topi_matmul.py | 44 +++++++++++++++++++++++++++
 10 files changed, 126 insertions(+), 59 deletions(-)
 create mode 100644 topi/tests/python/test_topi_matmul.py

diff --git a/nnvm/src/top/tensor/matrix_op.cc b/nnvm/src/top/tensor/matrix_op.cc
index c881e683a6c5..de95eddee1f6 100644
--- a/nnvm/src/top/tensor/matrix_op.cc
+++ b/nnvm/src/top/tensor/matrix_op.cc
@@ -3,7 +3,7 @@
  * \file matrix_op.cc
  * \brief Matrix operators
  */
-#include <topi/nn.h>
+#include <topi/transform.h>
 #include <nnvm/op.h>
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 223e73eeb596..34fe2ba49dc8 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -238,10 +238,10 @@ def compute(shape, fcompute, name="compute", tag="", attrs=None):
     tensor: Tensor
         The created tensor
     """
-    if _tag.TagScope.current is not None:
+    if _tag.TagScope.get_current() is not None:
         if tag != "":
             raise ValueError("nested tag is not allowed for now")
-        tag = _tag.TagScope.current.tag
+        tag = _tag.TagScope.get_current().tag
     shape = (shape,) if isinstance(shape, _expr.Expr) else shape
     ndim = len(shape)
     code = fcompute.__code__
@@ -311,10 +311,10 @@ def scan(init, update, state_placeholder, inputs=None, name="scan", tag="", attr
       s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
       res = tvm.scan(s_init, s_update, s_state, X)
     """
-    if _tag.TagScope.current is not None:
+    if _tag.TagScope.get_current() is not None:
         if tag != "":
             raise ValueError("nested tag is not allowed for now")
-        tag = _tag.TagScope.current.tag
+        tag = _tag.TagScope.get_current().tag
     if isinstance(init, _tensor.Tensor):
         init = [init]
     if isinstance(update, _tensor.Tensor):
@@ -407,10 +407,10 @@ def extern(shape,
                           "tvm.contrib.cblas.matmul",
                             ins[0], ins[1], outs[0], 0, 0), name="C")
     """
-    if _tag.TagScope.current is not None:
+    if _tag.TagScope.get_current() is not None:
         if tag != "":
             raise ValueError("nested tag is not allowed for now")
-        tag = _tag.TagScope.current.tag
+        tag = _tag.TagScope.get_current().tag
     shape = (shape,) if isinstance(shape, (_expr.Expr, _Integral)) else shape
     shape = [shape] if isinstance(shape[0], (_expr.Expr, _Integral)) else shape
     if in_buffers is not None:
diff --git a/python/tvm/tag.py b/python/tvm/tag.py
index 5f6091a80a17..de9f8403de2a 100644
--- a/python/tvm/tag.py
+++ b/python/tvm/tag.py
@@ -1,25 +1,36 @@
 """Tag class for TVM operators."""
+import warnings
 from ._ffi.base import decorate
 
 class TagScope(object):
     """Tag scope object to set tag for operators, working as context
     manager and decorator both. See also tag_scope.
     """
-    current = None
+    _current = None
+
+    @classmethod
+    def get_current(cls):
+        if cls._current:
+            cls._current.accessed = True
+        return cls._current
+
     def __init__(self, tag):
         self._old_scope = None
         self.tag = tag
+        self.accessed = False
 
     def __enter__(self):
-        if TagScope.current is not None:
+        if TagScope._current is not None:
             raise ValueError("nested op_tag is not allowed for now")
-        self._old_scope = TagScope.current
-        TagScope.current = self
+        self._old_scope = TagScope._current
+        TagScope._current = self
         return self
 
     def __exit__(self, ptype, value, trace):
         assert self._old_scope is None
-        TagScope.current = self._old_scope
+        if not self.accessed:
+            warnings.warn("Tag '%s' declared via TagScope was not used." % (self.tag,))
+        TagScope._current = self._old_scope
 
     def __call__(self, fdecl):
         def tagged_fdecl(func, *args, **kwargs):
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 4a537a646425..dbbfecbcc28d 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -200,37 +200,6 @@ inline tvm::Tensor pad(const tvm::Tensor& t,
   return tvm::compute(output_shape, l, name, tag);
 }
 
-/*!
- * \brief Creates an operation that calculates a matrix multiplication
- *  (row-major notation):
- *      A(i, k) * B(k, j), if trans_a == trans_b
- *          the usual transposed combinations, otherwise
- *
- * \param A The matrix A
- * \param B The matrix B
- * \param trans_a Is A's layout transposed?
- * \param trans_b Is B's layout transposed?
- * \param name The name of the operation
- * \param tag The tag to mark the operation
- *
- * \return A Tensor whose op member is the matmul operation
- */
-inline tvm::Tensor matmul(const tvm::Tensor& A,
-                           const tvm::Tensor& B,
-                           bool trans_a = false,
-                           bool trans_b = false,
-                           std::string name = "tensor",
-                           std::string tag = kMatMul) {
-  tvm::Array<tvm::Expr> output_shape{A->shape[trans_a ? 1 : 0],
-                                     B->shape[trans_b ? 0 : 1]};
-  auto k = tvm::reduce_axis(tvm::Range{0, A->shape[trans_a ? 0 : 1]}, "k");
-  auto l = [&](tvm::Var i, tvm::Var j) {
-    return tvm::sum((trans_a ? A[k][i] : A[i][k]) * (trans_b ? B[j][k] : B[k][j]),
-                    {k});
-  };
-  return tvm::compute(output_shape, l, name, tag);
-}
-
 /*!
  * \brief Creates an operation that performs a 2-D convolution with an
  * NCHW-layout
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 245b38cfb63d..6dbdbe5574f4 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -627,6 +627,37 @@ inline Tensor where(const Tensor& condition,
   return out;
 }
 
+/*!
+ * \brief Creates an operation that calculates a matrix multiplication
+ *  (row-major notation):
+ *      A(i, k) * B(k, j), if trans_a == trans_b
+ *          the usual transposed combinations, otherwise
+ *
+ * \param A The matrix A
+ * \param B The matrix B
+ * \param trans_a Is A's layout transposed?
+ * \param trans_b Is B's layout transposed?
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor whose op member is the matmul operation
+ */
+inline tvm::Tensor matmul(const tvm::Tensor& A,
+                           const tvm::Tensor& B,
+                           bool trans_a = false,
+                           bool trans_b = false,
+                           std::string name = "tensor",
+                           std::string tag = kMatMul) {
+  tvm::Array<tvm::Expr> output_shape{A->shape[trans_a ? 1 : 0],
+                                     B->shape[trans_b ? 0 : 1]};
+  auto k = tvm::reduce_axis(tvm::Range{0, A->shape[trans_a ? 0 : 1]}, "k");
+  auto l = [&](tvm::Var i, tvm::Var j) {
+    return tvm::sum((trans_a ? A[k][i] : A[i][k]) * (trans_b ? B[j][k] : B[k][j]),
+                    {k});
+  };
+  return tvm::compute(output_shape, l, name, tag);
+}
+
 
 }  // namespace topi
 #endif  // TOPI_TRANSFORM_H_
diff --git a/topi/python/topi/reduction.py b/topi/python/topi/reduction.py
index 52121a506f43..7a4f161a8fee 100644
--- a/topi/python/topi/reduction.py
+++ b/topi/python/topi/reduction.py
@@ -1,9 +1,7 @@
 # pylint: disable=redefined-builtin,consider-using-enumerate,no-member
 """Reduce operators"""
 from __future__ import absolute_import as _abs
-import tvm
 from . import cpp
-from . import tag
 
 def _get_real_axis(ndim, axis):
     if axis is None:
@@ -26,7 +24,6 @@ def _get_real_axis(ndim, axis):
     return real_axis
 
 
-@tvm.tag_scope(tag=tag.COMM_REDUCE)
 def sum(data, axis=None, keepdims=False):
     """Sum of array elements over a given axis or a list of axes
 
@@ -52,7 +49,6 @@ def sum(data, axis=None, keepdims=False):
     return cpp.sum(data, axis, keepdims)
 
 
-@tvm.tag_scope(tag=tag.COMM_REDUCE)
 def max(data, axis=None, keepdims=False):
     """Maximum of array elements over a given axis or a list of axes
 
@@ -78,7 +74,6 @@ def max(data, axis=None, keepdims=False):
     return cpp.max(data, axis, keepdims)
 
 
-@tvm.tag_scope(tag=tag.COMM_REDUCE)
 def min(data, axis=None, keepdims=False):
     """Minimum of array elements over a given axis or a list of axes
 
@@ -104,7 +99,6 @@ def min(data, axis=None, keepdims=False):
     return cpp.min(data, axis, keepdims)
 
 
-@tvm.tag_scope(tag=tag.COMM_REDUCE_IDX)
 def argmax(data, axis=None, keepdims=False):
     """Returns the indices of the maximum values along an axis.
 
@@ -130,7 +124,6 @@ def argmax(data, axis=None, keepdims=False):
     return cpp.argmax(data, axis, keepdims)
 
 
-@tvm.tag_scope(tag=tag.COMM_REDUCE_IDX)
 def argmin(data, axis=None, keepdims=False):
     """Returns the indices of the minimum values along an axis.
 
@@ -156,7 +149,6 @@ def argmin(data, axis=None, keepdims=False):
     return cpp.argmin(data, axis, keepdims)
 
 
-@tvm.tag_scope(tag=tag.COMM_REDUCE)
 def prod(data, axis=None, keepdims=False):
     """Product of array elements over a given axis or a list of axes
 
diff --git a/topi/python/topi/tensor.py b/topi/python/topi/tensor.py
index 6fcddedbe445..06f23bbe7703 100644
--- a/topi/python/topi/tensor.py
+++ b/topi/python/topi/tensor.py
@@ -1,11 +1,8 @@
 # pylint: disable=invalid-name,consider-using-enumerate,unused-argument,len-as-condition
 """Elementwise operators"""
 from __future__ import absolute_import as _abs
-import tvm
 from . import cpp
-from . import tag
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
 def elemwise_sum(xs):
     """Perform element-wise sum on inputs
 
@@ -22,7 +19,6 @@ def elemwise_sum(xs):
     return cpp.elemwise_sum(xs)
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
 def full(shape, dtype, fill_value):
     """Fill tensor with fill_value
 
@@ -43,7 +39,6 @@ def full(shape, dtype, fill_value):
     return cpp.full(shape, dtype, fill_value)
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
 def full_like(x, fill_value):
     """Construct a tensor with same shape as input tensor,
        then fill tensor with fill_value.
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 2ad01852c8b9..eb3f9bad1095 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -111,7 +111,6 @@ def _compute(*indices):
         return a(*idx)
     return tvm.compute(new_shape, _compute)
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def flip(a, axis=0):
     """Flip/reverse elements of an array in a particular axis.
 
@@ -129,7 +128,6 @@ def flip(a, axis=0):
     """
     return cpp.flip(a, axis)
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def strided_slice(a, begin, end, strides=None):
     """Slice of an array.
 
@@ -315,7 +313,6 @@ def _compute(begin, *indices):
     # pylint: enable=cell-var-from-loop
 
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def take(a, indices, axis=None):
     """Take elements from an array along an axis.
 
@@ -338,3 +335,22 @@ def take(a, indices, axis=None):
     if axis is None:
         return cpp.take(a, indices)
     return cpp.take(a, indices, int(axis))
+
+def matmul(a, b, transp_a=False, transp_b=False):
+    """
+    Creates an operation that calculates a matrix multiplication (row-major notation):
+        A(i, k) * B(k, j)
+    if trans_a == trans_b, the usual transposed combinations, otherwise
+
+    Parameters
+    ----------
+    a : The matrix A
+    b : The matrix B
+    trans_a : Is A's layout transposed?
+    trans_b : Is B's layout transposed?
+
+    Returns
+    -------
+    A Tensor whose op member is the matmul operation
+    """
+    return cpp.matmul(a, b, transp_a, transp_b)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index fef2487e6770..ae1ad57551cb 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -292,6 +292,15 @@ TVM_REGISTER_GLOBAL("topi.where")
   *rv = where(args[0], args[1], args[2]);
 });
 
+TVM_REGISTER_GLOBAL("topi.matmul")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  switch ( args.size() ) {
+    case 2: *rv = matmul(args[0], args[1]); break;
+    case 3: *rv = matmul(args[0], args[1], args[2]); break;
+    case 4: *rv = matmul(args[0], args[1], args[2], args[3]); break;
+    default: CHECK(0) << "topi.matmul expects 2, 3 or 4 arguments";
+  }});
+
 TVM_REGISTER_GLOBAL("topi.strided_slice")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = strided_slice(args[0], args[1], args[2], args[3]);
diff --git a/topi/tests/python/test_topi_matmul.py b/topi/tests/python/test_topi_matmul.py
new file mode 100644
index 000000000000..a2902e17d40b
--- /dev/null
+++ b/topi/tests/python/test_topi_matmul.py
@@ -0,0 +1,44 @@
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+
+def with_tvm(lam, *args):
+    """ Take numpy arrays as args, convert them to TVM tensors and call `lam`.
+    Result of lambda is converted back to numpy array and returned.
+    """
+    ctx = tvm.cpu(0)
+    pls = []     # placeholders
+    vals_nd = [] # initial values
+    for i,arg in enumerate(args):
+        pls.append(tvm.placeholder(arg.shape, name='pl'+str(i)))
+        vals_nd.append(tvm.nd.array(arg, ctx))
+
+    out = lam(*pls)
+    out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), ctx)
+    s = tvm.create_schedule([out.op])
+    m = tvm.build(s, pls + [out], "llvm")
+    m(*(vals_nd+[out_nd]))
+    return out_nd.asnumpy()
+
+def verify_matmul(sa, sb, transp_a, transp_b):
+    a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32)
+    b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
+    c1 = np.matmul(np.transpose(a) if transp_a else a,
+                   np.transpose(b) if transp_b else b)
+    c2 = with_tvm(lambda A,B: topi.matmul(A,B,transp_a,transp_b), a,b)
+    np.testing.assert_allclose(c1, c2, rtol=1e-5)
+
+def test_matmul():
+    verify_matmul((1,1),(1,1),False,False)
+    verify_matmul((1,1),(1,1),True,True)
+    verify_matmul((2,2),(2,2),False,False)
+    verify_matmul((2,2),(2,2),True,True)
+    verify_matmul((2,3),(3,5),False,False)
+    verify_matmul((5,3),(3,2),False,False)
+    verify_matmul((3,5),(3,2),True,False)
+    verify_matmul((3,5),(2,3),True,True)
+
+if __name__ == "__main__":
+    test_matmul()
+

From 894518b6e2cf437600423cacb0fd1ed56bbcb849 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Fri, 28 Sep 2018 01:58:58 +0530
Subject: [PATCH 134/529] [ONNX][FRONTEND] Constantfill - #1539 (#1764)

---
 nnvm/python/nnvm/frontend/onnx.py             | 42 ++++++++++++++++++-
 .../python/frontend/onnx/test_forward.py      | 33 +++++++++++++++
 2 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index 1584c960aeb4..f89a94d1b3c8 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -611,6 +611,46 @@ def _impl_v1(cls, inputs, attr, params):
                 'axis': ('axis', 1),
             })(inputs, attr, params)
 
+class ConstantFill(OnnxOpConverter):
+    """ Operator converter for ConstantFill.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        is_full = True
+        num_inputs = len(inputs)
+        if 'shape' in attr:
+            if num_inputs > 0:
+                raise ImportError(
+                    "Can't set shape and input tensor at a time")
+            shape = attr.pop('shape')
+        else:
+            if num_inputs == 0:
+                raise ImportError(
+                    "Either shape attribute or input should be set")
+            if 'input_as_shape' in attr and attr['input_as_shape']:
+                shape = params[inputs[0].list_output_names()[0]].asnumpy()
+            else:
+                is_full = False
+
+        if not is_full:
+            if 'extra_shape' in attr:
+                raise ImportError(
+                    "Extra Shape not supported with fill_like")
+
+            out = AttrCvt(
+                op_name='full_like',
+                transforms={'value': 'fill_value'},
+                ignores=['dtype'])(inputs, attr)
+            return _sym.cast(out, dtype=attr['dtype'].decode("utf-8"))
+        else:
+            if 'extra_shape' in attr:
+                shape = shape + attr.pop('extra_shape')
+
+            return AttrCvt(
+                op_name='full',
+                transforms={'value': 'fill_value'},
+                extras={'shape':shape})(inputs, attr)
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -628,7 +668,7 @@ def _get_convert_map(opset):
         'ThresholdedRelu': ThresholdedRelu.get_converter(opset),
         'ScaledTanh': ScaledTanh.get_converter(opset),
         'ParametricSoftplus': ParametricSoftPlus.get_converter(opset),
-        # 'ConstantFill'
+        'ConstantFill': ConstantFill.get_converter(opset),
         # 'GivenTensorFill'
         'FC': AttrCvt('dense', ignores=['axis', 'axis_w']),
         'Scale': Scale.get_converter(opset),
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 645174d04fe2..187e6c175cd4 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -680,6 +680,38 @@ def test_forward_arg_min_max():
             verify_argmin([3,4,4], axis, keepdims)
             verify_argmax([3,4,4], axis, keepdims)
 
+def verify_constantfill(is_shape, input_dim, out_dim, value, dtype, **kwargs):
+    input_a = np.random.uniform(size=input_dim).astype(dtype)
+    out = np.empty(shape=out_dim, dtype=dtype)
+    out.fill(value)
+
+    if is_shape == True:
+        fill_node = helper.make_node("ConstantFill", [], ["out"], shape=input_dim, value=value, **kwargs)
+    else:
+        fill_node = helper.make_node("ConstantFill", ["input_a"], ["out"], value=value, dtype=dtype, **kwargs)
+
+    graph = helper.make_graph([fill_node],
+                              "fill_test",
+                              inputs = [helper.make_tensor_value_info("input_a",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(out.shape))])
+
+    model = helper.make_model(graph, producer_name='fill_test')
+
+    for target, ctx in ctx_list():
+        if is_shape == True:
+            tvm_out = get_tvm_output(model, [], target, ctx, out.shape)
+        else:
+            tvm_out = get_tvm_output(model, [input_a], target, ctx, out.shape)
+
+        np.testing.assert_allclose(out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_constantfill():
+    verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
+    verify_constantfill(False, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
+    verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5, 4, 5, 6), 10, 'float32', extra_shape=(4, 5, 6))
+
 if __name__ == '__main__':
     # verify_super_resolution_example()
     # verify_squeezenet1_1()
@@ -704,3 +736,4 @@ def test_forward_arg_min_max():
     test_forward_hardsigmoid()
     test_forward_arg_min_max()
     test_softmax()
+    test_constantfill()

From fc3bab7991066efde806d0079f12a879c81217e9 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 27 Sep 2018 17:13:24 -0400
Subject: [PATCH 135/529] Adding source types to C++ reduce functions (#1771)

---
 src/lang/ir_operator.cc                   |  8 +++---
 tests/python/unittest/test_lang_tensor.py | 30 +++++++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/src/lang/ir_operator.cc b/src/lang/ir_operator.cc
index 50e598d13dc2..5cad23e8ce57 100644
--- a/src/lang/ir_operator.cc
+++ b/src/lang/ir_operator.cc
@@ -9,7 +9,7 @@
 namespace tvm {
 
 Expr sum(Expr source, Array<IterVar> rdom) {
-  Var x("x"), y("y");
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Add::make(x, y);
   Expr identity_element = make_zero(source.type());
   ir::CommReducer combiner =
@@ -18,7 +18,7 @@ Expr sum(Expr source, Array<IterVar> rdom) {
 }
 
 Expr max(Expr source, Array<IterVar> rdom) {
-  Var x("x"), y("y");
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Max::make(x, y);
   Expr identity_element = source.type().min();
   ir::CommReducer combiner =
@@ -27,7 +27,7 @@ Expr max(Expr source, Array<IterVar> rdom) {
 }
 
 Expr min(Expr source, Array<IterVar> rdom) {
-  Var x("x"), y("y");
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Min::make(x, y);
   Expr identity_element = source.type().max();
   ir::CommReducer combiner =
@@ -36,7 +36,7 @@ Expr min(Expr source, Array<IterVar> rdom) {
 }
 
 Expr prod(Expr source, Array<IterVar> rdom) {
-  Var x("x"), y("y");
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Mul::make(x, y);
   Expr identity_element = make_one(source.type());
   ir::CommReducer combiner =
diff --git a/tests/python/unittest/test_lang_tensor.py b/tests/python/unittest/test_lang_tensor.py
index 1d8603dfc98b..f562a48e44ae 100644
--- a/tests/python/unittest/test_lang_tensor.py
+++ b/tests/python/unittest/test_lang_tensor.py
@@ -1,4 +1,5 @@
 import tvm
+from topi.nn.pooling import pool
 
 def test_tensor():
     m = tvm.var('m')
@@ -185,6 +186,34 @@ def test_tensor_inputs():
     assert tuple(y.op.input_tensors) == (x,)
 
 
+def test_tensor_pool():
+    def intrin_pool():
+        A = tvm.placeholder((64, 16, 16), name='A')
+        kh = tvm.reduce_axis((0, 3), name='kh')
+        kw = tvm.reduce_axis((0, 3), name='kw')
+        P = tvm.compute((64, 14, 14),
+                        lambda c, oh, ow: tvm.max(A[c, oh + kh, ow + kw],
+                                                  axis=[kh, kw]),
+                        name='p')
+
+        def intrin_func(ins, outs):
+            dinp = ins[0]
+            dout = outs[0]
+            return tvm.call_packed("op", dinp, dout)
+
+        with tvm.build_config(offset_factor=1):
+            return tvm.decl_tensor_intrin(P.op, intrin_func)
+
+    A = tvm.placeholder((1, 64, 16, 16), name='A')
+    P = pool(data=A, kernel=(3, 3), stride=(1, 1), padding=(0, 0, 0, 0),
+             pool_type='max')
+    s = tvm.create_schedule(P.op)
+    _, oh, _, _ = P.op.axis
+    intrin = intrin_pool()
+    s[P].tensorize(oh, intrin)
+    tvm.lower(s, [A, P])
+
+
 if __name__ == "__main__":
     test_rank_zero()
     test_tensor_inputs()
@@ -199,3 +228,4 @@ def test_tensor_inputs():
     test_extern_multi_out()
     test_tuple_inputs()
     test_tuple_with_different_deps()
+    test_tensor_pool()

From 5c4adbadc3697b1bb63dbeb08665f3923b695fdb Mon Sep 17 00:00:00 2001
From: nhynes <nhynes@berkeley.edu>
Date: Thu, 27 Sep 2018 21:25:11 -0700
Subject: [PATCH 136/529] [SGX] Improve edgeroutines (#1775)

---
 .gitignore                              |  1 +
 CMakeLists.txt                          |  4 +++-
 cmake/modules/SGX.cmake                 | 14 +++++++++++++-
 src/runtime/sgx/tvm.edl                 |  7 ++-----
 src/runtime/sgx/untrusted/sgx_module.cc | 23 ++++++++---------------
 5 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index 01f91d69874f..9d9faaf082ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -183,6 +183,7 @@ docs.tgz
 cat.png
 *.mlmodel
 tvm_u.*
+tvm_t.*
 # Mac OS X
 .DS_Store
 build*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e1d8bcc38fb..b6669dc3ce42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -183,7 +183,9 @@ add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})
 add_library(tvm_topi SHARED ${TOPI_SRCS})
 add_library(tvm_runtime SHARED ${RUNTIME_SRCS})
 if(NOT USE_SGX STREQUAL "OFF")
-  add_dependencies(tvm_runtime sgx_edl)
+  add_dependencies(tvm sgx_edl)
+  add_dependencies(tvm_runtime sgx_edl tvm_t)
+  install(TARGETS tvm_t ARCHIVE DESTINATION lib${LIB_SUFFIX})
 endif()
 add_library(nnvm_compiler SHARED ${NNVM_COMPILER_SRCS})
 
diff --git a/cmake/modules/SGX.cmake b/cmake/modules/SGX.cmake
index d7b8546d5d41..c9894de11f8b 100644
--- a/cmake/modules/SGX.cmake
+++ b/cmake/modules/SGX.cmake
@@ -3,6 +3,8 @@ if(NOT USE_SGX STREQUAL "OFF")
 
   set(_sgx_src ${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/sgx)
   set(_tvm_u_h ${_sgx_src}/untrusted/tvm_u.h)
+  set(_tvm_t_h ${_sgx_src}/trusted/tvm_t.h)
+  set(_tvm_t_c ${_sgx_src}/trusted/tvm_t.c)
   set(_tvm_edl ${_sgx_src}/tvm.edl)
   set(_sgx_ustdc ${RUST_SGX_SDK}/sgx_ustdc)
 
@@ -11,13 +13,16 @@ if(NOT USE_SGX STREQUAL "OFF")
     set(_urts_lib "${_urts_lib}_sim")
   endif()
 
+  # build edge routines
   add_custom_command(
     OUTPUT ${_tvm_u_h}
     COMMAND ${USE_SGX}/bin/x64/sgx_edger8r --untrusted
-      --untrusted-dir ${_sgx_src}/untrusted
+      --untrusted --untrusted-dir ${_sgx_src}/untrusted
+      --trusted --trusted-dir ${_sgx_src}/trusted
       --search-path ${USE_SGX}/include --search-path ${RUST_SGX_SDK}/edl
       ${_tvm_edl}
     COMMAND sed -i "4i '#include <tvm/runtime/c_runtime_api.h>'" ${_tvm_u_h}
+    COMMAND sed -i "4i '#include <tvm/runtime/c_runtime_api.h>'" ${_tvm_t_h}
     DEPENDS ${_tvm_edl}
   )
   add_custom_command(
@@ -27,6 +32,13 @@ if(NOT USE_SGX STREQUAL "OFF")
   )
   add_custom_target(sgx_edl DEPENDS ${_tvm_u_h} ${_sgx_ustdc}/libsgx_ustdc.a)
 
+  # build trusted library
+  set_source_files_properties(${_tvm_t_c} PROPERTIES GENERATED TRUE)
+  add_library(tvm_t STATIC ${_tvm_t_c})
+  add_dependencies(tvm_t sgx_edl)
+  target_include_directories(tvm_t PUBLIC ${USE_SGX}/include ${USE_SGX}/include/tlibc)
+
+  # add untrusted runtime files
   include_directories(${USE_SGX}/include)
   file(GLOB RUNTIME_SGX_SRCS ${_sgx_src}/untrusted/*.c*)
   list(APPEND TVM_RUNTIME_LINKER_LIBS
diff --git a/src/runtime/sgx/tvm.edl b/src/runtime/sgx/tvm.edl
index 55c8a878d766..d46940ecefef 100644
--- a/src/runtime/sgx/tvm.edl
+++ b/src/runtime/sgx/tvm.edl
@@ -9,7 +9,8 @@ enclave {
                                           [in, count=num_args] const TVMValue* arg_values,
                                           [in, count=num_args] const int* type_codes,
                                           int num_args,
-                                          [isptr, user_check] TVMRetValueHandle ret);
+                                          [out] TVMValue* ret_val,
+                                          [out] int* ret_type_code);
     };
 
     untrusted {
@@ -19,10 +20,6 @@ enclave {
                                    int num_args,
                                    [out] TVMValue* ret_val,
                                    [out] int* ret_type_code);
-        void tvm_ocall_set_return([isptr, user_check] TVMRetValueHandle ret,
-                                   [in, count=num_ret] const TVMValue* value,
-                                   [in, count=num_ret] const int* type_code,
-                                   int num_ret);
         void tvm_ocall_register_export([in, string] const char* name, int func_id);
         void* tvm_ocall_reserve_space(size_t num_bytes, size_t alignment);
     };
diff --git a/src/runtime/sgx/untrusted/sgx_module.cc b/src/runtime/sgx/untrusted/sgx_module.cc
index b1c1692de398..2fef99df889f 100644
--- a/src/runtime/sgx/untrusted/sgx_module.cc
+++ b/src/runtime/sgx/untrusted/sgx_module.cc
@@ -110,15 +110,18 @@ class SGXModuleNode : public ModuleNode {
     int func_id = exported->second;
     return PackedFunc([this, func_id](TVMArgs args, TVMRetValue* rv) {
         sgx::EnclaveContext ctx(this);
+        TVMValue ret_value;
+        int ret_type_code;
         TVM_SGX_CHECKED_CALL(tvm_ecall_packed_func(eid_, func_id,
-              args.values, args.type_codes, args.num_args, rv));
+              args.values, args.type_codes, args.num_args, &ret_value, &ret_type_code));
+        *rv = TVMArgValue(ret_value, ret_type_code);
       });
   }
 
-  void RunWorkers(int num_tasks, void* tg) {
-    std::function<void(int)> runner = [this, tg](int _worker_id) {
+  void RunWorkers(int num_tasks) {
+    std::function<void(int)> runner = [this](int _worker_id) {
       this->GetFunction("__tvm_run_worker__",
-                        std::shared_ptr<SGXModuleNode>(nullptr))(tg);
+                        std::shared_ptr<SGXModuleNode>(nullptr))();
     };
     thread_group_.reset(new tvm::runtime::threading::ThreadGroup(
           num_tasks, runner, false /* include_main_thread */));
@@ -144,7 +147,7 @@ namespace sgx {
 
 TVM_REGISTER_GLOBAL("__sgx_thread_group_launch__")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-  EnclaveContext::GetModule()->RunWorkers(args[0], args[1]);
+  EnclaveContext::GetModule()->RunWorkers(args[0]);
 });
 
 TVM_REGISTER_GLOBAL("__sgx_thread_group_join__")
@@ -215,16 +218,6 @@ void* tvm_ocall_reserve_space(size_t num_bytes, size_t alignment) {
   return buf;
 }
 
-void tvm_ocall_set_return(TVMRetValueHandle ret,
-                           const TVMValue* value,
-                           const int* type_code,
-                           int num_ret) {
-  CHECK_EQ(num_ret, 1) << "Only one return value is currently supported.";
-  CHECK(type_code[0] != kStr) << "Return kBytes, not kStr.";
-  TVMRetValue* rv = static_cast<TVMRetValue*>(ret);
-  *rv = TVMArgValue(value[0], type_code[0]);
-}
-
 }  // extern "C"
 }  // namespace sgx
 

From 29269d6bf6fd578536a19bc4f752863938e55242 Mon Sep 17 00:00:00 2001
From: Inon S <InonS@users.noreply.github.com>
Date: Sat, 29 Sep 2018 02:50:48 +0300
Subject: [PATCH 137/529] Added Dockerfile demonstrating OpenCL & OpenGL
 installation (#1770)

---
 docker/Dockerfile.demo_opencl | 65 +++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 docker/Dockerfile.demo_opencl

diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
new file mode 100644
index 000000000000..6d54325050ae
--- /dev/null
+++ b/docker/Dockerfile.demo_opencl
@@ -0,0 +1,65 @@
+# USAGE: sudo docker build libs/tvm -f libs/tvm/docker/Dockerfile.ocl -t l4b/tvm:ocl
+
+# REFERENCE: https://docs.docker.com/engine/reference/builder
+
+FROM ubuntu:18.04
+
+RUN echo "Labelling this image"
+LABEL Description="Docker image for TVM built with OpenCL & OpenGL support"
+
+RUN echo "Preparing to install dependencies"
+RUN apt-get update
+# ENV DEBIAN_FRONTEND noninteractive
+RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
+
+RUN echo "Installing utility libraries"
+RUN apt-get install -y apt-utils
+RUN apt-get install -y cmake g++ llvm
+RUN apt-get install -y git
+# make wget unzip libtinfo-dev libz-dev libcurl4-openssl-dev
+RUN apt-get install -y libopenblas-dev
+
+# RUN echo "Installing gtest"
+# RUN apt-get install -y libgtest-dev 
+# RUN cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib
+
+RUN echo "Installing Python"
+RUN apt-get install -y python3-dev python3-pip
+RUN pip3 install setuptools numpy nose-timer cython decorator scipy tornado psutil xgboost
+
+RUN echo "Installing Jupyter notebook"
+RUN pip3 install matplotlib Image Pillow jupyter[notebook]
+
+RUN echo "Installing OpenCL libraries"
+RUN apt-get install -y libviennacl-dev mesa-opencl-icd ocl-icd-opencl-dev clinfo
+RUN apt-get install -y libclblas-dev libclfft-dev libclsparse-dev
+
+RUN echo "Installing OpenGL libraries"
+RUN apt-get install -y libcogl-dev libegl1 libgles1 libglfw3-dev 
+# libglew-dev
+
+RUN echo "Upgrading dependencies"
+RUN apt-get upgrade -y
+
+RUN echo "Cloning TVM source & submodules"
+ENV TVM_PAR_DIR="/usr"
+RUN mkdir -p TVM_PAR_DIR && \
+	cd ${TVM_PAR_DIR} && \
+	git clone https://github.com/dmlc/tvm --recursive
+#RUN git submodule update --init --recursive
+
+
+RUN echo "Building TVM"
+#USE_BLAS: "openblas" | "mkl" | "atlas" | "apple" | "none"
+ENV TVM_HOME="/usr/tvm"
+ENV TVM_BUILD_DIR="${TVM_HOME}/build"
+RUN mkdir -p ${TVM_BUILD_DIR} && \
+	cd ${TVM_BUILD_DIR} && \
+	cmake .. -DUSE_BLAS=openblas -DUSE_LLVM=ON -DUSE_OPENCL=ON -DUSE_OPENGL=ON && \
+	make -j6
+
+RUN echo "Building Python package"
+ENV PYTHONPATH=${TVM_HOME}/python:${TVM_HOME}/topi/python:${TVM_HOME}/nnvm/python:${PYTHONPATH}
+RUN cd ${TVM_HOME}/python && python3 setup.py install --user
+RUN cd ${TVM_HOME}/topi/python && python3 setup.py install --user
+RUN cd ${TVM_HOME}/nnvm/python && python3 setup.py install --user

From 45837c821051c182948d57f2e283129de504cc4b Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 29 Sep 2018 10:03:55 -0700
Subject: [PATCH 138/529] [RELAY] Refactor type inference to use type solver
 (#1779)

---
 .gitignore                                    |   7 +
 include/tvm/relay/op.h                        |  47 +-
 include/tvm/relay/pass.h                      |   4 +-
 include/tvm/relay/type.h                      | 178 +++-
 include/tvm/runtime/packed_func.h             |  22 +-
 python/tvm/relay/expr.py                      |   8 +-
 python/tvm/relay/ty.py                        |  41 +-
 src/common/arena.h                            |  92 ++
 src/relay/ir/environment.cc                   |   5 +-
 src/relay/ir/type.cc                          |  42 +-
 src/relay/op/type_relations.cc                |  87 +-
 src/relay/op/type_relations.h                 |  70 +-
 src/relay/pass/incomplete_type.h              |  38 -
 src/relay/pass/resolve.cc                     | 100 ---
 src/relay/pass/resolve.h                      |  47 -
 src/relay/pass/type_functor.h                 |   1 -
 src/relay/pass/type_infer.cc                  | 807 ++++++------------
 src/relay/pass/type_solver.cc                 | 166 ++++
 src/relay/pass/type_solver.h                  | 231 +++++
 src/relay/pass/type_visitor.h                 |   7 +-
 src/relay/pass/unifier.cc                     | 324 -------
 src/relay/pass/unifier.h                      | 141 ---
 tests/python/relay/test_ir_nodes.py           |   4 +-
 ...eval_integration.py => test_type_infer.py} |  29 +-
 tests/python/relay/test_type_solver.py        |  56 ++
 tutorials/nnvm/.gitignore                     |  11 +
 26 files changed, 1225 insertions(+), 1340 deletions(-)
 create mode 100644 src/common/arena.h
 delete mode 100644 src/relay/pass/incomplete_type.h
 delete mode 100644 src/relay/pass/resolve.cc
 delete mode 100644 src/relay/pass/resolve.h
 create mode 100644 src/relay/pass/type_solver.cc
 create mode 100644 src/relay/pass/type_solver.h
 delete mode 100644 src/relay/pass/unifier.cc
 delete mode 100644 src/relay/pass/unifier.h
 rename tests/python/relay/{test_tyck_eval_integration.py => test_type_infer.py} (91%)
 create mode 100644 tests/python/relay/test_type_solver.py
 create mode 100644 tutorials/nnvm/.gitignore

diff --git a/.gitignore b/.gitignore
index 9d9faaf082ea..1a04e291302e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -190,6 +190,13 @@ build*
 
 # Jetbrain
 .idea
+.ipython
+.jupyter
+.nv
+.pylint.d
+.python_history
+.pytest_cache
+.local
 
 # tmp file
 .nfs*
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index 9f4e7be08a8c..5735a935f6c2 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -13,10 +13,9 @@
 #include <utility>
 #include <vector>
 
-#include "../attrs.h"
-#include "./base.h"
-#include "./expr.h"
-#include "./type.h"
+#include "base.h"
+#include "expr.h"
+#include "type.h"
 
 namespace tvm {
 namespace relay {
@@ -45,7 +44,7 @@ class OpNode : public relay::ExprNode {
   Array<AttrFieldInfo> arguments;
   /*!
    * \brief The type key of the attribute field
-   *  This can be empty, in which case it defaults to
+   *  This can be empty, in which case it defaults to anything.
    */
   std::string attrs_type_key;
   /*!
@@ -156,11 +155,13 @@ class OpRegistry {
    */
   inline OpRegistry& add_type_rel(
       const std::string& rel_name,
-      std::function<Array<Type>(const Array<Type>&, int)> type_rel_func);
-
+      runtime::TypedPackedFunc<bool(const Array<Type>&,
+                                    int,
+                                    const Attrs&,
+                                    const TypeReporter&)> type_rel_func);
   /*!
    * \brief Set the type key of attributes.
-   * \param type_key The type of of the attrs field.x
+   * \param type_key The type of of the attrs field.
    * \return reference to self.
    */
   inline OpRegistry& set_attrs_type_key(const std::string& type_key);
@@ -348,23 +349,25 @@ inline OpRegistry& OpRegistry::add_argument(const std::string& name,
 
 inline OpRegistry& OpRegistry::add_type_rel(
     const std::string& rel_name,
-    std::function<Array<Type>(const Array<Type>&, int)> type_rel_func) {
+    runtime::TypedPackedFunc<bool(const Array<Type>&,
+                                  int,
+                                  const Attrs&,
+                                  const TypeReporter&)> type_rel_func) {
   auto func_name = std::string("tvm.relay.type_relation.") + rel_name;
-
-  TypedEnvFunc<Array<Type>(const Array<Type>&, int)> env_type_rel_func;
+  TypeRelationFn env_type_rel_func;
 
   if (runtime::Registry::Get(func_name)) {
     auto env_func = EnvFunc::Get(func_name);
     env_type_rel_func = env_func;
   } else {
     runtime::Registry::Register(func_name)
-        .set_body_typed<Array<Type>(const Array<Type>&, int)>(type_rel_func);
+        .set_body(type_rel_func.packed());
     auto env_func = EnvFunc::Get(func_name);
     env_type_rel_func = env_func;
   }
 
-  std::vector<TypeParam> type_params;
-  std::vector<Type> arg_types;
+  Array<TypeParam> type_params;
+  Array<Type> arg_types;
 
   // Add inputs.
   std::string input_name_prefix = "in";
@@ -375,15 +378,27 @@ inline OpRegistry& OpRegistry::add_type_rel(
     arg_types.push_back(param);
   }
 
-  auto ty_call_args = Array<Type>(arg_types);
+  Array<Type> ty_call_args = arg_types;
 
   // Add output type.
   auto out_param = TypeParamNode::make("out", TypeParamNode::Kind::kType);
   type_params.push_back(out_param);
+  // this will trigger copy on write.
   ty_call_args.push_back(out_param);
 
+  // The attributes of primitive op is nullptr
+  //
+  // The attributes of primitive operator can vary at the call site.
+  // The type of sum is also dependent on Attrs being passed.
+  // So puting nullptr in the Attrs means that the operator is polymorphic on Attrs.
+  //
+  // A common example is sum(x, axis), where the choice of axis
+  // can affect the type of the function.
   TypeConstraint type_rel =
-      TypeRelationNode::make(rel_name, env_type_rel_func, ty_call_args);
+      TypeRelationNode::make(env_type_rel_func,
+                             ty_call_args,
+                             arg_types.size(),
+                             Attrs());
 
   auto func_type =
       FuncTypeNode::make(arg_types, out_param, type_params, {type_rel});
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index b49d53815f62..d3747c214859 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -26,7 +26,7 @@ namespace relay {
  * \return A type checked expression with its checked_type field populated.
  */
 Expr InferType(const Environment& env, const Expr& e);
-Expr InferType(const Environment& env, const GlobalVar& v, const Function& e);
+Expr InferType(const Environment& env, const GlobalVar& var, const Function& f);
 
 /*!
  * \brief Check that types are well formed by applying "kinding rules".
@@ -69,7 +69,7 @@ bool AlphaEqual(const Expr& e1, const Expr& e2);
  *
  * For example: `forall s, Tensor[f32, s]` is equal to
  * `forall w, Tensor[f32, w]`.
- * 
+ *
  * See https://en.wikipedia.org/wiki/Lambda_calculus#Alpha_equivalence
  * for more details.
  *
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index f972eb85b041..53f484522518 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -11,7 +11,8 @@
 #include <tvm/node/node.h>
 #include <string>
 
-#include "./base.h"
+#include "base.h"
+#include "../attrs.h"
 
 namespace tvm {
 namespace relay {
@@ -116,10 +117,10 @@ class TypeParamNode : public TypeNode {
   /*! \brief possible kinds of TypeParam */
   enum Kind : int {
     /*! \brief template variable in shape expression */
-    kShapeVar = 0,
-    kShape = 1,
+    kType = 0,
+    kShapeVar = 1,
     kBaseType = 2,
-    kType = 3
+    kShape = 3
   };
   /*!
    * \brief The variable itself is only meaningful when
@@ -143,6 +144,33 @@ class TypeParamNode : public TypeNode {
 
 RELAY_DEFINE_NODE_REF(TypeParam, TypeParamNode, Type);
 
+/*!
+ * \brief IncompleteType.
+ * This is intermediate values that is used during type inference.
+ *
+ * If we view the type relations as "computational graph of types",
+ * then IncompleteType represents intermediate values of the graph,
+ * TypeParam represents the input to the graph.
+ */
+class IncompleteType;
+
+/*! \brief IncompleteType container node */
+class IncompleteTypeNode : public TypeNode {
+ public:
+  TypeParamNode::Kind kind;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("kind", &kind);
+  }
+
+  TVM_DLL static IncompleteType make(TypeParamNode::Kind kind);
+
+  static constexpr const char* _type_key = "relay.IncompleteType";
+  TVM_DECLARE_NODE_TYPE_INFO(IncompleteTypeNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(IncompleteType, IncompleteTypeNode, Type);
+
 /*!
  * \brief Potential Constraints in the type.
  * \note This is reserved for future use.
@@ -190,7 +218,8 @@ class FuncTypeNode : public TypeNode {
     v->Visit("span", &span);
   }
 
-  TVM_DLL static FuncType make(tvm::Array<Type> arg_types, Type ret_type,
+  TVM_DLL static FuncType make(tvm::Array<Type> arg_types,
+                               Type ret_type,
                                tvm::Array<TypeParam> type_params,
                                tvm::Array<TypeConstraint> type_constraints);
 
@@ -200,11 +229,102 @@ class FuncTypeNode : public TypeNode {
 
 RELAY_DEFINE_NODE_REF(FuncType, FuncTypeNode, Type);
 
+/*!
+ * \brief The type of tuple values.
+ */
+class TupleType;
+/*!
+ * \brief TupleType container.
+ */
+class TupleTypeNode : public TypeNode {
+ public:
+  /*! \brief The type of each field in the tuple. */
+  tvm::Array<Type> fields;
+
+  TupleTypeNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("fields", &fields); }
+
+  TVM_DLL static TupleType make(tvm::Array<Type> fields);
+
+  static constexpr const char* _type_key = "relay.TypeTuple";
+  TVM_DECLARE_NODE_TYPE_INFO(TupleTypeNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(TupleType, TupleTypeNode, Type);
+
+class TypeReporter;
+
+/*!
+ * \brief reporter that reports back to the
+ *  type resolution information.
+ */
+class TypeReporterNode : public Node {
+ public:
+  /*!
+   * \brief Create a type equality constraint.
+   *
+   *  The "assign direction" acts as a hint to the solver
+   *  showing that it is more likely to resolve dst by src.
+   *  But it is possible for the solver to resolve src by dst as well.
+   */
+  TVM_DLL virtual void Assign(const Type& dst, const Type& src) = 0;
+  /*!
+   * \brief assert shape expression equals each other.
+   * \param lhs The left operand.
+   * \param rhs The right operand.
+   */
+  TVM_DLL virtual void AssertEQ(const ShapeExpr& lhs, const ShapeExpr& rhs) = 0;
+
+  // solver is not serializable.
+  void VisitAttrs(tvm::AttrVisitor* v) final {}
+
+  static constexpr const char* _type_key = "relay.TypeReporter";
+  TVM_DECLARE_NODE_TYPE_INFO(TypeReporterNode, Node);
+};
+
+/*!
+ * \brief Container class of TypeReporter.
+ * \sa TypeReporterNode
+ */
+class TypeReporter : public NodeRef {
+ public:
+  TypeReporter() {}
+  explicit TypeReporter(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {
+  }
+  TypeReporterNode* operator->() const {
+    return static_cast<TypeReporterNode*>(node_.get());
+  }
+  using ContainerType = TypeReporterNode;
+};
+
+/*!
+ * \brief User defined type constraint function.
+ *
+ * If the input type information can be used to fully decide
+ * the IncompleteTypes, then the function should call
+ * reporter.Assign to report the new types, and return true.
+ * Otherwise, the function should return false.
+ *
+ * \param args The arguments to the relation.
+ *   The types are stored in the form of
+ *   [input_type_0, input_type_1, ... input_type_n,
+ *    output_type_0, output_type_1, ... output_type_m]
+ *
+ * \param num_inputs Number of input types in the args.
+ * \param attrs The additional attributes of the operator.
+ * \param reporter The reporter to report solution to.
+ * \return false if This relation cannot be resolved.
+ *   true if this relation has been resolved.
+ */
 using TypeRelationFn =
-    TypedEnvFunc<Array<Type>(const Array<Type>&, int)>;
+    TypedEnvFunc<bool(const Array<Type>& args,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter)>;
 
 /*!
- * \brief Opaque type relation, is an input-output relation on types.
+ * \brief User defined type relation, is an input-output relation on types.
  */
 class TypeRelation;
 /*!
@@ -214,24 +334,30 @@ class TypeRelation;
  */
 class TypeRelationNode : public TypeConstraintNode {
  public:
-  /*! \brief The name of the function */
-  std::string name;
-
   /*!
    * \brief The function on input and output variables which
    *  this is not directly serializable,
    *  need to be looked-up in the environment.
    */
-  TypeRelationFn func_;
-
+  TypeRelationFn func;
   /*! \brief The type arguments to the type function. */
   tvm::Array<Type> args;
+  /*! \brief Number of inputs arguments */
+  int num_inputs;
+  /*! \brief Attributes to the relation function */
+  Attrs attrs;
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
-    v->Visit("name", &name);
+    v->Visit("func", &func);
+    v->Visit("args", &args);
+    v->Visit("num_inputs", &num_inputs);
+    v->Visit("attrs", &attrs);
   }
 
-  TVM_DLL static TypeRelation make(std::string name, TypeRelationFn func_, Array<Type> args);
+  TVM_DLL static TypeRelation make(TypeRelationFn func,
+                                   Array<Type> args,
+                                   int num_args,
+                                   Attrs attrs);
 
   static constexpr const char* _type_key = "relay.TypeRelation";
   TVM_DECLARE_NODE_TYPE_INFO(TypeRelationNode, TypeConstraintNode);
@@ -239,30 +365,6 @@ class TypeRelationNode : public TypeConstraintNode {
 
 RELAY_DEFINE_NODE_REF(TypeRelation, TypeRelationNode, TypeConstraint);
 
-/*!
- * \brief The type of tuple values.
- */
-class TupleType;
-/*!
- * \brief TupleType container.
- */
-class TupleTypeNode : public TypeNode {
- public:
-  /*! \brief The type of each field in the tuple. */
-  tvm::Array<Type> fields;
-
-  TupleTypeNode() {}
-
-  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("fields", &fields); }
-
-  TVM_DLL static TupleType make(tvm::Array<Type> fields);
-
-  static constexpr const char* _type_key = "relay.TypeTuple";
-  TVM_DECLARE_NODE_TYPE_INFO(TupleTypeNode, TypeNode);
-};
-
-RELAY_DEFINE_NODE_REF(TupleType, TupleTypeNode, Type);
-
 // The following fields contains advanced typing
 // Only keep the class name and reserved for future usage.
 class GenericTensorType;
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 401b0bbb97ed..2ef4b0a64d3f 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -175,7 +175,17 @@ class TypedPackedFunc<R(Args...)> {
    *
    * \param packed The packed function
    */
-  inline explicit TypedPackedFunc(PackedFunc packed);
+  inline TypedPackedFunc(PackedFunc packed);  // NOLINT(*)
+  /*!
+   * \brief constructor from TVMRetValue
+   * \param value The TVMRetValue
+   */
+  inline TypedPackedFunc(const TVMRetValue& value);  // NOLINT(*)
+  /*!
+   * \brief constructor from TVMArgValue
+   * \param value The TVMArgValue
+   */
+  inline TypedPackedFunc(const TVMArgValue& value);  // NOLINT(*)
   /*!
    * \brief construct from a lambda function with the same signature.
    *
@@ -196,7 +206,7 @@ class TypedPackedFunc<R(Args...)> {
              std::is_convertible<FLambda,
                                  std::function<R(Args...)>
                                  >::value>::type>
-  explicit TypedPackedFunc(const FLambda& typed_lambda) {
+  TypedPackedFunc(const FLambda& typed_lambda) {  // NOLINT(*)
     this->AssignTypedLambda(typed_lambda);
   }
   /*!
@@ -1143,6 +1153,14 @@ template<typename R, typename ...Args>
 TypedPackedFunc<R(Args...)>::TypedPackedFunc(PackedFunc packed)
   : packed_(packed) {}
 
+template<typename R, typename ...Args>
+TypedPackedFunc<R(Args...)>::TypedPackedFunc(const TVMRetValue& value)
+    : packed_(value.operator PackedFunc()) {}
+
+template<typename R, typename ...Args>
+TypedPackedFunc<R(Args...)>::TypedPackedFunc(const TVMArgValue& value)
+    : packed_(value.operator PackedFunc()) {}
+
 template<typename R, typename ...Args>
 template<typename FType>
 inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda) {
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 3bddbc89b56e..e3b6c9d7e9ff 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -2,16 +2,18 @@
 """The expression nodes of Relay."""
 from __future__ import absolute_import
 from .base import NodeBase, register_relay_node
-from ._ir_pass import _get_checked_type
 from . import _make
 from .. import convert
 
 
 class Expr(NodeBase):
     """The base type for all Relay expressions."""
-
     def checked_type(self):
-        return _get_checked_type(self)
+        ret = self._checked_type_
+        if ret is None:
+            raise ValueError("The type checker has not populated"
+                             " the checked_type for this node")
+        return ret
 
     def __call__(self, *args):
         converted_args = []
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index d2a256e77f5b..c7cf9a346b68 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -52,11 +52,10 @@ class Kind(IntEnum):
        with. For example one's of kind BaseType can only be `float32`, `int32`,
        and so on.
     """
-    ShapeVar = 0
-    Shape = 1
+    Type = 0
+    ShapeVar = 1
     BaseType = 2
-    Type = 3
-
+    Shape = 3
 
 @register_relay_node
 class TypeParam(Type):
@@ -68,7 +67,7 @@ class TypeParam(Type):
     functions which are generic over types.
     """
 
-    def __init__(self, var, kind):
+    def __init__(self, var, kind=Kind.Type):
         """Construct a TypeParam.
 
         Parameters
@@ -76,7 +75,7 @@ def __init__(self, var, kind):
         var: tvm.expr.Var
             The tvm.Var which backs the type parameter.
 
-        kind: Kind
+        kind: Kind, optional
             The kind of the type parameter.
 
         Returns
@@ -130,8 +129,7 @@ def __init__(self,
                  arg_types,
                  ret_type,
                  type_params,
-                 type_constraints
-                ):
+                 type_constraints):
         """Construct a function type.
 
         Parameters
@@ -153,6 +151,29 @@ def __init__(self,
 @register_relay_node
 class IncompleteType(Type):
     """An incomplete type."""
-
-    def __init__(self, kind):
+    def __init__(self, kind=Kind.Type):
         self.__init_handle_by_constructor__(_make.IncompleteType, kind)
+
+
+@register_relay_node
+class TypeRelation(TypeConstraint):
+    """Type relation in relay.
+
+    Parameters
+    ----------
+    func : EnvFunc
+        User defined relation function.
+
+    args : list of types
+        List of types to the func.
+
+    num_inputs: int
+        Number of input arguments in args,
+        this act as a hint for type inference.
+
+    attrs : Attrs
+        The attribute attached to the relation information
+    """
+    def __init__(self, func, args, num_inputs, attrs):
+        self.__init_handle_by_constructor__(_make.TypeRelation,
+                                            func, args, num_inputs, attrs)
diff --git a/src/common/arena.h b/src/common/arena.h
new file mode 100644
index 000000000000..e8d4b2e23e37
--- /dev/null
+++ b/src/common/arena.h
@@ -0,0 +1,92 @@
+/*!
+ * Copyright 2018 by Contributors
+ *
+ * \file arena.h
+ * \brief Arena allocator that allocates
+ *  memory chunks and frees them all during destruction time.
+ */
+#ifndef TVM_COMMON_ARENA_H_
+#define TVM_COMMON_ARENA_H_
+
+#include <type_traits>
+
+namespace tvm {
+namespace common {
+
+const constexpr int kArenaPageSize = 16 << 10;
+
+/*!
+ * \brief Arena allocator that allocates memory from continuous
+ *  chunk and frees them all only during destruction.
+ */
+class Arena {
+ public:
+  Arena() {
+    // eagerly allocate the first page.
+    head_ = reinterpret_cast<PageHeader*>(new Page());
+    head_->next = nullptr;
+    head_->ptr = sizeof(PageHeader);
+  }
+  ~Arena() {
+    // delete all the allocated pages.
+    while (head_ != nullptr) {
+      Page* page = reinterpret_cast<Page*>(head_);
+      head_ = head_->next;
+      delete page;
+    }
+  }
+  /*!
+   * \brief Allocate a space from Arena for type T
+   * \param T the data type to be allocated
+   */
+  template<typename T>
+  T* Alloc() {
+    return static_cast<T*>(Alloc(sizeof(T), alignof(T)));
+  }
+
+ private:
+  // page size 16 KB
+  // The page data type;
+  using Page = std::aligned_storage<kArenaPageSize, 1024>::type;
+  /*! \brief Page header */
+  struct PageHeader {
+    /*! \brief points to the next page */
+    PageHeader* next;
+    /*! \brief memory allocator ptr inside page */
+    size_t ptr;
+  };
+  /* \brief The page header */
+  PageHeader* head_{nullptr};
+  /*!
+   * \brief Align ptr by upper bound.
+   * \param ptr The pointer value.
+   * \param align The alignment requirement.
+   */
+  size_t UpperAlign(size_t ptr, size_t align) {
+    return ptr + (align - (ptr % align)) % align;
+  }
+  /*!
+   * \brief Internal aligned alloc function.
+   * \param size The size of the memory.
+   * \param align The alignment requirement.
+   */
+  void* Alloc(size_t size, size_t align) {
+    size_t ptr = UpperAlign(head_->ptr, align);
+    if (ptr + size <= kArenaPageSize) {
+      head_->ptr = ptr + size;
+      return reinterpret_cast<char*>(head_) + ptr;
+    } else {
+      PageHeader* new_head = reinterpret_cast<PageHeader*>(new Page());
+      new_head->next = head_;
+      ptr = UpperAlign(sizeof(PageHeader), align);
+      CHECK_LE(ptr + size, kArenaPageSize);
+      new_head->ptr = ptr + size;
+      head_ = new_head;
+      return reinterpret_cast<char*>(head_) + ptr;
+    }
+  }
+};
+
+}  // namespace common
+}  // namespace tvm
+#endif  // TVM_COMMON_ARENA_H_
diff --git a/src/relay/ir/environment.cc b/src/relay/ir/environment.cc
index d7a28231ceac..eeebbb32a9fe 100644
--- a/src/relay/ir/environment.cc
+++ b/src/relay/ir/environment.cc
@@ -6,7 +6,6 @@
 #include <tvm/relay/environment.h>
 #include <tvm/relay/pass.h>
 #include <sstream>
-#include "./../pass/resolve.h"
 
 namespace tvm {
 namespace relay {
@@ -49,7 +48,7 @@ void EnvironmentNode::Add(const GlobalVar &var,
     auto checked_func = GetRef<Function>(func_node);
     auto type = checked_func->checked_type();
 
-    CHECK(IsFullyResolved(type));
+    CHECK(type.as<IncompleteTypeNode>() == nullptr);
 
     if (functions.find(var) != functions.end()) {
       if (!update) {
@@ -68,7 +67,7 @@ void EnvironmentNode::Add(const GlobalVar &var,
       this->functions.Set(var, checked_func);
     }
   } else {
-    throw Error("internal error: unknown item type, unreachable code");
+    LOG(FATAL) << "internal error: unknown item type, unreachable code";
   }
 }
 
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
index fce01390fa94..7d96d04cb514 100644
--- a/src/relay/ir/type.cc
+++ b/src/relay/ir/type.cc
@@ -55,7 +55,27 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
     << node->kind << ")";
 });
 
-FuncType FuncTypeNode::make(tvm::Array<Type> arg_types, Type ret_type,
+IncompleteType IncompleteTypeNode::make(TypeParamNode::Kind kind) {
+  auto n = make_node<IncompleteTypeNode>();
+  n->kind = std::move(kind);
+  return IncompleteType(n);
+}
+
+TVM_REGISTER_API("relay._make.IncompleteType")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    int kind = args[0];
+    *ret = IncompleteTypeNode::make(static_cast<TypeParamNode::Kind>(kind));
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<IncompleteTypeNode>(
+    [](const IncompleteTypeNode* node,
+       tvm::IRPrinter* p) {
+      p->stream << "IncompleteTypeNode(" << node->kind << ", " << node << ")";
+    });
+
+FuncType FuncTypeNode::make(tvm::Array<Type> arg_types,
+                            Type ret_type,
                             tvm::Array<TypeParam> type_params,
                             tvm::Array<TypeConstraint> type_constraints) {
   NodePtr<FuncTypeNode> n = make_node<FuncTypeNode>();
@@ -79,24 +99,28 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
             << node->type_constraints << ")";
 });
 
-TypeRelation TypeRelationNode::make(std::string name, TypeRelationFn func, Array<Type> args) {
+TypeRelation TypeRelationNode::make(TypeRelationFn func,
+                                    Array<Type> args,
+                                    int num_inputs,
+                                    Attrs attrs) {
   NodePtr<TypeRelationNode> n = make_node<TypeRelationNode>();
-  n->name = std::move(name);
-  n->func_ = std::move(func);
+  n->func = std::move(func);
   n->args = std::move(args);
+  n->num_inputs = num_inputs;
+  n->attrs = std::move(attrs);
   return TypeRelation(n);
 }
 
 TVM_REGISTER_API("relay._make.TypeRelation")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-  *ret = TypeRelationNode::make(args[0], args[1], args[2]);
+    *ret = TypeRelationNode::make(args[0], args[1], args[2], args[3]);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<TypeRelationNode>([](const TypeRelationNode *node,
-                                       tvm::IRPrinter *p) {
-  p->stream << "TypeRelationNode(" << node->name << ", " << node->args
-    << ")";
+.set_dispatch<TypeRelationNode>([](const TypeRelationNode *node, tvm::IRPrinter *p) {
+    p->stream << "TypeRelationNode("
+              << node->func->name
+              << ", " << node->args << ")";
 });
 
 TupleType TupleTypeNode::make(Array<Type> fields) {
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 94550dbd5075..58fcc18ad43e 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -8,7 +8,6 @@
 #include <tvm/relay/logging.h>
 #include <tvm/relay/op.h>
 #include <numeric>
-#include "../pass/incomplete_type.h"
 #include "./type_relations.h"
 
 namespace tvm {
@@ -30,18 +29,19 @@ int ToInt(const tvm::Expr& e) {
   return imm->value;
 }
 
-Array<Type> IdentityRel(const Array<Type>& types, int num_args) {
-  CHECK_EQ(types.size(), 2);
-  auto t1 = ToTensorType(types[0]);
-  if (t1 && types[1].as<IncompleteTypeNode>()) {
-    return {t1, t1};
-  } else {
-    return types;
+bool IdentityRel(const Array<Type>& types,
+                 int num_inputs,
+                 const Attrs& attrs,
+                 const TypeReporter& reporter) {
+  for (size_t i = 1; i < types.size(); ++i) {
+    reporter->Assign(types[i], types[0]);
   }
+  return true;
 }
 
-static Type ConcreteBroadcast(const TensorType& t1, const TensorType& t2,
-                              DataType output_dtype) {
+Type ConcreteBroadcast(const TensorType& t1,
+                       const TensorType& t2,
+                       DataType output_dtype) {
   RELAY_LOG(INFO) << "ConcreteBroadcast: t1=" << t1 << " t2=" << t2
                   << std::endl;
   auto sh1 = t1->shape;
@@ -73,7 +73,7 @@ static Type ConcreteBroadcast(const TensorType& t1, const TensorType& t2,
     Array<ShapeExpr> smaller;
 
     for (int i = 0; i < (full_len - suffix_len); i++) {
-      smaller.push_back(tvm::ir::IntImm::make(HalideIR::Int(64), 1));
+      smaller.push_back(make_const(tvm::Int(64), 1));
     }
 
     if (sh1.size() < sh2.size()) {
@@ -93,46 +93,52 @@ static Type ConcreteBroadcast(const TensorType& t1, const TensorType& t2,
 
     CHECK_EQ(larger.size(), smaller.size());
 
-    Array<HalideIR::Expr> out_shape;
+    Array<ShapeExpr> out_shape;
     for (size_t i = 0; i < smaller.size(); i++) {
       auto left = smaller[i].as<tvm::ir::IntImm>();
       auto right = larger[i].as<tvm::ir::IntImm>();
       CHECK(left);
       CHECK(right);
       int64_t dim = std::max(left->value, right->value);
-      out_shape.push_back(tvm::ir::IntImm::make(HalideIR::Int(64), dim));
+      out_shape.push_back(make_const(tvm::Int(64), dim));
     }
 
     return TensorTypeNode::make(out_shape, output_dtype);
   }
 }
 
-Array<Type> BroadcastRel(const Array<Type>& types, int num_args) {
+bool BroadcastRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
   RELAY_LOG(INFO) << "In1: " << types[0] << "In2: " << types[1]
                   << "Out: " << types[2] << std::endl;
-  if (auto t1 = ToTensorType(types[0])) {
-    if (auto t2 = ToTensorType(types[1])) {
-      CHECK_EQ(t1->dtype, t2->dtype);
-      return {t1, t2, ConcreteBroadcast(t1, t2, t1->dtype)};
+  if (auto t0 = ToTensorType(types[0])) {
+    if (auto t1 = ToTensorType(types[1])) {
+      CHECK_EQ(t0->dtype, t1->dtype);
+      reporter->Assign(types[2], ConcreteBroadcast(t0, t1, t0->dtype));
+      return true;
     }
   }
-
-  return types;
+  return false;
 }
 
-/* A relation which specifies broadcasting rules for operations which
-   compute boolean results.
-*/
-Array<Type> BroadcastCompRel(const Array<Type>& types, int num_args) {
+bool BroadcastCompRel(const Array<Type>& types,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
-  if (auto t1 = ToTensorType(types[0])) {
-    if (auto t2 = ToTensorType(types[1])) {
-      return {t1, t2, ConcreteBroadcast(t1, t2, HalideIR::Bool())};
+  RELAY_LOG(INFO) << "In1: " << types[0] << "In2: " << types[1]
+                  << "Out: " << types[2] << std::endl;
+  if (auto t0 = ToTensorType(types[0])) {
+    if (auto t1 = ToTensorType(types[1])) {
+      CHECK_EQ(t0->dtype, t1->dtype);
+      reporter->Assign(types[2], ConcreteBroadcast(t0, t1, ::tvm::Bool()));
+      return true;
     }
   }
-
-  return types;
+  return false;
 }
 
 /*! \brief Handle concrete concat case from known input to output. */
@@ -175,10 +181,10 @@ inline Type ConcreteConcatRel(const Type& input_type) {
 
     auto out_axis_dim = std::accumulate(axis_dims.begin(), axis_dims.end(), 0);
 
-    Array<tvm::Expr> out_shape = { tvm::ir::IntImm::make(HalideIR::Int(64), out_axis_dim) };
+    Array<tvm::Expr> out_shape = { make_const(Int(64), out_axis_dim) };
 
     for (auto dim : dims) {
-      out_shape.push_back(tvm::ir::IntImm::make(HalideIR::Int(64), dim));
+      out_shape.push_back(make_const(Int(64), dim));
     }
 
     return TensorTypeNode::make(out_shape, dtype);
@@ -188,19 +194,18 @@ inline Type ConcreteConcatRel(const Type& input_type) {
   }
 }
 
-Array<Type> ConcatRel(const Array<Type>& types, int num_args) {
+bool ConcatRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
-
-  if (types[0].as<IncompleteTypeNode>() && types[1].as<IncompleteTypeNode>()) {
-    return types;
-  } else if (types[1].as<IncompleteTypeNode>()) {
-    return { types[0], ConcreteConcatRel(types[0]) };
-  } else {
-    throw TypeRelationError(
-      "can not deduce relationship between the " \
-      "type of concat's input and output");
+  if (types[0].as<TupleTypeNode>()) {
+    reporter->Assign(types[1], ConcreteConcatRel(types[0]));
+    return true;
   }
+  return false;
 }
 
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/type_relations.h b/src/relay/op/type_relations.h
index 9dfc29022ee3..2c34c8bc8949 100644
--- a/src/relay/op/type_relations.h
+++ b/src/relay/op/type_relations.h
@@ -24,42 +24,72 @@ struct TypeRelationError : Error {
       : Error(msg) {}
 };
 
-/*! \brief The identity type relation maps a single input variable
- * to the output variable.
+/*!
+ * \brief The identity type relation, all the types are equal.
  *
  * \param types The input and output types to the relation.
- * \param num_args The number of input arguments.
- * \return The (potentially partial) solution to the relation.
+ * \param num_inputs The number of input arguments.
+ * \param attrs The attributes
+ * \param reporter The reporter.
+ * \return true whether relation has been resolved.
  */
-Array<Type> IdentityRel(const Array<Type>& types, int num_args);
-/*! \brief The broadcast type relation, implements the broadcasting
+bool IdentityRel(const Array<Type>& types,
+                 int num_inputs,
+                 const Attrs& attrs,
+                 const TypeReporter& reporter);
+
+/*!
+ * \brief The broadcast type relation, implements the broadcasting
  * rule over the two input types producing the broadcasted type.
  *
  * \param types The input and output types to the relation.
- * \param num_args The number of input arguments.
- * \return The (potentially partial) solution to the relation.
+ * \param num_inputs The number of input arguments.
+ * \param attrs The attributes
+ * \param reporter The reporter.
+ * \return true whether relation has been resolved.
  */
-Array<Type> BroadcastRel(const Array<Type>& types, int num_args);
-/*! \brief The broadcast type relation, implements the broadcasting
- * rule over the two input types producing the broadcasted type.
+bool BroadcastRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter);
+
+/*!
+ * \brief The broadcast type relation, implements the broadcasting
+ *  rule over the two input types producing the broadcasted type.
  *
  * This differs from BroadcastRel in the return dtype,
- * it instead returns bool, for use in comparsion operators
+ * it instead returns bool(uint8), for use in comparsion operators
  * such as equal, not_equal, lt, and so on.
  *
  * \param types The input and output types to the relation.
- * \param num_args The number of input arguments.
- * \return The (potentially partial) solution to the relation.
+ * \param num_inputs The number of input arguments.
+ * \param attrs The attributes
+ * \param reporter The reporter.
+ * \return true whether relation has been resolved.
  */
-Array<Type> BroadcastCompRel(const Array<Type>& types, int num_args);
+bool BroadcastCompRel(const Array<Type>& types,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter);
 
-/*! \brief The concat relation.
+/*!
+ * \brief The The concat relation, implements the broadcasting
+ *  rule over the two input types producing the broadcasted type.
  *
- * This relation takes a single input which must be a single tensor
- * or an arbitrary sized tuple. It combines these input dimensions
- * together to produce the output example.
+ * This differs from BroadcastRel in the return dtype,
+ * it instead returns bool(uint8), for use in comparsion operators
+ * such as equal, not_equal, lt, and so on.
+ *
+ * \param types The input and output types to the relation.
+ * \param num_inputs The number of input arguments.
+ * \param attrs The attributes
+ * \param reporter The reporter.
+ * \return true whether relation has been resolved.
  */
-Array<Type> ConcatRel(const Array<Type>& types, int num_args);
+bool ConcatRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/incomplete_type.h b/src/relay/pass/incomplete_type.h
deleted file mode 100644
index 78771dc6e9b7..000000000000
--- a/src/relay/pass/incomplete_type.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file incomplete_type.h
- * \brief A way to defined arbitrary function signature with dispatch on types.
- */
-
-#ifndef TVM_RELAY_PASS_INCOMPLETE_TYPE_H_
-#define TVM_RELAY_PASS_INCOMPLETE_TYPE_H_
-
-#include <tvm/relay/expr.h>
-
-namespace tvm {
-namespace relay {
-
-/*!
- * \brief Represents a portion of an incomplete type.
- */
-class IncompleteType;
-
-/*! \brief IncompleteType container node */
-class IncompleteTypeNode : public TypeNode {
- public:
-  TypeParamNode::Kind kind;
-
-  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("kind", &kind); }
-
-  TVM_DLL static IncompleteType make(TypeParamNode::Kind kind);
-
-  static constexpr const char* _type_key = "relay.IncompleteType";
-  TVM_DECLARE_NODE_TYPE_INFO(IncompleteTypeNode, TypeNode);
-};
-
-RELAY_DEFINE_NODE_REF(IncompleteType, IncompleteTypeNode, Type);
-
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_PASS_INCOMPLETE_TYPE_H_
diff --git a/src/relay/pass/resolve.cc b/src/relay/pass/resolve.cc
deleted file mode 100644
index b073613bafc2..000000000000
--- a/src/relay/pass/resolve.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file resolve.cc
- * \brief Resolve incomplete types to complete types.
- */
-
-#include <tvm/relay/expr.h>
-#include <tvm/relay/expr_functor.h>
-#include "./resolve.h"
-#include "./type_visitor.h"
-
-namespace tvm {
-namespace relay {
-
-struct ResolveTypeType : TypeMutator {
-  const TypeUnifier &unifier;
-
-  explicit ResolveTypeType(const TypeUnifier &unifier) : unifier(unifier) {}
-
-  Type VisitType(const Type &t) override {
-    if (!t.defined()) {
-      auto inc_ty = IncompleteTypeNode::make(TypeParamNode::Kind::kType);
-      unifier->Insert(inc_ty);
-      return inc_ty;
-    } else {
-      return TypeMutator::VisitType(t);
-    }
-  }
-
-  Type VisitType_(const IncompleteTypeNode *op) override {
-    return unifier->Subst(GetRef<IncompleteType>(op));
-  }
-};
-
-struct ResolveTypeExpr : ExprMutator {
-  const TypeUnifier &unifier;
-
-  explicit ResolveTypeExpr(const TypeUnifier &unifier) : unifier(unifier) {}
-
-  Expr Mutate(const Expr &e) {
-    // NB: a bit tricky here.
-    //
-    // We want to store resolved type without having
-    // to re-typecheck the entire term.
-    //
-    // Since we know that e : T[...] under some holes
-    // then it is the case that if we resolve types
-    // present in e, then we can type it under T
-    // with the wholes filled in.
-    //
-    // We will visit e like normal building a new
-    // term, then resolve e's old type and write
-    // it back into the new node.
-    auto new_e = ExprMutator::Mutate(e);
-    CHECK(e->checked_type_.defined());
-    auto resolved_cty = VisitType(e->checked_type_);
-    new_e->checked_type_ = resolved_cty;
-    return new_e;
-  }
-
-  Type VisitType(const Type &t) {
-    return ResolveTypeType(unifier).VisitType(t);
-  }
-};
-
-Type Resolve(const TypeUnifier &unifier, const Type &ty) {
-  CHECK(ty.defined());
-  return ResolveTypeType(unifier).VisitType(ty);
-}
-
-Expr Resolve(const TypeUnifier &unifier, const Expr &expr) {
-  return ResolveTypeExpr(unifier).Mutate(expr);
-}
-
-struct FullyResolved : TypeVisitor<> {
-  bool incomplete;
-
-  FullyResolved() : incomplete(true) {}
-
-  void VisitType(const Type &t) override {
-    if (!t.defined()) {
-      incomplete = true;
-    } else {
-      return TypeVisitor<>::VisitType(t);
-    }
-  }
-
-  void VisitType_(const IncompleteTypeNode *ty_var) override {
-    incomplete = false;
-  }
-};
-
-bool IsFullyResolved(const Type &t) {
-  auto fr = FullyResolved();
-  fr.VisitType(t);
-  return fr.incomplete;
-}
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/pass/resolve.h b/src/relay/pass/resolve.h
deleted file mode 100644
index 0cd7dce2d88d..000000000000
--- a/src/relay/pass/resolve.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file tvm/relay/resolve.h
- * \brief Resolve incomplete types to complete types.
- */
-#ifndef TVM_RELAY_PASS_RESOLVE_H_
-#define TVM_RELAY_PASS_RESOLVE_H_
-
-#include <tvm/relay/expr.h>
-#include <string>
-#include "./unifier.h"
-
-namespace tvm {
-namespace relay {
-
-/*! \brief Resolve a type containing incomplete types.
- *
- * This pass replaces incomplete types with their representative, and
- * converts types which are not defined into fresh variables.
- *
- * \param unifier The unifier containing the unification data.
- * \param ty The type to resolve.
- * \returns The resolved type.
- */
-Type Resolve(const TypeUnifier& unifier, const Type& ty);
-
-/*! \brief Resolve an expression containing incomplete types.
- *
- * This pass replaces incomplete types with their representative, and
- * converts types which are not defined into fresh variables.
- *
- * \param unifier The unifier containing the unification data.
- * \param ty The expression to resolve.
- * \returns The resolved expression.
- */
-Expr Resolve(const TypeUnifier& unifier, const Expr& expr);
-
-/*! \brief Check if all types have been filled in.
- *   \param t The type.
- *   \returns True if the type is resolved, false otherwise.
- */
-bool IsFullyResolved(const Type& t);
-
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_PASS_RESOLVE_H_
diff --git a/src/relay/pass/type_functor.h b/src/relay/pass/type_functor.h
index cccde62625ea..5152690c17e0 100644
--- a/src/relay/pass/type_functor.h
+++ b/src/relay/pass/type_functor.h
@@ -8,7 +8,6 @@
 
 #include <tvm/node/ir_functor.h>
 #include <tvm/relay/expr.h>
-#include "./incomplete_type.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index deed982acbc6..c1ea090e9db9 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -22,607 +22,360 @@
 
 #include <tvm/relay/error.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/logging.h>
 #include <tvm/relay/pass.h>
-#include "./incomplete_type.h"
-#include "./resolve.h"
-#include "./type_subst.h"
-#include "./type_visitor.h"
-#include "./unifier.h"
+#include "type_solver.h"
+#include "type_subst.h"
 
 namespace tvm {
 namespace relay {
-
-using namespace tvm::runtime;
-
-// // We declare this for forward compatibility.
-struct ConstraintData {};
-
-/*! \brief A more efficient representation of the type relation
- * data needed for type checking.
- */
-struct TypeRelationData : ConstraintData {
-  std::string name;
-  std::vector<Type> args;
-  TypeRelationFn func;
-  Span span;
-
-  explicit TypeRelationData(const TypeRelation& ty_rel)
-      : TypeRelationData(ty_rel->args, ty_rel->func_, ty_rel->span) {}
-
-  TypeRelationData(const Array<Type>& args, const TypeRelationFn& func, const Span& sp)
-      : func(func), span(sp) {
-    for (auto arg : args) {
-      this->args.push_back(arg);
-    }
+//
+// The inference algorithm can roughly be devided into three stages:
+// - Populate the constraints by visiting the expression (TypeInferencer.GetType)
+//   - solver.AddConstraint and solver.Unify are called to populate the necessary constraints
+// - Solve the constraints (solver_.Solve)
+// - Recreate expression with the resolved checked_type (Resolver.VisitExpr)
+//
+class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
+ public:
+  // constructors
+  TypeInferencer()
+      : env_(EnvironmentNode::make({})) {
   }
-
-  TypeRelation ToTypeRel() const {
-    Array<Type> args = Array<Type>(this->args.begin(), this->args.end());
-    return TypeRelationNode::make(
-        this->name, this->func, args);
+  explicit TypeInferencer(Environment env)
+      : env_(env) {
   }
-};
-
-struct TypeContext {
-  std::unordered_map<Var, Type, NodeHash> var_map;
-  std::vector<std::vector<TypeRelationData> > constraints;
 
-  TypeContext() { constraints.push_back({}); }
+  // inference the type of expr.
+  Expr Infer(Expr expr);
 
-  void Insert(const Var& id, const Type& t) { var_map[id] = t; }
-
-  void AddConstraint(const TypeConstraint& constraint) {
-      constraints.back().push_back(TypeRelationData(Downcast<TypeRelation>(constraint)));
+ private:
+  // type resolver that maps back to type
+  class Resolver;
+  // internal environment
+  Environment env_;
+  // map from expression to checked type
+  // type inferencer will populate it up
+  std::unordered_map<Expr, Type, NodeHash, NodeEqual> type_map_;
+  // The solver used by the inferencer.
+  TypeSolver solver_;
+  // Unify two types
+  Type Unify(const Type& t1, const Type& t2, const Span& span) {
+    // TODO(tqchen, jroesch): propagate span to solver
+    try {
+      return solver_.Unify(t1, t2);
+    } catch (const dmlc::Error &e) {
+      LOG(FATAL)
+          << "Error unifying `"
+          << t1
+          << "` and `"
+          << t2
+          << "`: " << e.what();
+      return Type();
+    }
   }
-
-  Type Lookup(const Var& var) {
-    auto type = var_map.find(var);
-    if (type != var_map.end()) {
-      return (*type).second;
-    } else {
-      throw FatalTypeError(std::string("undeclared local variable: ") + var->name_hint);
+  // Lazily get type for expr
+  // will call visit to deduce it if it is not in the type_map_
+  Type GetType(const Expr &expr) {
+    auto it = type_map_.find(expr);
+    if (it != type_map_.end()) {
+      return it->second;
     }
+    Type ret = this->VisitExpr(expr);
+    type_map_[expr] = ret;
+    return ret;
   }
 
-  struct Scope {
-    TypeContext& tc;
-    explicit Scope(TypeContext& tc) : tc(tc) { tc.constraints.push_back({}); }
-    ~Scope() { tc.constraints.pop_back(); }
-  };
-};
-
-struct CheckedExpr {
-  Expr expr;
-  Type type;
-  CheckedExpr(Expr e, Type t) : expr(e), type(t) {}
-  CheckedExpr() {}
-};
-
-enum SolverResult : int;
-
-class TypeInferencer : private ExprFunctor<CheckedExpr(const Expr&)> {
- private:
-  TypeContext context;
-
- public:
-  Environment env;
-  TypeUnifier unifier;
-
-  template <typename T>
-  T WithScope(const std::function<T()>& f) {
-    TypeContext::Scope fr(context);
-    return f();
+  // Visitor logics
+  Type VisitExpr_(const VarNode* op) final {
+    // The type of Var can already been lookedup in type_map_;
+    LOG(FATAL) << "Cannot find binding for var " << GetRef<Var>(op);
+    return Type();
   }
 
-  TypeInferencer();
-  TypeInferencer(Environment env, TypeUnifier unifier)
-      : env(env), unifier(unifier) {}
-  explicit TypeInferencer(Environment env);
-
-  CheckedExpr Infer(const Expr &expr);
-
-  FuncType Instantiate(FuncType fn_ty, tvm::Array<Type> &ty_args);
-
-  Type Normalize(const Type& t);
-
-  void ReportError(const std::string& msg, Span sp);
-  [[noreturn]] void FatalError(const std::string& msg, Span sp);
-
-  Type Unify(const Type &t1, const Type& t2, Span sp);
-  Type Resolve(const Type &t);
-  Expr Resolve(const Expr &e);
-
-  /*! \brief Attempt to solve a single relation. */
-  void Solve(TypeRelationData& ty_rel);
-
-  /*! \brief Attempt to solve all pending relations.
-   *
-   * If the solver
-   */
-  SolverResult Solve(std::vector<TypeRelationData>& rels);
-
-  /*! \brief Check that all relations hold. */
-  bool RelationsHold(bool scope_only = false);
-
-  /*! \brief Visit a function node, extra flag controls behavior. */
-  CheckedExpr VisitFunction(const Function& f, bool generalize);
-
- private:
-  CheckedExpr VisitExpr_(const VarNode* op) override;
-  CheckedExpr VisitExpr_(const GlobalVarNode* op) override;
-  CheckedExpr VisitExpr_(const ConstantNode* op) override;
-  CheckedExpr VisitExpr_(const TupleNode* op) override;
-  CheckedExpr VisitExpr_(const ParamNode* op) override;
-  CheckedExpr VisitExpr_(const FunctionNode* op) override;
-  CheckedExpr VisitExpr_(const CallNode* op) override;
-  CheckedExpr VisitExpr_(const LetNode* op) override;
-  CheckedExpr VisitExpr_(const IfNode* op) override;
-  CheckedExpr VisitExpr_(const OpNode* op) override;
-};
-
-TypeInferencer::TypeInferencer() {
-  this->env = EnvironmentNode::make({});
-  this->unifier = TypeUnifierNode::make(UnionFindNode::make({}));
-}
-
-TypeInferencer::TypeInferencer(Environment env) : env(env) {
-  this->unifier = TypeUnifierNode::make(UnionFindNode::make({}));
-}
-
-CheckedExpr TypeInferencer::Infer(const Expr& expr) {
-  RELAY_LOG(INFO) << "TypeInferencer::Check expr=" << expr << std::endl;
-  CheckedExpr checked_expr = this->VisitExpr(expr);
-  RELAY_LOG(INFO) << "TypeInferencer::Check type=" << checked_expr.type
-                  << std::endl;
-  Type final_type = checked_expr.type;
-  RELAY_LOG(INFO) << "TypeInferencer::Check type_after_subst=" << final_type
-                  << std::endl;
-  checked_expr.expr->checked_type_ = final_type;
-  return checked_expr;
-}
-
-CheckedExpr TypeInferencer::VisitExpr_(const VarNode* op) {
-  auto var = GetRef<Var>(op);
-  return {var, this->context.Lookup(var)};
-}
-
-CheckedExpr TypeInferencer::VisitExpr_(const GlobalVarNode* op) {
-  GlobalVar var = GetRef<GlobalVar>(op);
-  Expr e = this->env->Lookup(var);
-  return {var, e->checked_type()};
-}
-
-CheckedExpr TypeInferencer::VisitExpr_(const ConstantNode* const_node) {
-  return {GetRef<Constant>(const_node), const_node->tensor_type()};
-}
-
-CheckedExpr TypeInferencer::VisitExpr_(const TupleNode* op) {
-  Tuple pl = GetRef<Tuple>(op);
-
-  std::vector<Expr> field_exprs;
-  std::vector<Type> field_types;
-  for (auto field = pl->fields.begin(); field != pl->fields.end(); field++) {
-    auto checked_field = Infer(*field);
-    field_exprs.push_back(checked_field.expr);
-    field_types.push_back(checked_field.type);
+  Type VisitExpr_(const ParamNode* op) final {
+    // directly handled by Funtion
+    LOG(FATAL) << "not reached";
+    return Type();
   }
 
-  return {TupleNode::make(field_exprs), TupleTypeNode::make(field_types)};
-}
-
-CheckedExpr TypeInferencer::VisitExpr_(const ParamNode* param) {
-  // We should trigger error here and move param code direclty into function
-  // checking.
-  auto rtype = this->Resolve(param->type);
-  // This is a special case ... not sure if there is a better way
-  // to handle this.
-  param->var->checked_type_ = rtype;
-  return {ParamNode::make(param->var, rtype), rtype};
-}
-
-CheckedExpr TypeInferencer::VisitFunction(const Function& f, bool generalize) {
-  // First we add the parameters to the context allowing us to check their
-  // types.
-
-  // TODO(@jroesch): support polymorphism
+  Type VisitExpr_(const GlobalVarNode* op) final {
+    GlobalVar var = GetRef<GlobalVar>(op);
+    Expr e = env_->Lookup(var);
+    return e->checked_type();
+  }
 
-  std::vector<Type> param_types;
-  std::vector<Param> params;
+  Type VisitExpr_(const ConstantNode* op) final {
+    return op->tensor_type();
+  }
 
-  return this->WithScope<CheckedExpr>([&]() -> CheckedExpr {
-    for (auto param : f->params) {
-      CheckedExpr checked_param = this->Infer(param);
-      Type arg_type;
-      param_types.push_back(checked_param.type);
-      params.push_back(GetRef<Param>(checked_param.expr.as<ParamNode>()));
-      this->context.Insert(param->var, checked_param.type);
+  Type VisitExpr_(const TupleNode* op) final {
+    // TODO(tqchen, jroesch)
+    // tuple should be a constraint in the type solver
+    // to handle cases where the field type is not known.
+    Array<Type> fields;
+    for (Expr field : op->fields) {
+      fields.push_back(GetType(field));
     }
+    return TupleTypeNode::make(fields);
+  }
 
-    auto checked_body = this->Infer(f->body);
-    auto inferred_rtype = checked_body.type;
-    auto annotated_rtype = Resolve(f->ret_type);
-
-    auto unified_rtype = this->Unify(inferred_rtype, annotated_rtype, f->span);
-
-    CHECK(RelationsHold(true));
-
-    Array<TypeConstraint> cs;
+  Type VisitExpr_(const OpNode* op) final {
+    return op->op_type;
+  }
 
-    for (auto cons : this->context.constraints.back()) {
-      cs.push_back(cons.ToTypeRel());
+  Type VisitExpr_(const LetNode* op) final {
+    Type vtype = GetType(op->value);
+    if (op->value_type.defined()) {
+      vtype = Unify(vtype, op->value_type, op->span);
     }
-
-    return {FunctionNode::make(params, unified_rtype, checked_body.expr, {}),
-            FuncTypeNode::make(param_types, unified_rtype, {}, cs)};
-  });
-}
-
-CheckedExpr TypeInferencer::VisitExpr_(const FunctionNode* op) {
-  return this->VisitFunction(GetRef<Function>(op), false);
-}
-
-FuncType TypeInferencer::Instantiate(FuncType fn_ty,
-                                     tvm::Array<Type>& ty_args) {
-  tvm::Map<TypeParam, Type> subst_map;
-
-  // Build a subsitituion map up from the function type and type arguments.
-  // Eventually allow the type vars to be passed in.
-  for (auto ty_param : fn_ty->type_params) {
-    IncompleteType fresh = IncompleteTypeNode::make(ty_param->kind);
-    this->unifier->Insert(fresh);
-    ty_args.push_back(fresh);
-    subst_map.Set(ty_param, fresh);
+    CHECK(!type_map_.count(op->var));
+    // NOTE: no scoping is necessary becase var are unique in program
+    type_map_[op->var] = vtype;
+    return GetType(op->body);
   }
 
-  Type inst_ty = FuncTypeNode::make(fn_ty->arg_types, fn_ty->ret_type, {},
-                                    fn_ty->type_constraints);
-  inst_ty = TypeSubst(inst_ty, subst_map);
-
-  CHECK(KindCheck(this->env, inst_ty));
-
-  return GetRef<FuncType>(inst_ty.as<FuncTypeNode>());
-}
-
-CheckedExpr TypeInferencer::VisitExpr_(const CallNode* op) {
-  Call c = GetRef<Call>(op);
-
-  auto checked_op = this->Infer(c->op);
-
-  RELAY_LOG(INFO) << "TypeInferencer::VisitExpr_ op=" << c << std::endl
-                  << "fn_ty=" << checked_op.type << std::endl;
-
-  auto fn_ty_node = checked_op.type.as<FuncTypeNode>();
-
-  if (!fn_ty_node) {
-    this->FatalError("only expressions with function types can be called",
-                     c->op->span);
+  Type VisitExpr_(const IfNode* op) final {
+    // Ensure the type of the guard is of Tensor[Bool, ()],
+    // that is a rank-0 boolean tensor.
+    Type cond_type = this->GetType(op->cond);
+    this->Unify(cond_type,
+                TensorTypeNode::Scalar(tvm::Bool()),
+                op->cond->span);
+    Type checked_true = this->GetType(op->true_branch);
+    Type checked_false = this->GetType(op->false_branch);
+    return this->Unify(checked_true, checked_false, op->span);
   }
 
-  // We now have a function type.
-  FuncType fn_ty = GetRef<FuncType>(fn_ty_node);
-
-  tvm::Array<Type> ty_args;
-  if (ty_args.size() != 0) {
-    throw Error("found manually suplied type args, not supported");
+  // Handle special case basic primitive operator,
+  // if successful return the return type
+  Type PrimitiveCall(const FuncTypeNode* op,
+                     Array<Type> arg_types,
+                     const Attrs& attrs) {
+    if (op->type_params.size() != arg_types.size() + 1) return Type();
+    if (op->type_constraints.size() != 1) return Type();
+    const TypeRelationNode* rel = op->type_constraints[0].as<TypeRelationNode>();
+    if (rel == nullptr) return Type();
+    // validate if the type parameter matches up
+    for (size_t i = 0; i < op->type_params.size(); ++i) {
+      if (!op->type_params[i].same_as(rel->args[i])) return Type();
+    }
+    Type rtype = IncompleteTypeNode::make(TypeParamNode::Kind::kType);
+    arg_types.push_back(rtype);
+    // we can do simple replacement here
+    solver_.AddConstraint(TypeRelationNode::make(
+        rel->func, arg_types, arg_types.size() - 1, attrs));
+    return rtype;
   }
 
-  fn_ty = Instantiate(fn_ty, ty_args);
+  // instantiate the function type with fresh
+  FuncType Instantiate(const FuncTypeNode* fn_ty, Array<Type>* ty_args) {
+    tvm::Map<TypeParam, Type> subst_map;
 
-  std::vector<Type> arg_types;
-  std::vector<Expr> checked_args;
-
-  for (auto arg : c->args) {
-    auto checked_arg = this->Infer(arg);
-    arg_types.push_back(checked_arg.type);
-    checked_args.push_back(checked_arg.expr);
+    // Build a subsitituion map up from the function type and type arguments.
+    // Eventually allow the type vars to be passed in.
+    for (auto ty_param : fn_ty->type_params) {
+      IncompleteType fresh = IncompleteTypeNode::make(ty_param->kind);
+      subst_map.Set(ty_param, fresh);
+      ty_args->push_back(fresh);
+    }
+    Type ret_type = fn_ty->ret_type;
+
+    // If the function type is incomplete, place a new IncompleteType
+    // This relax the fn_ty to inputs -> Any
+    // The type checking can still pass when there are additional constraints on the type
+    // This is a temporary work around to check recursive functions whose
+    // return type is not yet known.
+    if (!ret_type.defined()) {
+      ret_type = IncompleteTypeNode::make(TypeParamNode::Kind::kType);
+    }
+    Type inst_ty = FuncTypeNode::make(fn_ty->arg_types,
+                                      ret_type, {},
+                                      fn_ty->type_constraints);
+    inst_ty = TypeSubst(inst_ty, subst_map);
+    return Downcast<FuncType>(inst_ty);
   }
 
-  auto type_arity = fn_ty->arg_types.size();
-  auto number_of_args = arg_types.size();
+  // Handle general call node.
+  Type GeneralCall(const CallNode* op, Array<Type> arg_types) {
+    Type ftype = GetType(op->op);
+    auto* fn_ty_node = ftype.as<FuncTypeNode>();
+    CHECK(fn_ty_node != nullptr)
+        << "only expressions with function types can be called, at "
+        << op->span;
+
+    Array<Type> type_args;
+    FuncType fn_ty = Instantiate(fn_ty_node, &type_args);
+    size_t type_arity = fn_ty->arg_types.size();
+    size_t number_of_args = arg_types.size();
+
+    if (type_arity != number_of_args) {
+      if (type_arity < number_of_args) {
+        LOG(FATAL) << "the function is provided too many arguments " << op->span;
+      } else {
+        LOG(FATAL) << "the function is provided too few arguments" << op->span;
+      }
+    }
+    for (size_t i = 0; i < fn_ty->arg_types.size(); i++) {
+      this->Unify(fn_ty->arg_types[i], arg_types[i], op->args[i]->span);
+    }
 
-  if (type_arity != number_of_args) {
-    if (type_arity < number_of_args) {
-      this->FatalError("the function is provided too many arguments", c->span);
-    } else {
-      this->FatalError("the function is provided too few arguments", c->span);
+    for (auto cs : fn_ty->type_constraints) {
+      solver_.AddConstraint(cs);
     }
+    return fn_ty->ret_type;
   }
 
-  for (size_t i = 0; i < fn_ty->arg_types.size(); i++) {
-    this->Unify(fn_ty->arg_types[i], arg_types[i], c->args[i]->span);
+  Type VisitExpr_(const CallNode* op) final {
+    // Fast path: well-formed primitive op
+    Array<Type> arg_types;
+    for (Expr arg : op->args) {
+      arg_types.push_back(GetType(arg));
+    }
+    if (const OpNode* opnode = op->op.as<OpNode>()) {
+      Type rtype = PrimitiveCall(opnode->op_type.as<FuncTypeNode>(),
+                                 arg_types,
+                                 op->attrs);
+      if (rtype.defined()) return rtype;
+    }
+    return GeneralCall(op, arg_types);
   }
 
-  // After we unify the arguments we should know more about the type
-  // arguments, let's run a quick pass over them to find new
-  // representatives.
-
-  for (size_t i = 0; i < ty_args.size(); i++) {
-    ty_args.Set(i, this->unifier->Subst(ty_args[i]));
+  Type VisitExpr_(const FunctionNode* f) final {
+    for (auto param : f->params) {
+      type_map_[param->var] = param->type;
+      type_map_[param] = param->type;
+    }
+    Type rtype = GetType(f->body);
+    // Run solver using the currently known information
+    solver_.Solve();
+    // Trying to resolve
+    Array<Type> arg_types;
+    for (size_t i = 0; i < f->params.size(); ++i) {
+      Param param = f->params[i];
+      Type atype = solver_.Resolve(param->type);
+      CHECK(atype.as<IncompleteTypeNode>() == nullptr)
+          << "Cannot resolve type of " << i
+          << "-th parameter of function at" << f->span;
+      arg_types.push_back(atype);
+    }
+    rtype = solver_.Resolve(rtype);
+    CHECK(rtype.as<IncompleteTypeNode>() == nullptr)
+        << "Cannot resolve return type of function at" << f->span;
+    // do not support constraint lifting for now.
+    return FuncTypeNode::make(arg_types, rtype, f->type_params, {});
   }
+};
 
-  // Add type constraints from the function types.
-  for (auto cs : fn_ty->type_constraints) {
-    context.AddConstraint(cs);
+class TypeInferencer::Resolver : public ExprMutator {
+ public:
+  Resolver(const std::unordered_map<Expr, Type, NodeHash, NodeEqual>& tmap,
+           TypeSolver* solver)
+      : tmap_(tmap), solver_(solver) {
   }
 
-  auto new_call =
-      CallNode::make(checked_op.expr, checked_args, c->attrs, ty_args);
-
-  return {new_call, fn_ty->ret_type};
-}
-
-CheckedExpr TypeInferencer::VisitExpr_(const LetNode* op) {
-  Let let = GetRef<Let>(op);
-
-  CheckedExpr checked_value;
-  Type annotated_ty = Resolve(let->value_type);
-
-  // If we are let-defining a function, we want to be able to
-  // recursively name the function in order to support recursive
-  // local definitions.
-  if (let->value.as<FunctionNode>()) {
-    context.Insert(let->var, annotated_ty);
-    checked_value = Infer(let->value);
-  } else {
-    checked_value = Infer(let->value);
+  Expr VisitExpr_(const VarNode* op) final {
+    return AttachCheckedType(op);
   }
 
-  Type unified_ty = this->Unify(checked_value.type, annotated_ty, let->span);
-
-  // Update type context with unified type now that we have
-  // solved this equation.
-  context.Insert(let->var, unified_ty);
-
-  auto checked_body = Infer(let->body);
-
-  auto checked_let = LetNode::make(let->var, checked_value.expr,
-                                   checked_body.expr, let->value_type);
-
-  return {checked_let, checked_body.type};
-}
-
-CheckedExpr TypeInferencer::VisitExpr_(const IfNode* op) {
-  If ifn = GetRef<If>(op);
-
-  // Ensure the type of the guard is of Tensor[Bool, ()],
-  // that is a rank-0 boolean tensor.
-  auto checked_cond = this->Infer(ifn->cond);
-  auto cond_type = checked_cond.type;
-
-  this->Unify(cond_type, TensorTypeNode::make({}, HalideIR::Bool()),
-              ifn->cond->span);
-  auto checked_true = this->Infer(ifn->true_branch);
-  auto checked_false = this->Infer(ifn->false_branch);
-  auto unified_type =
-      this->Unify(checked_true.type, checked_false.type, ifn->span);
-  auto checked_if =
-      IfNode::make(checked_cond.expr, checked_true.expr, checked_false.expr);
-  return {checked_if, unified_type};
-}
-
-CheckedExpr TypeInferencer::VisitExpr_(const OpNode* op_node) {
-  auto op = GetRef<Op>(op_node);
-  return {op, op->op_type};
-}
-
-Type TypeInferencer::Resolve(const Type &t) {
-  if (t.defined()) {
-    return ::tvm::relay::Resolve(this->unifier, t);
-  } else {
-    return IncompleteTypeNode::make(TypeParamNode::Kind::kType);
+  Expr VisitExpr_(const ConstantNode* op) final {
+    return AttachCheckedType(op);
   }
-}
 
-Expr TypeInferencer::Resolve(const Expr &e) {
-  CHECK(e.defined());
-  return ::tvm::relay::Resolve(this->unifier, e);
-}
+  Expr VisitExpr_(const GlobalVarNode* op) final {
+    return AttachCheckedType(op);
+  }
 
-void TypeInferencer::Solve(TypeRelationData & ty_rel) {
-  Array<Type> normalized_args;
+  Expr VisitExpr_(const OpNode* op) final {
+    return ExprMutator::VisitExpr_(op);
+  }
 
-  for (auto arg : ty_rel.args) {
-    normalized_args.push_back(Resolve(arg));
+  Expr VisitExpr_(const TupleNode* op) final {
+    return AttachCheckedType(op);
   }
 
-  auto new_args = ty_rel.func(normalized_args, ty_rel.args.size());
+  Expr VisitExpr_(const ParamNode* op) final {
+    return ExprMutator::VisitExpr_(op);
+  }
 
-  CHECK(new_args.size() == normalized_args.size());
-  tvm::Array<Type> final_args;
+  Expr VisitExpr_(const FunctionNode* op) final {
+    return AttachCheckedType(op);
+  }
 
-  for (size_t i = 0; i < new_args.size(); i++) {
-    ty_rel.args[i] = Unify(normalized_args[i], new_args[i], ty_rel.span);
+  Expr VisitExpr_(const CallNode* op) final {
+    return AttachCheckedType(op);
   }
-}
 
-int NumSolvedVars(const Array<Type>& vars) {
-  int num = 0;
-  for (auto var : vars) {
-    if (!var.as<IncompleteTypeNode>()) {
-      num += 1;
-    }
+  Expr VisitExpr_(const LetNode* op) final {
+    return AttachCheckedType(op);
   }
-  return num;
-}
 
-enum SolverResult : int {
-  Failed = -1,
-  Progress = 0,
-  Done = 1,
-};
+  Expr VisitExpr_(const IfNode* op) final {
+    return AttachCheckedType(op);
+  }
 
-SolverResult TypeInferencer::Solve(std::vector<TypeRelationData>& rels) {
-  // We start in the done state with zero progress.
-  SolverResult status = SolverResult::Done;
-  int progress = 0;
-
-  do {
-    // Upon rentering the loop we reset the state.
-    status = SolverResult::Done;
-    progress = 0;
-
-    std::vector<int> complete;
-
-    int i = 0;
-    // We will now process each relation in order.
-    for (TypeRelationData& ty_rel : rels) {
-      int arity = ty_rel.args.size();
-      int pre_solved = NumSolvedVars(ty_rel.args);
-      RELAY_LOG(INFO) << "TypeInferencer::Solve: "
-                      << "TypeRelation= "
-                      << ", Arity=" << arity << ", Solved=" << pre_solved
-                      << std::endl;
-      // If the relation is already solved then we will make no progress but try
-      // to set the status to done.
-      if (pre_solved == arity) {
-        status = static_cast<SolverResult>((status && SolverResult::Done));
-        complete.push_back(i);
-      // If there are unsolved variables we will try to solve some.
-      } else if (pre_solved < arity) {
-        Solve(ty_rel);
-        int post_solved = NumSolvedVars(ty_rel.args);
-
-        // If we solved any variables we will try to downgrade status to
-        // progress update the type relation, and then bump the progress counter
-        // by one.
-        if (post_solved > pre_solved) {
-          status =
-              static_cast<SolverResult>((status && SolverResult::Progress));
-          progress += 1;
-        }
+  // attach checked type to the mutated node.
+  template<typename T>
+  Expr AttachCheckedType(const T* op) {
+    auto it = tmap_.find(GetRef<Expr>(op));
+    CHECK(it != tmap_.end());
+    Type checked_type = solver_->Resolve(it->second);
+    CHECK(checked_type.as<IncompleteTypeNode>() == nullptr)
+        << "Cannot resolve type of " << GetRef<Expr>(op)
+        << " at " << op->span;
+    Expr new_e = ExprMutator::VisitExpr_(op);
+    if (!checked_type.same_as(new_e->checked_type_)) {
+      // Copy on write optimization
+      // If new_e is an old expression,
+      // we make a copy mutating an existing reference.
+      if (!new_e.node_.unique()) {
+        new_e = Expr(make_node<T>(*new_e.as<T>()));
       }
-      i++;
+      new_e->checked_type_ = checked_type;
     }
-
-    // If we made no progress and we aren't finished, then the state should be
-    // downgraded to fail, then we should exit the loop.
-    if (progress == 0 && status != SolverResult::Done) {
-      status = SolverResult::Failed;
-      break;
-    }
-
-    // Remove the satisfied relations.
-    for (auto i : complete) {
-      if (rels.size() > 1) {
-        rels[i] = rels.back();
-        rels.pop_back();
-      } else {
-        rels.pop_back();
-      }
-    }
-
-    std::reverse(rels.begin(), rels.end());
-  } while (status == SolverResult::Progress);
-  return status;
-}
-
-bool TypeInferencer::RelationsHold(bool scope_only) {
-  // If we are only checking the top scope,
-  // slice out the constraints.
-  //
-  // Otherwise we use all of them.
-  std::vector<std::vector<TypeRelationData> > constraints;
-
-  if (scope_only) {
-    constraints = {context.constraints[0]};
-  } else {
-    constraints = context.constraints;
+    return new_e;
   }
 
-  RELAY_LOG(INFO) << "TypeInferencer::RelationsHold: scope_only= " << scope_only
-                  << std::endl;
-  bool all_hold = true;
-  for (auto ty_rels : context.constraints) {
-    auto status = Solve(ty_rels);
-    RELAY_LOG(INFO) << "status= " << status << std::endl;
-    if (status == SolverResult::Failed || status == SolverResult::Progress) {
-      all_hold = false;
-    } else if (status == SolverResult::Done) {
-      continue;
-    } else {
-      throw InternalError("found invalid value for SolverResult");
-    }
+  Type VisitType(const Type &t) final {
+    return solver_->Resolve(t);
   }
 
-  return all_hold;
+ private:
+  const std::unordered_map<Expr, Type, NodeHash, NodeEqual>& tmap_;
+  TypeSolver* solver_;
+};
+
+
+Expr TypeInferencer::Infer(Expr expr) {
+  // step 0: populate the constraints
+  GetType(expr);
+  // step 1: solve the constraints
+  solver_.Solve();
+  // step 2: attach resolved types to checked_type field
+  return Resolver(type_map_, &solver_).VisitExpr(expr);
 }
 
 Expr InferType(const Environment& env, const Expr& e) {
-  TypeInferencer ti(env);
-  auto checked_expr = ti.Infer(e);
-  CHECK(ti.RelationsHold());
-  return ti.Resolve(checked_expr.expr);
+  return TypeInferencer(env).Infer(e);
 }
 
-Expr InferType(const Environment& env, const GlobalVar& var,
+Expr InferType(const Environment& env,
+               const GlobalVar& var,
                const Function& func) {
-  TypeInferencer ti(env);
-  auto func_copy = FunctionNode::make(func->params, func->ret_type, func->body,
-                                      func->type_params);
-  func_copy->checked_type_ = ti.Resolve(func_copy->fn_type());
+  Function func_copy = Function(make_node<FunctionNode>(*func.operator->()));
+  func_copy->checked_type_ = func_copy->fn_type();
   env->functions.Set(var, func_copy);
-  auto checked_expr = ti.Infer(func);
-  CHECK(ti.RelationsHold());
+  Expr func_ret = TypeInferencer(env).Infer(func_copy);
   auto map_node = env->functions.CopyOnWrite();
   map_node->data.erase(var.node_);
-  return ti.Resolve(checked_expr.expr);
-}
-
-void TypeInferencer::FatalError(const std::string& msg, Span sp) {
-  throw FatalTypeError(
-      "internal error: this exception should"
-      "be handled and errors reported with Environment::display_errors\n" +
-      msg);
-}
-
-Type TypeInferencer::Unify(const Type& t1, const Type& t2, Span sp) {
-  try {
-    return this->unifier->Unify(t1, t2);
-  } catch (const dmlc::Error &e) {
-    std::stringstream ss;
-    ss << "Error unifying `";
-    ss << t1;
-    ss << "` and `";
-    ss << t2;
-    ss << "`: " << e.what();
-    this->FatalError(ss.str(), sp);
-  }
+  return func_ret;
 }
 
 TVM_REGISTER_API("relay._ir_pass.check_expr")
-    .set_body([](TVMArgs args, TVMRetValue* ret) {
-      Environment env = args[0];
-      Expr e = args[1];
-      *ret = InferType(env, e);
-    });
-
-// TODO(@jroesch): put in a better namespace.
-TVM_REGISTER_API("relay._ir_pass._get_checked_type")
-    .set_body([](TVMArgs args, TVMRetValue* ret) {
-      Expr e = args[0];
-      *ret = e->checked_type();
-    });
-
-/* Incomplete Type */
-
-IncompleteType IncompleteTypeNode::make(TypeParamNode::Kind kind) {
-  auto n = make_node<IncompleteTypeNode>();
-  n->kind = std::move(kind);
-  return IncompleteType(n);
-}
-
-TVM_REGISTER_API("relay._make.IncompleteType")
-    .set_body([](TVMArgs args, TVMRetValue* ret) {
-      int kind = args[0];
-      *ret = IncompleteTypeNode::make(static_cast<TypeParamNode::Kind>(kind));
-    });
-
-TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<IncompleteTypeNode>([](const IncompleteTypeNode* node,
-                                         tvm::IRPrinter* p) {
-      p->stream << "IncompleteTypeNode(" << node->kind << ", " << node << ")";
-    });
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    Environment env = args[0];
+    Expr e = args[1];
+    *ret = InferType(env, e);
+  });
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
new file mode 100644
index 000000000000..6e382a69a988
--- /dev/null
+++ b/src/relay/pass/type_solver.cc
@@ -0,0 +1,166 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_solver.cc
+ * \brief Type solver implementations.
+ */
+#include <string>
+#include "type_solver.h"
+
+namespace tvm {
+namespace relay {
+
+class TypeSolver::Reporter : public TypeReporterNode {
+ public:
+  explicit Reporter(TypeSolver* solver)
+      : solver_(solver) {}
+
+  void Assign(const Type& dst, const Type& src) final {
+    solver_->Unify(dst, src);
+  }
+
+  void AssertEQ(const ShapeExpr& lhs, const ShapeExpr& rhs) final {
+    // TODO(tqchen)
+  }
+
+ private:
+  TypeSolver* solver_;
+};
+
+// constructor
+TypeSolver::TypeSolver()
+    : reporter_(make_node<Reporter>(this)) {
+}
+
+// destructor
+TypeSolver::~TypeSolver() {
+  // call destructor of all non-POD arena object
+  for (TypeNode* ptr : type_nodes_) {
+    ptr->~TypeNode();
+  }
+  for (RelationNode* ptr : rel_nodes_) {
+    ptr->~RelationNode();
+  }
+}
+
+// Add equality constraint
+Type TypeSolver::Unify(const Type& dst, const Type& src) {
+  // Known limitation
+  // - handle composite types whose component can be unknown.
+  // - handle shape pattern matching
+  TypeNode* lhs = GetTypeNode(dst);
+  TypeNode* rhs = GetTypeNode(src);
+  if (lhs->resolved_type.as<IncompleteTypeNode>()) {
+    MergeFromTo(lhs, rhs);
+    return rhs->resolved_type;
+  } else if (rhs->resolved_type.as<IncompleteTypeNode>()) {
+    MergeFromTo(rhs, lhs);
+    return lhs->resolved_type;
+  } else {
+    lhs->parent = rhs;
+    CHECK(AlphaEqual(lhs->resolved_type, rhs->resolved_type))
+        << "Incompatible parent types in UF:"
+        << lhs->resolved_type << " and " << rhs->resolved_type;
+    return rhs->resolved_type;
+  }
+}
+
+// Add type constraint to the solver.
+void TypeSolver::AddConstraint(const TypeConstraint& constraint) {
+  if (auto *op = constraint.as<TypeRelationNode>()) {
+    // create a new relation node.
+    RelationNode* rnode = make<RelationNode>();
+    rnode->rel = GetRef<TypeRelation>(op);
+    rel_nodes_.push_back(rnode);
+    // populate the type information.
+    for (size_t i = 0; i < op->args.size(); ++i) {
+      // insert link to the type list
+      LinkNode<TypeNode*>* tlink = make<LinkNode<TypeNode*> >();
+      TypeNode* tnode = GetTypeNode(op->args[i]);
+      tlink->value = tnode;
+      rnode->type_list.Push(tlink);
+      // insert type->relation node
+      LinkNode<RelationNode*>* rlink = make<LinkNode<RelationNode*> >();
+      rlink->value = rnode;
+      tnode->rel_list.Push(rlink);
+    }
+    // add the relation to the working queue.
+    this->AddToQueue(rnode);
+  } else {
+    LOG(FATAL) << "Do not know how to handle constraint type"
+               << constraint->type_key();
+  }
+}
+
+// Resolve a type in the solver context.
+Type TypeSolver::Resolve(const Type& type) {
+  auto it = tmap_.find(type);
+  if (it != tmap_.end()) {
+    return it->second->FindRoot()->resolved_type;
+  } else {
+    return type;
+  }
+}
+
+bool TypeSolver::Solve() {
+  // update until queue is empty
+  while (!update_queue_.empty()) {
+    RelationNode* rnode = update_queue_.front();
+    const auto& rel = rnode->rel;
+    update_queue_.pop();
+    CHECK(!rnode->resolved);
+    // update the relation with given evidence.
+    Array<Type> args;
+    for (auto* tlink = rnode->type_list.head; tlink != nullptr; tlink = tlink->next) {
+      args.push_back(tlink->value->FindRoot()->resolved_type);
+      CHECK_LE(args.size(), rel->args.size());
+    }
+    // call the function
+    bool resolved = rel->func(args, rel->num_inputs, rel->attrs, reporter_);
+    // mark inqueue as false after the function call
+    // so that rnode itself won't get enqueued again.
+    rnode->inqueue = false;
+
+    if (resolved) {
+      ++num_resolved_rels_;
+    }
+    rnode->resolved = resolved;
+  }
+  // This criterion is not necessarily right for all the possible cases
+  // TODO(tqchen): We should also count the number of in-complete types.
+  return num_resolved_rels_ == rel_nodes_.size();
+}
+
+
+// Expose type solver only for debugging purposes.
+TVM_REGISTER_API("relay._ir_pass._test_type_solver")
+.set_body([](runtime::TVMArgs args, runtime::TVMRetValue* ret) {
+    using runtime::PackedFunc;
+    using runtime::TypedPackedFunc;
+    auto solver = std::make_shared<TypeSolver>();
+
+    auto mod = [solver](std::string name) -> PackedFunc {
+      if (name == "Solve") {
+        return TypedPackedFunc<bool()>([solver]() {
+            return solver->Solve();
+          });
+      } else if (name == "Unify") {
+        return TypedPackedFunc<void(Type, Type)>([solver](Type lhs, Type rhs) {
+            solver->Unify(lhs, rhs);
+          });
+      } else if (name == "Resolve") {
+        return TypedPackedFunc<Type(Type)>([solver](Type t) {
+            return solver->Resolve(t);
+          });
+      } else if (name == "AddConstraint") {
+        return TypedPackedFunc<void(TypeConstraint)>([solver](TypeConstraint c) {
+            return solver->AddConstraint(c);
+          });
+      } else {
+        return PackedFunc();
+      }
+    };
+    *ret = runtime::TypedPackedFunc<runtime::PackedFunc(std::string)>(mod);
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/type_solver.h b/src/relay/pass/type_solver.h
new file mode 100644
index 000000000000..30f82f980a75
--- /dev/null
+++ b/src/relay/pass/type_solver.h
@@ -0,0 +1,231 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_solver.h
+ * \brief Solver logic for type inference.
+ */
+#ifndef TVM_RELAY_PASS_TYPE_SOLVER_H_
+#define TVM_RELAY_PASS_TYPE_SOLVER_H_
+
+#include <tvm/relay/type.h>
+#include <tvm/relay/pass.h>
+#include <vector>
+#include <queue>
+#include "../../common/arena.h"
+
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Interface of type solver used in type inference.
+ *
+ * TypeSolver works on a list of constraints among incomplete types.
+ * The user will populate the constraints by AddConstraint and Assign.
+ * Then we can call Solve to trying to resolve the unknown.
+ *
+ * This can be viewed as "type program(computational graph)" of types, where
+ * the type constraint are operators of the graph and the incomplete
+ * types are intermediate value of the graph.
+ * If all the input types are concretely known, we should be able to
+ * just run a forward pass on the "type program" to get all the types.
+ *
+ * The list of constraints representation means we are storing it as a bipartite
+ * graph instead of a DAG. This is because some constraints might go both direction.
+ * TypeSolver could take advantage of bidirectional constraints to deduce input
+ * value given output ones. Never-the-less, we should keep in mind that
+ * there is a "forward direction" that the TypeSolver should take advantage of.
+ */
+class TypeSolver {
+ public:
+  TypeSolver();
+  ~TypeSolver();
+  /*!
+   * \brief Add a type constraint to the solver.
+   * \param constraint The constraint to be added.
+   */
+  void AddConstraint(const TypeConstraint& constraint);
+  /*!
+   * \brief Resolve type to the solution type in the solver.
+   * \param type The type to be resolved.
+   * \return The resolved type.
+   */
+  Type Resolve(const Type& type);
+  /*!
+   * \brief Start to solve the types using the current known information.
+   * \return Whether all the incomplete types has been fully resolved.
+   */
+  bool Solve();
+  /*!
+   * \brief Unify lhs and rhs.
+   * \param lhs The left operand.
+   * \param rhs The right operand
+   */
+  Type Unify(const Type& lhs, const Type& rhs);
+
+ private:
+  class Reporter;
+  struct TypeNode;
+  struct RelationNode;
+  // Internally the solver maintains a bipartite graph of Relation and Types.
+  // All the object in the structure is managed by a arena allocator
+  // which releases the memory upon distruction of the type solver.
+  /*!
+   * \brief Link list node
+   * \tparam T the content data type
+   */
+  template<typename T>
+  struct LinkNode {
+    /*! \brief The content value */
+    T value;
+    /*! \brief pointer to the next location */
+    LinkNode<T>* next{nullptr};
+  };
+  /*!
+   * \brief LinkedList structure
+   * \tparam T the content data type
+   */
+  template<typename T>
+  struct LinkedList {
+    /*! \brief Head pointer */
+    LinkNode<T>* head{nullptr};
+    /*! \brief Tail pointer */
+    LinkNode<T>* tail{nullptr};
+    /*!
+     * \brief Push a new node to the end of the linked list.
+     * \param node The node to be pushed.
+     */
+    void Push(LinkNode<T>* node) {
+      node->next = nullptr;
+      if (this->tail != nullptr) {
+        this->tail->next = node;
+        this->tail = node;
+      } else {
+        head = tail = node;
+      }
+    }
+  };
+  /*!
+   * \brief type node struct
+   *  TypeNode implements a union-find data structure(via parent)
+   *  that can unifies the same types to the name resolved_type.
+   *
+   *  It also contains collection of links to related Relations,
+   *  which is stored in rel_list.
+   */
+  struct TypeNode {
+    /*! \brief The final resolved type */
+    Type resolved_type;
+    /*! \brief type node in the union find algorithm */
+    TypeNode* parent{nullptr};
+    /*! \brief list of relations that is related to this type node */
+    LinkedList<RelationNode*> rel_list;
+    /*!
+     * \brief Find the root type node, perform path compression
+     * \return The root type node.
+     */
+    TypeNode* FindRoot() {
+      // fast path
+      if (this->parent == nullptr) return this;
+      // slow path with path compression.
+      TypeNode* root = this;
+      while (root->parent != nullptr) {
+        root = root->parent;
+      }
+      for (TypeNode* p = this; p != root;) {
+        TypeNode* parent = p->parent;
+        p->parent = root;
+        p = parent;
+      }
+      return root;
+    }
+  };
+  /*! \brief relation node */
+  struct RelationNode {
+    /*! \brief Whether the relation is in the queue to be solved */
+    bool inqueue{false};
+    /*! \brief Whether the relation is resolved */
+    bool resolved{false};
+    /*! \brief The corresponding type relation */
+    TypeRelation rel;
+    /*! \brief list types to this relation */
+    LinkedList<TypeNode*> type_list;
+  };
+  /*! \brief List of all allocated type nodes */
+  std::vector<TypeNode*> type_nodes_;
+  /*! \brief List of all allocated relation nodes */
+  std::vector<RelationNode*> rel_nodes_;
+  /*! \brief Number of resolved relations */
+  size_t num_resolved_rels_{0};
+  /*! \brief map from type node to types. */
+  std::unordered_map<Type, TypeNode*, NodeHash, NodeEqual> tmap_;
+  /*! \breif Internal queue to update the relation */
+  std::queue<RelationNode*> update_queue_;
+  /*! \brief allocator of all the internal node obhect*/
+  common::Arena arena_;
+  /*! \brief Reporter that reports back to self */
+  TypeReporter reporter_;
+  /*!
+   * \brief Create function to create a new node ptr via arena
+   * \tparam The type parameter
+   * \return The node pointer.
+   */
+  template<typename T>
+  T* make() {
+    T* ptr = arena_.Alloc<T>();
+    // call constructor
+    new (ptr) T();
+    return ptr;
+  }
+  /*!
+   * \brief GetTypeNode that is corresponds to t.
+   * if it do not exist, create a new one.
+   * \return The type node.
+   */
+  TypeNode* GetTypeNode(const Type& t) {
+    auto it = tmap_.find(t);
+    if (it != tmap_.end()) {
+      return it->second->FindRoot();
+    } else {
+      TypeNode* n = make<TypeNode>();
+      type_nodes_.push_back(n);
+      n->resolved_type = t;
+      tmap_[t] = n;
+      return n;
+    }
+  }
+  /*!
+   * \brief Add relation node rel to the update queue
+   * \param rel The relation node
+   */
+  void AddToQueue(RelationNode* rel) {
+    if (rel->inqueue) return;
+    CHECK(!rel->resolved);
+    rel->inqueue = true;
+    update_queue_.push(rel);
+  }
+  /*!
+   * \brief Merge rhs type node to lhs
+   * \param src The source operand
+   * \param dst The dst operand.
+   */
+  void MergeFromTo(TypeNode* src, TypeNode* dst) {
+    if (src == dst) return;
+    src->parent = dst;
+    // move the link to the to dst
+    for (auto* rlink = src->rel_list.head; rlink != nullptr;) {
+      // store next pointer first before rlink get moved
+      auto* next = rlink->next;
+      // if the relation is not yet resolved
+      // send the relation to the new
+      if (!rlink->value->resolved) {
+        this->AddToQueue(rlink->value);
+        dst->rel_list.Push(rlink);
+      }
+      rlink = next;
+    }
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_TYPE_SOLVER_H_
diff --git a/src/relay/pass/type_visitor.h b/src/relay/pass/type_visitor.h
index c37b536ce0d0..8148942657ba 100644
--- a/src/relay/pass/type_visitor.h
+++ b/src/relay/pass/type_visitor.h
@@ -108,11 +108,14 @@ struct TypeMutator : TypeFunctor<Type(const Type& n)> {
     for (const Type& t : type_rel->args) {
       new_args.push_back(this->VisitType(t));
     }
-    return TypeRelationNode::make(type_rel->name, type_rel->func_, new_args);
+    return TypeRelationNode::make(type_rel->func,
+                                  new_args,
+                                  type_rel->num_inputs,
+                                  type_rel->attrs);
   }
 
   Type VisitType_(const IncompleteTypeNode* op) override {
-    return GetRef<IncompleteType>(op);
+    return GetRef<Type>(op);
   }
 };
 
diff --git a/src/relay/pass/unifier.cc b/src/relay/pass/unifier.cc
deleted file mode 100644
index 67cc58ffc0a3..000000000000
--- a/src/relay/pass/unifier.cc
+++ /dev/null
@@ -1,324 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file tvm/src/relay/pass/unifier.cc
- * \brief The type unifier which solves a system of equations between
- * incomplete types.
- */
-
-#include "./unifier.h"
-#include <tvm/relay/error.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/logging.h>
-#include <tvm/relay/pass.h>
-#include <tvm/relay/type.h>
-#include "./type_subst.h"
-#include "./type_visitor.h"
-
-namespace tvm {
-namespace relay {
-
-using tvm::IRPrinter;
-using namespace tvm::runtime;
-
-UnionFind UnionFindNode::make(tvm::Map<IncompleteType, Type> uf_map) {
-  auto n = make_node<UnionFindNode>();
-  n->uf_map = uf_map;
-  return UnionFind(n);
-}
-
-void UnionFindNode::Insert(const IncompleteType& v) { this->uf_map.Set(v, v); }
-
-void UnionFindNode::debug() {
-  for (const auto& entry : this->uf_map) {
-    RELAY_LOG(INFO) << entry.first << " = " << entry.second << std::endl;
-  }
-}
-
-void UnionFindNode::AssertAlphaEqual(const Type& l, const Type& r) {
-  if (!AlphaEqual(l, r)) {
-    std::stringstream ss;
-    ss << "Incompatible parent types in UF:" << l << " and " << r;
-    throw UnionFindError(ss.str());
-  }
-}
-
-void UnionFindNode::Unify(const IncompleteType& v1, const Type& t) {
-  RELAY_LOG(INFO) << "UnionFindNode::Unify v1=" << v1 << ", t=" << t
-                  << std::endl;
-  auto parent1 = this->Find(v1);
-
-  // if t is a type var, then unify parents
-  const IncompleteTypeNode *tvn2 = t.as<IncompleteTypeNode>();
-  if (tvn2) {
-    auto v2 = GetRef<IncompleteType>(tvn2);
-    auto parent2 = this->Find(v2);
-
-    // if parents are exactly equal, then we're done
-    if (parent1 == parent2) {
-      return;
-    }
-
-    // if first parent is a type var, then can just set its union find map to
-    // second parent
-    if (const IncompleteTypeNode *pvn1 = parent1.as<IncompleteTypeNode>()) {
-      auto pv1 = GetRef<IncompleteType>(pvn1);
-      this->uf_map.Set(pv1, parent2);
-      return;
-    }
-
-    // if second parent is a type var but first isn't, can set second type var
-    if (const IncompleteTypeNode *pvn2 = parent2.as<IncompleteTypeNode>()) {
-      auto pv2 = GetRef<IncompleteType>(pvn2);
-      this->uf_map.Set(pv2, parent1);
-      return;
-    }
-
-    // if both parents are not type vars themselves, check alpha-equality
-    AssertAlphaEqual(parent1, parent2);
-    return;
-  }
-
-  // if t is not a type var, then unify with v1's parent if parent is a type
-  // var; else, check alpha-equality for compatibility
-  if (const IncompleteTypeNode *pvn1 = parent1.as<IncompleteTypeNode>()) {
-    auto pv1 = GetRef<IncompleteType>(pvn1);
-    this->uf_map.Set(pv1, t);
-    return;
-  }
-
-  AssertAlphaEqual(parent1, t);
-}
-
-Type UnionFindNode::Find(const IncompleteType& v) {
-  // The node has no mapping, so its representative is just itself.
-  if (this->uf_map.find(v) == this->uf_map.end()) {
-    return v;
-  }
-
-  Type parent = this->uf_map.at(v);
-
-  if (v == parent) {
-    return v;
-  }
-
-  // if parent is not a type var, then it must be the representative type
-  const IncompleteTypeNode *rep = parent.as<IncompleteTypeNode>();
-  if (!rep) {
-    return parent;
-  }
-
-  // otherwise, recurse and perform path compression
-  IncompleteType pv = GetRef<IncompleteType>(rep);
-  Type higher_up = this->Find(pv);
-  this->uf_map.Set(v, higher_up);
-  return higher_up;
-}
-
-TVM_REGISTER_API("relay._make.UnionFind")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      if (args.size() == 0) {
-        *ret = UnionFindNode::make({});
-      } else {
-        *ret = UnionFindNode::make(args[0]);
-      }
-    });
-
-TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<UnionFindNode>([](const UnionFindNode *node,
-                                    tvm::IRPrinter *p) {
-      p->stream << "UnionFindNode(" << node->uf_map << ")";
-    });
-
-TypeUnifier TypeUnifierNode::make(UnionFind union_find) {
-  auto n = make_node<TypeUnifierNode>();
-  n->union_find = union_find;
-  return TypeUnifier(n);
-}
-
-void TypeUnifierNode::Insert(const IncompleteType& v) {
-  this->union_find->Insert(v);
-}
-
-Type TypeUnifierNode::Unify(const Type& t1, const Type& t2) {
-  RELAY_LOG(INFO) << "TypeUnifierNode::unify: t1=" << t1 << " t2=" << t2
-                  << std::endl;
-
-  Type unified = this->VisitType(t1, t2);
-  // TODO(@jroesch): Restore this code when we finish kind checker.
-  // if (!check_kind(unified)) {
-  // throw UnificationError("Invalid kinds in unified type");
-  // }
-  return unified;
-}
-
-struct IncompleteTypeSubst : TypeMutator {
-  const TypeUnifierNode *unifier;
-
-  IncompleteTypeSubst(const TypeUnifierNode *unifier) : unifier(unifier) {}
-
-  // type var: look it up in the type map and recurse
-  Type VisitType_(const IncompleteTypeNode* op) override {
-    auto tv = GetRef<IncompleteType>(op);
-    auto parent = unifier->union_find->Find(tv);
-    if (parent == tv) {
-      return tv;
-    }
-    return this->VisitType(parent);
-  }
-};
-
-Type TypeUnifierNode::Subst(const Type& t) {
-  IncompleteTypeSubst tvsubst(this);
-  // normalize first so substitutions in quantifiers will be correct
-  Type ret = tvsubst.VisitType(t);
-  // TODO(@jroesch): Restore this code when we finish kind checker.
-  // if (!check_kind(ret)) {
-  // std::stringstream ss;
-  // ss << "Invalid Kinds in substituted type!";
-  // ss << t << std::endl;
-  // ss << ret << std::endl;
-  // throw SubstitutionError(ss.str());
-  // }
-  return ret;
-}
-
-Type TypeUnifierNode::VisitType(const Type& t1, const Type t2) {
-  // When the right hand size is a type variable immediately unify.
-  if (const IncompleteTypeNode *tvn2 = t2.as<IncompleteTypeNode>()) {
-    return this->UnifyWithIncompleteType(t1, GetRef<IncompleteType>(tvn2));
-  } else {
-    return TypeFunctor<Type(const Type &t1, const Type t2)>::VisitType(t1, t2);
-  }
-}
-
-Type TypeUnifierNode::UnifyWithIncompleteType(const Type& t1,
-                                              const IncompleteType tv2) {
-  RELAY_LOG(INFO) << "unifyWithIncompleteType: t1=" << t1 << " t2=" << tv2
-                  << std::endl;
-  // Fix unify to return new representative
-  this->union_find->Unify(tv2, t1);
-  auto rep = this->union_find->Find(tv2);
-  RELAY_LOG(INFO) << "unifyWithIncompleteType: rep =" << rep << std::endl;
-  return rep;
-}
-
-Type TypeUnifierNode::VisitType_(const IncompleteTypeNode* t1, const Type rt2) {
-  IncompleteType tv1 = GetRef<IncompleteType>(t1);
-  RELAY_LOG(INFO) << "VisitType_: IncompleteTypeNode t1=" << t1 << " = " << rt2
-                  << std::endl;
-  this->union_find->Unify(tv1, rt2);
-  auto rep = this->union_find->Find(tv1);
-  RELAY_LOG(INFO) << "VisitType_: IncompleteTypeNode rep=" << rep << std::endl;
-  return rep;
-}
-
-Type TypeUnifierNode::VisitType_(const TypeParamNode* t1, const Type rt2) {
-  TypeParam ti1 = GetRef<TypeParam>(t1);
-
-  if (const TypeParamNode *tin2 = rt2.as<TypeParamNode>()) {
-    TypeParam ti2 = GetRef<TypeParam>(tin2);
-
-    if (ti1 != ti2) {
-      throw UnificationError("Attempting to unify non-matching TypeParams");
-    }
-
-    return ti1;
-  }
-
-  throw UnificationError("Unable to unify TypeParamNode");
-}
-
-Type TypeUnifierNode::VisitType_(const FuncTypeNode* t1, const Type rt2) {
-  FuncType ft1 = GetRef<FuncType>(t1);
-
-  if (const FuncTypeNode *tan2 = rt2.as<FuncTypeNode>()) {
-    FuncType ft2 = GetRef<FuncType>(tan2);
-
-    if (ft1->type_params.size() != ft2->type_params.size()) {
-      throw UnificationError(
-          "unable to unify functions with differing number of type parameters");
-    }
-
-    tvm::Map<TypeParam, Type> subst_map;
-
-    for (size_t i = 0; i < ft1->arg_types.size(); i++) {
-      subst_map.Set(ft1->type_params[i], ft2->type_params[i]);
-    }
-
-    ft1 = Downcast<FuncType>(TypeSubst(ft1, subst_map));
-
-    if (ft1->arg_types.size() != ft2->arg_types.size()) {
-      throw UnificationError("unable to unify functions of different arities");
-    }
-
-    tvm::Array<Type> unified_args;
-    for (size_t i = 0; i < ft1->arg_types.size(); i++) {
-      unified_args.push_back(
-          this->VisitType(ft1->arg_types[i], ft2->arg_types[i]));
-    }
-
-    Type unified_ret_type = this->VisitType(ft1->ret_type, ft2->ret_type);
-
-    return FuncTypeNode::make(unified_args, unified_ret_type, {}, {});
-  }
-
-  throw UnificationError("unable to unify function types");
-}
-
-Type TypeUnifierNode::VisitType_(const TensorTypeNode* t1, const Type rt2) {
-  TensorType tt1 = GetRef<TensorType>(t1);
-
-  if (const TensorTypeNode *ttn2 = rt2.as<TensorTypeNode>()) {
-    TensorType tt2 = GetRef<TensorType>(ttn2);
-
-    if (!AlphaEqual(tt1, tt2)) {
-      throw UnificationError("dtypes do not match");
-    }
-
-    RELAY_LOG(INFO) << "Unify Tensor Shape s1=" << tt1->shape
-                    << " s2= " << tt2->shape << std::endl;
-
-    if (tt1->shape.size() != tt2->shape.size()) {
-      throw UnificationError("shapes are not of the same length");
-    }
-
-    for (size_t i = 0U; i < tt1->shape.size(); i++) {
-      if (!tt1->shape[i].same_as(tt2->shape[i])) {
-        throw UnificationError("shapes do not match at index");
-      }
-    }
-
-    return rt2;
-  }
-
-  throw UnificationError("Cannot unify TensorTypeNode");
-}
-
-Type TypeUnifierNode::VisitType_(const TupleTypeNode* t1, const Type rt2) {
-  TupleType pt1 = GetRef<TupleType>(t1);
-
-  if (const TupleTypeNode *ptn2 = rt2.as<TupleTypeNode>()) {
-    TupleType pt2 = GetRef<TupleType>(ptn2);
-
-    std::vector<Type> unified_fields;
-    if (pt1->fields.size() != pt2->fields.size()) {
-      throw UnificationError("Product types are of different dimensions");
-    }
-
-    for (size_t i = 0U; i < pt1->fields.size(); i++) {
-      Type unified = this->VisitType(pt1->fields[i], pt2->fields[i]);
-      unified_fields.push_back(unified);
-    }
-
-    return TupleTypeNode::make(unified_fields);
-  }
-
-  throw UnificationError("Cannot unify TupleTypeNode");
-}
-
-Type TypeUnifierNode::VisitType_(const TypeRelationNode* tr1, const Type t2) {
-  throw InternalError("Cannot unify different type relations");
-}
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/pass/unifier.h b/src/relay/pass/unifier.h
deleted file mode 100644
index feda644cdd1d..000000000000
--- a/src/relay/pass/unifier.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file include/tvm/relay/pass/unifier.h
- * \brief The type unifier which solves a system of equations between
- * incomplete types.
- */
-#ifndef TVM_RELAY_PASS_UNIFIER_H_
-#define TVM_RELAY_PASS_UNIFIER_H_
-
-#include <tvm/relay/expr.h>
-#include <string>
-#include "./type_functor.h"
-
-namespace tvm {
-namespace relay {
-
-struct UnionFindError : dmlc::Error {
-  explicit UnionFindError(const std::string& msg) : Error(msg) {}
-};
-
-struct UnificationError : dmlc::Error {
-  explicit UnificationError(const std::string& msg) : Error(msg) {}
-};
-
-struct SubstitutionError : dmlc::Error {
-  explicit SubstitutionError(const std::string& msg) : Error(msg) {}
-};
-
-/*! \brief A union-find data structure for the type-checker */
-class UnionFind;
-
-class UnionFindNode : public Node {
- public:
-  /*! \brief The inernal map from incomplete types to their representatives. */
-  tvm::Map<IncompleteType, Type> uf_map;
-
-  UnionFindNode() {}
-
-  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("uf_map", &uf_map); }
-
-  TVM_DLL static UnionFind make(tvm::Map<IncompleteType, Type> uf_map);
-
-  /*! \brief Insert it into the union find.
-  * \param it The type to add to the union find.
-  */
-  void Insert(const IncompleteType& it);
-
-  /*! \brief Union operation, combine two equivalence classes.
-  * \param it The incomplete type to unify.
-  * \param ty The other type.
-  */
-  void Unify(const IncompleteType& it, const Type& t);
-
-  /*! \brief Find operation, returns the representative of the argument.
-  * \param it The element to lookup.
-  */
-  Type Find(const IncompleteType& it);
-
-  void debug();
-
-  void AssertAlphaEqual(const Type& l, const Type& r);
-
-  static constexpr const char* _type_key = "relay.UnionFind";
-  TVM_DECLARE_NODE_TYPE_INFO(UnionFindNode, Node);
-};
-
-class UnionFind : public NodeRef {
- public:
-  UnionFind() {}
-  explicit UnionFind(NodePtr<tvm::Node> p) : NodeRef(p) {}
-
-  // The union find structure is mutable so we do not use the standard macros
-  // and expose the pointer via `->`.
-  UnionFindNode* operator->() const {
-    return static_cast<UnionFindNode*>(node_.get());
-  }
-
-  using ContainerType = UnionFindNode;
-};
-
-class TypeUnifier;
-class TypeUnifierNode : public Node,
-                        private TypeFunctor<Type(const Type&, const Type)> {
- public:
-  UnionFind union_find;
-
-  TypeUnifierNode() {}
-
-  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("union_find", &union_find); }
-
-  TVM_DLL static TypeUnifier make(UnionFind uf);
-
-  /*! \brief Introduces a new type var into the unifier */
-  void Insert(const IncompleteType& v);
-
-  /*! \brief Unifies two types if possible, throws a unification error if it
-   * cannot  */
-  Type Unify(const Type& t1, const Type& t2);
-
-  /*! \brief Attempts to substitute all type vars in t with concrete types,
-   * throws substitution error if it cannot concretize*/
-  Type Subst(const Type& t);
-
-  // /*! \brief Checks the kinds in the given type */
-  // Type CheckKinds(const Type& t);
-
-  static constexpr const char* _type_key = "relay.TypeUnifier";
-  TVM_DECLARE_NODE_TYPE_INFO(TypeUnifierNode, Node);
-
- private:
-  /*! \brief Unify incomplete type with another type. */
-  Type UnifyWithIncompleteType(const Type& t1, const IncompleteType tvn2);
-  /*! \brief Implements unification between two types with incomplete portions.
-   */
-  Type VisitType(const Type& t1, const Type t2) override;
-
-  // Visitor Cases
-  Type VisitType_(const IncompleteTypeNode* t1, const Type t2) override;
-  Type VisitType_(const TensorTypeNode* t1, const Type t2) override;
-  Type VisitType_(const TypeParamNode* t1, const Type t2) override;
-  Type VisitType_(const FuncTypeNode* t1, const Type t2) override;
-  Type VisitType_(const TupleTypeNode* t1, const Type t2) override;
-  Type VisitType_(const TypeRelationNode* s1, const Type t2) override;
-};
-
-class TypeUnifier : public NodeRef {
- public:
-  TypeUnifier() {}
-  explicit TypeUnifier(NodePtr<tvm::Node> p) : NodeRef(p) {}
-
-  // no const so that unifier can be mutable as a member of typechecker
-  inline TypeUnifierNode* operator->() const {
-    return static_cast<TypeUnifierNode*>(node_.get());
-  }
-
-  using ContainerType = TypeUnifierNode;
-};
-
-}  // namespace relay
-}  // namespace tvm
-#endif  // TVM_RELAY_PASS_UNIFIER_H_
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index fc5d8ee0777d..91cbaf73bd2b 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -27,8 +27,8 @@ def test_tensor_type():
 
 
 def test_type_param():
-    tp = relay.TypeParam('name', relay.Kind.Shape)
-    assert tp.kind == relay.Kind.Shape
+    tp = relay.TypeParam('name', relay.Kind.Type)
+    assert tp.kind == relay.Kind.Type
     # assert tp.span  # TODO allow us to set span
     str(tp)
 
diff --git a/tests/python/relay/test_tyck_eval_integration.py b/tests/python/relay/test_type_infer.py
similarity index 91%
rename from tests/python/relay/test_tyck_eval_integration.py
rename to tests/python/relay/test_type_infer.py
index d95cda0ba819..18cf4b940c4f 100644
--- a/tests/python/relay/test_tyck_eval_integration.py
+++ b/tests/python/relay/test_type_infer.py
@@ -76,10 +76,10 @@ def test_add_broadcast_op():
     assert_has_type(func.to_func(), expected_ty)
 
 def test_dual_op():
-    """Program: 
-       fn (x : Tensor[f32, (10, 10)]) { 
-         let t1 = log(x); 
-         let t2 = add(t1, x); 
+    """Program:
+       fn (x : Tensor[f32, (10, 10)]) {
+         let t1 = log(x);
+         let t2 = add(t1, x);
          return t1;
        }
     """
@@ -93,8 +93,8 @@ def test_dual_op():
 
 
 def test_decl():
-    """Program: 
-       def f(x : Tensor[f32, (10, 10)]) { 
+    """Program:
+       def f(x : Tensor[f32, (10, 10)]) {
            let lx = log(x);
            return lx;
        }
@@ -125,7 +125,7 @@ def f(n: i32, data: f32) -> f32 {
     n = b.param('n', ty='int32')
     data = b.param('data', ty='float32')
     with b.decl(f, n, data):
-        with b.if_scope(equal(n, convert(0.0))):
+        with b.if_scope(equal(n, convert(0))):
             b.ret(f(subtract(n, convert(1)), log(data)))
         with b.else_scope():
             b.ret(data)
@@ -152,11 +152,12 @@ def try_concat2(x: Float(3, 2), y: Float(2, 2)) -> Float(5, 2) {
     assert_decl_has_type(ib.env, try_concat2, fn_ty)
 
 if __name__ == "__main__":
-    # test_monomorphic_let()
-    # test_single_op()
-    # test_add_op()
-    # test_add_broadcast_op()
-    # test_dual_op()
-    # test_decl()
-    # test_recursion()
+    test_recursion()
+
+    test_monomorphic_let()
+    test_single_op()
+    test_add_op()
+    test_add_broadcast_op()
+    test_dual_op()
+    test_decl()
     test_concat()
diff --git a/tests/python/relay/test_type_solver.py b/tests/python/relay/test_type_solver.py
new file mode 100644
index 000000000000..c96ca59d2c8d
--- /dev/null
+++ b/tests/python/relay/test_type_solver.py
@@ -0,0 +1,56 @@
+import tvm
+
+from tvm import relay
+from tvm.relay.ir_builder import scalar_type, convert, tensor_type
+
+
+def make_rel(name, args, num_inputs=None, attrs=None):
+    func = tvm.get_env_func("tvm.relay.type_relation." + name)
+    if num_inputs is None:
+        num_inputs = len(args) - 1
+    return relay.ty.TypeRelation(func, args, num_inputs, attrs)
+
+def make_solver():
+    solver = relay._ir_pass._test_type_solver()
+    solver.Solve = solver("Solve")
+    solver.Unify = solver("Unify")
+    solver.Resolve = solver("Resolve")
+    solver.AddConstraint = solver("AddConstraint")
+
+    def gen_type(name, args, out=None):
+        out = out if out else relay.ty.IncompleteType()
+        solver.AddConstraint(make_rel(name, args + [out]))
+        return out
+
+    solver.gen_type = gen_type
+    return solver
+
+
+def test_bcast():
+    solver = make_solver()
+    t0 = relay.ty.TensorType((10, 20), "float32")
+    t1 = relay.ty.TensorType((10, 1), "float32")
+    tc = relay.ty.TensorType((10, 1, 1), "float32")
+    t2 = solver.gen_type("Broadcast", [t0, t1])
+    t3 = solver.gen_type("Identity", [t2])
+    t4 = solver.gen_type("Broadcast", [t3, tc])
+    assert solver.Solve()
+    assert solver.Resolve(t2) == relay.ty.TensorType((10, 20), "float32")
+    assert solver.Resolve(t4) == relay.ty.TensorType((10, 10, 20), "float32")
+
+
+def test_backward_solving():
+    solver = make_solver()
+    t0 = relay.ty.TensorType((10, 20), "float32")
+    tc = relay.ty.TensorType((10, 1, 1), "float32")
+    t1 = relay.ty.IncompleteType()
+    t3 = solver.gen_type("Broadcast", [t0, t1])
+    t2 = solver.gen_type("Identity", [t1], out=tc)
+    assert solver.Solve()
+    assert solver.Resolve(t3) == relay.ty.TensorType((10, 10, 20), "float32")
+
+
+
+if __name__ == "__main__":
+    test_bcast()
+    test_backward_solving()
diff --git a/tutorials/nnvm/.gitignore b/tutorials/nnvm/.gitignore
new file mode 100644
index 000000000000..5f8a03c46b89
--- /dev/null
+++ b/tutorials/nnvm/.gitignore
@@ -0,0 +1,11 @@
+*.pb
+*.mlmodel
+*.ttf
+*.txt
+*synset*txt
+*.cfg
+ssd_model
+*.names
+*.jpg
+*.pbtxt
+*.weights

From 9db35729ec6d7c2bfe352ca5b21e73710f7848a4 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Sat, 29 Sep 2018 14:21:43 -0700
Subject: [PATCH 139/529] [Relay][DOC] Add tutorial for adding an operator to
 Relay (#1778)

---
 docs/dev/relay_add_op.rst | 148 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 docs/dev/relay_add_op.rst

diff --git a/docs/dev/relay_add_op.rst b/docs/dev/relay_add_op.rst
new file mode 100644
index 000000000000..6bd6e67700bd
--- /dev/null
+++ b/docs/dev/relay_add_op.rst
@@ -0,0 +1,148 @@
+Adding an Operator to Relay
+===========================
+
+In order to use TVM operators from within the Relay IR, the
+operators need to be registered in Relay in order to ensure
+that they will be integrated into Relay's type system.
+
+Registering an operator requires three steps:
+- Using the ``RELAY_REGISTER_OP`` macro in C++ to
+register the operator's arity and type information
+- Defining a C++ function to produce a call node for the
+operator and registering a Python API hook for the function
+- Wrapping the above Python API hook in a neater interface
+
+The file ``src/relay/op/tensor/elemwise.cc`` provides
+examples of the first two steps, while
+``python/tvm/relay/op/tensor.py`` gives examples of the
+last.
+
+Registering an Operator
+-----------------------
+
+TVM already has an operator registry, but Relay cannot properly
+incorporate TVM operators without additional type information.
+
+To allow for flexibility in registering operators and greater
+expressivity and granularity in expressing types in Relay, operators
+are typed using relations between input and output types. These relations
+are represented as functions that take in a list of input types and
+output types (any of these types may be incomplete) and return a list
+of input and output types that satisfies the relation. Essentially, a
+relation for an operator can enforce all the necessary typing rules
+(namely by inspecting the input types) in addition to computing the
+output type.
+
+For example, see ``src/relay/op/type_relations.h`` and their
+implementations. E.g., ``BroadcastRel`` takes two input types and an
+output type, checks that they are all tensor types with the same underlyin
+data type, and finally ensures that the shape of the output type is the
+broadcast of the input types' shapes.
+
+It may be necessary to add another type relation to ``type_relations.h``
+if the existing ones do not capture the behavior of the desired operator.
+
+The ``RELAY_REGISTER_OP`` macro in C++ allows a developer
+to specify the following information about an operator in Relay:
+- Arity (number of arguments)
+- Names and descriptions for positional arguments
+- Support level (1 indicating an internal intrinsic, higher numbers
+indicating operators that are not as integral to the framework or are
+supported externally)
+- A type relation for the operator
+
+The below example is from ``elemwise.cc`` and uses a broadcasting
+add for tensors:
+
+.. code:: c
+
+    RELAY_REGISTER_OP("add")
+        .set_num_inputs(2)
+        .add_argument("lhs", "Tensor", "The left hand side tensor.")
+        .add_argument("rhs", "Tensor", "The right hand side tensor.")
+        .set_support_level(1)
+        .add_type_rel("Broadcast", BroadcastRel);
+
+Creating a Call Node
+--------------------
+
+This step requires simply writing a function that takes
+the arguments to the operator (as Relay expressions) and
+returning a call node to the operator (i.e., the node that
+should be placed into the Relay AST where the call to the
+operator is intended).
+
+At present call attributes and type arguments (the last two fields)
+are not supported, so it suffices to use ``Op::Get`` to fetch
+the operator's information from the operator registry and pass in
+the arguments to the call node, as below.
+
+.. code:: c
+
+    TVM_REGISTER_API("relay.op._make.add")
+        .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {
+            static const Op& op = Op::Get("add");
+          return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+        });
+
+Including a Python API Hook
+---------------------------
+
+It is generally the convention in Relay, that functions exported
+through ``TVM_REGISTER_API`` should be wrapped in a separate
+Python function rather than called directly in Python. In the case
+of the functions that produce calls to operators, it may be convenient
+to bundle them, as in ``python/tvm/relay/op/tensor.py``, where
+elementwise operators on tensors are all provided. For example,
+the following is how the add function from the previous section is
+exposed in Python:
+
+.. code:: python
+
+    def add(lhs, rhs):
+        """Elementwise addition.
+
+        Parameters
+        ----------
+        lhs : relay.Expr
+            The left hand side input data
+        rhs : relay.Expr
+            The right hand side input data
+
+        Returns
+        -------
+        result : relay.Expr
+            The computed result.
+        """
+        return _make.add(lhs, rhs)
+
+Note that these Python wrappers might also be good opportunities to
+provide an easier interface to the operator. For example, the
+``concat`` operator is registered as taking only one operator,
+namely a tuple with the tensors to be concatenated, but the Python
+wrapper takes the tensors as arguments and combines them into a tuple
+before producing the call node:
+
+.. code:: python
+
+    def concat(*args):
+        """Concatenate the input tensors along the zero axis.
+
+        Parameters
+        ----------
+        args: list of Tensor
+
+        Returns
+        -------
+        tensor: The concatenated tensor.
+        """
+        tup = Tuple(list(args))
+        return _make.concat(tup)
+
+Summary
+-------
+- A TVM operator can be registered in Relay using a relation to express
+the appropriate type information.
+- Using an operator in Relay requires a function to produce a
+call node for the operator.
+- It is best to have a simple Python wrapper for producing the call node.

From a47db64e4ff80d69c69a03147e14b6e4f26eed24 Mon Sep 17 00:00:00 2001
From: Albin Joy <albin.joy@huawei.com>
Date: Sun, 30 Sep 2018 03:34:37 +0530
Subject: [PATCH 140/529] [NNVM][TENSORFLOW]Fix lstm testcase to support
 get_output without size input (#1731)

* [NNVM][TENSORFLOW]Fix lstm testcase issue to support get_output without size input

* removed redundant

* Enabled inceptionV1 testcase
---
 .../frontend/tensorflow/test_forward.py       | 26 ++++++++-----------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 2d965a6540a1..031d934006d1 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -26,7 +26,7 @@
 #######################################################################
 # Generic run functions for TVM & tensorflow
 # ------------------------------------------
-def run_tvm_graph(graph_def, input_data, input_node, output_shape, output_dtype, target='llvm'):
+def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm'):
     """ Generic function to compile on nnvm and execute on tvm """
 
     layout = None
@@ -62,10 +62,10 @@ def run_tvm_graph(graph_def, input_data, input_node, output_shape, output_dtype,
     # execute
     m.run()
     # get outputs
-    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+    if num_output > 1:
         tvm_output_list = []
-        for i, s in enumerate(output_shape):
-            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+        for i in range(0, num_output):
+            tvm_output = m.get_output(i)
             tvm_output_list.append(tvm_output.asnumpy())
         return tvm_output_list
     else:
@@ -119,8 +119,7 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False,
             if no_gpu and device == 'cuda':
                 continue
 
-            tvm_output = run_tvm_graph(final_graph_def, in_data,
-                                       in_node, tf_output.shape, tf_output.dtype, target=device)
+            tvm_output = run_tvm_graph(final_graph_def, in_data, in_node, target=device)
             np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
 
         sess.close()
@@ -572,14 +571,12 @@ def _get_tensorflow_output():
     graph_def, tf_out = _get_tensorflow_output()
     tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h],
                                ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c',
-                                'root/lstm_cell/LSTMBlockCell_h'],
-                               [tf_out[0].shape, (2, batch_size, num_hidden)],
-                               [tf_out[0].dtype, tf_out[1].dtype])
+                                'root/lstm_cell/LSTMBlockCell_h'], num_output=2)
     assert isinstance(tvm_output, list)
 
     out = tvm_output[0]
     out_state = tvm_output[1]
-    out_state_tup = np.split(out_state, indices_or_sections=2, axis=0)
+    out_state_tup = np.split(out_state, indices_or_sections=2, axis=1)
     out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
     out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
     tvm_out = [out, out_state_c, out_state_h]
@@ -587,7 +584,6 @@ def _get_tensorflow_output():
 
 def test_forward_lstm():
     '''test LSTM block cell'''
-    return
     _test_lstm_cell(1, 2, 1, 0.0, 'float32')
 
 
@@ -656,7 +652,7 @@ def test_forward_inception_v3():
 
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'input:0', 'InceptionV3/Predictions/Reshape_1:0')
-            tvm_output = run_tvm_graph(graph_def, data, 'input', tf_output.shape, 'float32')
+            tvm_output = run_tvm_graph(graph_def, data, 'input')
             np.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
 
 #######################################################################
@@ -692,7 +688,7 @@ def test_forward_inception_v1():
 
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'softmax:0')
-            tvm_output = run_tvm_graph(graph_def, tvm_data, 'DecodeJpeg/contents', tf_output.shape, 'float32')
+            tvm_output = run_tvm_graph(graph_def, tvm_data, 'DecodeJpeg/contents')
             np.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
 
 #######################################################################
@@ -710,7 +706,7 @@ def test_forward_mobilenet():
 
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'input:0', out_node + ':0')
-            tvm_output = run_tvm_graph(graph_def, data, 'input', tf_output.shape, 'float32')
+            tvm_output = run_tvm_graph(graph_def, data, 'input')
             np.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
 
 #######################################################################
@@ -1029,7 +1025,7 @@ def check_mean(ishape, **kwargs):
     test_forward_ptb()
 
     # RNN
-    #test_forward_lstm()
+    test_forward_lstm()
 
     # Elementwise
     test_forward_ceil()

From 6868d7507e73b0e51e010d417c9ad9e2f6661451 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Sat, 29 Sep 2018 15:06:06 -0700
Subject: [PATCH 141/529] support of multiple devices for tvm.build (#1773)

---
 python/tvm/build_module.py                    | 255 +++++++++++-------
 .../unittest/test_runtime_heterogeneous.py    |  30 +--
 2 files changed, 168 insertions(+), 117 deletions(-)

diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 8e0d16286d6a..2bb7442bab76 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -379,92 +379,32 @@ def lower(sch,
         return stmt
     return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
 
-def build(sch,
-          args=None,
-          target=None,
-          target_host=None,
-          name="default_function",
-          binds=None,
-          postpone_host_codegen=False):
-    """Build a function with arguments as signature. Code will be generated
-    for a device specified by the target. For homogeneous execution, a module
-    that contains both host and device code is returned. For heterogeneous
-    execution, a list of lowered functions for the host and a module containing
-    device code are returned, but actual code generation for the host module is
-    postponed after code generation is finished for all devices.
+
+def _build_for_device(flist, target, target_host):
+    """Build the lowered functions for a device with the given compilation
+    target.
 
     Parameters
     ----------
-    sch : tvm.Schedule, or LoweredFunc
-        The schedule to be builded
-
-    args : list of Buffer or Tensor or Var, optional
-        The argument lists to the function.
+    flist : list of LoweredFunc
+        The schedule to be built.
 
-    target : str or :any:`tvm.target.Target`, optional
+    target : str or :any:`tvm.target.Target`
         The target and option of the compilation.
 
-    target_host : str or :any:`tvm.target.Target` optional
-        Host compilation target, if target is device.
-        When TVM compiles device specific program such as CUDA,
-        we also need host(CPU) side code to interact with the driver
-        setup the dimensions and parameters correctly.
-        target_host is used to specify the host side codegen target.
-        By default, llvm is used if it is enabled,
-        otherwise a stackvm intepreter is used.
-
-    name : str, optional
-        The name of result function.
-
-    binds : dict, optional
-        Dictionary that maps the binding of symbolic buffer to Tensor.
-        By default, a new buffer is created for each tensor in the argument.
-
-    postpone_host_codegen : bool, optional
-        A bool value that indicates if code generation for the host module
-        should be postponed. This variable is set to be true for heterogeneous
-        execution. Otherwise, it is defaulted to false.
+    target_host : str or :any:`tvm.target.Target`
+        The host compilation target.
 
     Returns
     -------
-    ret : tvm.module, or (list of LoweredFunc, tvm.module) tuple
-        A module that combines both host and device code is returned when
-        postpone_host_codegen is not set. Otherwise, a list of lowered
-        functions for the host and a module contains only device code are
-        returned.
+    fhost : list of LoweredFunc
+        A list of lowered functions for the host.
 
-    Note
-    ----
-    See the note on :any:`tvm.target` on target string format.
+    mdev : tvm.module
+        A module that contains device code.
     """
-    if isinstance(sch, schedule.Schedule):
-        if args is None:
-            raise ValueError("args must be given for build from schedule")
-        flist = lower(sch, args,
-                      name=name,
-                      binds=binds)
-        if isinstance(flist, container.LoweredFunc):
-            flist = [flist]
-    elif isinstance(sch, container.LoweredFunc):
-        if args:
-            raise ValueError("args must be done when build from LoweredFunc")
-        flist = [sch]
-    elif isinstance(sch, (list, tuple, container.Array)):
-        flist = sch
-    else:
-        raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
-    fname_set = set()
-    for x in flist:
-        if not isinstance(x, container.LoweredFunc):
-            raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
-        if x.name in fname_set:
-            raise ValueError("Duplicate function name %s" % x.name)
-        fname_set.add(x.name)
-
-    target = _target.current_target() if target is None else target
-    target = _target.create(target) if target else _target.create("llvm")
+    target = _target.create(target)
     device_type = ndarray.context(target.target_name, 0).device_type
-
     fhost = []
     fdevice = []
     for func in flist:
@@ -496,31 +436,162 @@ def build(sch,
 
     if "gpu" in target.keys and not fdevice:
         warnings.warn(
-            "Specified target %s, but cannot find device code, did you do bind?" % target)
+            "Specified target %s, but cannot find device code, did you do "
+            "bind?" % target)
 
     fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost]
     fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost]
 
-    if not target_host:
-        if device_type == ndarray.cpu(0).device_type:
-            target_host = target
-            assert not fdevice
-        else:
-            target_host = "llvm" if module.enabled("llvm") else "stackvm"
+    if device_type == ndarray.cpu(0).device_type and target_host == target:
+        assert not fdevice
+
     target_host = _target.create(target_host)
-    target_device = target
-    fdevice = [ir_pass.LowerIntrin(x, target_device.target_name) for x in fdevice]
+    fdevice = [ir_pass.LowerIntrin(x, target.target_name) for x in fdevice]
     fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost]
     fhost = [ir_pass.CombineContextCall(x) for x in fhost]
+    mdev = codegen.build_module(fdevice, str(target)) if fdevice else None
+
+    return fhost, mdev
+
+
+def build(inputs,
+          args=None,
+          target=None,
+          target_host=None,
+          name="default_function",
+          binds=None):
+    """Build a function with arguments as signature. Code will be generated
+    for devices coupled with target information.
+
+    Parameters
+    ----------
+    inputs : tvm.Schedule, LoweredFunc, or dict of target to LoweredFunc list
+        The schedule to be built
+
+    args : list of Buffer or Tensor or Var, optional
+        The argument lists to the function.
+
+    target : str or :any:`tvm.target.Target`, optional
+        The target and option of the compilation.
+
+    target_host : str or :any:`tvm.target.Target` optional
+        Host compilation target, if target is device.
+        When TVM compiles device specific program such as CUDA,
+        we also need host(CPU) side code to interact with the driver
+        setup the dimensions and parameters correctly.
+        target_host is used to specify the host side codegen target.
+        By default, llvm is used if it is enabled,
+        otherwise a stackvm intepreter is used.
 
-    # Append fhost to the device module and return the updated module. All
-    # device modules will be imported to the host module after all of them are
-    # collected.
-    mdev = codegen.build_module(fdevice, str(target_device)) if fdevice else None
-    if postpone_host_codegen:
-        return fhost, mdev
+    name : str, optional
+        The name of result function.
+
+    binds : dict, optional
+        Dictionary that maps the binding of symbolic buffer to Tensor.
+        By default, a new buffer is created for each tensor in the argument.
 
-    mhost = codegen.build_module(fhost, str(target_host))
-    if fdevice:
-        mhost.import_module(mdev)
+    Returns
+    -------
+    ret : tvm.module
+        A module that combines both host and device code.
+
+    Examples
+    ________
+    There are two typical example uses of this function depending on the type
+    of the argument `inputs`:
+    1. it is a list of lowered functions:
+
+    .. code-block:: python
+
+        n = 2
+        A = tvm.placeholder((n,), name='A')
+        B = tvm.placeholder((n,), name='B')
+        C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+        s = tvm.create_schedule(C.op)
+        f = tvm.lower(s, [A, B, C], name="test_add")
+        m = tvm.build(f, target="llvm")
+
+    2. it is a dict of compilation target to list of lowered functions:
+
+    .. code-block:: python
+
+        n = 2
+        A = tvm.placeholder((n,), name='A')
+        B = tvm.placeholder((n,), name='B')
+        C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+        s1 = tvm.create_schedule(C.op)
+        s2 = topi.cpp.cuda.schedule_injective("cuda", [C])
+        f1 = tvm.lower(s1, [A, B, C], name="test_add1")
+        f2 = tvm.lower(s2, [A, B, C], name="test_add2")
+        m = tvm.build({"llvm": [f1], "cuda": [f2]}, target_host="llvm")
+
+    Note
+    ----
+    See the note on :any:`tvm.target` on target string format.
+    """
+    if isinstance(inputs, schedule.Schedule):
+        if args is None:
+            raise ValueError("args must be given for build from schedule")
+        flist = lower(inputs, args,
+                      name=name,
+                      binds=binds)
+        if isinstance(flist, container.LoweredFunc):
+            flist = [flist]
+    elif isinstance(inputs, container.LoweredFunc):
+        if args:
+            raise ValueError("args must be done when build from LoweredFunc.")
+        flist = [inputs]
+    elif isinstance(inputs, (list, tuple, container.Array)):
+        flist = inputs
+    elif not isinstance(inputs, (dict, container.Map)):
+        raise ValueError("inputs must be Schedule, LoweredFunc, list of "
+                         "LoweredFunc, or dict of target to list of "
+                         "LoweredFunc.")
+
+    if not isinstance(inputs, (dict, container.Map)):
+        target = _target.current_target() if target is None else target
+        target = target if target else "llvm"
+        target_flist = {target: flist}
+    else:
+        target_flist = inputs
+
+    for tar, flist in target_flist.items():
+        if not isinstance(tar, (str, _target.Target)):
+            raise ValueError("The key of inputs must be str or "
+                             "_target.Target when inputs is dict.")
+        fname_set = set()
+        for x in flist:
+            if not isinstance(x, container.LoweredFunc):
+                raise ValueError("inputs must be Schedule, LoweredFunc, list "
+                                 "of LoweredFunc, or dict of str to list of "
+                                 "LoweredFunc.")
+            if x.name in fname_set:
+                raise ValueError("Duplicate function name %s" % x.name)
+            fname_set.add(x.name)
+
+    if not target_host:
+        for tar, _ in target_flist.items():
+            tar = _target.create(tar)
+            device_type = ndarray.context(tar.target_name, 0).device_type
+            if device_type == ndarray.cpu(0).device_type:
+                target_host = tar
+                break
+    if not target_host:
+        target_host = "llvm" if module.enabled("llvm") else "stackvm"
+
+    fhost_all = []
+    device_modules = []
+    for tar, flist in target_flist.items():
+        fhost, mdev = _build_for_device(flist, tar, target_host)
+        # Save the current lowered functions of the host and the device module.
+        fhost_all += fhost
+        device_modules.append(mdev)
+
+    # Generate a unified host module.
+    mhost = codegen.build_module(fhost_all, str(target_host))
+
+    # Import all modules.
+    for mdev in device_modules:
+        if mdev:
+            mhost.import_module(mdev)
     return mhost
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index b916ee285717..3272165f0b02 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -124,9 +124,6 @@ def check_device(device, target_device):
         schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
         lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add],
                               name="elemwise_add")
-        host_funcs_add, lib_add = tvm.build(lower_add, target=target_device,
-                                            name="elemwise_add",
-                                            postpone_host_codegen=True)
 
         # Insert copy. Neither compute nor schedule is required for the copy
         # node. The compute will be performed at runtime which is just data
@@ -142,16 +139,8 @@ def check_device(device, target_device):
                                              elemwise_sub],
                               name="elemwise_sub")
 
-        host_funcs_sub, lib_sub = tvm.build(lower_sub, target=target_host,
-                                            name="elemwise_sub",
-                                            postpone_host_codegen=True)
-        host_funcs = host_funcs_add + host_funcs_sub
-        mhost = tvm.codegen.build_module(host_funcs, target_host)
-        if lib_add:
-            mhost.import_module(lib_add)
-        if lib_sub:
-            mhost.import_module(lib_sub)
-
+        target_flist = {target_device: [lower_add], target_host: [lower_sub]}
+        mhost = tvm.build(target_flist, target_host=target_host)
         ctx = [host_ctx, device_ctx]
         mod = graph_runtime.create(graph, mhost, ctx)
         params = {}
@@ -338,10 +327,6 @@ def check_device(device, target_device):
         lower_add1 = tvm.lower(
             add_schedule1, [tensor_d, copy_sub_add, elemwise_add1],
             name="elemwise_add1")
-        host_funcs_add, lib_add = tvm.build([lower_add0, lower_add1],
-                                            target=target_device,
-                                            postpone_host_codegen=True)
-
         # Create module for sub whose target is the host.
         tensor_c = tvm.placeholder(shape, name="C")
         elemwise_sub = tvm.compute(shape, lambda *i: copy_add_sub(*i)
@@ -350,15 +335,10 @@ def check_device(device, target_device):
         lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c,
                                              elemwise_sub],
                               name="elemwise_sub")
-        host_funcs_sub, lib_sub = tvm.build(lower_sub, target=target_host,
-                                            postpone_host_codegen=True)
-        host_funcs = host_funcs_add + host_funcs_sub
-        mhost = tvm.codegen.build_module(host_funcs, target_host)
-        if lib_add:
-            mhost.import_module(lib_add)
-        if lib_sub:
-            mhost.import_module(lib_sub)
 
+        target_flist = {target_device: [lower_add0, lower_add1], target_host:
+                        [lower_sub]}
+        mhost = tvm.build(target_flist, target_host=target_host)
         ctx = [host_ctx, device_ctx]
         params = {}
         params["A"] = tensor_a = np.random.uniform(

From 7f2ca0a717e49940c93c90b80c0d9d36335a8a70 Mon Sep 17 00:00:00 2001
From: Stilistik <philipp@rundumeli.ch>
Date: Sun, 30 Sep 2018 00:33:54 +0200
Subject: [PATCH 142/529] Implement tensorflow relational operators and related
 tests (#1714)

---
 nnvm/python/nnvm/frontend/tensorflow.py       | 15 +++++++++++
 .../frontend/tensorflow/test_forward.py       | 27 ++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 3bd3ee079ee0..ad7c4fc6796f 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -759,6 +759,15 @@ def _impl(inputs, attr, params):
                        extras={'axis': tuple(axis.asnumpy())})(inputs[0], attr)
     return _impl
 
+def _broadcast(name):
+    def _impl(inputs, attr, params):
+        op_name = _math_name_picker(name)(attr)
+        return AttrCvt(
+            op_name=op_name,
+            ignores=['name', 'Tidx']
+        )(inputs, attr)
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -819,6 +828,12 @@ def _impl(inputs, attr, params):
     'Transpose'                         : _transpose(),
     'Tanh'                              : AttrCvt('tanh'),
     'Mean'                              : _mean(),
+    'Less'                              : _broadcast('less'),
+    'Greater'                           : _broadcast('greater'),
+    'LessEqual'                         : _broadcast('less_equal'),
+    'GreaterEqual'                      : _broadcast('greater_equal'),
+    'Equal'                             : _broadcast('equal'),
+    'NotEqual'                          : _broadcast('not_equal'),
 }
 
 # _convert_map_rnn defines maps of rnn operator name to
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 031d934006d1..d73080d1cb00 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -378,7 +378,7 @@ def test_forward_reduce():
     data = np.random.uniform(size=(8,4,9)).astype('float32')
     _test_reduce(tf.reduce_sum, data=data)
     _test_reduce(tf.reduce_sum, data=data, axis=0)
-    _test_reduce(tf.reduce_sum, data=data, axis=(0,1))    
+    _test_reduce(tf.reduce_sum, data=data, axis=(0,1))
 
 
 #######################################################################
@@ -978,6 +978,28 @@ def check_mean(ishape, **kwargs):
     check_mean((10, 8, 16, 32), axis=(2,3))
     check_mean((10, 8, 16, 32), axis=(1,2), keepdims=True)
 
+#######################################################################
+# Relational operators
+# --------------------
+def _test_forward_rel_op(data, func):
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=data[0].shape, dtype=data[0].dtype, name='in1')
+        in2 = tf.placeholder(shape=data[1].shape, dtype=data[1].dtype, name='in2')
+        op = func(in1, in2, name='op')
+        out = tf.cast(op, tf.int32, name='out1')
+        compare_tf_with_tvm([data[0], data[1]], ['in1:0', 'in2:0'], 'out1:0')
+
+def test_forward_rel_ops():
+    t1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    t2 = np.array([[9, 8, 7], [6, 5, 4], [3, 2, 1]])
+    _test_forward_rel_op([t1, t2], math_ops.less)
+    _test_forward_rel_op([t1, t2], math_ops.greater)
+    _test_forward_rel_op([t1, t2], math_ops.less_equal)
+    _test_forward_rel_op([t1, t2], math_ops.greater_equal)
+    _test_forward_rel_op([t1, t2], math_ops.equal)
+    _test_forward_rel_op([t1, t2], math_ops.not_equal)
+
+
 #######################################################################
 # Main
 # ----
@@ -1030,3 +1052,6 @@ def check_mean(ishape, **kwargs):
     # Elementwise
     test_forward_ceil()
     test_forward_floor()
+
+    # Relational ops
+    test_forward_rel_ops()

From c784de1525a808f668da9018735da70f12d2b0f0 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 30 Sep 2018 23:29:33 -0400
Subject: [PATCH 143/529] [BUG] Fix incorrect libcuda.so found by cmake when
 multiple versions of CUDA exist (#1788)

---
 cmake/util/FindCUDA.cmake   | 12 ++++++++++--
 cmake/util/FindLLVM.cmake   |  7 ++++++-
 cmake/util/FindROCM.cmake   | 38 +++++++++++++++++++++----------------
 cmake/util/FindVulkan.cmake |  3 +++
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/cmake/util/FindCUDA.cmake b/cmake/util/FindCUDA.cmake
index 3ce0cc40a5e5..e715ad2efd2f 100644
--- a/cmake/util/FindCUDA.cmake
+++ b/cmake/util/FindCUDA.cmake
@@ -56,13 +56,15 @@ macro(find_cuda use_cuda)
     else(MSVC)
       find_library(_CUDA_CUDA_LIBRARY cuda
         PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
+        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs
+        NO_DEFAULT_PATH)
       if(_CUDA_CUDA_LIBRARY)
         set(CUDA_CUDA_LIBRARY ${_CUDA_CUDA_LIBRARY})
       endif()
       find_library(CUDA_NVRTC_LIBRARY nvrtc
         PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
+        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs
+        NO_DEFAULT_PATH)
       find_library(CUDA_CUDNN_LIBRARY cudnn
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib)
@@ -70,5 +72,11 @@ macro(find_cuda use_cuda)
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib)
     endif(MSVC)
+    message(STATUS "Found CUDA_TOOLKIT_ROOT_DIR=" ${CUDA_TOOLKIT_ROOT_DIR})
+    message(STATUS "Found CUDA_CUDA_LIBRARY=" ${CUDA_CUDA_LIBRARY})
+    message(STATUS "Found CUDA_CUDART_LIBRARY=" ${CUDA_CUDART_LIBRARY})
+    message(STATUS "Found CUDA_NVRTC_LIBRARY=" ${CUDA_NVRTC_LIBRARY})
+    message(STATUS "Found CUDA_CUDNN_LIBRARY=" ${CUDA_CUDNN_LIBRARY})
+    message(STATUS "Found CUDA_CUBLAS_LIBRARY=" ${CUDA_CUBLAS_LIBRARY})
   endif(CUDA_FOUND)
 endmacro(find_cuda)
diff --git a/cmake/util/FindLLVM.cmake b/cmake/util/FindLLVM.cmake
index 4bb58d462d12..8497761a7116 100644
--- a/cmake/util/FindLLVM.cmake
+++ b/cmake/util/FindLLVM.cmake
@@ -11,7 +11,7 @@
 # - LLVM_INCLUDE_DIRS
 # - LLVM_LIBS
 # - LLVM_DEFINITIONS
-# - TVM_LLVM_VERISON
+# - TVM_LLVM_VERSION
 #
 macro(find_llvm use_llvm)
   set(LLVM_CONFIG ${use_llvm})
@@ -56,4 +56,9 @@ macro(find_llvm use_llvm)
     separate_arguments(LLVM_LIBS)
     string(STRIP ${TVM_LLVM_VERSION} TVM_LLVM_VERSION)
   endif()
+  if(NOT LLVM_CONFIG STREQUAL "OFF")
+    message(STATUS "Found LLVM_INCLUDE_DIRS=" ${LLVM_INCLUDE_DIRS})
+    message(STATUS "Found LLVM_DEFINITIONS=" ${LLVM_DEFINITIONS})
+    message(STATUS "Found TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION})
+  endif()
 endmacro(find_llvm)
diff --git a/cmake/util/FindROCM.cmake b/cmake/util/FindROCM.cmake
index 235969813382..317fea1b8f4e 100644
--- a/cmake/util/FindROCM.cmake
+++ b/cmake/util/FindROCM.cmake
@@ -21,21 +21,27 @@ macro(find_rocm use_rocm)
   if(IS_DIRECTORY ${__use_rocm})
     set(__rocm_sdk ${__use_rocm})
     message(STATUS "Custom ROCM SDK PATH=" ${__use_rocm})
-   elseif(IS_DIRECTORY $ENV{ROCM_PATH})
-     set(__rocm_sdk $ENV{ROCM_PATH})
-   elseif(IS_DIRECTORY /opt/rocm)
-     set(__rocm_sdk /opt/rocm)
-   else()
-     set(__rocm_sdk "")
-   endif()
+  elseif(IS_DIRECTORY $ENV{ROCM_PATH})
+    set(__rocm_sdk $ENV{ROCM_PATH})
+  elseif(IS_DIRECTORY /opt/rocm)
+    set(__rocm_sdk /opt/rocm)
+  else()
+    set(__rocm_sdk "")
+  endif()
 
-   if(__rocm_sdk)
-     set(ROCM_INCLUDE_DIRS ${__rocm_sdk}/include)
-     find_library(ROCM_HIPHCC_LIBRARY hip_hcc ${__rocm_sdk}/lib)
-     find_library(ROCM_MIOPEN_LIBRARY MIOpen ${__rocm_sdk}/lib)
-     find_library(ROCM_ROCBLAS_LIBRARY rocblas ${__rocm_sdk}/lib)
-     if(ROCM_HIPHCC_LIBRARY)
-       set(ROCM_FOUND TRUE)
-     endif()
-   endif(__rocm_sdk)
+  if(__rocm_sdk)
+    set(ROCM_INCLUDE_DIRS ${__rocm_sdk}/include)
+    find_library(ROCM_HIPHCC_LIBRARY hip_hcc ${__rocm_sdk}/lib)
+    find_library(ROCM_MIOPEN_LIBRARY MIOpen ${__rocm_sdk}/lib)
+    find_library(ROCM_ROCBLAS_LIBRARY rocblas ${__rocm_sdk}/lib)
+    if(ROCM_HIPHCC_LIBRARY)
+      set(ROCM_FOUND TRUE)
+    endif()
+  endif(__rocm_sdk)
+  if(ROCM_FOUND)
+    message(STATUS "Found ROCM_INCLUDE_DIRS=" ${ROCM_INCLUDE_DIRS})
+    message(STATUS "Found ROCM_HIPHCC_LIBRARY=" ${ROCM_HIPHCC_LIBRARY})
+    message(STATUS "Found ROCM_MIOPEN_LIBRARY=" ${ROCM_MIOPEN_LIBRARY})
+    message(STATUS "Found ROCM_ROCBLAS_LIBRARY=" ${ROCM_ROCBLAS_LIBRARY})
+  endif(ROCM_FOUND)
 endmacro(find_rocm)
diff --git a/cmake/util/FindVulkan.cmake b/cmake/util/FindVulkan.cmake
index 0b85e8f47d79..15c85bfe27c3 100644
--- a/cmake/util/FindVulkan.cmake
+++ b/cmake/util/FindVulkan.cmake
@@ -51,5 +51,8 @@ macro(find_vulkan use_vulkan)
     find_path(_spirv spirv.hpp HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv/unified1)
     find_path(_glsl_std GLSL.std.450.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv/unified1)
     list(APPEND Vulkan_INCLUDE_DIRS ${_libspirv} ${_spirv} ${_glsl_std})
+    message(STATUS "Vulkan_INCLUDE_DIRS=" ${Vulkan_INCLUDE_DIRS})
+    message(STATUS "Vulkan_LIBRARY=" ${Vulkan_LIBRARY})
+    message(STATUS "Vulkan_SPIRV_TOOLS_LIBRARY=" ${Vulkan_SPIRV_TOOLS_LIBRARY})
   endif(Vulkan_FOUND)
 endmacro(find_vulkan)

From 2ef7081220732065af290953d16f7113d1b40866 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Sun, 30 Sep 2018 20:32:21 -0700
Subject: [PATCH 144/529] [Relay] Free Variables (#1786)

---
 include/tvm/relay/pass.h             |  30 +++++++
 python/tvm/relay/ir_pass.py          |   4 +
 src/relay/pass/type_visitor.h        |  12 +--
 src/relay/pass/util.cc               | 118 +++++++++++++++++++++++++++
 tests/python/relay/test_free_vars.py |  29 +++++++
 5 files changed, 187 insertions(+), 6 deletions(-)
 create mode 100644 src/relay/pass/util.cc
 create mode 100644 tests/python/relay/test_free_vars.py

diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index d3747c214859..8b2a5fafd8f0 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -92,6 +92,36 @@ bool AlphaEqual(const Type& t1, const Type& t2);
  */
 bool WellFormed(const Expr & e);
 
+/*! \brief Get free variables from expression e.
+ *
+ * Free variables are variables that are not bound by a let or a function parameter in the context.
+ *
+ * \param e the expression.
+ *
+ * \return the set of free variable.
+ */
+tvm::Array<Var> FreeVariables(const Expr & e);
+
+/*! \brief Get free type parameters from expression e.
+ *
+ * Free type parameters are type parameters that are not bound by a function type in the context.
+ *
+ * \param e the expression.
+ *
+ * \return the set of free type variables.
+ */
+tvm::Array<TypeParam> FreeTypeVariables(const Expr & e);
+
+/*! \brief Get free type parameters from type t.
+ *
+ * Free type parameters are type parameters that are not bound by a function type in the context.
+ *
+ * \param t the type.
+ *
+ * \return the set of free type variables.
+ */
+tvm::Array<TypeParam> FreeTypeVariables(const Type & t);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_H_
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 8a9612420327..339b9f74d8d4 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -14,3 +14,7 @@
 well_formed = _ir_pass.well_formed
 
 check_kind = _ir_pass.check_kind
+
+free_vars = _ir_pass.free_vars
+
+free_type_vars = _ir_pass.free_type_vars
diff --git a/src/relay/pass/type_visitor.h b/src/relay/pass/type_visitor.h
index 8148942657ba..6468269686e8 100644
--- a/src/relay/pass/type_visitor.h
+++ b/src/relay/pass/type_visitor.h
@@ -95,13 +95,13 @@ struct TypeMutator : TypeFunctor<Type(const Type& n)> {
                               type_params, type_constraints);
   }
 
-    Type VisitType_(const TupleTypeNode* op) override {
-      std::vector<Type> new_fields;
-      for (const Type& t : op->fields) {
-        new_fields.push_back(this->VisitType(t));
-      }
-      return TupleTypeNode::make(new_fields);
+  Type VisitType_(const TupleTypeNode* op) override {
+    std::vector<Type> new_fields;
+    for (const Type& t : op->fields) {
+      new_fields.push_back(this->VisitType(t));
     }
+    return TupleTypeNode::make(new_fields);
+  }
 
   Type VisitType_(const TypeRelationNode* type_rel) override {
     std::vector<Type> new_args;
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
new file mode 100644
index 000000000000..5f87c3d4cb89
--- /dev/null
+++ b/src/relay/pass/util.cc
@@ -0,0 +1,118 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file util.cc
+ *
+ * \brief simple util for relay.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include "./type_visitor.h"
+
+namespace tvm {
+namespace relay {
+
+class FreeVar;
+class FreeTypeVar : private TypeVisitor<> {
+  std::unordered_set<TypeParam, NodeHash, NodeEqual> * free_vars;
+  std::unordered_set<TypeParam, NodeHash, NodeEqual> * bound_vars;
+  FreeTypeVar(std::unordered_set<TypeParam, NodeHash, NodeEqual> * free_vars,
+              std::unordered_set<TypeParam, NodeHash, NodeEqual> * bound_vars) :
+    free_vars(free_vars), bound_vars(bound_vars) { }
+
+  void VisitType_(const TypeParamNode* tp) final {
+    auto var = GetRef<TypeParam>(tp);
+    if (bound_vars->count(var) == 0) {
+      free_vars->insert(var);
+    }
+  }
+
+  void VisitType_(const FuncTypeNode* f) final {
+    for (auto type_param : f->type_params) {
+      bound_vars->insert(type_param);
+    }
+
+    for (auto type_cs : f->type_constraints) {
+      this->VisitType(type_cs);
+    }
+
+    for (auto arg_type : f->arg_types) {
+      this->VisitType(arg_type);
+    }
+    this->VisitType(f->ret_type);
+  }
+  friend FreeVar;
+};
+
+class FreeVar : public ExprVisitor {
+  void VisitExpr_(const VarNode *v) final {
+    auto var = GetRef<Var>(v);
+    if (bound_vars.count(var) == 0) {
+      free_vars.insert(var);
+    }
+  }
+
+  void VisitExpr_(const FunctionNode *f) final {
+    for (const auto& tp : f->type_params) {
+      bound_types.insert(tp);
+    }
+    for (const auto& p : f->params) {
+      bound_vars.insert(p->var);
+    }
+    VisitExpr(f->body);
+    VisitType(f->ret_type);
+  }
+
+  void VisitExpr_(const LetNode *l) final {
+    bound_vars.insert(l->var);
+    VisitExpr(l->value);
+    VisitExpr(l->body);
+    VisitType(l->value_type);
+  }
+
+ public:
+  std::unordered_set<Var, NodeHash, NodeEqual> free_vars;
+  std::unordered_set<Var, NodeHash, NodeEqual> bound_vars;
+  std::unordered_set<TypeParam, NodeHash, NodeEqual> free_types;
+  std::unordered_set<TypeParam, NodeHash, NodeEqual> bound_types;
+
+  void VisitType(const Type& t) final {
+    FreeTypeVar(&free_types, &bound_types)(t);
+  }
+};
+
+tvm::Array<Var> FreeVariables(const Expr& e) {
+  FreeVar fv;
+  fv.VisitExpr(e);
+  return tvm::Array<Var>(fv.free_vars.begin(), fv.free_vars.end());
+}
+
+tvm::Array<TypeParam> FreeTypeVariables(const Expr& e) {
+  FreeVar fv;
+  fv.VisitExpr(e);
+  return tvm::Array<TypeParam>(fv.free_types.begin(), fv.free_types.end());
+}
+
+tvm::Array<TypeParam> FreeTypeVariables(const Type& t) {
+  FreeVar fv;
+  fv.VisitType(t);
+  return tvm::Array<TypeParam>(fv.free_types.begin(), fv.free_types.end());
+}
+
+TVM_REGISTER_API("relay._ir_pass.free_vars")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = FreeVariables(args[0]);
+  });
+
+TVM_REGISTER_API("relay._ir_pass.free_type_vars")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    NodeRef x = args[0];
+    if (x.as<TypeNode>()) {
+      *ret = FreeTypeVariables(Downcast<Type>(x));
+    } else {
+      *ret = FreeTypeVariables(Downcast<Expr>(x));
+    }
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_free_vars.py b/tests/python/relay/test_free_vars.py
new file mode 100644
index 000000000000..002646ada582
--- /dev/null
+++ b/tests/python/relay/test_free_vars.py
@@ -0,0 +1,29 @@
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import free_vars, free_type_vars
+
+def test_free_vars():
+    x = relay.Var("x")
+    fvx = free_vars(x)
+    assert len(fvx) == 1
+    assert fvx[0] == x
+    v = relay.Constant(tvm.nd.array(10))
+    ty = relay.TensorType([], "int32")
+    let = relay.Let(x, v, x, ty)
+    fvx = free_vars(let)
+    assert len(free_vars(let)) == 0
+    f = relay.Function([relay.Param(x, ty)], ty, x)
+    assert len(free_vars(f)) == 0
+
+def test_free_type_vars():
+    tp = relay.TypeParam("")
+    ty = relay.TupleType([tp, relay.TensorType([], "int32")])
+    x = relay.Var("x")
+    y = relay.Var("y")
+    let = relay.Let(x, y, x, ty)
+    fvl = free_vars(let)
+    assert len(fvl) == 1
+    assert fvl[0] == y
+    ftvl = free_type_vars(let)
+    assert len(ftvl) == 1
+    assert ftvl[0] == tp

From 3cae1357e94f436fc990c5d373c7aa4642506574 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 30 Sep 2018 23:38:13 -0400
Subject: [PATCH 145/529] Add docs/dev/relay_add_op.rst to docs/dev/index.rst
 (#1790)

---
 docs/dev/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index f3ab322bfe53..cfd79ccde468 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -11,3 +11,4 @@ In this part of documentation, we share the rationale for the specific choices m
    nnvm_json_spec
    nnvm_overview
    hybrid_script
+   relay_add_op

From 8a953d2b23cb77c12079efe111eb222afe45ceaf Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 1 Oct 2018 12:21:39 -0400
Subject: [PATCH 146/529] Add atol=1e-5 to test_topi_matmul.test_matmul (#1791)

---
 topi/tests/python/test_topi_matmul.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/tests/python/test_topi_matmul.py b/topi/tests/python/test_topi_matmul.py
index a2902e17d40b..407a2859b467 100644
--- a/topi/tests/python/test_topi_matmul.py
+++ b/topi/tests/python/test_topi_matmul.py
@@ -27,7 +27,7 @@ def verify_matmul(sa, sb, transp_a, transp_b):
     c1 = np.matmul(np.transpose(a) if transp_a else a,
                    np.transpose(b) if transp_b else b)
     c2 = with_tvm(lambda A,B: topi.matmul(A,B,transp_a,transp_b), a,b)
-    np.testing.assert_allclose(c1, c2, rtol=1e-5)
+    np.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
 
 def test_matmul():
     verify_matmul((1,1),(1,1),False,False)

From e3ef9f6b5101bcf642ee060636dd7c38e41eb2e5 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Mon, 1 Oct 2018 11:54:49 -0700
Subject: [PATCH 147/529] [Relay] Incorporate TypeRelations into more tests
 (#1792)

---
 python/tvm/relay/__init__.py          |  1 +
 python/tvm/relay/ty.pyi               | 23 ++++++++++
 src/relay/pass/kind_check.cc          | 15 +++++--
 tests/python/relay/test_check_kind.py | 61 +++++++++++++++++++++++++--
 tests/python/relay/test_ir_nodes.py   | 17 ++++++++
 5 files changed, 110 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 7007028af6c7..fc1fae76ced1 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -21,6 +21,7 @@
 TypeParam = ty.TypeParam
 TypeConstraint = ty.TypeConstraint
 FuncType = ty.FuncType
+TypeRelation = ty.TypeRelation
 
 # Expr
 Constant = expr.Constant
diff --git a/python/tvm/relay/ty.pyi b/python/tvm/relay/ty.pyi
index 1aba99e42a27..221fc228081d 100644
--- a/python/tvm/relay/ty.pyi
+++ b/python/tvm/relay/ty.pyi
@@ -158,3 +158,26 @@ class IncompleteType(Type):
 
     def __init__(self, kind):
         self.__init_handle_by_constructor__(_make.IncompleteType, kind)
+
+@register_relay_node
+class TypeRelation(TypeConstraint):
+    """Type relation in relay.
+
+    Parameters
+    ----------
+    func : EnvFunc
+        User defined relation function.
+
+    args : list of types
+        List of types to the func.
+
+    num_inputs: int
+        Number of input arguments in args,
+        this act as a hint for type inference.
+
+    attrs : Attrs
+        The attribute attached to the relation information
+    """
+    def __init__(self, func, args, num_inputs, attrs):
+        self.__init_handle_by_constructor__(_make.TypeRelation,
+                                            func, args, num_inputs, attrs)
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
index 83f52d8873e3..f649243dbfec 100644
--- a/src/relay/pass/kind_check.cc
+++ b/src/relay/pass/kind_check.cc
@@ -45,8 +45,7 @@ struct KindChecker : TypeVisitor<> {
       return true;
     }
 
-    return t.as<TensorTypeNode>() || t.as<BaseTensorTypeNode>()
-      || t.as<TupleTypeNode>() || t.as<FuncTypeNode>();
+    return t.as_derived<BaseTensorTypeNode>() || t.as<TupleTypeNode>() || t.as<FuncTypeNode>();
   }
 
   void VisitType_(const TupleTypeNode* op) override {
@@ -61,8 +60,9 @@ struct KindChecker : TypeVisitor<> {
   }
 
   void VisitType_(const FuncTypeNode* op) override {
-    // func types should only take normal types for arguments
-    // and only return a normal type
+    // Func types should only take normal types for arguments
+    // and only return a normal type. They should also have
+    // well-formed constraints
     for (const Type& t : op->arg_types) {
       this->VisitType(t);
       valid = valid && IsTypeKind(t);
@@ -71,6 +71,13 @@ struct KindChecker : TypeVisitor<> {
       }
     }
 
+    for (const TypeConstraint& tc : op->type_constraints) {
+      this->VisitType(tc);
+      if (!valid) {
+        return;
+      }
+    }
+
     this->VisitType(op->ret_type);
     valid = valid && IsTypeKind(op->ret_type);
   }
diff --git a/tests/python/relay/test_check_kind.py b/tests/python/relay/test_check_kind.py
index 413e6d7051d6..314c8c8b7992 100644
--- a/tests/python/relay/test_check_kind.py
+++ b/tests/python/relay/test_check_kind.py
@@ -2,7 +2,7 @@
 from tvm import relay
 from tvm.relay.ir_pass import check_kind
 
-def test_tuple_kinds():
+def test_tuple_kind():
     # only contain type kinds
     tp = relay.TypeParam('tp', relay.Kind.Type)
     tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
@@ -12,6 +12,7 @@ def test_tuple_kinds():
     tup_ty = relay.TupleType(fields)
     assert check_kind(tup_ty)
 
+
 def test_func_kind():
     # only contain type kinds
     tp1 = relay.TypeParam('tp1', relay.Kind.Type)
@@ -21,15 +22,29 @@ def test_func_kind():
     dtype = 'float32'
     tensor_type = relay.TensorType(shape, dtype)
 
+    tr = relay.TypeRelation(None, tvm.convert([tensor_type, tp1]) , 1, None)
+
     type_params = tvm.convert([tp1, tp2])
-    type_constraints = tvm.convert([])
+    type_constraints = tvm.convert([tr])
     arg_types = tvm.convert([tp1, tensor_type])
     ret_type = relay.TupleType(tvm.convert([tp2, tensor_type]))
 
     tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
     assert check_kind(tf)
 
-def test_invalid_tuple_kinds():
+
+def test_relation_kind():
+    # only have type kinds for arguments
+    tp = relay.TypeParam('tp', relay.Kind.Type)
+    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    tf = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([]))
+    args = tvm.convert([tf, tt, tp])
+
+    tr = relay.TypeRelation(None, args, 2, None)
+    assert check_kind(tr)
+
+
+def test_invalid_tuple_kind():
     tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
     tp2 = relay.TypeParam('tp2', relay.Kind.BaseType)
     tp3 = relay.TypeParam('tp3', relay.Kind.ShapeVar)
@@ -38,6 +53,7 @@ def test_invalid_tuple_kinds():
     tup_ty = relay.TupleType(fields)
     assert not check_kind(tup_ty)
 
+
 def test_invalid_func_kind():
     tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
     tp2 = relay.TypeParam('tp2', relay.Kind.BaseType)
@@ -51,16 +67,29 @@ def test_invalid_func_kind():
     tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
     assert not check_kind(tf)
 
+
+def test_invalid_relation_kind():
+    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeParam('tp2', relay.Kind.BaseType)
+    tp3 = relay.TypeParam('tp3', relay.Kind.ShapeVar)
+    args = tvm.convert([tp1, tp2, tp3])
+
+    tr = relay.TypeRelation(None, args, 2, None)
+    assert not check_kind(tr)
+
+
 def test_func_with_invalid_ret_type():
     tp1 = relay.TypeParam('tp1', relay.Kind.Type)
     tp2 = relay.TypeParam('tp2', relay.Kind.Shape)
     tf = relay.FuncType(tvm.convert([tp1]), tp2, tvm.convert([tp1, tp2]), tvm.convert([]))
 
+
 def test_func_with_invalid_arg_types():
     tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
     tp2 = relay.TypeParam('tp2', relay.Kind.Type)
     tf = relay.FuncType(tvm.convert([tp1]), tp2, tvm.convert([tp1, tp2]), tvm.convert([]))
 
+
 def test_func_with_invalid_tuple():
     tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
 
@@ -69,6 +98,18 @@ def test_func_with_invalid_tuple():
     tf = relay.FuncType(tvm.convert([]), ret_type, tvm.convert([tp1]), tvm.convert([]))
     assert not check_kind(tf)
 
+
+def test_func_with_invalid_relation():
+    tp1 = relay.TypeParam('tp1', relay.Kind.Type)
+    tp2 = relay.TypeParam('tp2', relay.Kind.Shape)
+    tp3 = relay.TypeParam('tp3', relay.Kind.ShapeVar)
+
+    tr = relay.TypeRelation(None, tvm.convert([tp2, tp3]), 1, None)
+
+    tf = relay.FuncType(tvm.convert([tp1]), tp1, tvm.convert([tp1, tp2, tp3]), tvm.convert([tr]))
+    assert not check_kind(tf)
+
+
 def test_tuple_with_invalid_func():
     tensor_type = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
 
@@ -77,3 +118,17 @@ def test_tuple_with_invalid_func():
 
     tup_ty = relay.TupleType(tvm.convert([tensor_type, tf]))
     assert not check_kind(tup_ty)
+
+
+if __name__ == "__main__":
+    test_tuple_kind()
+    test_func_kind()
+    test_relation_kind()
+    test_invalid_tuple_kind()
+    test_invalid_func_kind()
+    test_invalid_relation_kind()
+    test_func_with_invalid_ret_type()
+    test_func_with_invalid_arg_types()
+    test_func_with_invalid_tuple()
+    test_func_with_invalid_relation()
+    test_tuple_with_invalid_func()
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index 91cbaf73bd2b..4505710c06cc 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -58,6 +58,21 @@ def test_tuple_type():
     assert tup_ty.fields == fields
 
 
+def test_type_relation():
+    tp = relay.TypeParam('tp', relay.Kind.Type)
+    tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([]))
+    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    args = tvm.convert([tf, tt, tp])
+
+    num_inputs = 2
+    func = None
+    attrs = None
+
+    tr = relay.TypeRelation(func, args, num_inputs, attrs)
+    assert tr.args == args
+    assert tr.num_inputs == num_inputs
+
+
 def test_constant():
     arr = tvm.nd.array(10)
     const = relay.Constant(arr)
@@ -158,6 +173,8 @@ def test_if():
     test_tensor_type()
     test_type_param()
     test_func_type()
+    test_tuple_type()
+    test_type_relation()
     test_constant()
     test_tuple()
     test_local_var()

From 54dcfe336ca84c6b2d38b6b3f523e743d8648c21 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 1 Oct 2018 13:45:53 -0700
Subject: [PATCH 148/529] [IR]  eager constant folding in operator overloading
 (#1789)

---
 include/tvm/buffer.h                          |   1 +
 include/tvm/expr.h                            |  10 -
 include/tvm/ir.h                              |   2 -
 include/tvm/ir_operator.h                     | 589 ++++++++++++++++--
 include/tvm/tensor.h                          |   1 +
 nnvm/src/top/tensor/reduce.cc                 |   2 +-
 python/tvm/api.py                             |  12 +-
 python/tvm/expr.py                            |  16 +-
 python/tvm/generic.py                         |   8 +-
 src/api/api_ir.cc                             |  99 +--
 src/arithmetic/compute_expr.h                 |  61 +-
 src/arithmetic/detect_linear_equation.cc      |   4 +-
 src/codegen/codegen_cuda.cc                   |   2 +-
 src/codegen/verilog/verilog_ir.cc             |   2 +-
 src/lang/expr.cc                              |   1 +
 src/lang/ir_operator.cc                       | 402 +++++++++++-
 src/pass/ir_util.h                            |   5 +-
 src/pass/split_pipeline.cc                    |   3 +-
 src/pass/storage_rewrite.cc                   |   2 +-
 src/pass/vectorize_loop.cc                    |   1 -
 tests/cpp/ir_mutator_test.cc                  |   1 +
 tests/python/unittest/test_arith_intset.py    |   9 +-
 tests/python/unittest/test_lang_basic.py      |   2 +-
 tests/python/unittest/test_lang_operator.py   |  35 ++
 tests/python/unittest/test_lang_reflection.py |   2 +-
 tests/python/unittest/test_pass_simplify.py   |   1 -
 topi/include/topi/elemwise.h                  |   9 +-
 topi/include/topi/nn/pooling.h                |  10 +-
 topi/python/topi/vision/ssd/multibox.py       |  10 +-
 29 files changed, 1106 insertions(+), 196 deletions(-)
 create mode 100644 tests/python/unittest/test_lang_operator.py

diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 5901a27fe1ce..cda76cd140c5 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -10,6 +10,7 @@
 
 #include "base.h"
 #include "expr.h"
+#include "ir_operator.h"
 #include "node/container.h"
 
 namespace tvm {
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index a199d656caf8..050ab4c334e2 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -7,7 +7,6 @@
 #define TVM_EXPR_H_
 
 #include <ir/Expr.h>
-#include <ir/IROperator.h>
 #include <ir/IRPrinter.h>
 #include <string>
 #include <algorithm>
@@ -34,15 +33,6 @@ using HalideIR::Internal::Stmt;
 using HalideIR::Internal::IRPrinter;
 using HalideIR::Internal::Variable;
 
-using HalideIR::Internal::make_const;
-using HalideIR::Internal::make_zero;
-using HalideIR::Internal::make_one;
-using HalideIR::Internal::as_const_int;
-using HalideIR::Internal::as_const_uint;
-using HalideIR::Internal::const_true;
-using HalideIR::Internal::const_false;
-using HalideIR::Internal::is_no_op;
-
 inline Type TVMShapeIndexType() {
   if (std::is_signed<tvm_index_t>::value) {
     return Int(sizeof(tvm_index_t) * 8);
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index b75d75c18182..14e60146567f 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -495,8 +495,6 @@ using HalideIR::Internal::Block;
 using HalideIR::Internal::IfThenElse;
 using HalideIR::Internal::Evaluate;
 using HalideIR::Internal::Shuffle;
-// ir functions
-using HalideIR::Internal::is_const_power_of_two_integer;
 
 /*!
  * \brief Create a type annotation expression
diff --git a/include/tvm/ir_operator.h b/include/tvm/ir_operator.h
index 39588a2228f9..5abd95b8c166 100644
--- a/include/tvm/ir_operator.h
+++ b/include/tvm/ir_operator.h
@@ -1,24 +1,426 @@
 /*!
- *  Copyright (c) 2017 by Contributors
+ *  Copyright (c) 2018 by Contributors
  * \file tvm/ir_operator.h
- * \brief Common operators of Expr
+ * \brief Common operators defined for Expr.
+ *
+ * \note Most of the operator defined here perform simple constant folding
+ *   when the type is int32 or int64 for simplifying the index expressions.
  */
 #ifndef TVM_IR_OPERATOR_H_
 #define TVM_IR_OPERATOR_H_
 
 #include <algorithm>
+#include <type_traits>
 #include "expr.h"
 #include "ir.h"
 
 namespace tvm {
+/*!
+ * \brief Make a const value with certain data type.
+ * \param t The target type.
+ * \param value The input value
+ * \return the result expression.
+ * \tparam ValueType The constant value type
+ */
+template<typename ValueType,
+         typename = typename std::enable_if<std::is_pod<ValueType>::value>::type>
+inline Expr make_const(Type t, ValueType value);
+/*!
+ * \brief Make a const zero expr.
+ * \param t The target type.
+ * \return the result expression.
+ */
+inline Expr make_zero(Type t);
+/*!
+ * \brief Make a constant true expression.
+ * \param lanes The number of lanes in the bool
+ * \return The result expression.
+ */
+inline Expr const_true(int lanes = 1) {
+  return make_const(UInt(1, lanes), 1);
+}
+/*!
+ * \brief Make a constant false expression.
+ * \param lanes The number of lanes in the bool
+ * \return The result expression.
+ */
+inline Expr const_false(int lanes = 1) {
+  return make_const(UInt(1, lanes), 0);
+}
+/*!
+ * \brief Get x as constant int expression.
+ * \param x The expression
+ * \return the address to the int expression,
+ *         return nullptr, if x is not IntImm.
+ */
+inline const int64_t* as_const_int(const Expr& x) {
+  if (!x.defined()) return nullptr;
+  if (const ir::IntImm* op = x.as<ir::IntImm>()) {
+    return &(op->value);
+  } else {
+    return nullptr;
+  }
+}
+
+/*!
+ * \brief Get x as constant uint expression.
+ * \param x The expression
+ * \return the address to the int expression,
+ *         return nullptr, if x is not UIntImm.
+ */
+inline const uint64_t* as_const_uint(const Expr& x) {
+  if (!x.defined()) return nullptr;
+  if (const ir::UIntImm* op = x.as<ir::UIntImm>()) {
+    return &(op->value);
+  } else {
+    return nullptr;
+  }
+}
+
+/*!
+ * \brief Check whether x is a constant integer expression.
+ * \param x The input argument
+ * \param value the value to be compared against.
+ * \return whether x is constant expression.
+ */
+inline bool is_const_int(const Expr& x, int64_t value);
+
+/*!
+ * \brief Check whether stmt is nop.
+ * \param stmt The input statement
+ * \return whether stmt is nop
+ */
+inline bool is_no_op(const Stmt& stmt);
+
+/*!
+ * \brief Check whether x is a constant integer 1
+ * \param x The input argument.
+ * \note This only return true for integer types.
+ * \return whether x is constant 1
+ */
+inline bool is_one(const Expr& x) {
+  return is_const_int(x, 1);
+}
 
-using HalideIR::likely;
-using HalideIR::likely_if_innermost;
-// functions
-using HalideIR::cast;
-using HalideIR::min;
-using HalideIR::max;
-using HalideIR::select;
+/*!
+ * \brief Check whether x is a constant integer 0
+ * \param x The input argument
+ * \return whether x is constant 0
+ * \note This only return true for integer types.
+ */
+inline bool is_zero(const Expr& x) {
+  return is_const_int(x, 0);
+}
+
+/*!
+ * \brief Check whether x is a constant.
+ * \note This only return true for integer types.
+ * \return whether x is constant
+ */
+inline bool is_const(const Expr& x);
+
+/*!
+ * \brief Check whether x is a constant power of two
+ * If x is power of two, write the power to the shift.
+ *
+ * \param x The input expression.
+ * \param shift The output shift if x is power of two.
+ * \return whether x is constant power of two
+ */
+TVM_DLL bool is_const_power_of_two_integer(const Expr& x, int* shift);
+
+/*!
+ * \brief cast value to type.
+ *
+ * \param t the target type.
+ * \param value The value
+ * \return The result expression.
+ * \note This function may return value if the type is the same.
+ */
+TVM_DLL Expr cast(const Type& t, Expr value);
+/*!
+ * \brief perform reinterpret cast value to type.
+ *
+ * \param t the target type.
+ * \param value The value
+ * \return The result expression.
+ * \note This function may return value if the type is the same.
+ */
+TVM_DLL Expr reinterpret(const Type& t, Expr value);
+/*!
+ * \brief add operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator+(Expr a, Expr b);
+/*!
+ * \brief subtraction operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator-(Expr a, Expr b);
+/*!
+ * \brief negation.
+ *
+ * \param a input.
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator-(Expr a);
+/*!
+ * \brief multiplication operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator*(Expr a, Expr b);
+/*!
+ * \brief division operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator/(Expr a, Expr b);
+/*!
+ * \brief mod operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator%(Expr a, Expr b);
+/*!
+ * \brief left shift operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator<<(Expr a, Expr b);
+/*!
+ * \brief right shift operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator>>(Expr a, Expr b);
+/*!
+ * \brief greater
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator>(Expr a, Expr b);
+/*!
+ * \brief greater_equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator>=(Expr a, Expr b);
+/*!
+ * \brief less
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator<(Expr a, Expr b);
+/*!
+ * \brief less_equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator<=(Expr a, Expr b);
+/*!
+ * \brief equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator==(Expr a, Expr b);
+/*!
+ * \brief not_equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator!=(Expr a, Expr b);
+/*!
+ * \brief and
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note This operator does eager constant folding.
+ */
+TVM_DLL Expr operator&&(Expr a, Expr b);
+/*!
+ * \brief or
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note This operator does eager constant folding.
+ */
+TVM_DLL Expr operator||(Expr a, Expr b);
+/*!
+ * \brief not
+ *
+ * \param a left operand
+ * \return The result expression.
+ * \note This operator does eager constant folding.
+ */
+TVM_DLL Expr operator!(Expr a);
+/*!
+ * \brief take maximum of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr max(Expr a, Expr b);
+/*!
+ * \brief take minimum of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr min(Expr a, Expr b);
+/*!
+ * \brief right shift
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator>>(Expr a, Expr b);
+/*!
+ * \brief left shift
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator<<(Expr a, Expr b);
+/*!
+ * \brief take bitwise and of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator&(Expr a, Expr b);
+/*!
+ * \brief take bitwise or of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator|(Expr a, Expr b);
+/*!
+ * \brief take bitwise xor of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator^(Expr a, Expr b);
+/*!
+ * \brief take bitwise negation of two values
+ *
+ * \param a the input expression.
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator~(Expr a);
+/*!
+ * \brief select result by condition
+ *
+ * \param cond The condition
+ * \param true_value The value when results are true.
+ * \param false_value The value when results are false.
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr select(Expr cond, Expr true_value, Expr false_value);
+/*!
+ * \brief Mark condition as likely.
+ * \param cond The condition
+ * \return The marked expression.
+ */
+TVM_DLL Expr likely(Expr cond);
+/*!
+ * \brief Calculate power(x, y)
+ * \param x The left operand.
+ * \param y The right operand.
+ */
+TVM_DLL Expr pow(Expr x, Expr y);
+/*!
+ * \brief Calculate absolute value of x.
+ * \param x The input data
+ *
+ * \return The aboslute value of input data x
+ */
+TVM_DLL Expr abs(Expr x);
 
 /*!
  * \brief sum of of source expression over axis
@@ -48,13 +450,12 @@ TVM_DLL Expr min(Expr source, Array<IterVar> axis);
  */
 TVM_DLL Expr prod(Expr source, Array<IterVar> axis);
 
-// Unary intrinsic operators
+// Intrinsic operators
 #define TVM_DECLARE_INTRIN_UNARY(OpName)                                \
   inline Expr OpName(Expr x) {                                          \
     return ir::Call::make(x.type(), #OpName, {x}, ir::Call::PureIntrinsic); \
   }                                                                     \
 
-
 TVM_DECLARE_INTRIN_UNARY(exp);
 TVM_DECLARE_INTRIN_UNARY(tanh);
 TVM_DECLARE_INTRIN_UNARY(sigmoid);
@@ -64,38 +465,152 @@ TVM_DECLARE_INTRIN_UNARY(floor);
 TVM_DECLARE_INTRIN_UNARY(ceil);
 TVM_DECLARE_INTRIN_UNARY(round);
 TVM_DECLARE_INTRIN_UNARY(trunc);
+TVM_DECLARE_INTRIN_UNARY(popcount);
 
-/*!
- * \brief Calculate power(x, y)
- * \param x The left operand.
- * \param y The right operand.
- */
-inline Expr pow(Expr x, Expr y) {
-  match_types(x, y);
-  CHECK(x.type().is_float()) << "power only applies to float";
-  return ir::Call::make(x.type(), "pow", { x, y }, ir::Call::PureIntrinsic);
+
+// Implementation details after this
+inline bool is_const(const Expr& x) {
+  if (x.as<ir::IntImm>() || x.as<ir::UIntImm>()) {
+    return true;
+  } else if (const auto* op = x.as<ir::Broadcast>()) {
+    const Expr& val = op->value;
+    if (val.as<ir::IntImm>() || val.as<ir::UIntImm>()) {
+      return true;
+    }
+  }
+  return false;
 }
 
-/*!
- * \brief Calculate absolute value of x, elementwise
- * \param x The input data
- *
- * \return The aboslute value of input data x
- */
-inline Expr abs(Expr x) {
-  if (x.type().is_int()) {
-    return select(x >= make_zero(x.type()), x, -x);
-  } else if (x.type().is_float()) {
-    return ir::Call::make(x.type(), "fabs", {x}, ir::Call::PureIntrinsic);
-  } else if (x.type().is_uint()) {
-    return x;
+inline bool is_positive_const(const Expr& a) {
+  if (const ir::IntImm* op = a.as<ir::IntImm>()) {
+    return op->value > 0;
+  } else if (const ir::UIntImm* op = a.as<ir::UIntImm>()) {
+    return op->value > 0;
   } else {
-    LOG(WARNING) << "Warning: Data type " << x.type()
-      <<" not supported for absolute op. Skipping absolute op...";
-    return x;
+    return false;
   }
 }
 
-}  // namespace tvm
+inline bool is_negative_const(const Expr& a) {
+  if (const ir::IntImm* op = a.as<ir::IntImm>()) {
+    return op->value < 0;
+  } else {
+    return false;
+  }
+}
+
+inline bool is_const_int(const Expr& x, int64_t value) {
+  if (const auto* op = x.as<ir::IntImm>()) {
+    return op->value == value;
+  } else if (const auto* op = x.as<ir::UIntImm>()) {
+    return op->value == static_cast<uint64_t>(value);
+  } else if (const auto* op = x.as<ir::Broadcast>()) {
+    const Expr& val = op->value;
+    if (const auto* opv = val.as<ir::IntImm>()) {
+      return opv->value == value;
+    } else if (const auto* opv = val.as<ir::UIntImm>()) {
+      return opv->value == static_cast<uint64_t>(value);
+    }
+  }
+  return false;
+}
+
+inline bool is_no_op(const Stmt& stmt) {
+  if (!stmt.defined()) return true;
+  if (const auto* op = stmt.as<ir::Evaluate>()) {
+    return is_const(op->value);
+  }
+  return false;
+}
+
+template<typename ValueType>
+inline Expr MakeConstScalar(Type t, ValueType value) {
+  if (t.is_int()) return ir::IntImm::make(t, static_cast<int64_t>(value));
+  if (t.is_uint()) return ir::UIntImm::make(t, static_cast<uint64_t>(value));
+  if (t.is_float()) return ir::FloatImm::make(t, static_cast<double>(value));
+  LOG(FATAL) << "cannot make const for type " << t;
+  return Expr();
+}
+
+template<typename ValueType, typename>
+inline Expr make_const(Type t, ValueType value) {
+  if (t.lanes() == 1) {
+    return MakeConstScalar(t, value);
+  } else {
+    return ir::Broadcast::make(
+        MakeConstScalar(t.element_of(), value), t.lanes());
+  }
+}
+
+inline Expr make_zero(Type t) {
+  if (t.is_handle()) {
+    return reinterpret(t, make_const(UInt(64), 0));
+  }
+  return make_const(t, 0);
+}
+
+// additional const expression overloading
+#define TVM_DEFINE_ASSIGN_OP_OVERLOAD(Name, OpFunc)            \
+  inline Expr Name(Expr& a, Expr b) {                          \
+    a = OpFunc(a, b);                                          \
+    return a;                                                  \
+  }
 
+#define TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(Name)              \
+  inline Expr Name(const Expr& a, float b) {                   \
+    return Name(a, Expr(b));                                   \
+  }                                                            \
+  inline Expr Name(float a, const Expr& b) {                   \
+    return Name(Expr(a), b);                                   \
+  }                                                            \
+  inline Expr Name(int a, const Expr& b) {                     \
+    return Name(make_const(b.type(), a), b);                   \
+  }                                                            \
+  inline Expr Name(const Expr& a, int b) {                     \
+    return Name(a, make_const(a.type(), b));                   \
+  }
+
+#define TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(Name)                  \
+  inline Expr Name(const Expr& a, bool b) {                             \
+    return Name(a, Expr(b));                                            \
+  }                                                                     \
+  inline Expr Name(bool a, const Expr& b) {                             \
+    return Name(Expr(a), b);                                            \
+  }
+
+#define TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(Name)                      \
+  inline Expr Name(const Expr& a, int b) {                              \
+    return Name(a, make_const(a.type(), b));                            \
+  }                                                                     \
+  inline Expr Name(int a, const Expr& b) {                              \
+    return Name(make_const(b.type(), a), b);                            \
+  }
+
+
+TVM_DEFINE_ASSIGN_OP_OVERLOAD(operator+=, operator+);
+TVM_DEFINE_ASSIGN_OP_OVERLOAD(operator-=, operator-);
+TVM_DEFINE_ASSIGN_OP_OVERLOAD(operator*=, operator*);
+TVM_DEFINE_ASSIGN_OP_OVERLOAD(operator/=, operator/);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator+);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator-);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator*);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator/);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(max);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(min);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator>);  // NOLINT(*)
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator>=);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator<);  // NOLINT(*)
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator<=);
+// integer related ops
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator%);
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator>>); // NOLINT(*)
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator<<); // NOLINT(*)
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator&);
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator|);
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator^);
+// logical ops
+TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(operator&&);
+TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(operator||);
+
+}  // namespace tvm
 #endif  // TVM_IR_OPERATOR_H_
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index e205f6b9ff5e..7665e724b236 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -13,6 +13,7 @@
 
 #include "base.h"
 #include "expr.h"
+#include "ir_operator.h"
 #include "arithmetic.h"
 #include "node/container.h"
 
diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index 91d2ea7202b8..7241c4b4b85a 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -354,7 +354,7 @@ Example::
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
     auto axis = ShapeToArray(r_axes);
 
-    Expr count = make_one(inputs[0]->dtype);
+    Expr count = make_const(inputs[0]->dtype, 1);
     for (auto& i : r_axes) {
       count *= inputs[0]->shape[i];
     }
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 34fe2ba49dc8..8cf507de6386 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -156,9 +156,9 @@ def any(*args):
         raise ValueError("Any must take at least 1 argument")
     if len(args) == 1:
         return args[0]
-    ret = _expr.Or(args[0], args[1])
+    ret = _make._OpOr(args[0], args[1])
     for i in range(2, len(args)):
-        ret = _expr.Or(ret, args[i])
+        ret = _make._OpOr(ret, args[i])
     return ret
 
 
@@ -180,9 +180,9 @@ def all(*args):
         raise ValueError("Any must take at least 1 argument")
     if len(args) == 1:
         return args[0]
-    ret = _expr.And(args[0], args[1])
+    ret = _make._OpAnd(args[0], args[1])
     for i in range(2, len(args)):
-        ret = _expr.And(ret, args[i])
+        ret = _make._OpAnd(ret, args[i])
     return ret
 
 
@@ -773,5 +773,5 @@ def reducer(expr, axis, where=None, *args):
 _init_api("tvm.api")
 #pylint: disable=unnecessary-lambda
 sum = comm_reducer(lambda x, y: x+y, lambda t: const(0, dtype=t), name="sum")
-min = comm_reducer(lambda x, y: _expr.Min(x, y), max_value, name='min')
-max = comm_reducer(lambda x, y: _expr.Max(x, y), min_value, name='max')
+min = comm_reducer(lambda x, y: _make._OpMin(x, y), max_value, name='min')
+max = comm_reducer(lambda x, y: _make._OpMax(x, y), min_value, name='max')
diff --git a/python/tvm/expr.py b/python/tvm/expr.py
index 1c1c9f82cb97..00a523416c85 100644
--- a/python/tvm/expr.py
+++ b/python/tvm/expr.py
@@ -60,7 +60,7 @@ def __rfloordiv__(self, other):
         return self.__rdiv__(other)
 
     def __mod__(self, other):
-        return _make.Mod(self, other)
+        return _make._OpMod(self, other)
 
     def __neg__(self):
         neg_one = _api_internal._const(-1, self.dtype)
@@ -85,10 +85,10 @@ def __invert__(self):
         return _make.Call(self.dtype, "bitwise_not", [self], Call.PureIntrinsic, None, 0)
 
     def __lt__(self, other):
-        return _make.LT(self, other)
+        return _make._OpLT(self, other)
 
     def __le__(self, other):
-        return _make.LE(self, other)
+        return _make._OpLE(self, other)
 
     def __eq__(self, other):
         return EqualOp(self, other)
@@ -97,10 +97,10 @@ def __ne__(self, other):
         return NotEqualOp(self, other)
 
     def __gt__(self, other):
-        return _make.GT(self, other)
+        return _make._OpGT(self, other)
 
     def __ge__(self, other):
-        return _make.GE(self, other)
+        return _make._OpGE(self, other)
 
     def __nonzero__(self):
         raise ValueError("Cannot use and / or / not operator to Expr, hint: " +
@@ -122,7 +122,7 @@ def equal(self, other):
         ret : Expr
             The equality expression.
         """
-        return _make.EQ(self, other)
+        return _make._OpEQ(self, other)
 
     def astype(self, dtype):
         """Cast the expression to other type.
@@ -169,7 +169,7 @@ def __bool__(self):
 
     def asnode(self):
         """Convert node."""
-        return _make.EQ(self.a, self.b)
+        return _make._OpEQ(self.a, self.b)
 
 
 class NotEqualOp(NodeGeneric, ExprOp):
@@ -201,7 +201,7 @@ def __bool__(self):
 
     def asnode(self):
         """Convert node."""
-        return _make.NE(self.a, self.b)
+        return _make._OpNE(self.a, self.b)
 
 
 class Expr(ExprOp, NodeBase):
diff --git a/python/tvm/generic.py b/python/tvm/generic.py
index 2926f73d5a02..ab1a80d3f612 100644
--- a/python/tvm/generic.py
+++ b/python/tvm/generic.py
@@ -24,7 +24,7 @@ def add(lhs, rhs):
     op : tvm.Expr
         The result Expr of add operaton.
     """
-    return _make.Add(lhs, rhs)
+    return _make._OpAdd(lhs, rhs)
 
 
 def subtract(lhs, rhs):
@@ -42,7 +42,7 @@ def subtract(lhs, rhs):
     op : tvm.Expr
         The result Expr of subtract operaton.
     """
-    return _make.Sub(lhs, rhs)
+    return _make._OpSub(lhs, rhs)
 
 
 def multiply(lhs, rhs):
@@ -60,7 +60,7 @@ def multiply(lhs, rhs):
     op : tvm.Expr
         The result Expr of multiply operaton.
     """
-    return _make.Mul(lhs, rhs)
+    return _make._OpMul(lhs, rhs)
 
 
 def divide(lhs, rhs):
@@ -78,7 +78,7 @@ def divide(lhs, rhs):
     op : tvm.Expr
         The result Expr of divide operaton.
     """
-    return _make.Div(lhs, rhs)
+    return _make._OpDiv(lhs, rhs)
 
 
 def cast(src, dtype):
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index 8a65260a0f58..1040f6ce6f66 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/expr.h>
 #include <tvm/ir.h>
-#include <ir/IROperator.h>
+#include <tvm/ir_operator.h>
 #include <tvm/api_registry.h>
 #include <tvm/ir_operator.h>
 
@@ -117,6 +117,50 @@ TVM_REGISTER_API("make.CommReducer")
       *ret = Node::make(args[0], args[1], args[2], args[3], args[4]);   \
     })                                                                  \
 
+
+REGISTER_MAKE5(Reduce);
+REGISTER_MAKE4(AttrStmt);
+
+REGISTER_MAKE2(IntImm);
+REGISTER_MAKE2(UIntImm);
+REGISTER_MAKE2(FloatImm);
+REGISTER_MAKE1(StringImm);
+
+REGISTER_MAKE2(Add);
+REGISTER_MAKE2(Sub);
+REGISTER_MAKE2(Mul);
+REGISTER_MAKE2(Div);
+REGISTER_MAKE2(Mod);
+REGISTER_MAKE2(Min);
+REGISTER_MAKE2(Max);
+REGISTER_MAKE2(EQ);
+REGISTER_MAKE2(NE);
+REGISTER_MAKE2(LT);
+REGISTER_MAKE2(LE);
+REGISTER_MAKE2(GT);
+REGISTER_MAKE2(GE);
+REGISTER_MAKE2(And);
+REGISTER_MAKE2(Or);
+
+REGISTER_MAKE1(Not);
+REGISTER_MAKE3(Select);
+REGISTER_MAKE3(Ramp);
+REGISTER_MAKE2(Cast);
+REGISTER_MAKE2(Broadcast);
+REGISTER_MAKE2(Shuffle);
+REGISTER_MAKE3(Let);
+REGISTER_MAKE3(LetStmt);
+REGISTER_MAKE3(AssertStmt);
+REGISTER_MAKE3(ProducerConsumer);
+REGISTER_MAKE5(Allocate);
+REGISTER_MAKE4(Provide);
+REGISTER_MAKE4(Prefetch);
+REGISTER_MAKE1(Free);
+REGISTER_MAKE2(Block);
+REGISTER_MAKE3(IfThenElse);
+REGISTER_MAKE1(Evaluate);
+
+// operator overloading, smarter than make
 #define REGISTER_MAKE_BINARY_OP(Node, Func)                  \
   TVM_REGISTER_API("make."#Node)                             \
   .set_body([](TVMArgs args,  TVMRetValue *ret) {            \
@@ -138,50 +182,27 @@ TVM_REGISTER_API("make.CommReducer")
       }                                                                 \
     })
 
-REGISTER_MAKE5(Reduce);
-REGISTER_MAKE4(AttrStmt);
 
-REGISTER_MAKE2(IntImm);
-REGISTER_MAKE2(UIntImm);
-REGISTER_MAKE2(FloatImm);
-REGISTER_MAKE1(StringImm);
-REGISTER_MAKE_BINARY_OP(Add, operator+);
-REGISTER_MAKE_BINARY_OP(Sub, operator-);
-REGISTER_MAKE_BINARY_OP(Mul, operator*);
-REGISTER_MAKE_BINARY_OP(Div, operator/);
-REGISTER_MAKE_BINARY_OP(Mod, operator%);
-REGISTER_MAKE_BINARY_OP(Min, min);
-REGISTER_MAKE_BINARY_OP(Max, max);
-REGISTER_MAKE_BINARY_OP(EQ, operator==);
-REGISTER_MAKE_BINARY_OP(NE, operator!=);
-REGISTER_MAKE_BINARY_OP(LT, operator<); // NOLINT(*)
-REGISTER_MAKE_BINARY_OP(LE, operator<=); // NOLINT(*)
-REGISTER_MAKE_BINARY_OP(GT, operator>);  // NOLINT(*)
-REGISTER_MAKE_BINARY_OP(GE, operator>=);
-REGISTER_MAKE_BINARY_OP(And, operator&&);
-REGISTER_MAKE_BINARY_OP(Or, operator||);
+REGISTER_MAKE_BINARY_OP(_OpAdd, operator+);
+REGISTER_MAKE_BINARY_OP(_OpSub, operator-);
+REGISTER_MAKE_BINARY_OP(_OpMul, operator*);
+REGISTER_MAKE_BINARY_OP(_OpDiv, operator/);
+REGISTER_MAKE_BINARY_OP(_OpMod, operator%);
+REGISTER_MAKE_BINARY_OP(_OpMin, min);
+REGISTER_MAKE_BINARY_OP(_OpMax, max);
+REGISTER_MAKE_BINARY_OP(_OpEQ, operator==);
+REGISTER_MAKE_BINARY_OP(_OpNE, operator!=);
+REGISTER_MAKE_BINARY_OP(_OpLT, operator<); // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(_OpLE, operator<=); // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(_OpGT, operator>);  // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(_OpGE, operator>=);
+REGISTER_MAKE_BINARY_OP(_OpAnd, operator&&);
+REGISTER_MAKE_BINARY_OP(_OpOr, operator||);
 REGISTER_MAKE_BIT_OP(bitwise_and, operator&);
 REGISTER_MAKE_BIT_OP(bitwise_or, operator|);
 REGISTER_MAKE_BIT_OP(bitwise_xor, operator^);
 REGISTER_MAKE_BIT_OP(left_shift, operator<<); // NOLINT(*)
 REGISTER_MAKE_BIT_OP(right_shift, operator>>);
-REGISTER_MAKE1(Not);
-REGISTER_MAKE3(Select);
-REGISTER_MAKE3(Ramp);
-REGISTER_MAKE2(Cast);
-REGISTER_MAKE2(Broadcast);
-REGISTER_MAKE2(Shuffle);
-REGISTER_MAKE3(Let);
-REGISTER_MAKE3(LetStmt);
-REGISTER_MAKE3(AssertStmt);
-REGISTER_MAKE3(ProducerConsumer);
-REGISTER_MAKE5(Allocate);
-REGISTER_MAKE4(Provide);
-REGISTER_MAKE4(Prefetch);
-REGISTER_MAKE1(Free);
-REGISTER_MAKE2(Block);
-REGISTER_MAKE3(IfThenElse);
-REGISTER_MAKE1(Evaluate);
 
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/arithmetic/compute_expr.h b/src/arithmetic/compute_expr.h
index 5f44347f3539..218e9d218a66 100644
--- a/src/arithmetic/compute_expr.h
+++ b/src/arithmetic/compute_expr.h
@@ -14,10 +14,6 @@
 namespace tvm {
 namespace arith {
 
-using HalideIR::Internal::add_would_overflow;
-using HalideIR::Internal::sub_would_overflow;
-using HalideIR::Internal::mul_would_overflow;
-
 /*!
  * \brief Compute the expression with the given binary op.
  * \param lhs The left operand
@@ -42,23 +38,9 @@ template<typename Op>
 inline Expr ComputeReduce(
     const Array<Expr>& values, Expr empty_value);
 
-template<typename T>
-inline bool GetConst(Expr e, T* out);
-
-template<>
-inline bool GetConst<int64_t>(Expr e, int64_t *out) {
-  if (e.type().is_vector()) return false;
-  const int64_t *v = as_const_int(e);
-  if (v) {
-    *out = *v; return true;
-  } else {
-    return false;
-  }
-}
-template<>
-inline bool GetConst<uint64_t>(Expr e, uint64_t *out) {
+inline bool GetConst(Expr e, int64_t* out) {
   if (e.type().is_vector()) return false;
-  const uint64_t *v = as_const_uint(e);
+  const int64_t* v = as_const_int(e);
   if (v) {
     *out = *v; return true;
   } else {
@@ -69,66 +51,37 @@ inline bool GetConst<uint64_t>(Expr e, uint64_t *out) {
 // get a small constant int
 inline bool GetConstInt(Expr e, int* out) {
   int64_t v1 = 0;
-  uint64_t v2 = 0;
   if (GetConst(e, &v1)) {
     if (v1 > static_cast<int64_t>(
             std::numeric_limits<int>::max())) return false;
     *out = static_cast<int>(v1); return true;
   }
-  if (GetConst(e, &v2)) {
-    if (v2 > static_cast<uint64_t>(
-            std::numeric_limits<int>::max())) return false;
-    *out = static_cast<int>(v2); return true;
-  }
   return false;
 }
 
-#define TVM_CONST_PROPAGATION(OP_NAME, OP)                       \
-  int64_t ia = 0, ib = 0;                                        \
-  if (GetConst(a, &ia) && GetConst(b, &ib)) {                    \
-    if (OP_NAME ## _would_overflow(a.type().bits(), ia, ib)) {   \
-      LOG(FATAL) << "signed int overflow";                       \
-    }                                                            \
-    return ir::IntImm::make(a.type(), ia OP ib);                 \
-  }                                                              \
-  uint64_t ua = 0, ub = 0;                                       \
-  if (GetConst(a, &ua) && GetConst(b, &ub)) {                    \
-    return ir::UIntImm::make(a.type(), ua OP ub);                \
-  }                                                              \
-
 template<>
 inline Expr ComputeExpr<ir::Add>(Expr a, Expr b) {
-  if (is_zero(a)) return b;
-  if (is_zero(b)) return a;
-  TVM_CONST_PROPAGATION(add, +);
-  return ir::Add::make(a, b);
+  return a + b;
 }
 
 template<>
 inline Expr ComputeExpr<ir::Sub>(Expr a, Expr b) {
-  if (is_zero(b)) return a;
-  TVM_CONST_PROPAGATION(sub, -);
-  return ir::Sub::make(a, b);
+  return a - b;
 }
 
 template<>
 inline Expr ComputeExpr<ir::Mul>(Expr a, Expr b) {
-  if (is_one(a)) return b;
-  if (is_one(b)) return a;
-  TVM_CONST_PROPAGATION(mul, *);
-  return ir::Mul::make(a, b);
+  return a * b;
 }
 
 template<>
 inline Expr ComputeExpr<ir::Div>(Expr a, Expr b) {
-  if (is_one(b)) return a;
-  return ir::Div::make(a, b);
+  return a / b;
 }
 
 template<>
 inline Expr ComputeExpr<ir::Mod>(Expr a, Expr b) {
-  if (is_zero(a)) return make_zero(a.type());
-  return ir::Mod::make(a, b);
+  return a % b;
 }
 
 template<>
diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index 109cdc6d9146..4e6d8caf3772 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -194,7 +194,7 @@ bool DetectClipBound(
   if (!LinearEqDetector(var).Detect(canonical, &ret)) return false;
   ret.coeff = Simplify(ret.coeff);
   IntervalEntry& p = (*bmap)[var.get()];
-  if (is_one(ret.coeff)) {
+  if (is_const_int(ret.coeff, 1)) {
     // var + shift >=0 -> var >= -shift
     if (p.min_value.defined()) {
       p.min_value = ir::Max::make(p.min_value, -ret.base);
@@ -203,7 +203,7 @@ bool DetectClipBound(
     }
     return true;
   }
-  if (is_const(ret.coeff, -1)) {
+  if (is_const_int(ret.coeff, -1)) {
     // -var + shift >=0 -> var <= shift
     if (p.max_value.defined()) {
       p.max_value = ir::Min::make(p.max_value, ret.base);
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 7c8399cfc7b5..0960106ae471 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -42,7 +42,7 @@ std::string CodeGenCUDA::Finish() {
 }
 
 void CodeGenCUDA::VisitStmt_(const ir::For* op) {
-  CHECK(is_zero(op->min));
+  CHECK(is_const_int(op->min, 0));
   if (op->for_type == ir::ForType::Unrolled) {
     PrintIndent();
     stream << "#pragma unroll\n";
diff --git a/src/codegen/verilog/verilog_ir.cc b/src/codegen/verilog/verilog_ir.cc
index dea8ebaebb8d..0cc4b9cf3c21 100644
--- a/src/codegen/verilog/verilog_ir.cc
+++ b/src/codegen/verilog/verilog_ir.cc
@@ -195,7 +195,7 @@ class PipelineExtractor: public IRVisitor {
       ChannelEntry& cb = cmap_.at(ch->handle_var.get());
       trigger->signal_index = static_cast<int>(cb.node->ctrl_signals.size());
       // Grab the advance constant size.
-      int trigger_size;
+      int trigger_size = 0;
       if (attr->attr_key == attr::pipeline_stage_scope) {
         cb.node->ctrl_signals.push_back(
             ControlSignalNode::make(kComputeFinish, 0));
diff --git a/src/lang/expr.cc b/src/lang/expr.cc
index 062ea9217e63..7ac0e372371c 100644
--- a/src/lang/expr.cc
+++ b/src/lang/expr.cc
@@ -5,6 +5,7 @@
 #include <tvm/base.h>
 #include <tvm/expr.h>
 #include <tvm/ir.h>
+#include <tvm/ir_operator.h>
 #include <ir/IRPrinter.h>
 #include <memory>
 
diff --git a/src/lang/ir_operator.cc b/src/lang/ir_operator.cc
index 5cad23e8ce57..30742764351d 100644
--- a/src/lang/ir_operator.cc
+++ b/src/lang/ir_operator.cc
@@ -8,6 +8,406 @@
 
 namespace tvm {
 
+/*!
+ * \brief Check whether type is used to represent index.
+ *
+ * Index types are frequently used in shape computation
+ * and need to be aggressively constant-folded.
+ *
+ * \param type The type to represent index.
+ * \return the checked result.
+ */
+inline bool IsIndexType(const Type& type) {
+  return type.is_int() && type.lanes() == 1 &&
+      (type.bits() == 32 || type.bits() == 64);
+}
+
+// simple cast that only checks if type matches and cast
+inline Expr SimpleCast(const Type& t, Expr value) {
+  if (value.type() == t) return value;
+  return ir::Cast::make(t, value);
+}
+
+// The public function with a quick checking path.
+void BinaryOpMatchTypes(Expr& lhs, Expr& rhs) {  // NOLINT(*)
+  if (lhs.type() == rhs.type()) return;
+  Type ltype = lhs.type();
+  Type rtype = rhs.type();
+  if (ltype.lanes() == 1 && rtype.lanes() != 1) {
+    lhs = ir::Broadcast::make(lhs, rtype.lanes());
+  } else if (rtype.lanes() == 1 && ltype.lanes() != 1) {
+    rhs = ir::Broadcast::make(rhs, ltype.lanes());
+  } else {
+    CHECK(ltype.lanes() == rtype.lanes())
+        << "Cannot match type " << ltype << " vs " << rtype;
+  }
+  if (lhs.type() == rhs.type()) return;
+  // Only do very simple type coversion
+  // int->float, int(32)->int(64)
+  // require the types to be relatively consistent
+  // This will the reduce amount code generated by operators
+  // and also help user to find potential type conversion problems.
+  if (!lhs.type().is_float() && rhs.type().is_float()) {
+    // int->float
+    lhs = ir::Cast::make(rhs.type(), lhs);
+  } else if (lhs.type().is_float() && !rhs.type().is_float()) {
+    // int->float
+    rhs = ir::Cast::make(lhs.type(), rhs);
+  } else if ((lhs.type().is_int() && rhs.type().is_int()) ||
+             (lhs.type().is_uint() && rhs.type().is_uint())) {
+    // promote int to higher bits
+    if (lhs.type().bits() < rhs.type().bits()) {
+      lhs = ir::Cast::make(rhs.type(), lhs);
+    } else {
+      rhs = ir::Cast::make(lhs.type(), rhs);
+    }
+  } else if ((lhs.type().is_int() && rhs.type().is_uint()) ||
+             (lhs.type().is_uint() && rhs.type().is_int())) {
+    int bits = std::max(lhs.type().bits(), rhs.type().bits());
+    lhs = SimpleCast(Int(bits, lhs.type().lanes()), lhs);
+    rhs = SimpleCast(Int(bits, rhs.type().lanes()), rhs);
+  } else {
+    LOG(FATAL) << "Cannot match type " << ltype << " vs " << rtype;
+  }
+}
+
+
+template<typename ValueType>
+inline bool ConstPowerHelper(ValueType val, int *shift) {
+  if (val <= 0) return false;
+  shift[0] = 0;
+  while (val != 0) {
+    if (val & 1) {
+      return (val == 1);
+    }
+    ++shift[0];
+    val = val >> 1;
+  }
+  return true;
+}
+
+bool is_const_power_of_two_integer(const Expr& x, int* shift) {
+  if (const auto* op = x.as<ir::IntImm>()) {
+    return ConstPowerHelper(op->value, shift);
+  } else if (const auto* op = x.as<ir::UIntImm>()) {
+    return ConstPowerHelper(op->value, shift);
+  } else {
+    return false;
+  }
+}
+
+Expr cast(const Type& t, Expr value) {
+  using ir::IntImm;
+  if (value.type() == t) return value;
+  // const fold IntImm as they are used in index computations
+  if (t.lanes() == 1) {
+    if (const IntImm* op = value.as<IntImm>()) {
+      return make_const(t, op->value);
+    }
+    return ir::Cast::make(t, value);
+  } else {
+    if (value.type().lanes() == 1) {
+      // manually unroll cast
+      Type vtype = t.element_of();
+      if (value.type() != vtype) {
+        if (const IntImm* op = value.as<IntImm>()) {
+          value = make_const(vtype, op->value);
+        } else {
+          value = ir::Cast::make(vtype, value);
+        }
+      }
+      return ir::Broadcast::make(value, t.lanes());
+    } else {
+      CHECK(value.type().lanes() == t.lanes());
+      return ir::Cast::make(t, value);
+    }
+  }
+}
+
+Expr reinterpret(const Type& t, Expr value) {
+  if (value.type() == t) return value;
+  return ir::Call::make(t, ir::Call::reinterpret, { value }, ir::Call::PureIntrinsic);
+}
+
+#define TVM_CONST_PROPAGATION(BODY)                                     \
+  using ir::IntImm;                                                     \
+  using ir::UIntImm;                                                    \
+  const IntImm* pa = a.as<IntImm>();                                    \
+  const IntImm* pb = b.as<IntImm>();                                    \
+  const Type& ta = a.type();                                            \
+  const Type& tb = b.type();                                            \
+  if (IsIndexType(ta) && IsIndexType(tb)) {                             \
+    BODY;                                                               \
+  }                                                                     \
+  BinaryOpMatchTypes(a, b);
+
+
+Expr operator+(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, pa->value + pb->value);
+      if (pa && pa->value == 0) return SimpleCast(rtype, b);
+      if (pb && pb->value == 0) return SimpleCast(rtype, a);
+    });
+  return ir::Add::make(a, b);
+}
+
+Expr operator-(Expr a) {
+  using ir::IntImm;
+  const IntImm* pa = a.as<IntImm>();
+  if (pa) {
+    return ir::IntImm::make(a.type(), -pa->value);
+  }
+  return make_zero(a.type()) - a;
+}
+
+Expr operator-(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, pa->value - pb->value);
+      if (pb && pb->value == 0) return SimpleCast(rtype, a);
+    });
+  return ir::Sub::make(a, b);
+}
+
+Expr operator*(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, pa->value * pb->value);
+      if (pa) {
+        if (pa->value == 1) return SimpleCast(rtype, b);
+        if (pa->value == 0) return SimpleCast(rtype, a);
+      }
+      if (pb) {
+        if (pb->value == 1) return SimpleCast(rtype, a);
+        if (pb->value == 0) return SimpleCast(rtype, b);
+      }
+    });
+  return ir::Mul::make(a, b);
+}
+
+Expr operator/(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      // due to division and mod can have different modes
+      // only constant fold positive number where rule is fixed.
+      if (pa && pb && pa->value >= 0 && pb->value > 0) {
+        return IntImm::make(rtype, pa->value / pb->value);
+      }
+      if (pa) {
+        if (pa->value == 0) return SimpleCast(rtype, a);
+      }
+      if (pb) {
+        if (pb->value == 1) return SimpleCast(rtype, a);
+        CHECK_NE(pb->value, 0) << "Divide by zero";
+      }
+    });
+  return ir::Div::make(a, b);
+}
+
+Expr operator%(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      // due to division and mod can have different modes
+      // only constant fold positive number where rule is fixed.
+      if (pa && pb && pa->value >= 0 && pb->value > 0) {
+        return IntImm::make(rtype, pa->value % pb->value);
+      }
+      if (pa) {
+        if (pa->value == 0) return SimpleCast(rtype, a);
+      }
+      if (pb) {
+        if (pb->value == 1) return make_zero(rtype);
+        CHECK_NE(pb->value, 0) << "Divide by zero";
+      }
+    });
+  return ir::Mod::make(a, b);
+}
+
+Expr min(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, std::min(pa->value, pb->value));
+    });
+  return ir::Min::make(a, b);
+}
+
+Expr max(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, std::max(pa->value, pb->value));
+    });
+  return ir::Max::make(a, b);
+}
+
+Expr select(Expr cond, Expr true_value, Expr false_value) {
+  using ir::IntImm;
+  using ir::UIntImm;
+  CHECK(cond.type().is_bool());
+  BinaryOpMatchTypes(true_value, false_value);
+  if (const UIntImm* op = cond.as<UIntImm>()) {
+    if (op->value != 0) {
+      return true_value;
+    } else {
+      return false_value;
+    }
+  } else if (const IntImm* op = cond.as<IntImm>()) {
+    if (op->value != 0) {
+      return true_value;
+    } else {
+      return false_value;
+    }
+  }
+  return ir::Select::make(cond, true_value, false_value);
+}
+
+Expr likely(Expr cond) {
+  if (is_const(cond)) return cond;
+  return ir::Call::make(cond.type(), ir::Call::likely, { cond }, ir::Call::PureIntrinsic);
+}
+
+Expr operator>(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value > pb->value);
+    });
+  return ir::GT::make(a, b);
+}
+
+Expr operator>=(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value >= pb->value);
+    });
+  return ir::GE::make(a, b);
+}
+
+Expr operator<(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value < pb->value);
+    });
+  return ir::LT::make(a, b);
+}
+
+Expr operator<=(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value <= pb->value);
+    });
+  return ir::LE::make(a, b);
+}
+
+Expr operator==(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value == pb->value);
+    });
+  return ir::EQ::make(a, b);
+}
+
+Expr operator!=(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value != pb->value);
+    });
+  return ir::NE::make(a, b);
+}
+
+Expr operator&&(Expr a, Expr b) {
+  using ir::UIntImm;
+  const UIntImm* pa = a.as<UIntImm>();
+  const UIntImm* pb = b.as<UIntImm>();
+  if (pa && pb) {
+    return UIntImm::make(UInt(1), pa->value && pb->value);
+  }
+  return ir::And::make(a, b);
+}
+
+Expr operator||(Expr a, Expr b) {
+  using ir::UIntImm;
+  const UIntImm* pa = a.as<UIntImm>();
+  const UIntImm* pb = b.as<UIntImm>();
+  if (pa && pb) {
+    return UIntImm::make(UInt(1), pa->value || pb->value);
+  }
+  return ir::Or::make(a, b);
+}
+
+Expr operator!(Expr a) {
+  using ir::UIntImm;
+  const UIntImm* pa = a.as<UIntImm>();
+  if (pa) {
+    return UIntImm::make(UInt(1), !(pa->value));
+  }
+  return ir::Not::make(a);
+}
+
+Expr operator>>(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, (pa->value >> pb->value));
+      if (pb) {
+        if (pb->value == 0) return SimpleCast(rtype, a);
+      }
+    });
+  return ir::Call::make(a.type(), ir::Call::shift_right, { a, b }, ir::Call::PureIntrinsic);
+}
+
+Expr operator<<(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, (pa->value << pb->value));
+      if (pb) {
+        if (pb->value == 0) return SimpleCast(rtype, a);
+      }
+    });
+  return ir::Call::make(a.type(), ir::Call::shift_left, { a, b }, ir::Call::PureIntrinsic);
+}
+
+Expr operator&(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, (pa->value & pb->value));
+    });
+  return ir::Call::make(a.type(), ir::Call::bitwise_and, { a, b }, ir::Call::PureIntrinsic);
+}
+
+Expr operator|(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, (pa->value | pb->value));
+    });
+  return ir::Call::make(a.type(), ir::Call::bitwise_or, { a, b }, ir::Call::PureIntrinsic);
+}
+
+Expr operator^(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, (pa->value ^ pb->value));
+    });
+  return ir::Call::make(a.type(), ir::Call::bitwise_xor, { a, b }, ir::Call::PureIntrinsic);
+}
+
+Expr operator~(Expr a) {
+  CHECK(a.type().is_int() || a.type().is_uint());
+  return ir::Call::make(a.type(), ir::Call::bitwise_not, { a }, ir::Call::PureIntrinsic);
+}
+
+Expr pow(Expr x, Expr y) {
+  BinaryOpMatchTypes(x, y);
+  CHECK(x.type().is_float()) << "power only applies to float";
+  return ir::Call::make(x.type(), "pow", { x, y }, ir::Call::PureIntrinsic);
+}
+
+Expr abs(Expr x) {
+  if (x.type().is_int()) {
+    return select(x >= make_zero(x.type()), x, -x);
+  } else if (x.type().is_float()) {
+    return ir::Call::make(x.type(), "fabs", {x}, ir::Call::PureIntrinsic);
+  } else if (x.type().is_uint()) {
+    return x;
+  } else {
+    LOG(FATAL) << "Data type " << x.type()
+               <<" not supported for absolute op. Skipping absolute op...";
+    return x;
+  }
+}
+
 Expr sum(Expr source, Array<IterVar> rdom) {
   Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Add::make(x, y);
@@ -38,7 +438,7 @@ Expr min(Expr source, Array<IterVar> rdom) {
 Expr prod(Expr source, Array<IterVar> rdom) {
   Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Mul::make(x, y);
-  Expr identity_element = make_one(source.type());
+  Expr identity_element = make_const(source.type(), 1);
   ir::CommReducer combiner =
     ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
   return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
diff --git a/src/pass/ir_util.h b/src/pass/ir_util.h
index f871133fb74f..3cef4486ee1b 100644
--- a/src/pass/ir_util.h
+++ b/src/pass/ir_util.h
@@ -7,6 +7,7 @@
 #define TVM_PASS_IR_UTIL_H_
 
 #include <tvm/ir.h>
+#include <tvm/ir_operator.h>
 #include <tvm/runtime/device_api.h>
 #include <vector>
 
@@ -75,7 +76,7 @@ inline Expr TVMStructGet(
   Array<Expr> args ={
     handle,
     make_const(Int(32), index),
-    make_const(Int(32), kind)};
+    make_const(Int(32), static_cast<int>(kind))};
   return Call::make(dtype, intrinsic::tvm_struct_get, args, Call::PureIntrinsic);
 }
 
@@ -125,7 +126,7 @@ inline Stmt TVMStructSet(
   Array<Expr> args ={
     handle,
     make_const(Int(32), index),
-    make_const(Int(32), kind),
+    make_const(Int(32), static_cast<int>(kind)),
     value};
   return Evaluate::make(
       Call::make(Int(32), intrinsic::tvm_struct_set, args, Call::Intrinsic));
diff --git a/src/pass/split_pipeline.cc b/src/pass/split_pipeline.cc
index 0dd5bd65106f..c143a0d19153 100644
--- a/src/pass/split_pipeline.cc
+++ b/src/pass/split_pipeline.cc
@@ -102,9 +102,8 @@ class MarkChannelAccess : public IRMutator {
     } else {
       alloc_size = op->extents[0];
       for (size_t i = 1; i < op->extents.size(); ++i) {
-        alloc_size *= op->extents[i];
+        alloc_size = alloc_size * op->extents[i];
       }
-      alloc_size = ir::Simplify(alloc_size);
     }
 
     if (rw.write_count) {
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 2bab21d85737..54f5010f1461 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -578,7 +578,7 @@ class StoragePlanRewriter : public IRMutator {
           combo_size = combo_size / type_bits;
           // round up for can not divided
           if (!divided) {
-             combo_size += make_const(Int(32), 1);
+             combo_size = combo_size + make_const(Int(32), 1);
           }
           combo_size = ir::Simplify(combo_size);
           e->new_alloc = Allocate::make(
diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc
index 206b75ed068d..fe2f819809fd 100644
--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
@@ -437,7 +437,6 @@ class LoopVectorizer : public IRMutator {
   Stmt Mutate_(const For* op, const Stmt& s) final {
     if (op->for_type == ForType::Vectorized) {
       CHECK(is_zero(op->min));
-      CHECK(is_positive_const(op->extent));
       int lanes = 0;
       bool succ = arith::GetConstInt(op->extent, &lanes);
       if (!succ || lanes < 1) {
diff --git a/tests/cpp/ir_mutator_test.cc b/tests/cpp/ir_mutator_test.cc
index fd5a60756f1c..0802d405bbe4 100644
--- a/tests/cpp/ir_mutator_test.cc
+++ b/tests/cpp/ir_mutator_test.cc
@@ -1,6 +1,7 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <tvm/ir_mutator.h>
+#include <tvm/ir_operator.h>
 
 namespace {
 using namespace tvm::ir;
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index 78589cf3af0e..9b869feddc9d 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -35,7 +35,7 @@ def test_deduce():
 
     e1 = (a*4+b < c)
     res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
-    ans1 = (((c - b) + -1)/4) 
+    ans1 = (((c - b) + -1)/4)
     assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)
 
     e2 = (tvm.max(5, a * 4) < 0)
@@ -63,7 +63,7 @@ def test_check():
     assert res1.is_nothing()
 
     # multiple compare operators
-    res2 = tvm.arith.DeduceBound(a, (a+b>3)>c , {b: b_s, c: c_s}, {})
+    res2 = tvm.arith.DeduceBound(a, (a+b>3).astype(c.dtype)>c , {b: b_s, c: c_s}, {})
     assert res2.is_nothing()
 
     # multiple target variable
@@ -88,11 +88,11 @@ def test_basic(a1, a2, coff):
         res1 = tvm.arith.DeduceBound(a, e0<=17, {b: b_s}, {b: b_s})
         [x, y] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) <= 17)).value == 1
-      
+
         res1 = tvm.arith.DeduceBound(a, e0>=17, {b: b_s}, {b: b_s})
         [x, y] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) >= 17)).value == 1
-       
+
     test_basic(0, 4, 4)
     test_basic(1, 5, 4)
     test_basic(2, 6, 4)
@@ -137,4 +137,3 @@ def test_complex(a1, a2, coff):
     test_check()
     test_deduce_basic()
     test_deduce_complex()
-
diff --git a/tests/python/unittest/test_lang_basic.py b/tests/python/unittest/test_lang_basic.py
index c9a04747b56d..bf25ca3dfc85 100644
--- a/tests/python/unittest/test_lang_basic.py
+++ b/tests/python/unittest/test_lang_basic.py
@@ -8,7 +8,7 @@ def test_const():
 
 def test_make():
     x = tvm.const(1)
-    y = tvm.make.IntImm('int32', 1)
+    y = tvm.var("x")
     z = x + y
     assert isinstance(tvm.max(x, y), tvm.expr.Max)
     assert isinstance(tvm.min(x, y), tvm.expr.Min)
diff --git a/tests/python/unittest/test_lang_operator.py b/tests/python/unittest/test_lang_operator.py
new file mode 100644
index 000000000000..9c701ed2abe3
--- /dev/null
+++ b/tests/python/unittest/test_lang_operator.py
@@ -0,0 +1,35 @@
+import tvm
+
+def test_const_fold():
+    def check(f, *args):
+        x = f(*[tvm.const(x) for x in args])
+        y = f(*args)
+        if not isinstance(x, (tvm.expr.IntImm, tvm.expr.UIntImm)) or x.value != int(y):
+            raise ValueError("check error: %s vs %s " % (x, y))
+
+    check(lambda x, y: x + y, 3, 4)
+    check(lambda x, y: x * y, 3, 12)
+    check(lambda x, y: x * y - 10, 3, 12)
+    check(lambda x, y: x - y % 10, 3, 12)
+    check(lambda x, y: x // y + 10, 100, 12)
+    check(lambda x, y: x & y + 10, 112, 128)
+    check(lambda x, y: x > y, 112, 128)
+    check(lambda x, y: x < y, 112, 128)
+    check(lambda x, y: x <= y, 112, 128)
+    check(lambda x, y: x >= y, 112, 128)
+    check(lambda x, y: (x | y) ^ 10, 112, 128)
+
+
+def test_const_fold2():
+    x = tvm.var("x")
+    assert (x + 0).same_as(x)
+    assert (0 + x).same_as(x)
+    assert (x - 0).same_as(x)
+    assert (x % 1).value == 0
+    assert (x * 1).same_as(x)
+    assert (1 * x).same_as(x)
+    assert isinstance((1 / x), tvm.expr.Div)
+
+if __name__ == "__main__":
+    test_const_fold()
+    test_const_fold2()
diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py
index 83b440a2c1d4..3ec760f20c76 100644
--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -15,7 +15,7 @@ def test_make_smap():
     # save load json
     x = tvm.const(1)
     y = tvm.const(10)
-    z = x + y
+    z = tvm.expr.Add(x, y)
     smap = tvm.convert({"z": z, "x": x})
     json_str = tvm.save_json(tvm.convert([smap]))
     arr = tvm.load_json(json_str)
diff --git a/tests/python/unittest/test_pass_simplify.py b/tests/python/unittest/test_pass_simplify.py
index c38083822fe2..fce6eaed5a1f 100644
--- a/tests/python/unittest/test_pass_simplify.py
+++ b/tests/python/unittest/test_pass_simplify.py
@@ -53,7 +53,6 @@ def test_canonical():
     assert (tvm.ir_pass.Equal(ret1, ret2))
 
 if __name__ == "__main__":
-    test_modular()
     test_bound()
     test_basic()
     test_simplify()
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index 88c77f0afc52..02bc51515159 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -163,7 +163,7 @@ inline Tensor full(const Array<Expr>& shape,
                    const Expr fill_value,
                    std::string name = "tensor",
                    std::string tag = kElementWise) {
-  Expr ev = lossless_cast(dtype, fill_value);
+  Expr ev = cast(dtype, fill_value);
   if (!ev.defined()) {
     LOG(ERROR) << "Can't cast fill_value to " << dtype;
   }
@@ -173,7 +173,7 @@ inline Tensor full(const Array<Expr>& shape,
 }
 
 /*!
-* \brief Creates an operation that construct a tensor with same shape as input tensor, 
+* \brief Creates an operation that construct a tensor with same shape as input tensor,
 * then fill a tensor with fill_value
 *
 * \param x The input tensor
@@ -187,10 +187,7 @@ inline Tensor full_like(const Tensor& x,
                         const Expr fill_value,
                         std::string name = "tensor",
                         std::string tag = kElementWise) {
-  Expr ev = lossless_cast(x->dtype, fill_value);
-  if (!ev.defined()) {
-    LOG(ERROR) << "Can't cast fill_value to " << x->dtype;
-  }
+  Expr ev = cast(x->dtype, fill_value);
   return compute(x->shape, [&](const Array<Var>& i) {
       return ev;
   }, name, tag);
diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h
index ca318adfe6cb..795d04a31a46 100644
--- a/topi/include/topi/nn/pooling.h
+++ b/topi/include/topi/nn/pooling.h
@@ -94,10 +94,10 @@ inline Tensor pool_impl(const Tensor& x,
   out_shape.Set(height_axis, out_height);
   out_shape.Set(width_axis, out_width);
 
-  const int64_t *padding_h0 = HalideIR::Internal::as_const_int(pad_top);
-  const int64_t *padding_w0 = HalideIR::Internal::as_const_int(pad_left);
-  const int64_t *padding_h1 = HalideIR::Internal::as_const_int(pad_bottom);
-  const int64_t *padding_w1 = HalideIR::Internal::as_const_int(pad_right);
+  const int64_t *padding_h0 = as_const_int(pad_top);
+  const int64_t *padding_w0 = as_const_int(pad_left);
+  const int64_t *padding_h1 = as_const_int(pad_bottom);
+  const int64_t *padding_w1 = as_const_int(pad_right);
   const bool do_pad = ((padding_h0 && *padding_h0) || (padding_w0 && *padding_w0)) ||
                       ((padding_h1 && *padding_h1) || (padding_w1 && *padding_w1));
 
@@ -192,7 +192,7 @@ inline bool find_height_width(const std::string& layout,
 *        Since pooling does not care about the factor size of dimensions
 *        other than `H` and `W`, one can pass `NCHWc` as well.
 * \param  count_include_pad Whether include padding in the calculation when pool_type is 'avg'
-*        
+*
 *
 * \return The output tensor in the same layout
 */
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index a8f97146519b..4e6e6ab27fea 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -164,10 +164,10 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
         oy = py * vy * ah + ay
         ow = tvm.exp(pw * vw) * aw / 2.0
         oh = tvm.exp(ph * vh) * ah / 2.0
-        return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
+        return tvm.select(clip, tvm.max(0, tvm.min(1, ox - ow)), ox - ow), \
+               tvm.select(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \
+               tvm.select(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \
+               tvm.select(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh)
 
     batch_size = cls_prob.shape[0]
     num_classes = cls_prob.shape[1]
@@ -191,7 +191,7 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
                 with ib.if_scope(j > 0):
                     temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i]
                     cls_id[0] = tvm.select(temp > score[0], j, cls_id[0])
-                    score[0] = tvm.make.Max(temp, score[0])
+                    score[0] = tvm.max(temp, score[0])
             with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)):
                 cls_id[0] = 0
             # [id, prob, xmin, ymin, xmax, ymax]

From 009f49027cd203ec76dccdc37f0208c5c604c681 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Mon, 1 Oct 2018 13:47:47 -0700
Subject: [PATCH 149/529] [RELAY] First pass at pretty printer (#1749)

---
 include/tvm/relay/base.h                 |   2 +
 include/tvm/relay/expr.h                 |   8 +
 include/tvm/relay/expr_functor.h         |   3 +-
 include/tvm/relay/type.h                 |   8 +
 python/tvm/relay/__init__.py             |   1 -
 python/tvm/relay/_expr.py                |   5 +
 python/tvm/relay/expr.py                 |   3 +
 python/tvm/relay/expr.pyi                |   2 +-
 src/relay/ir/debug_printer.cc            | 303 +++++++++++++
 src/relay/ir/doc.h                       | 514 +++++++++++++++++++++++
 src/relay/ir/expr.cc                     |   2 +-
 src/relay/pass/type_functor.h            |   4 +-
 tests/python/relay/test_debug_printer.py |  93 ++++
 13 files changed, 943 insertions(+), 5 deletions(-)
 create mode 100644 python/tvm/relay/_expr.py
 create mode 100644 src/relay/ir/debug_printer.cc
 create mode 100644 src/relay/ir/doc.h
 create mode 100644 tests/python/relay/test_debug_printer.py

diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index ab55f6f3965f..48fd59c19793 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -158,6 +158,8 @@ class RelayNode : public Node {
   TVM_DECLARE_BASE_NODE_INFO(RelayNode, Node);
 };
 
+struct Environment;
+
 }  // namespace relay
 }  // namespace tvm
 
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 02753a76b0da..55080c0fddd9 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -376,6 +376,14 @@ class IfNode : public ExprNode {
 
 RELAY_DEFINE_NODE_REF(If, IfNode, Expr);
 
+/*! \brief Print a debug representation of the expression to the stream.
+ *  \param env The environment.
+ *  \param e The expression
+ *  \param os the stream
+ *  \returns A reference to the stream.
+ */
+std::ostream& DebugPrint(const Environment& env, const Expr& e, std::ostream& os);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_EXPR_H_
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index e79535a5034b..1da66bc95f57 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -11,6 +11,7 @@
 #include <string>
 #include "./expr.h"
 #include "./op.h"
+#include "./error.h"
 
 namespace tvm {
 namespace relay {
@@ -89,7 +90,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
   virtual R VisitExpr_(const OpNode* op,
                        Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExprDefault_(const Node* op, Args...) {
-    throw dmlc::Error(std::string("Do not have a default for ") + op->type_key());
+    throw Error(std::string("Do not have a default for ") + op->type_key());
   }
 
  private:
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index 53f484522518..ac4c0ec747af 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -365,6 +365,14 @@ class TypeRelationNode : public TypeConstraintNode {
 
 RELAY_DEFINE_NODE_REF(TypeRelation, TypeRelationNode, TypeConstraint);
 
+/*! \brief Print a debug representation of the type to the stream.
+ *  \param env The environment.
+ *  \param t The type
+ *  \param os the stream
+ *  \returns A reference to the stream.
+ */
+std::ostream& DebugPrint(const Environment& env, const Type& t, std::ostream& os);
+
 // The following fields contains advanced typing
 // Only keep the class name and reserved for future usage.
 class GenericTensorType;
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index fc1fae76ced1..ef35962b41f2 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -33,4 +33,3 @@
 Call = expr.Call
 Let = expr.Let
 If = expr.If
-Var = Var
diff --git a/python/tvm/relay/_expr.py b/python/tvm/relay/_expr.py
new file mode 100644
index 000000000000..1a27c4efc410
--- /dev/null
+++ b/python/tvm/relay/_expr.py
@@ -0,0 +1,5 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable
+"""The interface of expr function exposed from C++."""
+from tvm._ffi.function import _init_api
+
+_init_api("relay._expr", __name__)
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index e3b6c9d7e9ff..3f90a3af64a5 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -2,6 +2,7 @@
 """The expression nodes of Relay."""
 from __future__ import absolute_import
 from .base import NodeBase, register_relay_node
+from . import _expr
 from . import _make
 from .. import convert
 
@@ -115,3 +116,5 @@ class If(Expr):
     def __init__(self, cond, true_value, false_value):
         self.__init_handle_by_constructor__(
             _make.If, cond, true_value, false_value)
+
+debug_print = _expr._debug_print
diff --git a/python/tvm/relay/expr.pyi b/python/tvm/relay/expr.pyi
index fd30e3ed25cf..e73a5963e5b1 100644
--- a/python/tvm/relay/expr.pyi
+++ b/python/tvm/relay/expr.pyi
@@ -111,4 +111,4 @@ class If(Expr):
 
     def __init__(self, cond, true_value, false_value):
         # type: (Expr, Expr, Expr) -> None
-        ...
+        ...
\ No newline at end of file
diff --git a/src/relay/ir/debug_printer.cc b/src/relay/ir/debug_printer.cc
new file mode 100644
index 000000000000..e216faa0f195
--- /dev/null
+++ b/src/relay/ir/debug_printer.cc
@@ -0,0 +1,303 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file debug_printer.cc
+ * \brief A pretty printer for the Relay IR.
+ * As we had not determined a formal syntax yet, right now it is only for debug purpose.
+ */
+
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/environment.h>
+#include <tvm/relay/error.h>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include "../pass/type_functor.h"
+#include "doc.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace tvm::runtime;
+
+Doc KindDocify(TypeParamNode::Kind k) {
+  switch (k) {
+    case TypeParamNode::kShapeVar:
+      return DocOfStr("ShapeVar");
+    case TypeParamNode::kShape:
+      return DocOfStr("Shape");
+    case TypeParamNode::kBaseType:
+      return DocOfStr("BaseType");
+    case TypeParamNode::kType:
+      return DocOfStr("Type");
+    default:
+      LOG(FATAL) << "unreachable code: case not handle in kind";
+      throw;  // log fatal throw but compiler doesnt know
+  }
+}
+
+template<typename T>
+std::vector<Doc> MapDocify(const tvm::Array<T>& arr, const std::function<Doc(const T&)>& f) {
+  std::vector<Doc> vec;
+  for (size_t i = 0; i < arr.size(); ++i) {
+    vec.push_back(f(arr[i]));
+  }
+  return vec;
+}
+
+template<typename T, typename Hash = std::hash<T>, typename Eq = std::equal_to<T>>
+class Counter {
+  std::unordered_map<T, size_t, Hash, Eq> cnt_;
+
+ public:
+  Counter() = default;
+  Counter(const Counter&) = delete;
+  size_t operator()(const T& t) {
+    auto v = cnt_.count(t) == 0 ? 0 : cnt_.at(t) + 1;
+    cnt_[t] = v;
+    return v;
+  }
+};
+
+std::string Mangle(const std::string& str, size_t s) {
+  return str + "_" + std::to_string(s);
+  // return s == 0 ? str : str + "_" + std::to_string(s - 1);
+  // the above line look prettier but is dangerous:
+  // suppose we have x, x, x_0. mangling will give x, x_0, x_0!
+  // the save approach give x_0, x_1, x_0_1, and in fact never clash:
+  // stripping _([0-9]*) is invert of mangle under all circumstances.
+  // another problem is we need to prevent Var/TypeParam/GlobalVar clashing each other.
+}
+
+constexpr size_t indent = 2;
+
+struct TypeParamName {
+  bool operator==(const TypeParamName&) const {
+    return true;
+  }
+};
+
+struct mhash {
+  size_t operator()(const ::tvm::relay::TypeParamName&) const noexcept {
+    return 0;
+  }
+};
+
+class TypeDocifier : private TypeFunctor<Doc(const Type& n)> {
+  Environment env;
+  Counter<TypeParamName, mhash> cnt;
+  std::unordered_map<TypeParam, Doc, NodeHash, NodeEqual> map;
+
+  std::vector<Doc> DocifyTypeArray(const tvm::Array<Type>& arr) {
+    return MapDocify<Type>(arr, [=](const Type& t) { return Docify(t); });
+  }
+
+  std::vector<Doc> DocifyTypeParam(const tvm::Array<TypeParam>& arr) {
+    return MapDocify<TypeParam>(arr, [=](const TypeParam& tp) { return Docify(tp); });
+  }
+
+  std::vector<Doc> DocifyTypeConstraint(const tvm::Array<TypeConstraint>& arr) {
+    return MapDocify<TypeConstraint>(arr, [=](const TypeConstraint& tc) { return Docify(tc); });
+  }
+
+  Doc VisitType_(const TensorTypeNode* t) final {
+    return DocOfStr("tensor");
+  }
+
+  Doc VisitType_(const TypeParamNode* p) final {
+    auto tp = GetRef<TypeParam>(p);
+    if (map.count(tp) == 0) {
+      auto name =
+        DocOfStr(Mangle("tp", cnt(TypeParamName())) +
+                 std::string(":")) +
+        KindDocify(p->kind);
+      map.insert(std::pair<TypeParam, Doc>(tp, name));
+    }
+    return map.at(tp);
+  }
+
+  Doc Quantify(const tvm::Array<TypeParam>& tp, const Doc& d) {
+    if (tp.size() == 0) {
+      return d;
+    }
+    return Seq("forall", DocifyTypeParam(tp), ",") + Sep() + d;
+  }
+
+  Doc Constraint(const tvm::Array<TypeConstraint>& tc, const Doc& d) {
+    if (tc.size() == 0) {
+      return d;
+    }
+    return Seq("(", DocifyTypeConstraint(tc), ") =>") + Sep() + d;
+  }
+
+  Doc VisitType_(const FuncTypeNode* f) final {
+    auto inner = Seq("<", DocifyTypeArray(f->arg_types), ">") + Sep() +
+                 DocOfStr("->") + Sep() + Docify(f->ret_type);
+    return Group(Quantify(f->type_params,
+                          Constraint(f->type_constraints, inner)));
+  }
+
+  Doc VisitType_(const TypeRelationNode* r) final {
+    return DocOfStr("Relation") + Seq("(", DocifyTypeArray(r->args), ")");
+  }
+
+  Doc VisitType_(const TupleTypeNode* t) final {
+    return Seq("<", DocifyTypeArray(t->fields), ">");
+  }
+
+  Doc VisitType_(const IncompleteTypeNode* i) final {
+    return DocOfStr("_");
+  }
+
+ public:
+  TypeDocifier(const Environment& env) : env(env) { }
+
+  Doc Docify(const Type& t) { return t.get() ? (*this)(t) : DocOfStr("_"); }
+};
+
+class ExprDocifier : private ExprFunctor<Doc(const Expr& n)> {
+  Environment env;
+  Counter<std::string> cnt;
+  std::unordered_map<Var, std::string, NodeHash, NodeEqual> map;
+  TypeDocifier td;
+
+  std::string VarName(const Var& v) {
+    if (map.count(v) == 0) {
+      map.insert(std::pair<Var, std::string>(v, Mangle(v->name_hint, cnt(v->name_hint))));
+    }
+    return map.at(v);
+  }
+
+  Doc TypeAnnotation(const Doc& d, const Type& t) {
+    // test for t being null. probably shouldnt has null. should talk to jared.
+    if (!t.get() || t.as<IncompleteTypeNode>()) {
+      return d;
+    } else {
+      return d + DocOfStr(":") + td.Docify(t);
+    }
+  }
+
+  std::vector<Doc> DocifyExprArray(const tvm::Array<Expr>& arr) {
+    std::vector<Doc> vec;
+    for (size_t i = 0; i < arr.size(); ++i) {
+      vec.push_back(Docify(arr[i]));
+    }
+    return vec;
+  }
+
+  std::vector<Doc> DocifyParamArray(const tvm::Array<Param>& arr) {
+    std::vector<Doc> vec;
+    for (size_t i = 0; i < arr.size(); ++i) {
+      vec.push_back(Docify(arr[i]));
+    }
+    return vec;
+  }
+
+  Doc VisitExpr_(const ConstantNode* c) final {
+    return DocOfStr("some_constant");
+  }
+
+  Doc VisitExpr_(const TupleNode* t) final {
+    return Seq("<", DocifyExprArray(t->fields), ">");
+  }
+
+  Doc VisitExpr_(const VarNode* v) final {
+    return DocOfStr(VarName(GetRef<Var>(v)));
+  }
+
+  Doc VisitExpr_(const GlobalVarNode* g) final {
+    return DocOfStr(g->name_hint);
+  }
+
+  Doc VisitExpr_(const ParamNode* p) final {
+    return TypeAnnotation(Docify(p->var), p->type);
+  }
+
+  Doc VisitExpr_(const FunctionNode* f) final {
+    return Group(TypeAnnotation(Seq("(", DocifyParamArray(f->params), ")"), f->ret_type) + Sep() +
+                 DocOfStr("=>") + Sep() +
+                 Block(indent, "{", Docify(f->body), "}"));
+  }
+
+  Doc VisitExpr_(const CallNode* c) final {
+    auto args = DocifyExprArray(c->args);
+    return Docify(c->op) + Seq("<", DocifyExprArray(c->args), ">");
+  }
+
+  Doc VisitExpr_(const LetNode* l) final {
+    return Group(DocOfStr("let") + Sep() + TypeAnnotation(Docify(l->var), l->value_type) + Sep() +
+                 DocOfStr("=") + Sep() + Docify(l->value) + DocOfStr(";") + Endl() +
+                 Docify(l->body));
+  }
+
+  Doc VisitExpr_(const IfNode* i) final {
+    return Group(DocOfStr("if") + Sep() + Docify(i->cond) + Sep() +
+                 Block(indent, "{", Docify(i->true_branch), "}") + Sep() +
+                 DocOfStr("else") + Sep() +
+                 Block(indent, "{", Docify(i->false_branch), "}"));
+  }
+
+  Doc VisitExpr_(const OpNode* o) final {
+    return DocOfStr(o->name);
+  }
+
+ public:
+  ExprDocifier(const Environment& env) : env(env), td(env) { }
+
+  Doc Docify(const Expr& e) { return (*this)(e); }
+};
+
+Doc DocOfExpr(const Environment& env, const Expr& expr) {
+  ExprDocifier d(env);
+  return d.Docify(expr);
+}
+
+Doc DocOfType(const Environment& env, const Type& expr) {
+  TypeDocifier d(env);
+  return d.Docify(expr);
+}
+
+RDoc ExprRDoc(const Environment& env, const Expr& expr) {
+  return Layout(DocOfExpr(env, expr));
+}
+
+RDoc TypeRDoc(const Environment& env, const Type& expr) {
+  return Layout(DocOfType(env, expr));
+}
+
+std::ostream & DebugPrint(const Environment& env, const Expr& e, std::ostream& os) {
+  return os << ExprRDoc(env, e);
+}
+
+std::ostream & DebugPrint(const Environment& env, const Type& t, std::ostream& os) {
+  return os << TypeRDoc(env, t);
+}
+
+std::string PrintExpr(const Environment& env, const Expr& e) {
+  std::stringstream ss;
+  ss << ExprRDoc(env, e);
+  return ss.str();
+}
+
+std::string PrintType(const Environment& env, const Type& t) {
+  std::stringstream ss;
+  ss << TypeRDoc(env, t);
+  return ss.str();
+}
+
+TVM_REGISTER_API("relay._expr._debug_print")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    NodeRef x = args[1];
+    std::cout << x << std::endl;
+    if (x.as<TypeNode>()) {
+      *ret = PrintType(args[0], Downcast<Type>(x));
+    } else {
+      *ret = PrintExpr(args[0], Downcast<Expr>(x));
+    }
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/doc.h b/src/relay/ir/doc.h
new file mode 100644
index 000000000000..15e965e5b818
--- /dev/null
+++ b/src/relay/ir/doc.h
@@ -0,0 +1,514 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file doc.h
+ * \brief A pretty printer DSL for constructing (Doc) and formatting (RDoc) documents.
+ *        It is based heavily on Philip Wadler's "A prettier printer."
+ *        See https://homepages.inf.ed.ac.uk/wadler/papers/prettier/prettier.pdf
+ *        for more details.
+ *
+ * Since the original paper uses call by value for efficiency, everything doc function is maximally lazy.
+ * You can probably yank speed by doing strict analysis and removing some Lazy (if this is bottleneck).
+ */
+#ifndef TVM_RELAY_IR_DOC_H_
+#define TVM_RELAY_IR_DOC_H_
+
+#include <unordered_map>
+#include <utility>
+#include <string>
+#include <functional>
+#include <vector>
+#include <memory>
+#include <ostream>
+#include <map>
+#include "error.h"
+
+namespace tvm {
+namespace relay {
+
+/*! \brief A Document represent structured text.
+ * beside having unstructured string, it capture different ways to compose them -
+ * line break, space, indentation, representation choice.
+ */
+struct Doc;
+
+/*! \brief RDoc represent rendered document.
+ * all the high level detail on the document, such as indentation, choice, has been removed.
+ * there is only one single, straight forward way to print it.
+ */
+struct RDoc;
+
+//! \brief Empty document
+inline Doc Nil();
+
+//! \brief Concatenate two documents
+inline Doc App(const Doc& l, const Doc& r);
+
+//! \brief Indent a document
+inline Doc Nest(size_t width, const Doc& doc);
+
+//! \brief Lift string to a document
+inline Doc DocOfStr(const std::string& text);
+
+//! \brief New line
+inline Doc Endl();
+
+//! \brief Remove all line break from the Document.
+inline Doc Flatten(const Doc& d);
+
+/*! \brief Choose between two possible layouts.
+ * assume Flatten(l) == Flatten(r), and l need to be more compact.
+ */
+inline Doc Choose(const Doc& l, const Doc& r);
+
+//! \brief Use a single line if possible
+inline Doc Group(const Doc& d);
+
+//! \brief print an RDoc
+inline std::ostream& operator<<(std::ostream& os, const RDoc& rdoc);
+
+/*! \brief Joins a vector of documents with a given separator document
+ *  \example Join(["a", "b, "c"], ", ") => "a, b, c"
+ *  \param vec the vector of documents
+ *  \param sep the separator between documents
+ */
+inline Doc Join(const std::vector<Doc>& vec, const Doc& sep);
+
+/*! \brief Creates an indented block.
+ *  \param indent the indentation size
+ *  \param open the opening string
+ *  \param body the body of the block
+ *  \param close the closing string
+ */
+inline Doc Block(size_t indent, const std::string& open,
+                 const Doc& body, const std::string& close);
+
+/*! \brief Creates a comma-separated sequence with opening and closing strings.
+ *  \param open the opening string
+ *  \param body the body of the Block
+ *  \param close the closing string
+ */
+inline Doc Seq(const std::string& open,
+               const std::vector<Doc>& body, const std::string& close);
+
+//! \brief Either a space or a new line
+inline Doc Sep();
+
+/*! \brief Layout a document to a given width
+ *  \param d the document to render
+ *  \param width the line width
+ */
+inline RDoc Layout(const Doc& d, size_t width = 80);
+
+// end of API, start of implementation
+
+template<typename T>
+struct LazyNode {
+  mutable std::function<T()> thunk;
+  explicit LazyNode(const std::function<T()>& thunk) : thunk(thunk) { }
+};
+
+//! \brief denote a value that will be computed (at most once) on need.
+template<typename T>
+struct Lazy {
+  std::shared_ptr<LazyNode<T> > lazy_node;
+  explicit Lazy(const std::function<T()>& thunk) :
+    lazy_node(std::make_shared<LazyNode<T>>(thunk)) { }
+  explicit Lazy(const T& value) : Lazy([=]() { return value; }) { }
+  explicit Lazy(const Lazy<Lazy<T>>& thunk) : Lazy([=]() { return thunk.get().get(); }) { }
+  // calculate the result.
+  // memoize it by replacing the thunk with a constant function which immediate return.
+  T get() const {
+    T res = lazy_node->thunk();
+    lazy_node->thunk = [=]() { return res; };
+    return res;
+  }
+  template<typename R>
+  Lazy<R> map(const std::function<R(const T&)>& func) const {
+    Lazy<T> self(*this);
+    return Lazy<R>([=]() -> R { return func(self.get()); });
+  }
+};
+
+struct NilNode;
+struct AppNode;
+struct NestNode;
+struct TextNode;
+struct LineNode;
+struct ChoiceNode;
+
+/*! \brief The inner representation of Doc.
+ * a doc represent structured text,
+ * and can be rendered onto screen while keeping the structure.
+ */
+struct DocNode {
+  /* a docnode is a union of the below node.
+   * exactly one of them will be non null.
+   * their meaning is denoted by the construction function of the same name.
+   * so for example, the meaning of AppNode is exactly a node construct by App.
+   */
+  std::shared_ptr<NilNode> nil;
+  std::shared_ptr<AppNode> app;
+  std::shared_ptr<NestNode> nest;
+  std::shared_ptr<TextNode> text;  // construct by DocOfStr
+  std::shared_ptr<LineNode> line;
+  std::shared_ptr<ChoiceNode> choice;
+  DocNode(std::shared_ptr<NilNode> nil,
+           std::shared_ptr<AppNode> app,
+           std::shared_ptr<NestNode> nest,
+           std::shared_ptr<TextNode> text,
+           std::shared_ptr<LineNode> line,
+           std::shared_ptr<ChoiceNode> choice) :
+    nil(nil),
+    app(app),
+    nest(nest),
+    text(text),
+    line(line),
+    choice(choice) { }
+};
+
+struct Doc {
+  Lazy<DocNode> doc;
+  explicit Doc(const DocNode& ed) : doc(ed) { }
+  explicit Doc(const Lazy<Doc>& ldoc) :
+    doc(ldoc.map<Lazy<DocNode> >([](const Doc& d){ return d.doc; })) { }
+
+  Doc operator+(const Doc& r) const {
+    return App(*this, r);
+  }
+
+  template<typename T>
+  Lazy<T> Match(
+    const std::function<T()>& nilf,
+    const std::function<T(const Doc&, const Doc&)>& appf,
+    const std::function<T(size_t, const Doc&)>& nestf,
+    const std::function<T(const std::string&)>& textf,
+    const std::function<T()>& linef,
+    const std::function<T(const Doc&, const Doc&)>& choicef) const;
+};
+
+struct NilNode { };
+
+struct AppNode {
+  Doc left, right;
+  AppNode(const Doc& left, const Doc& right) : left(left), right(right) { }
+};
+
+struct NestNode {
+  size_t space;
+  Doc doc;
+  NestNode(size_t space, const Doc& doc) : space(space), doc(doc) { }
+};
+
+struct TextNode {
+  std::string text;
+  explicit TextNode(const std::string& text) : text(text) { }
+};
+
+struct LineNode { };
+
+struct ChoiceNode {
+  Doc left, right;
+  ChoiceNode(const Doc& left, const Doc& right) : left(left), right(right) { }
+};
+
+template<typename T>
+Lazy<T> Doc::Match(
+    const std::function<T()>& nilf,
+    const std::function<T(const Doc&, const Doc&)>& appf,
+    const std::function<T(size_t, const Doc&)>& nestf,
+    const std::function<T(const std::string&)>& textf,
+    const std::function<T()>& linef,
+    const std::function<T(const Doc&, const Doc&)>& choicef) const {
+    return doc.map<T>([=](const DocNode& d) {
+      if (d.nil) {
+        return nilf();
+      } else if (d.app) {
+        return appf(d.app->left, d.app->right);
+      } else if (d.nest) {
+        return nestf(d.nest->space, d.nest->doc);
+      } else if (d.text) {
+        return textf(d.text->text);
+      } else if (d.line) {
+        return linef();
+      } else {
+        return choicef(d.choice->left, d.choice->right);
+      }
+    });
+}
+
+//! \brief Empty document
+inline Doc Nil() {
+  return Doc(DocNode(std::make_shared<NilNode>(), nullptr, nullptr, nullptr, nullptr, nullptr));
+}
+
+//! \brief Concatenate two documents
+inline Doc App(const Doc& l, const Doc& r) {
+  return Doc(DocNode(
+    nullptr,
+    std::make_shared<AppNode>(l, r),
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr));
+}
+
+//! \brief Indent a document
+inline Doc Nest(size_t width, const Doc& doc) {
+  auto x = std::make_shared<NestNode>(width, doc);
+  return Doc(DocNode(
+    nullptr,
+    nullptr,
+    std::make_shared<NestNode>(width, doc),
+    nullptr,
+    nullptr,
+    nullptr));
+}
+
+//! \brief Lift string to a document
+inline Doc DocOfStr(const std::string& text) {
+  return Doc(DocNode(nullptr, nullptr, nullptr,
+    std::make_shared<TextNode>(text), nullptr, nullptr));
+}
+
+//! \brief New line
+inline Doc Endl() {
+  return Doc(DocNode(nullptr, nullptr, nullptr, nullptr, std::make_shared<LineNode>(), nullptr));
+}
+
+/*! \brief Choose between two possible layouts.
+ * assume Flatten(l) == Flatten(r), and l need to be more compact.
+ */
+inline Doc Choose(const Doc& l, const Doc& r) {
+  return Doc(DocNode(
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr,
+    std::make_shared<ChoiceNode>(l, r)));
+}
+
+//! \brief Remove new line from the whole document.
+inline Doc Flatten(const Doc& d) {
+  return Doc(d.Match<Doc>(
+    []() { return Nil(); },
+    [](const Doc& l, const Doc& r) { return Flatten(l) + Flatten(r); },
+    [](size_t space, const Doc& doc) { return Flatten(doc); },
+    [](const std::string& str) { return DocOfStr(str); },
+    []() { return DocOfStr(" "); },
+    [](const Doc& l, const Doc& r) { return Flatten(l); }));
+}
+
+//! \brief Use a single line if possible
+inline Doc Group(const Doc& d) {
+  return Choose(Flatten(d), d);
+}
+
+struct RNilNode;
+struct RTextNode;
+struct RLineNode;
+
+struct RDocNode {
+  std::shared_ptr<RNilNode> rnil;
+  std::shared_ptr<RTextNode> rtext;
+  std::shared_ptr<RLineNode> rline;
+  RDocNode(const std::shared_ptr<RNilNode>& rnil,
+           const std::shared_ptr<RTextNode>& rtext,
+           const std::shared_ptr<RLineNode>& rline) :
+    rnil(rnil), rtext(rtext), rline(rline) { }
+};
+
+/*! \brief RDoc represent rendered document.
+ * all the high level detail on the document, such as indentation, alternative, has been removed.
+ * there is only one single, straight forward way to print it.
+ */
+struct RDoc {
+  Lazy<RDocNode> doc;
+  explicit RDoc(const RDocNode& d) : doc(d) { }
+  explicit RDoc(const Lazy<RDoc>& ldoc) :
+    doc(ldoc.map<Lazy<RDocNode>>([](const RDoc& d){ return d.doc; })) { }
+  template<typename T>
+  Lazy<T> Match(
+    const std::function<T()> &rnilf,
+    const std::function<T(const std::string&, const RDoc&)>& rtextf,
+    const std::function<T(size_t, const RDoc&)>& rlinef) const;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const RDoc& rdoc) {
+  return *rdoc.Match<std::ostream*>(
+    [&]() { return & os; },
+    [&](const std::string& text, const RDoc& r) {
+      return & (os << text << r);
+    },
+    [&](size_t space, const RDoc& r) {
+      return & (os << std::endl << std::string(space, ' ') << r);
+    }).get();
+}
+
+struct RNilNode { };
+
+struct RTextNode {
+  std::string text;
+  RDoc rest;
+  RTextNode(const std::string& text, const RDoc& rest) : text(text), rest(rest) { }
+};
+
+struct RLineNode {
+  size_t space;
+  RDoc rest;
+  RLineNode(size_t space, const RDoc& rest) : space(space), rest(rest) { }
+};
+
+//! \brief Empty RDoc
+inline RDoc RNil() { return RDoc(RDocNode(std::make_shared<RNilNode>(), nullptr, nullptr)); }
+
+//! \brief RDoc that begin with std::string
+inline RDoc RText(const std::string& text, const RDoc& rest) {
+  return RDoc(RDocNode(nullptr, std::make_shared<RTextNode>(text, rest), nullptr));
+}
+
+//! \brief RDoc that begin with a new line, followed by space
+inline RDoc RLine(size_t space, const RDoc& rest) {
+  return RDoc(RDocNode(nullptr, nullptr, std::make_shared<RLineNode>(space, rest)));
+}
+
+template<typename T>
+Lazy<T> RDoc::Match(
+  const std::function<T()>& rnilf,
+  const std::function<T(const std::string&, const RDoc&)>& rtextf,
+  const std::function<T(size_t, const RDoc&)>& rlinef) const {
+  return doc.map<T>([=](const RDocNode& rdoc) {
+    if (rdoc.rnil) {
+      return rnilf();
+    } else if (rdoc.rtext) {
+      return rtextf(rdoc.rtext->text, rdoc.rtext->rest);
+    } else {
+      return rlinef(rdoc.rline->space, rdoc.rline->rest);
+    }
+  });
+}
+
+template<typename T>
+struct List;
+
+template<typename T>
+struct EagerList {
+  const std::shared_ptr<std::pair<T, List<T>>> cons;
+};
+
+//! \brief lazy list
+template<typename T>
+struct List {
+  Lazy<EagerList<T> > l;
+  List() : l([]() { return EagerList<T>({nullptr}); }) { }
+  List(const T& t, const List<T>& l) :
+    l([=]() { return EagerList<T>({std::make_shared<std::pair<T, List<T>>>(t, l)}); }) { }
+  template<typename R>
+  Lazy<R> Match(const std::function<R()>& nullf,
+                const std::function<R(const T&, const List<T>&)>& consf) const {
+    return l.template map<R>([=](const EagerList<T>& l) {
+        if (l.cons) {
+          return consf(l.cons->first, l.cons->second);
+        } else {
+          return nullf();
+        }
+    });
+  }
+};
+
+//! \brief Does x fit into line of size w?
+inline bool Fits(int w, const RDoc& x) {
+  return (w >= 0) && x.Match<bool>(
+    []() { return true; },
+    [=](const std::string& s, const RDoc& x) { return Fits(w - s.size(), x); },
+    [](size_t space, const RDoc& x) { return true; }).get();
+}
+
+//! \brief Choose the one that fits best.
+inline RDoc Better(size_t w, size_t k, const RDoc& x, const RDoc& y) {
+  return Fits(w-k, x) ? x : y;
+}
+
+typedef std::pair<size_t/*indent size*/, Doc> best_arg;
+inline RDoc Best(size_t w/*wrap width*/, size_t k/*space used*/,
+  const List<best_arg>& l/*to be rendered*/) {
+  return RDoc(l.Match<RDoc>(
+    []() { return RNil(); },
+    [=](const best_arg& p, const List<best_arg>& z) {
+      return RDoc(p.second.Match<RDoc>(
+        [=]() { return Best(w, k, z); },
+        [=](const Doc& x, const Doc& y) {
+          return Best(
+            w,
+            k,
+            List<best_arg>(best_arg(p.first, x), List<best_arg>(best_arg(p.first, y), z))); },
+        [=](size_t j, const Doc& x) {
+          return Best(w, k, List<best_arg>(best_arg(p.first + j, x), z)); },
+        [=](const std::string& text) { return RText(text, Best(w, k + text.size(), z)); },
+        [=]() { return RLine(p.first, Best(w, p.first, z)); },
+        [=](const Doc& x, const Doc& y) {
+          return Better(
+            w,
+            k,
+            Best(w, k, List<best_arg>(best_arg(p.first, x), z)),
+            Best(w, k, List<best_arg>(best_arg(p.first, y), z))); }));
+    }));
+}
+
+/*! \brief Joins a vector of documents with a given separator document
+ *  \example Join(["a", "b, "c"], ", ") => "a, b, c"
+ *  \param vec the vector of documents
+ *  \param sep the separator between documents
+ */
+inline Doc Join(const std::vector<Doc>& vec, const Doc& sep) {
+  // https://www.safaribooksonline.com/library/view/c-cookbook/0596007612/ch04s09.html
+  Doc output = Nil();
+  for (auto p = vec.begin(); p != vec.end(); ++p) {
+    output = output + *p;
+    if (p != vec.end() - 1) {
+      output = output + sep;
+    }
+  }
+
+  return output;
+}
+
+/*! \brief Creates an indented block.
+ *  \param indent the indentation size
+ *  \param open the opening string
+ *  \param body the body of the block
+ *  \param close the closing string
+ */
+inline Doc Block(size_t indent, const std::string& open,
+  const Doc& body, const std::string& close) {
+  return DocOfStr(open) + Nest(indent, Endl() + body) + Endl() + DocOfStr(close);
+}
+
+/*! \brief Creates a comma-separated sequence with opening and closing strings.
+ *  \param open the opening string
+ *  \param body the body of the Block
+ *  \param close the closing string
+ */
+inline Doc Seq(const std::string& open,
+  const std::vector<Doc>& body, const std::string& close) {
+  return Group(DocOfStr(open) +
+               Nest(open.size(), Join(body, DocOfStr(",") + Endl())) +
+               DocOfStr(close));
+}
+
+//! \brief Either a space or a new line
+inline Doc Sep() {
+  return Choose(DocOfStr(" "), Endl());
+}
+
+/*! \brief Layout a document to a given width
+ *  \param d the document to render
+ *  \param width the line width
+ */
+inline RDoc Layout(const Doc& d, size_t width) {
+  return Best(width, 0, List<best_arg>(best_arg(0, d), List<best_arg>()));
+}
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_IR_DOC_H_
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 241ccc0b85c3..dbbb5b84fc8b 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -190,7 +190,7 @@ TVM_REGISTER_API("relay._make.If").set_body([](TVMArgs args, TVMRetValue *ret) {
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<IfNode>([](const IfNode *node, tvm::IRPrinter *p) {
   p->stream << "IfNode(" << node->cond << ", " << node->true_branch
-            << node->false_branch << ")";
+            << ", " << node->false_branch << ")";
 });
 
 }  // namespace relay
diff --git a/src/relay/pass/type_functor.h b/src/relay/pass/type_functor.h
index 5152690c17e0..a451fbe16984 100644
--- a/src/relay/pass/type_functor.h
+++ b/src/relay/pass/type_functor.h
@@ -8,6 +8,8 @@
 
 #include <tvm/node/ir_functor.h>
 #include <tvm/relay/expr.h>
+#include <tvm/relay/error.h>
+#include <string>
 
 namespace tvm {
 namespace relay {
@@ -68,7 +70,7 @@ class TypeFunctor<R(const Type& n, Args...)> {
 
   virtual R VisitTypeDefault_(const Node* op, Args...) {
     LOG(FATAL) << "Do not have a default for " << op->type_key();
-    return R();
+    throw;  // unreachable, written to stop compiler warning
   }
 
  private:
diff --git a/tests/python/relay/test_debug_printer.py b/tests/python/relay/test_debug_printer.py
new file mode 100644
index 000000000000..867d9bb3791f
--- /dev/null
+++ b/tests/python/relay/test_debug_printer.py
@@ -0,0 +1,93 @@
+import tvm
+from tvm import relay
+from tvm.relay.expr import debug_print
+from tvm.relay.ir_builder import IRBuilder
+
+ib = IRBuilder()
+
+def show(e):
+    r = debug_print(ib.env, e)
+    assert r is not None
+    # print(r) # uncomment this line to debug
+
+
+def test_constant():
+    arr = tvm.nd.array(10)
+    const = relay.Constant(arr)
+    show(const)
+    # should print the array inside?
+
+
+def test_tuple():
+    fields = tvm.convert([])
+    tup = relay.Tuple(fields)
+    show(tup)
+
+
+def test_local_var():
+    name_hint = 's'
+    lv = relay.Var(name_hint)
+    show(lv)
+
+
+def test_dup_var():
+    lv = relay.Var('s')
+    rv = relay.Var('s')
+    show(relay.Tuple([lv, rv]))
+
+
+def test_large_dup_var():
+    av = relay.Var('s')
+    bv = relay.Var('s')
+    cv = relay.Var('s')
+    show(relay.Tuple([av, bv, cv]))
+
+
+def test_global_var():
+    name_hint = 'g'
+    gv = relay.GlobalVar(name_hint)
+    gv.name_hint == name_hint
+    show(gv)
+
+
+def test_param():
+    lv = relay.Var('x')
+    ty = None
+    param = relay.Param(lv, ty)
+    show(lv)
+
+
+def test_function():
+    param_names = ['a', 'b', 'c', 'd']
+    params = tvm.convert([relay.Param(relay.Var(n), None) for n in param_names])
+    ret_type = None
+    body = params[0].var
+    type_params = tvm.convert([])
+    fn = relay.Function(params, ret_type, body, type_params)
+    show(fn)
+
+
+
+def test_call():
+    op = relay.Var('f')
+    arg_names = ['a', 'b', 'c', 'd']
+    args = tvm.convert([relay.Var(n) for n in arg_names])
+    call = relay.Call(op, args, None, None)
+    show(call)
+
+
+def test_let():
+    lv = relay.Var('x')
+    ty = relay.ty.TensorType((10, 20), "float32")
+    arr = tvm.nd.array(10)
+    value = relay.Constant(arr)
+    let = relay.Let(lv, value, lv, ty)
+    show(let)
+
+
+def test_if():
+    cond = relay.Var('cond')
+    left = relay.Var('left')
+    right = relay.Var('right')
+    ife = relay.If(cond, left, right)
+    show(ife)

From 5383874c5becf7aa064671451f438e4349ae0703 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 1 Oct 2018 20:16:06 -0700
Subject: [PATCH 150/529] [NODE] Keep base node system in HalideIR (#1793)

---
 3rdparty/HalideIR             |   2 +-
 include/tvm/buffer.h          |   2 +-
 include/tvm/ir_functor_ext.h  |   2 +-
 include/tvm/ir_mutator.h      |   2 +-
 include/tvm/ir_visitor.h      |   2 +-
 include/tvm/lowered_func.h    |   2 +-
 include/tvm/node/container.h  | 586 ----------------------------------
 include/tvm/node/ir_functor.h | 254 ---------------
 include/tvm/node/memory.h     |  59 ----
 include/tvm/node/node.h       | 337 -------------------
 include/tvm/tensor.h          |   2 +-
 src/lang/node.cc              |  58 ----
 12 files changed, 7 insertions(+), 1301 deletions(-)
 delete mode 100644 include/tvm/node/container.h
 delete mode 100644 include/tvm/node/ir_functor.h
 delete mode 100644 include/tvm/node/memory.h
 delete mode 100644 include/tvm/node/node.h
 delete mode 100644 src/lang/node.cc

diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
index cf6090aeaeb7..2f3ecdfdedf3 160000
--- a/3rdparty/HalideIR
+++ b/3rdparty/HalideIR
@@ -1 +1 @@
-Subproject commit cf6090aeaeb782d1daff54b0ca5c2c281d7008db
+Subproject commit 2f3ecdfdedf3efa7e45a3945dca63a25856c4674
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index cda76cd140c5..2c72db169a2d 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -11,7 +11,7 @@
 #include "base.h"
 #include "expr.h"
 #include "ir_operator.h"
-#include "node/container.h"
+#include "tvm/node/container.h"
 
 namespace tvm {
 
diff --git a/include/tvm/ir_functor_ext.h b/include/tvm/ir_functor_ext.h
index 85d2de75dd99..43868114307d 100644
--- a/include/tvm/ir_functor_ext.h
+++ b/include/tvm/ir_functor_ext.h
@@ -6,7 +6,7 @@
 #ifndef TVM_IR_FUNCTOR_EXT_H_
 #define TVM_IR_FUNCTOR_EXT_H_
 
-#include "node/ir_functor.h"
+#include "tvm/node/ir_functor.h"
 #include "ir.h"
 
 namespace tvm {
diff --git a/include/tvm/ir_mutator.h b/include/tvm/ir_mutator.h
index 6b391caf4b5f..6cc80d55352b 100644
--- a/include/tvm/ir_mutator.h
+++ b/include/tvm/ir_mutator.h
@@ -9,7 +9,7 @@
 #include <unordered_map>
 #include "expr.h"
 #include "ir.h"
-#include "node/ir_functor.h"
+#include "tvm/node/ir_functor.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_visitor.h b/include/tvm/ir_visitor.h
index 265ec0e56efb..755f15078ce2 100644
--- a/include/tvm/ir_visitor.h
+++ b/include/tvm/ir_visitor.h
@@ -7,7 +7,7 @@
 #define TVM_IR_VISITOR_H_
 
 #include "ir.h"
-#include "node/ir_functor.h"
+#include "tvm/node/ir_functor.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h
index 8bd2b1ba84cf..5cb59fd47712 100644
--- a/include/tvm/lowered_func.h
+++ b/include/tvm/lowered_func.h
@@ -13,7 +13,7 @@
 #include "base.h"
 #include "expr.h"
 #include "tensor.h"
-#include "node/container.h"
+#include "tvm/node/container.h"
 
 namespace tvm {
 
diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
deleted file mode 100644
index 43adae27671c..000000000000
--- a/include/tvm/node/container.h
+++ /dev/null
@@ -1,586 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file tvm/node/container.h
- * \brief Array/Map container in the DSL graph.
- */
-#ifndef TVM_NODE_CONTAINER_H_
-#define TVM_NODE_CONTAINER_H_
-
-#include <type_traits>
-#include <vector>
-#include <initializer_list>
-#include <unordered_map>
-#include <utility>
-#include <string>
-#include "node.h"
-#include "memory.h"
-
-namespace tvm {
-
-/*! \brief array node content in array */
-class ArrayNode : public Node {
- public:
-  /*! \brief the data content */
-  std::vector<NodePtr<Node> > data;
-
-  void VisitAttrs(AttrVisitor* visitor) final {
-     // Visitor to array have no effect.
-  }
-
-  static constexpr const char* _type_key = "Array";
-  TVM_DECLARE_NODE_TYPE_INFO(ArrayNode, Node);
-};
-
-/*! \brief map node content */
-class MapNode : public Node {
- public:
-  void VisitAttrs(AttrVisitor* visitor) final {
-     // Visitor to map have no effect.
-  }
-  // hash function
-  struct Hash {
-    size_t operator()(const NodePtr<Node>& n) const {
-      return std::hash<Node*>()(n.get());
-    }
-  };
-  // comparator
-  struct Equal {
-    bool operator()(
-        const NodePtr<Node>& a,
-        const NodePtr<Node>& b) const {
-      return a.get() == b.get();
-    }
-  };
-
-  /*! \brief The corresponding conatiner type */
-  using ContainerType = std::unordered_map<
-   NodePtr<Node>,
-   NodePtr<Node>,
-   Hash, Equal>;
-
-  /*! \brief the data content */
-  ContainerType data;
-
-  static constexpr const char* _type_key = "Map";
-  TVM_DECLARE_NODE_TYPE_INFO(MapNode, Node);
-};
-
-
-/*! \brief specialized map node with string as key */
-class StrMapNode : public Node {
- public:
-  void VisitAttrs(AttrVisitor* visitor) final {
-     // Visitor to map have no effect.
-  }
-  /*! \brief The corresponding conatiner type */
-  using ContainerType = std::unordered_map<
-    std::string,
-    NodePtr<Node> >;
-
-  /*! \brief the data content */
-  ContainerType data;
-
-  static constexpr const char* _type_key = "StrMap";
-  TVM_DECLARE_NODE_TYPE_INFO(StrMapNode, Node);
-};
-
-/*!
- * \brief iterator adapter that adapts TIter to return another type.
- * \tparam Converter a struct that contains converting function
- * \tparam TIter the content iterator type.
- */
-template<typename Converter,
-         typename TIter>
-class IterAdapter {
- public:
-  explicit IterAdapter(TIter iter) : iter_(iter) {}
-  inline IterAdapter& operator++() {  // NOLINT(*)
-    ++iter_;
-    return *this;
-  }
-  inline IterAdapter& operator++(int) {  // NOLINT(*)
-    ++iter_;
-    return *this;
-  }
-  inline IterAdapter operator+(int offset) const {  // NOLINT(*)
-    return IterAdapter(iter_ + offset);
-  }
-  inline bool operator==(IterAdapter other) const {
-    return iter_ == other.iter_;
-  }
-  inline bool operator!=(IterAdapter other) const {
-    return !(*this == other);
-  }
-  inline const typename Converter::ResultType operator*() const {
-    return Converter::convert(*iter_);
-  }
-
- private:
-  TIter iter_;
-};
-
-/*!
- * \brief Array container of NodeRef in DSL graph.
- *  Array implements copy on write semantics, which means array is mutable
- *  but copy will happen when array is referenced in more than two places.
- *
- * operator[] only provide const acces, use Set to mutate the content.
- * \tparam T The content NodeRef type.
- */
-template<typename T,
-         typename = typename std::enable_if<std::is_base_of<NodeRef, T>::value>::type >
-class Array : public NodeRef {
- public:
-  /*!
-   * \brief default constructor
-   */
-  Array() {
-    node_ = make_node<ArrayNode>();
-  }
-  /*!
-   * \brief move constructor
-   * \param other source
-   */
-  Array(Array<T> && other) {  // NOLINT(*)
-    node_ = std::move(other.node_);
-  }
-  /*!
-   * \brief copy constructor
-   * \param other source
-   */
-  Array(const Array<T> &other) { // NOLINT(*)
-    node_ = other.node_;
-  }
-  /*!
-   * \brief constructor from pointer
-   * \param n the container pointer
-   */
-  explicit Array(NodePtr<Node> n) : NodeRef(n) {}
-  /*!
-   * \brief constructor from iterator
-   * \param begin begin of iterator
-   * \param end end of iterator
-   * \tparam IterType The type of iterator
-   */
-  template<typename IterType>
-  Array(IterType begin, IterType end) {
-    assign(begin, end);
-  }
-  /*!
-   * \brief constructor from initializer list
-   * \param init The initalizer list
-   */
-  Array(std::initializer_list<T> init) { // NOLINT(*)
-    assign(init.begin(), init.end());
-  }
-  /*!
-   * \brief constructor from vector
-   * \param init The vector
-   */
-  Array(const std::vector<T>& init) { // NOLINT(*)
-    assign(init.begin(), init.end());
-  }
-  /*!
-   * \brief move assign operator
-   * \param other The source of assignment
-   * \return reference to self.
-   */
-  Array<T>& operator=(Array<T> && other) {
-    node_ = std::move(other.node_);
-    return *this;
-  }
-  /*!
-   * \brief copy assign operator
-   * \param other The source of assignment
-   * \return reference to self.
-   */
-  Array<T>& operator=(const Array<T> & other) {
-    node_ = other.node_;
-    return *this;
-  }
-  /*!
-   * \brief reset the array to content from iterator.
-   * \param begin begin of iterator
-   * \param end end of iterator
-   * \tparam IterType The type of iterator
-   */
-  template<typename IterType>
-  void assign(IterType begin, IterType end) {
-    auto n = make_node<ArrayNode>();
-    for (IterType it = begin; it != end; ++it) {
-      n->data.push_back((*it).node_);
-    }
-    node_ = std::move(n);
-  }
-  /*!
-   * \brief Read i-th element from array.
-   * \param i The index
-   * \return the i-th element.
-   */
-  inline const T operator[](size_t i) const {
-    return T(static_cast<const ArrayNode*>(node_.get())->data[i]);
-  }
-  /*! \return The size of the array */
-  inline size_t size() const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const ArrayNode*>(node_.get())->data.size();
-  }
-  /*!
-   * \brief copy on write semantics
-   *  Do nothing if current handle is the unique copy of the array.
-   *  Otherwise make a new copy of the array to ensure the current handle
-   *  hold a unique copy.
-   *
-   * \return Handle to the internal node container(which ganrantees to be unique)
-   */
-  inline ArrayNode* CopyOnWrite() {
-    if (node_.get() == nullptr || !node_.unique())  {
-      NodePtr<ArrayNode> n = make_node<ArrayNode>();
-      n->data = static_cast<ArrayNode*>(node_.get())->data;
-      NodePtr<Node>(std::move(n)).swap(node_);
-    }
-    return static_cast<ArrayNode*>(node_.get());
-  }
-  /*!
-   * \brief push a new item to the back of the list
-   * \param item The item to be pushed.
-   */
-  inline void push_back(const T& item) {
-    ArrayNode* n = this->CopyOnWrite();
-    n->data.push_back(item.node_);
-  }
-  /*!
-   * \brief set i-th element of the array.
-   * \param i The index
-   * \param value The value to be setted.
-   */
-  inline void Set(size_t i, const T& value) {
-    ArrayNode* n = this->CopyOnWrite();
-    n->data[i] = value.node_;
-  }
-  /*! \return whether array is empty */
-  inline bool empty() const {
-    return size() == 0;
-  }
-  /*! \brief specify container node */
-  using ContainerType = ArrayNode;
-
-  struct Ptr2NodeRef {
-    using ResultType = T;
-    static inline T convert(const NodePtr<Node>& n) {
-      return T(n);
-    }
-  };
-  using iterator = IterAdapter<Ptr2NodeRef,
-                               std::vector<NodePtr<Node> >::const_iterator>;
-
-  using reverse_iterator = IterAdapter<
-    Ptr2NodeRef,
-    std::vector<NodePtr<Node> >::const_reverse_iterator>;
-
-  /*! \return begin iterator */
-  inline iterator begin() const {
-    return iterator(static_cast<const ArrayNode*>(node_.get())->data.begin());
-  }
-  /*! \return end iterator */
-  inline iterator end() const {
-    return iterator(static_cast<const ArrayNode*>(node_.get())->data.end());
-  }
-  /*! \return rbegin iterator */
-  inline reverse_iterator rbegin() const {
-    return reverse_iterator(static_cast<const ArrayNode*>(node_.get())->data.rbegin());
-  }
-  /*! \return rend iterator */
-  inline reverse_iterator rend() const {
-    return reverse_iterator(static_cast<const ArrayNode*>(node_.get())->data.rend());
-  }
-};
-
-/*!
- * \brief Map container of NodeRef->NodeRef in DSL graph.
- *  Map implements copy on write semantics, which means map is mutable
- *  but copy will happen when array is referenced in more than two places.
- *
- * operator[] only provide const acces, use Set to mutate the content.
- * \tparam K The key NodeRef type.
- * \tparam V The value NodeRef type.
- */
-template<typename K,
-         typename V,
-         typename = typename std::enable_if<
-           std::is_base_of<NodeRef, K>::value ||
-           std::is_base_of<std::string, K>::value >::type,
-         typename = typename std::enable_if<std::is_base_of<NodeRef, V>::value>::type>
-class Map : public NodeRef {
- public:
-  /*!
-   * \brief default constructor
-   */
-  Map() {
-    node_ = make_node<MapNode>();
-  }
-  /*!
-   * \brief move constructor
-   * \param other source
-   */
-  Map(Map<K, V> && other) {  // NOLINT(*)
-    node_ = std::move(other.node_);
-  }
-  /*!
-   * \brief copy constructor
-   * \param other source
-   */
-  Map(const Map<K, V> &other) { // NOLINT(*)
-    node_ = other.node_;
-  }
-  /*!
-   * \brief constructor from pointer
-   * \param n the container pointer
-   */
-  explicit Map(NodePtr<Node> n) : NodeRef(n) {}
-  /*!
-   * \brief constructor from iterator
-   * \param begin begin of iterator
-   * \param end end of iterator
-   * \tparam IterType The type of iterator
-   */
-  template<typename IterType>
-  Map(IterType begin, IterType end) {
-    assign(begin, end);
-  }
-  /*!
-   * \brief constructor from initializer list
-   * \param init The initalizer list
-   */
-  Map(std::initializer_list<std::pair<K, V> > init) { // NOLINT(*)
-    assign(init.begin(), init.end());
-  }
-  /*!
-   * \brief constructor from vector
-   * \param init The vector
-   */
-  template<typename Hash, typename Equal>
-  Map(const std::unordered_map<K, V, Hash, Equal>& init) { // NOLINT(*)
-    assign(init.begin(), init.end());
-  }
-  /*!
-   * \brief move assign operator
-   * \param other The source of assignment
-   * \return reference to self.
-   */
-  Map<K, V>& operator=(Map<K, V> && other) {
-    node_ = std::move(other.node_);
-    return *this;
-  }
-  /*!
-   * \brief copy assign operator
-   * \param other The source of assignment
-   * \return reference to self.
-   */
-  Map<K, V>& operator=(const Map<K, V> & other) {
-    node_ = other.node_;
-    return *this;
-  }
-  /*!
-   * \brief reset the array to content from iterator.
-   * \param begin begin of iterator
-   * \param end end of iterator
-   * \tparam IterType The type of iterator
-   */
-  template<typename IterType>
-  void assign(IterType begin, IterType end) {
-    NodePtr<MapNode> n = make_node<MapNode>();
-    for (IterType i = begin; i != end; ++i) {
-      n->data.emplace(std::make_pair(i->first.node_,
-                                     i->second.node_));
-    }
-    node_ = std::move(n);
-  }
-  /*!
-   * \brief Read element from map.
-   * \param key The key
-   * \return the corresonding element.
-   */
-  inline const V operator[](const K& key) const {
-    return V(static_cast<const MapNode*>(node_.get())->data.at(key.node_));
-  }
-  /*!
-   * \brief Read element from map.
-   * \param key The key
-   * \return the corresonding element.
-   */
-  inline const V at(const K& key) const {
-    return V(static_cast<const MapNode*>(node_.get())->data.at(key.node_));
-  }
-  /*! \return The size of the array */
-  inline size_t size() const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const MapNode*>(node_.get())->data.size();
-  }
-  /*! \return The size of the array */
-  inline size_t count(const K& key) const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const MapNode*>(node_.get())->data.count(key.node_);
-  }
-  /*!
-   * \brief copy on write semantics
-   *  Do nothing if current handle is the unique copy of the array.
-   *  Otherwise make a new copy of the array to ensure the current handle
-   *  hold a unique copy.
-   *
-   * \return Handle to the internal node container(which ganrantees to be unique)
-   */
-  inline MapNode* CopyOnWrite() {
-    if (node_.get() == nullptr || !node_.unique())  {
-      NodePtr<MapNode> n = make_node<MapNode>();
-      n->data = static_cast<const MapNode*>(node_.get())->data;
-      NodePtr<Node>(std::move(n)).swap(node_);
-    }
-    return static_cast<MapNode*>(node_.get());
-  }
-  /*!
-   * \brief set the Map.
-   * \param key The index key.
-   * \param value The value to be setted.
-   */
-  inline void Set(const K& key, const V& value) {
-    MapNode* n = this->CopyOnWrite();
-    n->data[key.node_] = value.node_;
-  }
-
-  /*! \return whether array is empty */
-  inline bool empty() const {
-    return size() == 0;
-  }
-  /*! \brief specify container node */
-  using ContainerType = MapNode;
-
-  struct Ptr2NodeRef {
-    using ResultType = std::pair<K, V>;
-    static inline ResultType convert(const std::pair<
-                            NodePtr<Node>,
-                            NodePtr<Node> >& n) {
-      return std::make_pair(K(n.first), V(n.second));
-    }
-  };
-
-  using iterator = IterAdapter<
-    Ptr2NodeRef, MapNode::ContainerType::const_iterator>;
-
-  /*! \return begin iterator */
-  inline iterator begin() const {
-    return iterator(static_cast<const MapNode*>(node_.get())->data.begin());
-  }
-  /*! \return end iterator */
-  inline iterator end() const {
-    return iterator(static_cast<const MapNode*>(node_.get())->data.end());
-  }
-  /*! \return begin iterator */
-  inline iterator find(const K& key) const {
-    return iterator(static_cast<const MapNode*>(node_.get())->data.find(key.node_));
-  }
-};
-
-// specialize of string map
-template<typename V, typename T1, typename T2>
-class Map<std::string, V, T1, T2> : public NodeRef {
- public:
-  // for code reuse
-  Map() {
-    node_ = make_node<StrMapNode>();
-  }
-  Map(Map<std::string, V> && other) {  // NOLINT(*)
-    node_ = std::move(other.node_);
-  }
-  Map(const Map<std::string, V> &other) { // NOLINT(*)
-    node_ = other.node_;
-  }
-  explicit Map(NodePtr<Node> n) : NodeRef(n) {}
-  template<typename IterType>
-  Map(IterType begin, IterType end) {
-    assign(begin, end);
-  }
-  Map(std::initializer_list<std::pair<std::string, V> > init) { // NOLINT(*)
-    assign(init.begin(), init.end());
-  }
-
-  template<typename Hash, typename Equal>
-  Map(const std::unordered_map<std::string, V, Hash, Equal>& init) { // NOLINT(*)
-    assign(init.begin(), init.end());
-  }
-  Map<std::string, V>& operator=(Map<std::string, V> && other) {
-    node_ = std::move(other.node_);
-    return *this;
-  }
-  Map<std::string, V>& operator=(const Map<std::string, V> & other) {
-    node_ = other.node_;
-    return *this;
-  }
-  template<typename IterType>
-  void assign(IterType begin, IterType end) {
-    auto n = make_node<StrMapNode>();
-    for (IterType i = begin; i != end; ++i) {
-      n->data.emplace(std::make_pair(i->first,
-                                     i->second.node_));
-    }
-    node_ = std::move(n);
-  }
-  inline const V operator[](const std::string& key) const {
-    return V(static_cast<const StrMapNode*>(node_.get())->data.at(key));
-  }
-  inline const V at(const std::string& key) const {
-    return V(static_cast<const StrMapNode*>(node_.get())->data.at(key));
-  }
-  inline size_t size() const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const StrMapNode*>(node_.get())->data.size();
-  }
-  inline size_t count(const std::string& key) const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const StrMapNode*>(node_.get())->data.count(key);
-  }
-  inline StrMapNode* CopyOnWrite() {
-    if (node_.get() == nullptr || !node_.unique())  {
-      NodePtr<StrMapNode> n = make_node<StrMapNode>();
-      n->data = static_cast<const StrMapNode*>(node_.get())->data;
-      NodePtr<Node>(std::move(n)).swap(node_);
-    }
-    return static_cast<StrMapNode*>(node_.get());
-  }
-  inline void Set(const std::string& key, const V& value) {
-    StrMapNode* n = this->CopyOnWrite();
-    n->data[key] = value.node_;
-  }
-  inline bool empty() const {
-    return size() == 0;
-  }
-  using ContainerType = StrMapNode;
-
-  struct Ptr2NodeRef {
-    using ResultType = std::pair<std::string, V>;
-    static inline ResultType convert(const std::pair<
-                            std::string,
-                            NodePtr<Node> >& n) {
-      return std::make_pair(n.first, V(n.second));
-    }
-  };
-
-  using iterator = IterAdapter<
-    Ptr2NodeRef, StrMapNode::ContainerType::const_iterator>;
-
-  /*! \return begin iterator */
-  inline iterator begin() const {
-    return iterator(static_cast<const StrMapNode*>(node_.get())->data.begin());
-  }
-  /*! \return end iterator */
-  inline iterator end() const {
-    return iterator(static_cast<const StrMapNode*>(node_.get())->data.end());
-  }
-  /*! \return begin iterator */
-  inline iterator find(const std::string& key) const {
-    return iterator(static_cast<const StrMapNode*>(node_.get())->data.find(key));
-  }
-};
-
-}  // namespace tvm
-#endif  // TVM_NODE_CONTAINER_H_
diff --git a/include/tvm/node/ir_functor.h b/include/tvm/node/ir_functor.h
deleted file mode 100644
index 293bec75bbf5..000000000000
--- a/include/tvm/node/ir_functor.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file tvm/node/ir_functor.h
- * \brief Defines the IRFunctor data structures.
- */
-#ifndef TVM_NODE_IR_FUNCTOR_H_
-#define TVM_NODE_IR_FUNCTOR_H_
-
-#include <dmlc/logging.h>
-#include <string>
-#include <vector>
-#include <type_traits>
-#include <functional>
-#include "node.h"
-#include "../runtime/registry.h"
-
-namespace tvm {
-/*!
- * \brief A dynamical dispatched functor on NodeRef in the first argument.
- *
- * \code
- *   IRFunctor<std::string (const NodeRef& n, std::string prefix)> tostr;
- *   tostr.set_dispatch<Add>([](const Add* op, std::string prefix) {
- *     return prefix + "Add";
- *   });
- *   tostr.set_dispatch<IntImm>([](const IntImm* op) {
- *     return prefix + "IntImm"
- *   });
- *
- *   Expr x = make_const(1);
- *   Expr y = x + x;
- *   // dispatch to IntImm, outputs "MyIntImm"
- *   LOG(INFO) << tostr(x, "My");
- *   // dispatch to IntImm, outputs "MyAdd"
- *   LOG(INFO) << tostr(y, "My");
- * \endcode
- *
- * \tparam FType function signiture
- *  This type if only defined for FType with function signiture
- */
-template<typename FType>
-class IRFunctor;
-
-template<typename R, typename ...Args>
-class IRFunctor<R(const NodeRef& n, Args...)> {
- private:
-  using Function = std::function<R (const NodeRef&n, Args...)>;
-  using TSelf = IRFunctor<R (const NodeRef& n, Args...)>;
-  /*! \brief internal function table */
-  std::vector<Function> func_;
-
- public:
-  /*! \brief the result type of this functor */
-  using result_type = R;
-  /*!
-   * \brief Whether the functor can dispatch the corresponding Node
-   * \param n The node to be dispatched
-   * \return Whether dispatching function is registered for n's type.
-   */
-  inline bool can_dispatch(const NodeRef& n) const {
-    uint32_t type_index = n.type_index();
-    return type_index < func_.size() && func_[type_index] != nullptr;
-  }
-  /*!
-   * \brief invoke the functor , dispatch on type of n
-   * \param n The Node argument
-   * \param args The additional arguments
-   * \return The result.
-   */
-  inline R operator()(const NodeRef& n, Args... args) const {
-    uint32_t type_index = n.type_index();
-    CHECK(type_index < func_.size() &&
-          func_[type_index] != nullptr)
-        << "IRFunctor calls un-registered function on type "
-        << Node::TypeIndex2Key(type_index);
-    return func_[type_index](n, std::forward<Args>(args)...);
-  }
-  /*!
-   * \brief set the dispacher for type TNode
-   * \param f The function to be set.
-   * \tparam TNode the type of Node to be dispatched.
-   * \return reference to self.
-   */
-  template<typename TNode>
-  inline TSelf& set_dispatch(Function f) {  // NOLINT(*)
-    uint32_t tindex = Node::TypeKey2Index(TNode::_type_key);
-    if (func_.size() <= tindex) {
-      func_.resize(tindex + 1, nullptr);
-    }
-    CHECK(func_[tindex] == nullptr)
-        << "Dispatch for " << Node::TypeIndex2Key(tindex)
-        << " is already set";
-    func_[tindex] = f;
-    return *this;
-  }
-  /*!
-   * \brief set the dispacher for type TNode
-   *  This allows f to used detailed const Node pointer to replace NodeRef
-   *
-   * \param f The function to be set.
-   * \tparam TNode the type of Node to be dispatched.
-   * \return reference to self.
-   */
-  template<typename TNode>
-  inline TSelf& set_dispatch(std::function<R(const TNode* n, Args...)> f) { // NOLINT(*)
-    Function fun = [f](const NodeRef& n, Args... args) {
-      return f(static_cast<const TNode*>(n.node_.get()),
-               std::forward<Args>(args)...);
-    };
-    return this->set_dispatch<TNode>(fun);
-  }
-  /*!
-  * \brief unset the dispacher for type TNode
-  *
-  * \tparam TNode the type of Node to be dispatched.
-  * \return reference to self.
-  */
-  template<typename TNode>
-  inline TSelf& clear_dispatch() {  // NOLINT(*)
-    uint32_t tindex = Node::TypeKey2Index(TNode::_type_key);
-    CHECK_LT(tindex, func_.size()) << "clear_dispatch: index out of range";
-    func_[tindex] = nullptr;
-    return *this;
-  }
-};
-
-#define TVM_REGISTER_VAR_DEF(ClsName)                                 \
-  static TVM_ATTRIBUTE_UNUSED auto & __make_functor ## _ ## ClsName
-
-/*!
- * \brief Useful macro to set IRFunctor dispatch in a global static field.
- *
- * \code
- *  // Use IRFunctor to implement IRPrinter similar to Visitor Pattern.
- *  // vtable allows easy patch in of new Node types, without changing
- *  // interface of IRPrinter.
- *
- *  class IRPrinter {
- *   public:
- *    std::ostream& stream;
- *    // the dispatch function.
- *    void print(Expr e) {
- *      const static FType& f = *vtable();
- *      f(e, this);
- *    }
- *
- *    using FType = IRFunctor<void (const NodeRef&, IRPrinter *)>;
- *    // function to return global function table
- *    static FType& vtable();
- *  };
- *
- *  // in cpp/cc file
- *  IRPrinter::FType& IRPrinter::vtable() { // NOLINT(*0
- *    static FType inst; return inst;
- *  }
- *
- *  TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
- *  .set_dispatch<Add>([](const Add* n, IRPrinter* p) {
- *    p->print(n->a);
- *    p->stream << '+'
- *    p->print(n->b);
- *  });
- *
- *
- * \endcode
- *
- * \param ClsName The name of the class
- * \param FField The static function that returns a singleton of IRFunctor.
- */
-#define TVM_STATIC_IR_FUNCTOR(ClsName, FField)                       \
-  TVM_STR_CONCAT(TVM_REGISTER_VAR_DEF(ClsName), __COUNTER__)  =      \
-                              ClsName::FField()
-
- /*!
- * \brief A container for a list of callbacks. All callbacks are invoked when
- * the object is destructed.
- */
-class IRFunctorCleanList {
- public:
-  ~IRFunctorCleanList() {
-    for (auto &f : clean_items) {
-      f();
-    }
-  }
-
-  void append(std::function<void()> func) {
-    clean_items.push_back(func);
-  }
-
- private:
-  std::vector< std::function<void()> > clean_items;
-};
-
-/*!
-* \brief A wrapper around IRFunctor that will record calls to set_dispatch
-* and make a corresponding call to clear_dispatch when the last copy of
-* the IRFunctorStaticRegistry is destructed. When assigned to a static variable,
-* this can be used by NNVM and other libraries to unregister callbacks when
-* the library is unloaded. This prevents crashes when the underlying IRFunctor
-* is destructed as it will no longer contain std::function instances allocated
-* by a library that has been unloaded.
-*/
-template<typename FType>
-class IRFunctorStaticRegistry;
-
-template<typename R, typename ...Args>
-class IRFunctorStaticRegistry<R(const NodeRef& n, Args...)> {
- private:
-  IRFunctor<R(const NodeRef& n, Args...)> *irf_;
-  std::shared_ptr<IRFunctorCleanList> free_list;
-
-  using TSelf = IRFunctorStaticRegistry<R(const NodeRef& n, Args...)>;
-
- public:
-  IRFunctorStaticRegistry(IRFunctor<R(const NodeRef& n, Args...)> *irf) {
-    irf_ = irf;
-    free_list = std::make_shared<IRFunctorCleanList>();
-  }
-
-  template<typename TNode>
-  inline TSelf& set_dispatch(std::function<R(const TNode* n, Args...)> f) {  // NOLINT(*)
-    irf_->template set_dispatch<TNode>(f);
-    auto irf_copy = irf_;
-    free_list.get()->append([irf_copy] {
-      irf_copy->template clear_dispatch<TNode>();
-      });
-    return *this;
-  }
-};
-
-/*!
-* \brief Helper function for constructing an IRFunctorStaticRegistry. This allows
-* the compiler to deduce the template types.
-*/
-template<typename R, typename ...Args>
-IRFunctorStaticRegistry<R(const NodeRef& n, Args...)> MakeIRFunctorStaticRegistry(
-  IRFunctor<R(const NodeRef& n, Args...)> *irf) {
-  return IRFunctorStaticRegistry<R(const NodeRef& n, Args...)>(irf);
-}
-
-#define TVM_AUTO_REGISTER_VAR_DEF(ClsName)                           \
-  static TVM_ATTRIBUTE_UNUSED auto __make_functor ## _ ## ClsName
-
-/*!
-* \brief Macro to set IRFunctor dispatch in a global static field using an IRFunctorStaticRegistry.
-* Usage is exactly the same as TVM_STATIC_IR_FUNCTOR. Libraries should use this instead of
-* TVM_STATIC_IR_FUNCTOR.
-*/
-#define TVM_STATIC_IR_FUNCTOR_REGISTER(ClsName, FField)                  \
-  TVM_STR_CONCAT(TVM_AUTO_REGISTER_VAR_DEF(ClsName), __COUNTER__)  = \
-                        MakeIRFunctorStaticRegistry(&ClsName::FField())
-
-}  // namespace tvm
-#endif  // TVM_NODE_IR_FUNCTOR_H_
diff --git a/include/tvm/node/memory.h b/include/tvm/node/memory.h
deleted file mode 100644
index c0f791eb597b..000000000000
--- a/include/tvm/node/memory.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file tvm/node/memory.h
- * \brief Node memory management.
- */
-#ifndef TVM_NODE_MEMORY_H_
-#define TVM_NODE_MEMORY_H_
-
-#include "node.h"
-
-namespace tvm {
-/*!
- * \brief Allocate a node object.
- * \param args arguments to the constructor.
- * \tparam T the node type.
- * \return The NodePtr to the allocated object.
- */
-template<typename T, typename... Args>
-inline NodePtr<T> make_node(Args&&... args);
-
-// Detail implementations after this
-//
-// The current design allows swapping the
-// allocator pattern when necessary.
-//
-// Possible future allocator optimizations:
-// - Arena allocator that gives ownership of memory to arena (deleter_= nullptr)
-// - Thread-local object pools: one pool per size and alignment requirement.
-// - Can specialize by type of object to give the specific allocator to each object.
-//
-template<typename T>
-class SimpleNodeAllocator {
- public:
-  template<typename... Args>
-  static T* New(Args&&... args) {
-    return new T(std::forward<Args>(args)...);
-  }
-  static NodeBase::FDeleter Deleter() {
-    return Deleter_;
-  }
-
- private:
-  static void Deleter_(NodeBase* ptr) {
-    delete static_cast<T*>(ptr);
-  }
-};
-
-template<typename T, typename... Args>
-inline NodePtr<T> make_node(Args&&... args) {
-  using Allocator = SimpleNodeAllocator<T>;
-  static_assert(std::is_base_of<NodeBase, T>::value,
-                "make_node can only be used to create NodeBase");
-  T* node = Allocator::New(std::forward<Args>(args)...);
-  node->deleter_ = Allocator::Deleter();
-  return NodePtr<T>(node);
-}
-
-}  // namespace tvm
-#endif  // TVM_NODE_MEMORY_H_
diff --git a/include/tvm/node/node.h b/include/tvm/node/node.h
deleted file mode 100644
index efa930568c48..000000000000
--- a/include/tvm/node/node.h
+++ /dev/null
@@ -1,337 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file tvm/node/node.h
- * \brief Node system data structure.
- */
-#ifndef TVM_NODE_NODE_H_
-#define TVM_NODE_NODE_H_
-
-#include <string>
-#include <vector>
-#include <type_traits>
-#include "base/Type.h"
-#include "../runtime/node_base.h"
-#include "../runtime/c_runtime_api.h"
-
-namespace tvm {
-using HalideIR::Type;
-// forward declaration
-class Node;
-class NodeRef;
-
-namespace runtime {
-// forward declaration
-class NDArray;
-}  // namespace runtime
-
-/*!
- * \brief Visitor class to each node content.
- *  The content is going to be called for each field.
- */
-class TVM_DLL AttrVisitor {
- public:
-//! \cond Doxygen_Suppress
-  virtual void Visit(const char* key, double* value) = 0;
-  virtual void Visit(const char* key, int64_t* value) = 0;
-  virtual void Visit(const char* key, uint64_t* value) = 0;
-  virtual void Visit(const char* key, int* value) = 0;
-  virtual void Visit(const char* key, bool* value) = 0;
-  virtual void Visit(const char* key, std::string* value) = 0;
-  virtual void Visit(const char* key, void** value) = 0;
-  virtual void Visit(const char* key, Type* value) = 0;
-  virtual void Visit(const char* key, NodeRef* value) = 0;
-  virtual void Visit(const char* key, runtime::NDArray* value) = 0;
-  template<typename ENum,
-           typename = typename std::enable_if<std::is_enum<ENum>::value>::type>
-  void Visit(const char* key, ENum* ptr) {
-    static_assert(std::is_same<int, typename std::underlying_type<ENum>::type>::value,
-                  "declare enum to be enum int to use visitor");
-    this->Visit(key, reinterpret_cast<int*>(ptr));
-  }
-//! \endcond
-};
-
-/*!
- * \brief base class of node container in DSL AST.
- *  All object's internal is stored as std::shared_ptr<Node>
- */
-class TVM_DLL Node : public NodeBase {
- public:
-  /*! \brief virtual destructor */
-  virtual ~Node() {}
-  /*! \return The unique type key of the node */
-  virtual const char* type_key() const = 0;
-  /*!
-   * \brief Apply visitor to each field of the Node
-   *  Visitor could mutate the content of the node.
-   *  override if Node contains attribute fields.
-   * \param visitor The visitor
-   */
-  virtual void VisitAttrs(AttrVisitor* visitor) {}
-  /*! \return the type index of the node */
-  virtual const uint32_t type_index() const = 0;
-  /*!
-   * \brief Whether this node derives from node with type_index=tid.
-   *  Implemented by TVM_DECLARE_NODE_TYPE_INFO
-   *
-   * \param tid The type index.
-   * \return the check result.
-   */
-  virtual const bool _DerivedFrom(uint32_t tid) const;
-  /*!
-   * \brief get a runtime unique type index given a type key
-   * \param type_key Type key of a type.
-   * \return the corresponding type index.
-   */
-  static uint32_t TypeKey2Index(const char* type_key);
-  /*!
-   * \brief get type key from type index.
-   * \param index The type index
-   * \return the corresponding type key.
-   */
-  static const char* TypeIndex2Key(uint32_t index);
-  /*!
-   * \return whether the type is derived from
-   */
-  template<typename T>
-  inline bool derived_from() const;
-  /*!
-   * \return whether the node is of type T
-   * \tparam The type to be checked.
-   */
-  template<typename T>
-  inline bool is_type() const;
-  /*!
-   * \brief Get a NodePtr that holds reference to this Node.
-   * \return the NodePtr
-   */
-  inline NodePtr<Node> GetNodePtr() const;
-  // node ref can see this
-  friend class NodeRef;
-  static constexpr const char* _type_key = "Node";
-};
-
-/*! \brief Base class of all node reference object */
-class NodeRef {
- public:
-  /*! \brief type indicate the container type */
-  using ContainerType = Node;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool operator==(const NodeRef& other) const;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool same_as(const NodeRef& other) const;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool operator<(const NodeRef& other) const;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool operator!=(const NodeRef& other) const;
-  /*! \return the hash function for NodeRef */
-  inline size_t hash() const;
-  /*! \return whether the expression is null */
-  inline bool defined() const;
-  /*! \return the internal type index of IRNode */
-  inline uint32_t type_index() const;
-  /*! \return the internal node pointer */
-  inline const Node* get() const;
-  /*! \return the internal node pointer */
-  inline const Node* operator->() const;
-  /*!
-   * \brief Downcast this ir node to its actual type (e.g. Add, or
-   * Select). This returns nullptr if the node is not of the requested
-   * type. Example usage:
-   *
-   * if (const Add *add = node->as<Add>()) {
-   *   // This is an add node
-   * }
-   * \tparam T the target type, must be subtype of IRNode
-   */
-  template<typename T>
-  inline const T *as() const;
-  /*!
-   * \brief A more powerful version of as that also works with
-   *  intermediate base types.
-   * \tparam T the target type, must be subtype of IRNode
-   */
-  template<typename T>
-  inline const T *as_derived() const;
-  /*! \brief default constructor */
-  NodeRef() = default;
-  explicit NodeRef(NodePtr<Node> node) : node_(node) {}
-  /*! \brief the internal node object, do not touch  */
-  NodePtr<Node> node_;
-};
-
-/*!
- * \brief Get a reference type from a Node ptr type
- *
- *  It is always important to get a reference type
- *  if we want to return a value as reference or keep
- *  the node alive beyond the scope of the function.
- *
- * \param ptr The node pointer
- * \tparam RefType The reference type
- * \tparam NodeType The node type
- * \return The corresponding RefType
- */
-template <typename RefType, typename NodeType>
-inline RefType GetRef(const NodeType* ptr);
-
-/*!
- * \brief Downcast a base reference type to a more specific type.
- *
- * \param ref The inptut reference
- * \return The corresponding SubRef.
- * \tparam SubRef The target specific reference type.
- * \tparam BaseRef the current reference type.
- */
-template <typename SubRef, typename BaseRef>
-inline SubRef Downcast(BaseRef ref);
-
-/*!
- * \brief helper macro to declare type information in a base node.
- */
-#define TVM_DECLARE_BASE_NODE_INFO(TypeName, Parent)                    \
-  const bool _DerivedFrom(uint32_t tid) const override {                \
-    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
-    if (tidx == tid) return true;                                       \
-    return Parent::_DerivedFrom(tid);                                   \
-  }
-
-/*!
- * \brief helper macro to declare type information in a terminal node
- */
-#define TVM_DECLARE_NODE_TYPE_INFO(TypeName, Parent)                    \
-  const char* type_key() const final {                                  \
-    return TypeName::_type_key;                                         \
-  }                                                                     \
-  const uint32_t type_index() const final {                             \
-    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
-    return tidx;                                                        \
-  }                                                                     \
-  const bool _DerivedFrom(uint32_t tid) const final {                   \
-    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
-    if (tidx == tid) return true;                                       \
-    return Parent::_DerivedFrom(tid);                                   \
-  }
-
-// implementations of inline functions after this
-template<typename T>
-inline bool Node::is_type() const {
-  // use static field so query only happens once.
-  static uint32_t type_id = Node::TypeKey2Index(T::_type_key);
-  return type_id == this->type_index();
-}
-
-template<typename T>
-inline bool Node::derived_from() const {
-  // use static field so query only happens once.
-  static uint32_t type_id = Node::TypeKey2Index(T::_type_key);
-  return this->_DerivedFrom(type_id);
-}
-
-inline NodePtr<Node> Node::GetNodePtr() const {
-  return NodePtr<Node>(const_cast<Node*>(this));
-}
-
-template <typename RefType, typename NodeType>
-inline RefType GetRef(const NodeType* ptr) {
-  static_assert(std::is_base_of<typename RefType::ContainerType, NodeType>::value,
-                "Can only cast to the ref of same container type");
-  return RefType(ptr->GetNodePtr());
-}
-
-template <typename SubRef, typename BaseRef>
-inline SubRef Downcast(BaseRef ref) {
-  CHECK(ref->template is_type<typename SubRef::ContainerType>() ||
-        ref->template derived_from<typename SubRef::ContainerType>())
-      << "Downcast from " << ref->type_key() << " to "
-      << SubRef::ContainerType::_type_key << " failed.";
-  return SubRef(std::move(ref.node_));
-}
-
-inline const Node* NodeRef::get() const {
-  return node_.get();
-}
-
-inline const Node* NodeRef::operator->() const {
-  return node_.get();
-}
-
-inline bool NodeRef::defined() const {
-  return node_.get() != nullptr;
-}
-
-inline bool NodeRef::operator==(const NodeRef& other) const {
-  return node_.get() == other.node_.get();
-}
-
-inline bool NodeRef::same_as(const NodeRef& other) const {
-  return node_.get() == other.node_.get();
-}
-
-inline bool NodeRef::operator<(const NodeRef& other) const {
-  return node_.get() < other.node_.get();
-}
-
-inline bool NodeRef::operator!=(const NodeRef& other) const {
-  return node_.get() != other.node_.get();
-}
-
-inline size_t NodeRef::hash() const {
-  return std::hash<Node*>()(node_.get());
-}
-
-inline uint32_t NodeRef::type_index() const {
-  CHECK(node_.get() != nullptr)
-      << "null type";
-  return get()->type_index();
-}
-
-template<typename T>
-inline const T* NodeRef::as() const {
-  const Node* ptr = static_cast<const Node*>(get());
-  if (ptr && ptr->is_type<T>()) {
-    return static_cast<const T*>(ptr);
-  }
-  return nullptr;
-}
-
-template<typename T>
-inline const T* NodeRef::as_derived() const {
-  const Node* ptr = static_cast<const Node*>(get());
-  if (ptr && (ptr->is_type<T>() || ptr->derived_from<T>())) {
-    return static_cast<const T*>(ptr);
-  }
-  return nullptr;
-}
-
-/*! \brief The hash function for nodes */
-struct NodeHash {
-  size_t operator()(const NodeRef& a) const {
-    return a.hash();
-  }
-};
-
-/*! \brief The equal comparator for nodes */
-struct NodeEqual {
-  bool operator()(const NodeRef& a, const NodeRef& b) const {
-    return a.get() == b.get();
-  }
-};
-}  // namespace tvm
-#endif  // TVM_NODE_NODE_H_
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index 7665e724b236..16f7363a9e73 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -7,6 +7,7 @@
 #define TVM_TENSOR_H_
 
 #include <ir/FunctionBase.h>
+#include <tvm/node/container.h>
 #include <string>
 #include <vector>
 #include <type_traits>
@@ -15,7 +16,6 @@
 #include "expr.h"
 #include "ir_operator.h"
 #include "arithmetic.h"
-#include "node/container.h"
 
 namespace tvm {
 
diff --git a/src/lang/node.cc b/src/lang/node.cc
deleted file mode 100644
index f7043eaf7b2a..000000000000
--- a/src/lang/node.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- *  Implementation of IR Node API
- * \file node.cc
- */
-#include <tvm/node/node.h>
-#include <memory>
-#include <atomic>
-#include <mutex>
-#include <unordered_map>
-
-namespace tvm {
-
-namespace {
-// single manager of operator information.
-struct TypeManager {
-  // mutex to avoid registration from multiple threads.
-  // recursive is needed for trigger(which calls UpdateAttrMap)
-  std::mutex mutex;
-  std::atomic<uint32_t> type_counter{0};
-  std::unordered_map<std::string, uint32_t> key2index;
-  std::vector<std::string> index2key;
-  // get singleton of the
-  static TypeManager* Global() {
-    static TypeManager inst;
-    return &inst;
-  }
-};
-}  // namespace
-
-const bool Node::_DerivedFrom(uint32_t tid) const {
-  static uint32_t tindex = TypeKey2Index(Node::_type_key);
-  return tid == tindex;
-}
-
-// this is slow, usually caller always hold the result in a static variable.
-uint32_t Node::TypeKey2Index(const char* key) {
-  TypeManager *t = TypeManager::Global();
-  std::lock_guard<std::mutex>(t->mutex);
-  std::string skey = key;
-  auto it = t->key2index.find(skey);
-  if (it != t->key2index.end()) {
-    return it->second;
-  }
-  uint32_t tid = ++(t->type_counter);
-  t->key2index[skey] = tid;
-  t->index2key.push_back(skey);
-  return tid;
-}
-
-const char* Node::TypeIndex2Key(uint32_t index) {
-  TypeManager *t = TypeManager::Global();
-  std::lock_guard<std::mutex>(t->mutex);
-  internal_assert(index != 0);
-  return t->index2key.at(index - 1).c_str();
-}
-
-}  // namespace tvm

From 6b242284ab04dc0b485c71e66c8119eece88f0ac Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 2 Oct 2018 11:17:21 +0800
Subject: [PATCH 151/529] [TOPI] Add conv2d int8 template (#1735)

---
 nnvm/python/nnvm/top/nn.py                 |  15 +-
 nnvm/src/top/nn/convolution.cc             |   1 -
 topi/python/topi/cuda/conv2d.py            |  17 +-
 topi/python/topi/cuda/conv2d_direct.py     |   5 +
 topi/python/topi/cuda/conv2d_int8.py       | 336 +++++++++++++++++++++
 topi/python/topi/cuda/conv2d_winograd.py   |   7 +
 topi/python/topi/generic/nn.py             |  18 ++
 topi/python/topi/nn/conv2d.py              |  34 +++
 topi/tests/python/test_topi_conv2d_int8.py | 177 +++++++++++
 9 files changed, 602 insertions(+), 8 deletions(-)
 create mode 100644 topi/python/topi/cuda/conv2d_int8.py
 create mode 100644 topi/tests/python/test_topi_conv2d_int8.py

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index b452738123c3..49192cacd713 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -90,10 +90,12 @@ def compute_conv2d(attrs, inputs, _):
     kernel_layout = attrs["kernel_layout"]
     out_dtype = attrs["out_dtype"]
     out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
-    assert layout == "NCHW" or layout == "NHWC"
+    assert layout in ["NCHW", "NHWC", "NCHW4c"]
     (dilation_h, dilation_w) = dilation
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
+    elif layout == "NCHW4c" and (dilation_h > 1 or dilation_w > 1):
+        raise ValueError("not support dilate now")
     elif dilation == (1, 1):
         kernel = inputs[1]
     elif layout == "NCHW":
@@ -101,7 +103,12 @@ def compute_conv2d(attrs, inputs, _):
     else: #layout == NHWC
         kernel = topi.nn.dilate(inputs[1], [1, dilation_h, dilation_w, 1])
 
-    if groups == 1:
+    if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
+        # pylint: disable=assignment-from-no-return
+        out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], kernel, strides, padding,
+                                                  layout, out_dtype=out_dtype)
+        # pylint: enable=assignment-from-no-return
+    elif groups == 1:
         out = topi.nn.conv2d(
             inputs[0], kernel, strides, padding, layout, out_dtype=out_dtype)
     elif layout == "NCHW" and \
@@ -120,7 +127,7 @@ def compute_conv2d(attrs, inputs, _):
 
     if attrs.get_bool("use_bias"):
         bias = inputs[2]
-        expand_axis = 1 if layout == "NCHW" else 0
+        expand_axis = 1 if layout in ["NCHW", "NCHW4c"] else 0
         bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
         out = topi.add(out, bias)
     return out
@@ -136,6 +143,8 @@ def schedule_conv2d(attrs, outs, target):
     with tvm.target.create(target):
         if groups == 1 and layout == "NCHW":
             return topi.generic.schedule_conv2d_nchw(outs)
+        elif groups == 1 and layout == "NCHW4c":
+            return topi.generic.schedule_conv2d_NCHWc_int8_prepacked(outs)
         elif groups == 1 and layout == "NHWC":
             return topi.generic.schedule_conv2d_nhwc(outs)
         elif groups == channels and layout == "NCHW":
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index d5c9c18f68a6..22bda048a0a2 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -344,7 +344,6 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
 .set_num_inputs(UseBiasNumInputs<Conv2DParam>)
 .set_support_level(2);
 
-
 NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform)
 .describe(R"code(Weight transformation of winograd fast convolution algorithm.
 Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index a7d5f742d98c..4dac40746419 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -9,9 +9,10 @@
 
 from .conv2d_direct import schedule_direct_cuda
 from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda
+from .conv2d_int8 import conv2d_NCHWc_int8, schedule_conv2d_NCHWc_int8
 
 
-@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd'])
+@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8'])
 def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for cuda backend.
 
@@ -21,10 +22,13 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
         The config for this template
 
     data : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
+        4-D with shape [batch, in_channel, in_height, in_width] or
+        5-D with shape [batch, ic_chunk, in_height, in_width, ic_block]
 
     kernel : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
+        filter_width, num_filter_block, in_channel_block]
 
     strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -98,6 +102,9 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
     if cfg.template_key == 'winograd':
         return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype,
                              pre_computed=False)
+    if cfg.template_key == 'int8':
+        return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, layout, out_dtype,
+                                 pre_computed=False)
 
     if layout == 'NCHW':
         return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype)
@@ -108,7 +115,7 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
 
 
 @autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"],
-                                ["direct", 'winograd'])
+                                ["direct", 'winograd', "int8"])
 def schedule_conv2d_nchw_cuda(cfg, outs):
     """TOPI schedule callback of conv2d for cuda gpu
 
@@ -138,6 +145,8 @@ def _callback(op):
             schedule_direct_cuda(cfg, s, op.output(0))
         if op.tag == 'conv2d_nchw_winograd':
             schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
+        if op.tag == "conv2d_NCHWc_int8":
+            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0), pre_computed=False)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/cuda/conv2d_direct.py b/topi/python/topi/cuda/conv2d_direct.py
index 19e7ea38f647..9b315a6b0fc1 100644
--- a/topi/python/topi/cuda/conv2d_direct.py
+++ b/topi/python/topi/cuda/conv2d_direct.py
@@ -2,6 +2,7 @@
 """The templates for cuda conv2d operators"""
 import tvm
 from tvm import autotvm
+from ..util import get_const_tuple
 
 def schedule_direct_cuda(cfg, s, conv):
     """schedule optimized for batch size = 1"""
@@ -94,3 +95,7 @@ def schedule_direct_cuda(cfg, s, conv):
     # unroll
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
     s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    N, CO, OH, OW = get_const_tuple(output.shape)
+    _, KH, KW, CI = get_const_tuple(kernel.shape)
+    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
new file mode 100644
index 000000000000..053c9bc6bd31
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -0,0 +1,336 @@
+# pylint: disable=invalid-name
+"""Int8 conv2d in NCHWc layout"""
+import tvm
+from tvm import autotvm
+
+from .injective import _schedule_injective
+from ..generic import schedule_conv2d_NCHWc_int8_prepacked
+from .tensor_intrin import dp4a
+from ..nn.conv2d import conv2d_NCHWc_int8_prepacked
+from ..nn.pad import pad
+from ..nn.util import get_pad_tuple
+from ..util import get_const_tuple, get_const_int, traverse_inline
+
+
+def _conv2d_NCHWc_int8_arg_to_workload(data, kernel, stride, padding, out_dtype):
+    """convert argument to workload"""
+    shape = get_const_tuple(data.shape)
+    if len(shape) == 5:
+        N, ic_chunk, H, W, ic_block = shape
+        raw_data = tvm.placeholder(
+            (N, ic_chunk*ic_block, H, W), dtype=data.dtype)
+    else:
+        raw_data = data
+
+    shape = get_const_tuple(kernel.shape)
+    if len(shape) == 6:
+        oc_chunk, ic_chunk, KH, KW, oc_block, ic_block = shape
+        raw_kernel = tvm.placeholder(
+            (oc_chunk*oc_block, ic_chunk*ic_block, KH, KW), dtype=kernel.dtype)
+    else:
+        raw_kernel = kernel
+
+    return ('conv2d', ) + autotvm.task.task.args_to_workload(
+        [raw_data, raw_kernel, stride, padding, "NCHW", out_dtype])
+
+
+def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre_computed):
+    """Convolution operator in NCHW[x]c layout for int8.
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width] or
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
+        filter_width, num_filter_block, in_channel_block]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
+    pre_computed : str
+        Whether packed data and kernel are pre-computed
+
+    Returns
+    -------
+    output : tvm.Tensor
+        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
+    """
+    assert layout in ["NCHW", "NCHW4c"]
+
+    ic_block_factor = 4
+    oc_block_factor = 4
+
+    if not pre_computed:
+        batch, channels, height, width = get_const_tuple(data.shape)
+        assert channels % ic_block_factor == 0, \
+            "Number of input channels should be multiple of {}".format(
+                ic_block_factor)
+        packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
+                                   ic_block_factor),
+                                  lambda n, c, h, w, vc: data[n, c*ic_block_factor + vc, h, w],
+                                  name="packed_data")
+
+        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(
+            kernel.shape)
+        assert out_channels % 4 == 0, \
+            "Number of output channels should be multiple of {}".format(
+                oc_block_factor)
+        packed_kernel = tvm.compute(
+            (out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_h, kernel_w,
+             oc_block_factor, ic_block_factor),
+            lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block:
+            kernel[oc_chunk * oc_block_factor + oc_block,
+                   ic_chunk * ic_block_factor + ic_block, kh, kw],
+            name="packed_kernel")
+
+    else:
+        packed_data = data
+        packed_kernel = kernel
+
+    batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple(
+        packed_data.shape)
+    oc_chunk, ic_chunk, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
+        packed_kernel.shape)
+
+    if isinstance(stride, int):
+        stride_h, stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (kernel_h, kernel_w))
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left, 0]
+    pad_after = [0, 0, pad_down, pad_right, 0]
+    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
+
+    # compute the output shape
+    out_height = (in_height - kernel_h + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - kernel_w + pad_left + pad_right) // stride_w + 1
+
+    oshape = (batch, oc_chunk, out_height, out_width, oc_block)
+
+    icc = tvm.reduce_axis((0, ic_chunk), name='ic_chunk')
+    icb = tvm.reduce_axis((0, ic_block), name='ic_block')
+    kh = tvm.reduce_axis((0, kernel_h), name='kh')
+    kw = tvm.reduce_axis((0, kernel_w), name='kw')
+
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(pad_data[n, icc, oh*stride_h+kh, ow*stride_w+kw, icb]
+                               .astype('int32') *
+                               packed_kernel[oc_chunk, icc,
+                                             kh, kw, oc_block, icb]
+                               .astype('int32'),
+                               axis=[icc, kh, kw, icb]))
+
+    output = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                         conv[n, oc_chunk, oh, ow, oc_block].astype(out_dtype),
+                         tag="conv2d_NCHWc_int8",
+                         attrs={"workload": _conv2d_NCHWc_int8_arg_to_workload(
+                             data, kernel, stride, padding, out_dtype)})
+
+    # num flop
+    num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
+        ic_chunk * ic_block * kernel_h * kernel_w * 2
+    cfg.add_flop(num_flop)
+
+    return output
+
+
+_dp4a = dp4a('shared', 'shared', 'local')
+
+
+def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
+    """Schedule conv2d int8 NCHWc template"""
+    workload = output.op.attrs["workload"]
+
+    stride = workload[3]
+
+    conv = output.op.input_tensors[0]
+    packed_data, packed_kernel = conv.op.input_tensors
+
+    if isinstance(packed_data.op, tvm.tensor.ComputeOp) and "pad" in packed_data.op.tag:
+        pad_data = packed_data
+        packed_data = pad_data.op.input_tensors[0]
+    else:
+        pad_data = packed_data
+
+    if not pre_computed:
+        kernel, = packed_kernel.op.input_tensors
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # skip this part during tuning to make recrods accurate
+            # this part will be pre-computed during NNVM's pre-compute optimization pass
+            s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
+            s[packed_kernel].pragma(
+                s[packed_kernel].op.axis[0], "debug_skip_region")
+        else:
+            _schedule_injective(packed_data.op, s)
+            _schedule_injective(packed_kernel.op, s)
+    else:
+        kernel = packed_data
+
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    if pad_data != packed_data:
+        s[pad_data].compute_inline()
+
+    batch = get_const_int(packed_data.shape[0])
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    # create cache stage
+    AA = s.cache_read(pad_data, 'shared', [conv])
+    WW = s.cache_read(packed_kernel, 'shared', [conv])
+
+    s[conv].set_scope('local')
+
+    # handle bias
+    if output.op not in s.outputs:
+        s[output].compute_inline()
+        output = s.outputs[0].output(0)
+
+    # tile and bind spatial axes
+    n, f, y, x, c = s[output].op.axis
+    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
+    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
+    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
+
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    # this is the scope to attach global config inside this kernel
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    max_block_z = 128
+    if batch > max_block_z:
+        _, n = s[output].split(n, factor=max_block_z)
+    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    fused_byx = s[output].fuse(by, bx)
+    s[output].bind(n, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(fused_byx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    s[conv].compute_at(s[output], tx)
+
+    # tile and bind reduction axes
+    n, f, y, x, c = s[conv].op.axis
+
+    rc, ry, rx, rc_block = s[conv].op.reduce_axis
+    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
+    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
+    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
+    rco, rci = cfg['tile_rc'].apply(s, conv, rc)
+    ryo, ryi = cfg['tile_ry'].apply(s, conv, ry)
+    rxo, rxi = cfg['tile_rx'].apply(s, conv, rx)
+
+    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
+
+    _, rc_block = s[conv].split(rc_block, factor=4)
+    s[conv].tensorize(rc_block, _dp4a)
+
+    s[AA].compute_at(s[conv], rxo)
+    s[WW].compute_at(s[conv], rxo)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        if load == AA:
+            n, f, y, x, c = s[load].op.axis
+            if pad_data == packed_data and stride_h == 1 and stride_w == 1:
+                s[load].vectorize(c)
+                fused = s[load].fuse(n, f, y, x)
+            else:
+                c, _ = s[load].split(c, factor=4)
+                fused = s[load].fuse(n, f, y, x, c)
+        else:
+            n, f, y, x, oc_chunk, c = s[load].op.axis
+            fused = s[load].fuse(n, f, y, x, oc_chunk)
+            s[load].vectorize(c)
+
+        fused, tx = s[load].split(fused, factor=cfg["tile_x"].size[2])
+        fused, ty = s[load].split(fused, factor=cfg["tile_y"].size[2])
+        fused, tz = s[load].split(fused, factor=cfg["tile_f"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # double buffer
+    cfg.define_knob('AA_double_buffer', [0, 1])
+    cfg.define_knob('WW_double_buffer', [0, 1])
+    if cfg['AA_double_buffer'].val:
+        s[AA].double_buffer()
+    if cfg['WW_double_buffer'].val:
+        s[WW].double_buffer()
+
+    # unroll
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step',
+                     cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', False)
+
+    return s
+
+
+@conv2d_NCHWc_int8_prepacked.register(["cuda"])
+@autotvm.task.dispatcher
+def conv2d_NCHWc_int8_prepacked_dispatcher(data, kernel, stride, padding, layout, out_dtype):
+    assert layout == 'NCHW4c'
+    return _conv2d_NCHWc_int8_arg_to_workload(data, kernel, stride, padding, out_dtype)
+
+
+@conv2d_NCHWc_int8_prepacked_dispatcher.register("int8")
+def _decl_conv2d_NCHWc_int8_prepacked(cfg, data, kernel, stride, padding, layout, out_dtype):
+    return conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype,
+                             pre_computed=True)
+
+@autotvm.register_topi_schedule(schedule_conv2d_NCHWc_int8_prepacked, ["cuda"], ["int8"])
+def schedule_conv2d_NCHWc_int8_prepacked_cuda(cfg, outs):
+    """TOPI schedule callback of conv2d for cuda
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_NCHWc_int8' in op.tag:
+            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0), pre_computed=True)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 7e0574ea606b..6a0a126b9e4f 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -375,6 +375,13 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
         if cfg.template_key == 'direct':
             return None
 
+        if cfg.template_key == 'int8':
+            assert 'cuda' in tvm.target.current_target().keys
+            new_attrs['layout'] = 'NCHW4c'
+            new_attrs['out_layout'] = 'NCHW4c'
+            new_attrs['kernel_layout'] = 'OIHW4o4i'
+            return sym.conv2d(*copy_inputs, **new_attrs)
+
         # pre-compute weight transformation in winograd
         tile_size = _infer_tile_size(tinfos[0], tinfos[1])
 
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 874decc792ec..e99ce263296b 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -139,6 +139,24 @@ def schedule_conv2d_winograd_without_weight_transform(outs):
     return _default_schedule(outs, False)
 
 
+@tvm.target.generic_func
+def schedule_conv2d_NCHWc_int8_prepacked(outs):
+    """Schedule for conv2d NCHWc int8 with prepacked data and kernel
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of this operator
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_conv2d_transpose_nchw(outs):
     """Schedule for conv2d_transpose_nchw
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 3e06f6f6fed5..4d70c4903a3f 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -423,3 +423,37 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
+
+
+@tvm.target.generic_func
+def conv2d_NCHWc_int8_prepacked(data, kernel, stride, padding, layout, out_dtype):
+    """Convolution operator in NCHW[x]c layout for int8. Data and kernel should be packed in
+    advance.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    kernel : tvm.Tensor
+        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
+        filter_width, num_filter_block, in_channel_block]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
+    """
+    raise ValueError("missing register for topi.nn.conv2d_NCHWc_int8_prepacked")
diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
new file mode 100644
index 000000000000..2b85b2b97cb1
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -0,0 +1,177 @@
+"""Example code to do convolution."""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+from common import get_all_backend
+
+oc_block_factor = 4
+
+
+def verify_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
+    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W', dtype='int8')
+    bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias',
+                            dtype='int8')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
+    def get_ref_data():
+        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
+        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype)
+
+        # convert to NCHWc
+        _, _, out_height, out_width = c_np.shape
+        c_np = c_np.reshape((batch, num_filter // oc_block_factor, oc_block_factor, \
+                out_height, out_width)).transpose(0, 1, 3, 4, 2)
+
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+            print("Skip because int8 intrinsics are not available")
+            return
+
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
+            C = topi.nn.conv2d(A, dW, (stride, stride), (padding, padding),
+                               layout='NCHW', out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_nchw([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ["cuda"]:
+        check_device(device)
+
+
+class NCHWcInt8Fallback(autotvm.FallbackContext):
+    def _query_inside(self, target, workload):
+        key = (target, workload)
+        if key in self.memory:
+            return self.memory[key]
+        cfg = FallbackConfigEntity()
+        cfg.template_key = 'int8'
+        self.memory[key] = cfg
+        return cfg
+
+
+def test_conv2d_nchw():
+    with NCHWcInt8Fallback():
+        # ResNet18 workloads where channels in / out are multiple of oc_block_factor
+        verify_conv2d_NCHWc_int8(1,  64,  56,  64, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,  64,  56,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  64,  56, 128, 3, 2, 1)
+        verify_conv2d_NCHWc_int8(1,  64,  56, 128, 1, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 128,  28, 128, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 128,  28, 256, 3, 2, 1)
+        verify_conv2d_NCHWc_int8(1, 128,  28, 256, 1, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 256,  14, 256, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 256,  14, 512, 3, 2, 1)
+        verify_conv2d_NCHWc_int8(1, 256,  14, 512, 1, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 512,   7, 512, 3, 1, 1)
+
+        # bias, relu
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_relu=True)
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_bias=True)
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
+
+        # batch size
+        verify_conv2d_NCHWc_int8(4, 64, 56, 64, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(9, 64, 56, 64, 3, 1, 1)
+
+        # weird workloads
+        verify_conv2d_NCHWc_int8(4, 4, 4, 4, 4, 4, 4)
+
+        # inception v3 workloads where channels in / out are multiple of oc_block_factor
+        verify_conv2d_NCHWc_int8(1,   32, 149,  32, 3, 1, 0)
+        verify_conv2d_NCHWc_int8(1,   32, 147,  64, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,   64,  73,  80, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,   80,  73, 192, 3, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  35,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  35,  48, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,   48,  35,  64, 5, 1, 2)
+        verify_conv2d_NCHWc_int8(1,   64,  35,  96, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,   96,  35,  96, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,  192,  35,  32, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  256,  35,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  256,  35,  48, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  288,  35,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  288,  35,  48, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  288,  35, 384, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1,   96,  35,  96, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1,  768,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  768,  17, 128, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 128, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 192, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 128, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  768,  17, 160, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 160, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 192, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 160, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 192, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 320, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 192, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 320, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 384, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  384,   8, 384, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  384,   8, 384, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 448, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  448,   8, 384, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 320, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 384, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 448, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 1024,  19,  84, 3, 1, 1)
+
+
+if __name__ == "__main__":
+    test_conv2d_nchw()

From 155359e847eafb207a9d4e27ed1f73d9a187d540 Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Tue, 2 Oct 2018 06:43:38 +0300
Subject: [PATCH 152/529] [NNVM][TEST] Numgrad: fix nan and multioutput (#1754)

---
 nnvm/python/nnvm/testing/check_computation.py | 84 ++++++++++---------
 nnvm/tests/python/compiler/test_top_level1.py |  1 +
 2 files changed, 46 insertions(+), 39 deletions(-)

diff --git a/nnvm/python/nnvm/testing/check_computation.py b/nnvm/python/nnvm/testing/check_computation.py
index a207e8eb8ceb..76d7b66b140f 100644
--- a/nnvm/python/nnvm/testing/check_computation.py
+++ b/nnvm/python/nnvm/testing/check_computation.py
@@ -55,84 +55,84 @@ def infer_shapes_dtypes(graph, shape=None, dtype=None, fallback_dtype=None):
     """
     # Preprocess input parameters
     if shape is None:
-        shape = {}
+        provided_shapes = {}
+    elif isinstance(shape, dict):
+        provided_shapes = shape
+    else:
+        provided_shapes = {x: shape for x in graph.symbol.list_input_variables()}
 
     if dtype is None:
-        dtype = {}
-
-    if not isinstance(shape, dict):
-        shape = {x: shape for x in graph.symbol.list_input_variables()}
-
-    if not isinstance(dtype, dict):
-        dtype = {x: dtype for x in graph.symbol.list_input_variables()}
+        provided_dtypes = {}
+    elif isinstance(dtype, dict):
+        provided_dtypes = dtype
+    else:
+        provided_dtypes = {x: dtype for x in graph.symbol.list_input_variables()}
 
-    shape = _dict_var_to_dict_str(shape)
-    dtype = _dict_var_to_dict_str(dtype)
+    provided_shapes = _dict_var_to_dict_str(provided_shapes)
+    provided_dtypes = _dict_var_to_dict_str(provided_dtypes)
 
     # The graph may already contain shape and dtype info, so extract it and merge with
     # the user-specified shapes and dtypes (use the user-specified one on contradiction)
-    all_initial_shapes = graph.json_attr('shape')
-    all_initial_dtypes = graph.json_attr('dtype')
+    preexisting_shapes = graph.json_attr('shape')
+    preexisting_dtypes = graph.json_attr('dtype')
 
-    if all_initial_shapes:
+    if preexisting_shapes:
         for x in graph.index.input_names:
-            if x not in shape:
-                x_shape = tuple(all_initial_shapes[graph.index.entry_id(x)])
-                shape[x] = x_shape
+            if x not in provided_shapes:
+                x_shape = tuple(preexisting_shapes[graph.index.entry_id(x)])
+                provided_shapes[x] = x_shape
 
-    if all_initial_dtypes:
+    if preexisting_dtypes:
         for x in graph.index.input_names:
-            if x not in dtype:
-                x_dtype = TCODE_TO_DTYPE[all_initial_dtypes[graph.index.entry_id(x)]]
-                dtype[x] = x_dtype
+            if x not in provided_dtypes:
+                x_dtype = TCODE_TO_DTYPE[preexisting_dtypes[graph.index.entry_id(x)]]
+                provided_dtypes[x] = x_dtype
 
     # Perform inference
-    nnvm.compiler.graph_attr.set_shape_inputs(graph, shape)
-    nnvm.compiler.graph_attr.set_dtype_inputs(graph, dtype)
+    nnvm.compiler.graph_attr.set_shape_inputs(graph, provided_shapes)
+    nnvm.compiler.graph_attr.set_dtype_inputs(graph, provided_dtypes)
 
     graph = graph.apply('InferShape').apply('InferType')
 
-    shapes = graph.json_attr('shape')
-    dtypes = graph.json_attr('dtype')
-
-    out_len = len(graph.symbol.list_output_names())
+    inferred_shapes = graph.json_attr('shape')
+    inferred_dtypes = graph.json_attr('dtype')
 
     index = graph.index
 
-    output_shapes = \
-        [tuple(shapes[index.entry_id(index.output_entries[i])]) for i in range(out_len)]
-    output_dtypes = \
-        [TCODE_TO_DTYPE[dtypes[index.entry_id(index.output_entries[i])]] for i in range(out_len)]
+    output_shapes = [tuple(inferred_shapes[index.entry_id(entry)])
+                     for entry in index.output_entries]
+    output_dtypes = [TCODE_TO_DTYPE[inferred_dtypes[index.entry_id(entry)]]
+                     for entry in index.output_entries]
 
     # Postprocess the results
-    input_shapes = shape.copy()
-    input_dtypes = dtype.copy()
+    input_shapes = provided_shapes.copy()
+    input_dtypes = provided_dtypes.copy()
 
     for x in graph.symbol.list_input_variables():
         x_name = x.attr('name')
-        x_node_id = graph.index.node_id(x_name)
-        input_shapes[x_name] = tuple(shapes[x_node_id])
-        input_dtypes[x_name] = TCODE_TO_DTYPE[dtypes[x_node_id]]
+        x_entry_id = graph.index.entry_id(x_name)
+        input_shapes[x_name] = tuple(inferred_shapes[x_entry_id])
+        input_dtypes[x_name] = TCODE_TO_DTYPE[inferred_dtypes[x_entry_id]]
 
     # Merge the original user-specified shapes in case some of them are specified for non-existing
     # variables
-    for x_name, x_shape in shape.items():
+    for x_name, x_shape in provided_shapes.items():
         x_shape = tuple(x_shape)
         if input_shapes.get(x_name, x_shape) != x_shape:
             raise RuntimeError("Inferred shape differs from the provided shape.\n"
                                "Provided shapes: {}\nInferred shapes: {}"
-                               .format(shapes, input_shapes))
+                               .format(provided_shapes, input_shapes))
         else:
             input_shapes[x_name] = x_shape
 
     # Merge the original user-specified dtypes
-    for x_name, x_dtype in dtype.items():
+    for x_name, x_dtype in provided_dtypes.items():
         if not isinstance(x_dtype, str):
             x_dtype = TCODE_TO_DTYPE[x_dtype]
         if input_dtypes.get(x_name, x_dtype) != x_dtype:
             raise RuntimeError("Inferred dtype differs from the provided dtype.\n"
                                "Provided dtypes: {}\nInferred dtypes: {}"
-                               .format(dtypes, input_dtypes))
+                               .format(provided_dtypes, input_dtypes))
         else:
             input_dtypes[x_name] = x_dtype
 
@@ -622,6 +622,12 @@ def compare_derivative(j, n_der, grad):
         dist = np.sqrt(np.sum((ngrad - grad)**2))
         grad_norm = np.sqrt(np.sum(ngrad**2))
 
+        if not (np.isfinite(dist) and np.isfinite(grad_norm)):
+            raise ValueError(
+                "NaN or infinity detected during numerical gradient checking wrt {}\n"
+                "analytical grad = {}\n numerical grad = {}\n"
+                .format(x_name, grad, ngrad))
+
         # we multiple atol by this number to make it more universal for different sizes
         sqrt_n = np.sqrt(float(np.prod(grad.shape)))
 
diff --git a/nnvm/tests/python/compiler/test_top_level1.py b/nnvm/tests/python/compiler/test_top_level1.py
index ba6280dd9b14..089ae84cd2b8 100644
--- a/nnvm/tests/python/compiler/test_top_level1.py
+++ b/nnvm/tests/python/compiler/test_top_level1.py
@@ -96,6 +96,7 @@ def _check_function_must_fail(*args, **kwargs):
     _check_function_must_fail(sym.block_grad(x + 2*y), numerical_grads=True)
     _check_function_must_fail(x*x, numerical_grads=True,
                               numerical_grads_params={'atol': 0.0, 'rtol': 0.0})
+    _check_function_must_fail(sym.log(-x*x), numerical_grads=True, error=ValueError)
 
     # different styles of returning results from the forward function
     check_function(x + 2*y, lambda x, y: [x + 2*y], numerical_grads=False)

From 08b36630d0681d784da29aa58b4b0631a64ad20b Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Tue, 2 Oct 2018 12:25:11 -0400
Subject: [PATCH 153/529] Change error.h path in doc.h (#1794)

---
 src/relay/ir/doc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/ir/doc.h b/src/relay/ir/doc.h
index 15e965e5b818..1837eedd6006 100644
--- a/src/relay/ir/doc.h
+++ b/src/relay/ir/doc.h
@@ -12,6 +12,7 @@
 #ifndef TVM_RELAY_IR_DOC_H_
 #define TVM_RELAY_IR_DOC_H_
 
+#include <tvm/relay/error.h>
 #include <unordered_map>
 #include <utility>
 #include <string>
@@ -20,7 +21,6 @@
 #include <memory>
 #include <ostream>
 #include <map>
-#include "error.h"
 
 namespace tvm {
 namespace relay {

From 765002b5f909fd99dd52cfcadd02dfb7e36c7583 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Tue, 2 Oct 2018 15:59:53 -0700
Subject: [PATCH 154/529] [TOPI] Update TopHub and benchmark (#1796)

---
 apps/benchmark/arm_cpu_imagenet_bench.py        |  8 +++++---
 apps/benchmark/gpu_imagenet_bench.py            |  6 ++++--
 apps/benchmark/mobile_gpu_imagenet_bench.py     |  8 +++++---
 python/tvm/autotvm/tophub.py                    | 10 ++++++----
 topi/python/topi/arm_cpu/conv2d.py              |  4 ++--
 topi/python/topi/mali/conv2d.py                 | 10 ++++++++--
 topi/tests/python/test_topi_conv2d_nchw.py      |  9 ++-------
 topi/tests/python/test_topi_depthwise_conv2d.py | 12 ++++--------
 8 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index 2d7116475bc5..5b666bc9d2e0 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -58,8 +58,10 @@ def evaluate_network(network, target, target_host, number):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--network", type=str, choices=
-                        ['resnet-18', 'resnet-34', 'vgg-16',
-                         'mobilenet', 'mobilenet_v2', 'squeezenet v1.0', 'squeezenet v1.1'])
+                        ['resnet-18', 'resnet-34', 'resnet-50',
+                         'vgg-16', 'vgg-19', 'densenet-121', 'inception_v3',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet_v1.0', 'squeezenet_v1.1'],
+                        help='The name of neural network')
     parser.add_argument("--model", type=str, choices=
                         ['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro',
                          'pixel2', 'rasp3b', 'pynq'], default='rk3399',
@@ -68,7 +70,7 @@ def evaluate_network(network, target, target_host, number):
     parser.add_argument("--host", type=str, default='localhost')
     parser.add_argument("--port", type=int, default=9190)
     parser.add_argument("--rpc-key", type=str, required=True)
-    parser.add_argument("--number", type=int, default=6)
+    parser.add_argument("--number", type=int, default=3)
     args = parser.parse_args()
 
     dtype = 'float32'
diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
index 873e60f82c59..a0eb4a055103 100644
--- a/apps/benchmark/gpu_imagenet_bench.py
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -17,8 +17,10 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--network", type=str, choices=
-        ['resnet-18', 'resnet-34', 'resnet-50', 'vgg-16', 'vgg-19',
-         'inception_v3', 'mobilenet', 'mobilenet_v2', 'densenet-121'])
+                        ['resnet-18', 'resnet-34', 'resnet-50',
+                         'vgg-16', 'vgg-19', 'densenet-121', 'inception_v3',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet_v1.0', 'squeezenet_v1.1'],
+                        help='The name of neural network')
     parser.add_argument("--model", type=str,
                         choices=['1080ti', 'titanx', 'gfx900'], default='1080ti',
                         help="The model of the test device. If your device is not listed in "
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
index 8e29fa5dab9a..a75620b3fe08 100644
--- a/apps/benchmark/mobile_gpu_imagenet_bench.py
+++ b/apps/benchmark/mobile_gpu_imagenet_bench.py
@@ -58,8 +58,10 @@ def evaluate_network(network, target, target_host, number):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--network", type=str, choices=
-                        ['resnet-18', 'resnet-34', 'vgg-16',
-                         'mobilenet', 'mobilenet_v2', 'squeezenet v1.1'])
+                        ['resnet-18', 'resnet-34', 'resnet-50',
+                         'vgg-16', 'vgg-19', 'densenet-121', 'inception_v3',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet_v1.0', 'squeezenet_v1.1'],
+                        help='The name of neural network')
     parser.add_argument("--model", type=str, choices=
                         ['rk3399'], default='rk3399',
                         help="The model of the test device. If your device is not listed in "
@@ -67,7 +69,7 @@ def evaluate_network(network, target, target_host, number):
     parser.add_argument("--host", type=str, default='localhost')
     parser.add_argument("--port", type=int, default=9190)
     parser.add_argument("--rpc-key", type=str, required=True)
-    parser.add_argument("--number", type=int, default=10)
+    parser.add_argument("--number", type=int, default=30)
     args = parser.parse_args()
 
     dtype = 'float32'
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index bde706ee6cfb..9a309fd5b338 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -20,12 +20,12 @@
 
 # the version of each package
 PACKAGE_VERSION = {
-    'arm_cpu': "v0.01",
+    'arm_cpu': "v0.03",
 
     'cuda':    "v0.02",
     'rocm':    "v0.01",
     'opencl':  "v0.01",
-    'mali':    "v0.01",
+    'mali':    "v0.02",
 
     'vta':     "v0.01",
 }
@@ -38,7 +38,7 @@ def _alias(name):
         'vtacpu': 'vta',
 
         'metal': 'opencl',
-        'nvptx': 'cuda'
+        'nvptx': 'cuda',
     }
     return table.get(name, name)
 
@@ -61,11 +61,12 @@ def context(target, extra_files=None):
     if isinstance(target, str):
         target = _target.create(target)
 
-    possible_names = [str(target).split()[0]]
+    possible_names = []
     for opt in target.options:
         if opt.startswith("-device"):
             device = _alias(opt[8:])
             possible_names.append(device)
+    possible_names.append(target.target_name)
 
     all_packages = list(PACKAGE_VERSION.keys())
     for name in possible_names:
@@ -75,6 +76,7 @@ def context(target, extra_files=None):
 
             filename = "%s_%s.log" % (name, PACKAGE_VERSION[name])
             best_context.load(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, filename))
+            break   # only load one file to avoid some fallback template mismatch problem
 
     if extra_files:
         for filename in extra_files:
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 6a924a4b133c..a193e9acf5cb 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -506,8 +506,8 @@ def _callback(op):
 
 
 ##### REGISTER ALTER OP LAYOUT #####
-@conv2d_alter_layout.register(["arm_cpu", "mali"])
-def _alter_conv2d_layout(attrs, inputs, tinfos):
+@conv2d_alter_layout.register(["arm_cpu"])
+def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
     """Alter op layout for pre-computing kernel transformation"""
     import nnvm.symbol as sym
     copy_inputs = [s for s in inputs]
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 6bbf735af18e..d031acdd9a2b 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -9,11 +9,11 @@
 from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
 from ..util import traverse_inline, get_const_int, get_const_tuple, const_matrix
 from ..nn import conv2d, conv2d_winograd_without_weight_transform, \
-    get_pad_tuple, pad
+    get_pad_tuple, pad, conv2d_alter_layout
 
 # reuse some compute declarations from ARM CPU
 from ..arm_cpu.conv2d import _conv_arg_to_workload, _decl_spatial_pack,\
-    _winograd_conv_arg_to_workload
+    _winograd_conv_arg_to_workload, _alter_conv2d_layout_arm
 
 
 @conv2d.register('mali')
@@ -410,6 +410,12 @@ def _schedule_winograd(cfg, s, op):
 
     s[Y].compute_at(s[output], tt)
 
+@conv2d_alter_layout.register(["mali"])
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    try:
+        return _alter_conv2d_layout_arm(attrs, inputs, tinfos)
+    except KeyError:  # to filter out fallback opencl templates
+        return None
 
 ##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
 @conv2d_winograd_without_weight_transform.register(['mali'])
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index f65832a14bdb..14aa0b742a8a 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -69,16 +69,11 @@ def check_device(device):
         np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in get_all_backend():
-        check_device(device)
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
 
 
 def test_conv2d_nchw():
-    # load tophub
-    ctx = autotvm.apply_history_best([])
-    for device in get_all_backend():
-        context = autotvm.tophub.context(device)
-        context.__enter__()
-
     # ResNet18 workloads
     verify_conv2d_nchw(1,   3, 224,  64, 7, 2, 3)
     verify_conv2d_nchw(1,  64,  56,  64, 3, 1, 1)
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index 4d3c45763dfb..b03916b9ba09 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -102,7 +102,8 @@ def get_ref_data():
         np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
     for device in get_all_backend():
-        check_device(device)
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
 
 
 def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1):
@@ -201,16 +202,11 @@ def get_ref_data():
         np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
     for device in get_all_backend():
-        check_device(device)
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
 
 
 def test_depthwise_conv2d():
-    # load tophub
-    ctx = autotvm.apply_history_best([])
-    for device in get_all_backend():
-        context = autotvm.tophub.context(device)
-        context.__enter__()
-
     # mobilenet workloads
     depthwise_conv2d_with_workload_nchw(1, 32, 112, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 64, 112, 1, 3, 2, "SAME")

From 39564de4ff328bf161eea2e76d49d04e4ed7e194 Mon Sep 17 00:00:00 2001
From: James Gilles <jhgilles@mit.edu>
Date: Wed, 3 Oct 2018 13:15:06 -0400
Subject: [PATCH 155/529] Fix vulkan build with homebrew install of vulkan-sdk
 (#1802)

---
 cmake/util/FindVulkan.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/util/FindVulkan.cmake b/cmake/util/FindVulkan.cmake
index 15c85bfe27c3..504058c66b62 100644
--- a/cmake/util/FindVulkan.cmake
+++ b/cmake/util/FindVulkan.cmake
@@ -45,11 +45,11 @@ macro(find_vulkan use_vulkan)
   if(Vulkan_FOUND)
     get_filename_component(VULKAN_LIBRARY_PATH ${Vulkan_LIBRARY} DIRECTORY)
     find_library(Vulkan_SPIRV_TOOLS_LIBRARY SPIRV-Tools
-      ${VULKAN_LIBRARY_PATH}/spirv-tools)
+        HINTS ${VULKAN_LIBRARY_PATH} ${VULKAN_LIBRARY_PATH}/spirv-tools)
 
     find_path(_libspirv libspirv.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv-tools)
-    find_path(_spirv spirv.hpp HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv/unified1)
-    find_path(_glsl_std GLSL.std.450.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv/unified1)
+    find_path(_spirv spirv.hpp HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan SPIRV spirv/unified1)
+    find_path(_glsl_std GLSL.std.450.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan SPIRV spirv/unified1)
     list(APPEND Vulkan_INCLUDE_DIRS ${_libspirv} ${_spirv} ${_glsl_std})
     message(STATUS "Vulkan_INCLUDE_DIRS=" ${Vulkan_INCLUDE_DIRS})
     message(STATUS "Vulkan_LIBRARY=" ${Vulkan_LIBRARY})

From 6f5590035d7d2db92cda5f080dc6f6a2919fc80d Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 3 Oct 2018 10:15:36 -0700
Subject: [PATCH 156/529] Use ctx instead of tvm.gpu(0) in nnvm_quick_start
 tutorial (#1801)

---
 tutorials/nnvm_quick_start.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/nnvm_quick_start.py b/tutorials/nnvm_quick_start.py
index e16184300e2f..0244cbe81e5e 100644
--- a/tutorials/nnvm_quick_start.py
+++ b/tutorials/nnvm_quick_start.py
@@ -133,7 +133,7 @@
 loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
 input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
 
-module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
+module = graph_runtime.create(loaded_json, loaded_lib, ctx)
 module.load_params(loaded_params)
 module.run(data=input_data)
 out = module.get_output(0).asnumpy()

From c5d9c8f82b64e049b9806018273021fc85ff739c Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 3 Oct 2018 10:16:09 -0700
Subject: [PATCH 157/529] Change cat image extension to png to match its
 download URL (#1800)

---
 .gitignore                                              | 1 +
 nnvm/tests/python/frontend/coreml/model_zoo/__init__.py | 2 +-
 tutorials/nnvm/deploy_model_on_mali_gpu.py              | 2 +-
 tutorials/nnvm/deploy_model_on_rasp.py                  | 2 +-
 tutorials/nnvm/from_keras.py                            | 4 ++--
 tutorials/nnvm/from_mxnet.py                            | 2 +-
 tutorials/nnvm/from_mxnet_to_webgl.py                   | 2 +-
 7 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1a04e291302e..833eee1a0774 100644
--- a/.gitignore
+++ b/.gitignore
@@ -179,6 +179,7 @@ perf
 *.h5
 synset.txt
 cat.jpg
+cat.png
 docs.tgz
 cat.png
 *.mlmodel
diff --git a/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py b/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
index 87b9b5668432..0a39053b6d47 100644
--- a/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
+++ b/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
@@ -25,7 +25,7 @@ def get_resnet50():
 
 def get_cat_image():
     url = 'https://gist.githubusercontent.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png'
-    dst = 'cat.jpg'
+    dst = 'cat.png'
     real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
     download(url, real_dst)
     img = Image.open(real_dst).resize((224, 224))
diff --git a/tutorials/nnvm/deploy_model_on_mali_gpu.py b/tutorials/nnvm/deploy_model_on_mali_gpu.py
index 10aac3a67b94..6e3962a6609f 100644
--- a/tutorials/nnvm/deploy_model_on_mali_gpu.py
+++ b/tutorials/nnvm/deploy_model_on_mali_gpu.py
@@ -91,7 +91,7 @@
 ######################################################################
 # In order to test our model, here we download an image of cat and
 # transform its format.
-img_name = 'cat.jpg'
+img_name = 'cat.png'
 download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
 image = Image.open(img_name).resize((224, 224))
 
diff --git a/tutorials/nnvm/deploy_model_on_rasp.py b/tutorials/nnvm/deploy_model_on_rasp.py
index 807365829809..fa5fd2b0952f 100644
--- a/tutorials/nnvm/deploy_model_on_rasp.py
+++ b/tutorials/nnvm/deploy_model_on_rasp.py
@@ -88,7 +88,7 @@
 ######################################################################
 # In order to test our model, here we download an image of cat and
 # transform its format.
-img_name = 'cat.jpg'
+img_name = 'cat.png'
 download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
 image = Image.open(img_name).resize((224, 224))
 
diff --git a/tutorials/nnvm/from_keras.py b/tutorials/nnvm/from_keras.py
index 5c13b8b1d30a..fcac3adc79e1 100644
--- a/tutorials/nnvm/from_keras.py
+++ b/tutorials/nnvm/from_keras.py
@@ -56,8 +56,8 @@ def download(url, path, overwrite=False):
 from matplotlib import pyplot as plt
 from keras.applications.resnet50 import preprocess_input
 img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-download(img_url, 'cat.jpg')
-img = Image.open('cat.jpg').resize((224, 224))
+download(img_url, 'cat.png')
+img = Image.open('cat.png').resize((224, 224))
 plt.imshow(img)
 plt.show()
 # input preprocess
diff --git a/tutorials/nnvm/from_mxnet.py b/tutorials/nnvm/from_mxnet.py
index 78247dbe2b0a..dcecf3c42bcc 100644
--- a/tutorials/nnvm/from_mxnet.py
+++ b/tutorials/nnvm/from_mxnet.py
@@ -33,7 +33,7 @@
 from PIL import Image
 from matplotlib import pyplot as plt
 block = get_model('resnet18_v1', pretrained=True)
-img_name = 'cat.jpg'
+img_name = 'cat.png'
 synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
diff --git a/tutorials/nnvm/from_mxnet_to_webgl.py b/tutorials/nnvm/from_mxnet_to_webgl.py
index 75279839bfb3..4e7b57706de6 100644
--- a/tutorials/nnvm/from_mxnet_to_webgl.py
+++ b/tutorials/nnvm/from_mxnet_to_webgl.py
@@ -148,7 +148,7 @@ def download_image():
     from PIL import Image
 
     url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
-    img_name = "cat.jpg"
+    img_name = "cat.png"
 
     gluon.utils.download(url, img_name)
     image = Image.open(img_name).resize((224, 224))

From 475881fa36a2e514bc8db5038176598772ed2962 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 3 Oct 2018 10:16:58 -0700
Subject: [PATCH 158/529] [AUTOTVM] Support multiple targets load in tophub
 (#1803)

---
 python/tvm/autotvm/tophub.py | 43 +++++++++++++++++++-----------------
 tutorials/nnvm/deploy_ssd.py | 12 ++++++++--
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 9a309fd5b338..64295d158af5 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -51,32 +51,35 @@ def context(target, extra_files=None):
 
     Parameters
     ----------
-    target: Target
+    target: Target or List of Target
         The compilation target
     extra_files: list of str, optional
         Extra log files to load
     """
     best_context = ApplyHistoryBest([])
 
-    if isinstance(target, str):
-        target = _target.create(target)
-
-    possible_names = []
-    for opt in target.options:
-        if opt.startswith("-device"):
-            device = _alias(opt[8:])
-            possible_names.append(device)
-    possible_names.append(target.target_name)
-
-    all_packages = list(PACKAGE_VERSION.keys())
-    for name in possible_names:
-        name = _alias(name)
-        if name in all_packages:
-            check_backend(name)
-
-            filename = "%s_%s.log" % (name, PACKAGE_VERSION[name])
-            best_context.load(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, filename))
-            break   # only load one file to avoid some fallback template mismatch problem
+    targets = target if isinstance(target, (list, tuple)) else [target]
+
+    for tgt in targets:
+        if isinstance(tgt, str):
+            tgt = _target.create(tgt)
+
+        possible_names = []
+        for opt in tgt.options:
+            if opt.startswith("-device"):
+                device = _alias(opt[8:])
+                possible_names.append(device)
+        possible_names.append(tgt.target_name)
+
+        all_packages = list(PACKAGE_VERSION.keys())
+        for name in possible_names:
+            name = _alias(name)
+            if name in all_packages:
+                check_backend(name)
+
+                filename = "%s_%s.log" % (name, PACKAGE_VERSION[name])
+                best_context.load(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, filename))
+                break   # only load one file to avoid some fallback template mismatch problem
 
     if extra_files:
         for filename in extra_files:
diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd.py
index 26591b86c692..3f5f89a632b6 100644
--- a/tutorials/nnvm/deploy_ssd.py
+++ b/tutorials/nnvm/deploy_ssd.py
@@ -22,12 +22,20 @@
 
 
 ######################################################################
-# Set the parameters here
-# -----------------------
+# Preliminary and Set parameters
+# ------------------------------
+# We should build TVM with sort support, in TVM root directory
+#
+# .. code-block:: bash
+#
+#   echo "set(USE_SORT ON)" > config.mk
+#   make -j8
+#
 # .. note::
 #
 #   Currently we support compiling SSD on CPU only.
 #   GPU support is in progress.
+#
 
 model_name = "ssd_resnet50_512"
 model_file = "%s.zip" % model_name

From e198c15c5bac3a618036f433d19ee7c0446a7e2f Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Wed, 3 Oct 2018 18:13:45 -0700
Subject: [PATCH 159/529] [Tutorial] tutorial for tensorize (#1774)

---
 tutorials/language/tensorize.py | 286 ++++++++++++++++++++++++++++++++
 1 file changed, 286 insertions(+)
 create mode 100644 tutorials/language/tensorize.py

diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py
new file mode 100644
index 000000000000..675306de064e
--- /dev/null
+++ b/tutorials/language/tensorize.py
@@ -0,0 +1,286 @@
+"""
+Use Tensorize to Leverage Hardware Intrinsics
+=============================================
+**Author**: `Yizhi Liu <https://github.com/yzhliu>`_
+
+This is an introduction material on how to perform tensorization in TVM.
+
+By using schedule primitive :code:`tensorize`,
+people can replace a unit of computation with the corresponding intrinsics,
+making it easy to leverage handcrafted micro-kernels,
+as well as extend TVM to support new hardware architectures.
+
+The purpose of this tutorial is to show the functionality
+and usage of tensorize instead of providing an efficient solution.
+
+"""
+from __future__ import absolute_import, print_function
+
+import tvm
+import numpy as np
+
+######################################################################
+# Define Matrix Multiplication
+# ----------------------------
+# Take matrix multiplication as our example.
+# Matmul first multiply the corresponding elements between two matrix,
+# then accumulate across a certain axis.
+# The following lines describe the computation :code:`A * B^T` in TVM.
+#
+N, M, L = 1024, 512, 64
+A = tvm.placeholder((N, L), name='A')
+B = tvm.placeholder((M, L), name='B')
+k = tvm.reduce_axis((0, L), name='k')
+C = tvm.compute((N, M), lambda i, j:
+                tvm.sum(A[i, k] * B[j, k], axis=k), name='C')
+s = tvm.create_schedule(C.op)
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# Schedule the Matmul
+# -------------------
+# Now, suppose we have an accelerator that supports
+# matrix-vector multiplication (GEMV) as a hardware primitive,
+# which can take arbitrary size of reduce axis,
+# but another axis needs to be no larger than 16.
+# Thus we break down the matmul loops to make the innermost loops a (16x64) GEMV.
+#
+factor = 16
+x, y = C.op.axis
+z, = C.op.reduce_axis
+yo, yi = s[C].split(y, factor=factor)
+s[C].reorder(x, yo, yi, z)
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# As showed in the IR printed above,
+# the inner loops :code:`j.inner` along with :code:`k` together form a computation of GEMV
+# - within the inner most two loops, the index :code:`i` is fixed,
+# the access to the matrix :code:`A` only varies by :code:`k`,
+# which makes the access pattern of :code:`A` a "vector".
+# In order to leverage our hypothetical hardware's GEMV instruction,
+# we can tensorize over :code:`j.inner`.
+#
+# Define GEMV Tensorization Intrinsic
+# -----------------------------------
+# Before scheduling the tensorization, we need to first define the intrinsic function for GEMV.
+# It includes two parts, the first is a compute definition of GEMV.
+# TVM uses it to match the computing pattern in the original Matmul schedule.
+# The second is to specify how to execute GEMV on the device,
+# which is done in :code:`intrin_func` below.
+#
+def intrin_gemv(m, l):
+    a = tvm.placeholder((l,), name='a')
+    b = tvm.placeholder((m, l), name='b')
+    k = tvm.reduce_axis((0, l), name='k')
+    c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c')
+    Ab = tvm.decl_buffer(a.shape, a.dtype,
+                         name="A",
+                         offset_factor=1,
+                         strides=[1])
+    Bb = tvm.decl_buffer(b.shape, b.dtype,
+                         name="B",
+                         offset_factor=1,
+                         strides=[tvm.var("s1"), 1])
+    Cb = tvm.decl_buffer(c.shape, c.dtype,
+                         name="C",
+                         offset_factor=1,
+                         strides=[1])
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+        aa, bb = ins
+        cc = outs[0]
+        ib.emit(tvm.call_extern("int32", "gemv_update",
+                                cc.access_ptr("w"),
+                                aa.access_ptr("r"),
+                                bb.access_ptr("r"),
+                                m, l, bb.strides[0]))
+        return ib.get()
+    with tvm.build_config(offset_factor=1):
+        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
+
+######################################################################
+# Here :code:`tvm.decl_tensor_intrin` declares how to execute the computation :code:`c.op`.
+# Our implementation simply takes the inputs and outputs,
+# converts them to pointers and emit an external function call.
+# Note that tensorization requires user to specify :code:`offset_factor`,
+# with this information, TVM has knowledge of whether the data is aligned
+# between the start address of the original data structure
+# and the offset being passed to tensorize,
+# so that it has chance to optimize with vectorized loading.
+# We set the factor to 1 for simplification.
+#
+# Buffers are also declared for inputs and outputs, though this is not required,
+# we benefit from the extra information provided by buffers. For example, we pass
+# :code:`bb.strides[0]` as an argument to the external function :code:`gemv_update`.
+# For now :code:`bb.strides[0] == l`,
+# but later we will see how they can differ with more complicated schedules.
+#
+# Note that we use :code:`tvm.var("s1")` as the first stride dimension for :code:`B`.
+# If the strides can be inferred
+# - in this case, TVM knows tensor B is compact thus the strides are :code:`[L, 1]` -
+# such placeholder can be put to let TVM automatically bind the inferred value for us.
+#
+gemv = intrin_gemv(factor, L)
+s[C].tensorize(yi, gemv)
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# By tensorizing over :code:`yi`, the inner most two loops are
+# now replaced by the intrinsic function we defined before.
+# In order to build and run the module, let's define the external function :code:`gemv_update`,
+# it is a naive implementation of GEMV, just for demonstration.
+#
+def gemv_impl():
+    cc_code = """
+      extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {
+        for (int i = 0; i < m; ++i) {
+            for (int j = 0; j < l; ++j) {
+                cc[i] += aa[j] * bb[i * stride + j];
+            }
+        }
+        return 0;
+      }
+    """
+    from tvm.contrib import util, clang
+    temp = util.tempdir()
+    ll_path = temp.relpath("temp.ll")
+    # Create LLVM ir from c source code
+    ll_code = clang.create_llvm(cc_code, output=ll_path)
+    return ll_code
+
+######################################################################
+# Now we leverage the pragma attribute :code:`import_llvm` to import llvm asm inline.
+# The importing needs to happen before the tensorized GEMV being executed.
+#
+s[C].pragma(x, "import_llvm", gemv_impl())
+func = tvm.build(s, [A, B, C], target="llvm", name="gemv")
+
+from topi.util import get_const_tuple
+dtype = A.dtype
+ctx = tvm.context("cpu", 0)
+a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
+b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
+c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
+func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
+np.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
+
+######################################################################
+# We compare the tensorize version with that :code:`numpy.dot` produces,
+# ensure our implementation is correct.
+#
+# Reduce-update for Tensorize
+# ------------------------------------
+# Let's then move one step forward.
+# Assume our accelerator could only multiply a vector by a square matrix,
+# in which the vector size needs to be no larger than 16.
+# Given such hardware constrain, now we need to split the reduce axis as following,
+#
+zo, zi = s[C].split(z, factor=factor)
+s[C].reorder(x, yo, zo, yi, zi)
+
+######################################################################
+# However, since the tensorize intrinsic now only covers a part of the reduce axis,
+# instead of using one "body" function, TVM requires a :code:`reduce_reset` function,
+# which will be invoked before the reduce for-loop, and a :code:`reduce_update` function,
+# which defines the "update" computing strategy.
+#
+def gemv_impl():
+    cc_code = """
+      extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {
+        for (int i = 0; i < m; ++i) {
+            for (int j = 0; j < l; ++j) {
+                cc[i] += aa[j] * bb[i * stride + j];
+            }
+        }
+        return 0;
+      }
+      extern "C" int gemv_reset(float *cc, int m) {
+        for (int i = 0; i < m; ++i) {
+            cc[i] = 0.0;
+        }
+        return 0;
+      }
+    """
+    from tvm.contrib import util, clang
+    temp = util.tempdir()
+    ll_path = temp.relpath("temp.ll")
+    # Create LLVM ir from c source code
+    ll_code = clang.create_llvm(cc_code, output=ll_path)
+    return ll_code
+
+def intrin_gemv(m, l):
+    a = tvm.placeholder((l,), name='a')
+    b = tvm.placeholder((m, l), name='b')
+    k = tvm.reduce_axis((0, l), name='k')
+    c = tvm.compute((m,), lambda i:
+    tvm.sum(a[k] * b[i, k], axis=k), name='c')
+    Ab = tvm.decl_buffer(a.shape, a.dtype,
+                         name="A",
+                         offset_factor=1,
+                         strides=[1])
+    Bb = tvm.decl_buffer(b.shape, b.dtype,
+                         name="B",
+                         offset_factor=1,
+                         strides=[tvm.var("s1"), 1])
+    Cb = tvm.decl_buffer(c.shape, c.dtype,
+                         name="C",
+                         offset_factor=1,
+                         strides=[1])
+    def intrin_func(ins, outs):
+        aa, bb = ins
+        cc = outs[0]
+        def _body():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_extern("int32", "gemv_update",
+                                    cc.access_ptr("w"),
+                                    aa.access_ptr("r"),
+                                    bb.access_ptr("r"),
+                                    m, l, bb.strides[0]))
+            return ib.get()
+        def _reduce_reset():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_extern("int32", "gemv_reset", cc.access_ptr("w"), m))
+            return ib.get()
+        def _reduce_update():
+            return _body()
+        return _body(), _reduce_reset(), _reduce_update()
+    with tvm.build_config(offset_factor=1):
+        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
+
+######################################################################
+# Note that :code:`intrin_func` now returns a triplet:
+# :code:`(body, reduce_reset, reduce_update)`.
+# If tensorization includes all the reduce axes, function :code:`body()` will be invoked,
+# otherwise :code:`reduce_reset()` and :code:`reduce_update()` together will be used.
+# In our example :code:`body()` and :code:`reduce_update()`
+# share the same implementation,
+# while in other cases, hardware may have different instructions for these two functions.
+# Moreover, we can see now :code:`bb.strides[0]` is different from :code:`l`
+# due to the tiling.
+#
+# Tensorize for squared GEMV, build and check the results,
+#
+gemv = intrin_gemv(factor, factor)
+s[C].tensorize(yi, gemv)
+s[C].pragma(yo, "import_llvm", gemv_impl())
+
+func = tvm.build(s, [A, B, C], target="llvm", name="gemv")
+a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
+b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
+c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
+func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
+np.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
+
+######################################################################
+# Summary
+# -------
+# This tutorial demonstrates the usage of tensorize intrinsic in TVM.
+# Tensorize provides a way for users to get fully optimized schedule via micro-kernels.
+# For example, INT8 quantization on Intel CPUs uses tensorization
+# to invoke AVX instruction directly.
+# It also enables TVM to compile to ASICs -
+# checkout `VTA <https://docs.tvm.ai/vta/index.html>`_ for details.
+# We also demonstrates how to use inline assembly importing,
+# which helps users inject asm easily into the schedule.
+#

From c5cff989c044769dc78888a66fa1a6ea9f8ba9d3 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 3 Oct 2018 21:58:25 -0700
Subject: [PATCH 160/529] [RELAY][OP] conv2d, ShapeExpr->IndexExpr (#1798)

---
 include/tvm/attrs.h                       |  16 +
 include/tvm/packed_func_ext.h             |   2 +-
 include/tvm/relay/attrs/nn.h              |  72 +++
 include/tvm/relay/base.h                  |   2 +-
 include/tvm/relay/expr.h                  |   4 +-
 include/tvm/relay/type.h                  |  10 +-
 include/tvm/runtime/packed_func.h         |  12 +
 python/tvm/relay/__init__.py              |   1 +
 python/tvm/relay/env.py                   |   6 +-
 python/tvm/relay/ir_pass.py               |  22 +-
 python/tvm/relay/op/__init__.py           |   2 +
 python/tvm/relay/op/nn.py                 |  54 +++
 src/op/compute_op.cc                      |   3 +
 src/op/extern_op.cc                       |   3 +
 src/op/scan_op.cc                         |   3 +
 src/pass/ir_deep_compare.cc               |  13 +
 src/relay/ir/type.cc                      |   4 +-
 src/relay/op/nn/convolution.cc            | 158 +++++++
 src/relay/op/nn/layout.h                  | 538 ++++++++++++++++++++++
 src/relay/op/type_relations.cc            |   6 +-
 src/relay/pass/alpha_eq.cc                |  21 +-
 src/relay/pass/type_infer.cc              |  10 +-
 src/relay/pass/type_solver.cc             |   9 +-
 tests/python/relay/test_debug_printer.py  |   1 -
 tests/python/relay/test_op_level2.py      |  62 +++
 tests/python/relay/test_pass_alpha_eq.py  |  17 +
 tests/python/relay/test_type_infer.py     |  23 +-
 topi/include/topi/detail/constant_utils.h |   2 +
 28 files changed, 1039 insertions(+), 37 deletions(-)
 create mode 100644 include/tvm/relay/attrs/nn.h
 create mode 100644 python/tvm/relay/op/nn.py
 create mode 100644 src/relay/op/nn/convolution.cc
 create mode 100644 src/relay/op/nn/layout.h
 create mode 100644 tests/python/relay/test_op_level2.py
 create mode 100644 tests/python/relay/test_pass_alpha_eq.py

diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 7071dad07214..9e56b45932dc 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -56,6 +56,22 @@ namespace tvm {
   __fvisit__(#FieldName, &FieldName)
 
 
+/*!
+ * \brief Create a NodeRef type that represents null.
+ * \tparam TNodeRef the type to be created.
+ * \return A instance that will represent None.
+ */
+template<typename TNodeRef>
+inline TNodeRef NullValue() {
+  return TNodeRef(NodePtr<Node>(nullptr));
+}
+
+template<>
+inline Type NullValue<Type>() {
+  return Type(Type::Handle, 0, 0);
+}
+
+
 /*! \brief Error thrown during attribute checking. */
 struct AttrError : public dmlc::Error {
   /*!
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 8528eeaa5fa3..0491f3057815 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -114,7 +114,7 @@ inline TNodeRef TVMArgValue::AsNodeRef() const {
   static_assert(
       std::is_base_of<NodeRef, TNodeRef>::value,
       "Conversion only works for NodeRef");
-  if (type_code_ == kNull) return TNodeRef();
+  if (type_code_ == kNull) return TNodeRef(NodePtr<Node>(nullptr));
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
   NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
   CHECK(NodeTypeChecker<TNodeRef>::Check(sptr.get()))
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
new file mode 100644
index 000000000000..b364079f06fc
--- /dev/null
+++ b/include/tvm/relay/attrs/nn.h
@@ -0,0 +1,72 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/attrs/nn.h
+ * \brief Auxiliary attributes for nn operators.
+ */
+#ifndef TVM_RELAY_ATTRS_NN_H_
+#define TVM_RELAY_ATTRS_NN_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Attributes used in convolution operators */
+struct ConvAttrs : public tvm::AttrsNode<ConvAttrs> {
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> dilation;
+  int groups;
+  IndexExpr channels;
+  Array<IndexExpr> kernel_size;
+  std::string data_layout;
+  std::string weight_layout;
+  std::string out_layout;
+  DataType out_dtype;
+
+  TVM_DECLARE_ATTRS(ConvAttrs, "relay.attrs.ConvAttrs") {
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+        .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+        .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                  "on both sides for padding number of points");
+    TVM_ATTR_FIELD(dilation).set_default(Array<IndexExpr>({1, 1}))
+        .describe("Specifies the dilation rate to use for dilated convolution.");
+    TVM_ATTR_FIELD(groups).set_default(1)
+        .describe("Controls the connections between inputs and outputs."
+                  "At groups=1, all inputs are convolved to all outputs."
+                  "At groups=2, the operation becomes equivalent to having two convolution"
+                  "layers side by side, each seeing half the input channels, and producing"
+                  "half the output channels, and both subsequently concatenated.");
+    TVM_ATTR_FIELD(channels)
+        .describe("The number of output channels in the convolution."
+                  " If it is not set, inferred by shape of the weight.")
+        .set_default(NullValue<IndexExpr>());
+    TVM_ATTR_FIELD(kernel_size)
+        .describe("Specifies the dimensions of the convolution window.")
+        .set_default(NullValue<Array<IndexExpr> >());
+    TVM_ATTR_FIELD(data_layout).set_default("NCHW")
+        .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Convolution is applied on the 'H' and"
+                  "'W' dimensions.");
+    TVM_ATTR_FIELD(weight_layout).set_default("OIHW")
+        .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                  "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                  "dimensions respectively.");
+    TVM_ATTR_FIELD(out_layout).set_default("__undef__")
+        .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Default to be same as input layout.");
+
+    // use 0 bits to indicate none.
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(Int(0))
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_NN_H_
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index 48fd59c19793..4ae35f585c6f 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -37,7 +37,7 @@ using DataType = ::tvm::Type;
 /*!
  * \brief Symbolic expression for tensor shape.
  */
-using ShapeExpr = ::tvm::Expr;
+using IndexExpr = ::tvm::Expr;
 
 /*!
  * \brief Hash function for nodes.
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 55080c0fddd9..909b702bc1a1 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -286,7 +286,9 @@ class CallNode : public ExprNode {
     v->Visit("_checked_type_", &checked_type_);
   }
 
-  TVM_DLL static Call make(Expr op, Array<Expr> args, Attrs attrs = Attrs(),
+  TVM_DLL static Call make(Expr op,
+                           Array<Expr> args,
+                           Attrs attrs = Attrs(),
                            Array<Type> ty_args = Array<Type>());
 
   static constexpr const char* _type_key = "relay.Call";
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index ac4c0ec747af..4a187824f7f7 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -70,9 +70,9 @@ class TensorTypeNode : public BaseTensorTypeNode {
  public:
   /*!
    * \brief The shape of the tensor,
-   *  represented by ShapeExpr(tvm::Expr).
+   *  represented by IndexExpr(tvm::Expr).
    */
-  Array<ShapeExpr> shape;
+  Array<IndexExpr> shape;
   /*! \brief The content data type */
   DataType dtype;
 
@@ -82,7 +82,7 @@ class TensorTypeNode : public BaseTensorTypeNode {
     v->Visit("span", &span);
   }
 
-  TVM_DLL static TensorType make(Array<ShapeExpr> shape, DataType dtype);
+  TVM_DLL static TensorType make(Array<IndexExpr> shape, DataType dtype);
 
   /*! \brief Construct an scalar containing elements of dtype.  */
   TVM_DLL static TensorType Scalar(DataType dtype);
@@ -273,8 +273,10 @@ class TypeReporterNode : public Node {
    * \brief assert shape expression equals each other.
    * \param lhs The left operand.
    * \param rhs The right operand.
+   * \return false if assertation can be proven to have failed
+   *      true if solver can still proceed.
    */
-  TVM_DLL virtual void AssertEQ(const ShapeExpr& lhs, const ShapeExpr& rhs) = 0;
+  TVM_DLL virtual bool AssertEQ(const IndexExpr& lhs, const IndexExpr& rhs) = 0;
 
   // solver is not serializable.
   void VisitAttrs(tvm::AttrVisitor* v) final {}
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 2ef4b0a64d3f..d204f8624a64 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -521,6 +521,12 @@ class TVMArgValue : public TVMPODValue_ {
     if (type_code_ == kStr) {
       return String2TVMType(operator std::string());
     }
+    // None type
+    if (type_code_ == kNull) {
+      TVMType t;
+      t.code = kHandle; t.bits = 0; t.lanes = 0;
+      return t;
+    }
     TVM_CHECK_TYPE_CODE(type_code_, kTVMType);
     return value_.v_type;
   }
@@ -878,6 +884,7 @@ inline std::ostream& operator<<(std::ostream& os, TVMType t) {  // NOLINT(*)
 #endif
 
 inline std::string TVMType2String(TVMType t) {
+  if (t.bits == 0) return "";
 #ifndef _LIBCPP_SGX_NO_IOSTREAMS
   std::ostringstream os;
   os << t;
@@ -896,6 +903,11 @@ inline std::string TVMType2String(TVMType t) {
 
 inline TVMType String2TVMType(std::string s) {
   TVMType t;
+  // handle None type
+  if (s.length() == 0) {
+    t.bits = 0; t.lanes = 0; t.code = kHandle;
+    return t;
+  }
   t.bits = 32; t.lanes = 1;
   const char* scan;
   if (s.substr(0, 3) == "int") {
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index ef35962b41f2..7add619c203c 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -9,6 +9,7 @@
 # Operators
 from .op import Op
 from .op.tensor import *
+from .op import nn
 
 # Span
 Span = base.Span
diff --git a/python/tvm/relay/env.py b/python/tvm/relay/env.py
index 62afef76425a..8dd95d39b327 100644
--- a/python/tvm/relay/env.py
+++ b/python/tvm/relay/env.py
@@ -11,17 +11,19 @@ class Environment(NodeBase):
     options and more.
     """
 
-    def __init__(self, funcs):
+    def __init__(self, funcs=None):
         """Construct an environment.
 
         Parameters
         ------
-        funcs: list of relay.Function
+        funcs : optional, dict
+            Map of global var to Function
 
         Returns
         ------
         env: A new environment containing :py:class:`~relay.env.Environment`.
         """
+        funcs = funcs if funcs else {}
         self.__init_handle_by_constructor__(_make.Environment, funcs)
 
     def add(self, var, func):
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 339b9f74d8d4..78cc5027c32c 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -6,10 +6,26 @@
 them in Python.
 """
 from . import _ir_pass
-
-# Expose checking expression, should rename to infer_type.
 # pylint: disable=invalid-name
-check_expr = _ir_pass.check_expr
+
+def infer_type(env, expr):
+    """Infer the type of expr under the context of env
+
+    Parameters
+    ----------
+    env : relay.Environment
+        The global environmemt.
+
+    expr : relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    checked_expr : relay.Expr
+         The checked expression.
+    """
+    return _ir_pass.infer_type(env, expr)
+
 
 well_formed = _ir_pass.well_formed
 
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 0646a8326db6..4e6314001394 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -5,6 +5,8 @@
 
 # Operators
 from .tensor import *
+from . import nn
+
 
 # operator registry
 from . import _tensor
diff --git a/python/tvm/relay/op/nn.py b/python/tvm/relay/op/nn.py
new file mode 100644
index 000000000000..9d1714a82c67
--- /dev/null
+++ b/python/tvm/relay/op/nn.py
@@ -0,0 +1,54 @@
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+
+def conv2d(data,
+           weight,
+           strides=(1, 1),
+           padding=(0, 0),
+           dilation=(1, 1),
+           groups=1,
+           channels=None,
+           kernel_size=None,
+           data_layout="NCHW",
+           weight_layout="OIHW",
+           out_layout="",
+           out_dtype=""):
+    """Two dimensional convolution operator.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    weight : relay.Expr
+        The weight expressions.
+
+    strides : tuple of int, optional
+        The strides of convoltution.
+
+    padding : tuple of int, optional
+        The padding of convolution on both sides of inputs.
+
+    dilation : tuple of int, optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    weight_layout : str, optional
+        Layout of the weight.
+
+    out_layout : str, optional
+        Layout of the output.
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+    """
+    return _make.conv2d(data, weight, strides, padding, dilation,
+                        groups, channels, kernel_size, data_layout,
+                        weight_layout, out_layout, out_dtype)
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index 6100c957e473..daafac21b180 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -117,6 +117,9 @@ Operation ComputeOpNode::make(std::string name,
                               Map<std::string, NodeRef> attrs,
                               Array<IterVar> axis,
                               Array<Expr> body) {
+  if (!attrs.defined()) {
+    attrs = Map<std::string, NodeRef>();
+  }
   auto n = make_node<ComputeOpNode>();
   n->name = std::move(name);
   n->tag = std::move(tag);
diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc
index 952e52a852bd..cc6d57092f2a 100644
--- a/src/op/extern_op.cc
+++ b/src/op/extern_op.cc
@@ -43,6 +43,9 @@ Operation ExternOpNode::make(std::string name,
                              Array<Buffer> input_placeholders,
                              Array<Buffer> output_placeholders,
                              Stmt body) {
+  if (!attrs.defined()) {
+    attrs = Map<std::string, NodeRef>();
+  }
   auto n = make_node<ExternOpNode>();
   n->name = std::move(name);
   n->tag = std::move(tag);
diff --git a/src/op/scan_op.cc b/src/op/scan_op.cc
index 60369aaabb33..610d4619390d 100644
--- a/src/op/scan_op.cc
+++ b/src/op/scan_op.cc
@@ -51,6 +51,9 @@ Operation ScanOpNode::make(std::string name,
                            Array<Tensor> update,
                            Array<Tensor> state_placeholder,
                            Array<Tensor> inputs) {
+  if (!attrs.defined()) {
+    attrs = Map<std::string, NodeRef>();
+  }
   auto n = make_node<ScanOpNode>();
   CHECK_EQ(init.size(), update.size());
   CHECK_EQ(init.size(), state_placeholder.size());
diff --git a/src/pass/ir_deep_compare.cc b/src/pass/ir_deep_compare.cc
index 8a1b09e49339..2c0168ea5460 100644
--- a/src/pass/ir_deep_compare.cc
+++ b/src/pass/ir_deep_compare.cc
@@ -418,6 +418,19 @@ bool Equal(const Stmt& lhs, const Stmt& rhs) {
 }
 
 bool Equal(const Expr& lhs, const Expr& rhs) {
+  // quick pass for constant expressions.
+  if (const int64_t *a = as_const_int(lhs)) {
+    if (const int64_t *b = as_const_int(rhs)) {
+      return a[0] == b[0];
+    }
+  }
+  if (!lhs.defined()) {
+    if (rhs.defined()) return false;
+    if (!rhs.defined()) return true;
+  } else {
+    if (!rhs.defined()) return false;
+  }
+  // deep comparison.
   return IRDeepCompare().Equal(lhs, rhs);
 }
 
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
index 7d96d04cb514..a5af539947f0 100644
--- a/src/relay/ir/type.cc
+++ b/src/relay/ir/type.cc
@@ -11,7 +11,7 @@ namespace relay {
 using tvm::IRPrinter;
 using namespace tvm::runtime;
 
-TensorType TensorTypeNode::make(Array<ShapeExpr> shape, DataType dtype) {
+TensorType TensorTypeNode::make(Array<IndexExpr> shape, DataType dtype) {
   NodePtr<TensorTypeNode> n = make_node<TensorTypeNode>();
   n->shape = std::move(shape);
   n->dtype = std::move(dtype);
@@ -24,7 +24,7 @@ TensorType TensorTypeNode::Scalar(DataType dtype) {
 
 TVM_REGISTER_API("relay._make.TensorType")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-  Array<ShapeExpr> shape = args[0];
+  Array<IndexExpr> shape = args[0];
   *ret = TensorTypeNode::make(shape, args[1]);
 });
 
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
new file mode 100644
index 000000000000..f61a34151408
--- /dev/null
+++ b/src/relay/op/nn/convolution.cc
@@ -0,0 +1,158 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file convolution.cc
+ * \brief Convolution operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/nn.h>
+#include <vector>
+#include "layout.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(ConvAttrs);
+
+bool Conv2DRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const ConvAttrs* param = attrs.as<ConvAttrs>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->data_layout);
+  const Layout kernel_layout(param->weight_layout);
+  CHECK(in_layout.convertible(kNCHW))
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+  CHECK(kernel_layout.convertible(kOIHW))
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param->out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+  CHECK(out_layout.convertible(kNCHW))
+      << "Conv only support output layouts that are convertible from NCHW."
+      << " But got " << out_layout;
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
+  // infer weight if the kernel_size and channels are defined
+  if (param->kernel_size.defined() && param->channels.defined()) {
+    CHECK_EQ(param->kernel_size.size(), 2);
+    CHECK_EQ(param->dilation.size(), 2);
+    std::vector<IndexExpr> wshape(
+        {param->channels / param->groups,
+              data->shape[1] / param->groups,
+              param->kernel_size[0],
+              param->kernel_size[1]});
+    wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+    wshape[kernel_layout.indexof('O')] *= param->groups;
+    channels = param->channels;
+    dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+    // assign result to reporter
+    reporter->Assign(types[1], TensorTypeNode::make(wshape, data->dtype));
+  } else {
+    // use weight to infer the conv shape.
+    if (weight == nullptr) return false;
+    auto wshape = ConvertLayout(weight->shape, kernel_layout, kOIHW);
+    if (param->kernel_size.defined()) {
+      CHECK_EQ(param->kernel_size.size(), 2);
+      // check the size
+      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+          << "Conv2D: shape of weight is inconsistent with kernel_size, "
+          << " kernel_size=" << param->kernel_size
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    if (param->channels.defined()) {
+      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
+          << "Conv2D: shape of weight is inconsistent with channels, "
+          << " channels=" << param->channels
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    CHECK(reporter->AssertEQ(data->shape[1] / param->groups, wshape[1]));
+    channels = wshape[0];
+    dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
+  }
+  // dilation
+  std::vector<IndexExpr> oshape({data->shape[0], channels, 0, 0});
+
+  oshape[2] = (data->shape[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1;
+  oshape[3] = (data->shape[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1;
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  oshape = ConvertLayout(oshape, kNCHW, out_layout);
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
+  return true;
+}
+
+
+// Positional relay function to create conv2d operator
+// used by frontend FFI.
+Expr MakeConv2D(Expr data,
+                Expr weight,
+                Array<IndexExpr> strides,
+                Array<IndexExpr> padding,
+                Array<IndexExpr> dilation,
+                int groups,
+                IndexExpr channels,
+                Array<IndexExpr> kernel_size,
+                std::string data_layout,
+                std::string weight_layout,
+                std::string out_layout,
+                DataType out_dtype) {
+  auto attrs = make_node<ConvAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->channels = channels;
+  attrs->kernel_size = kernel_size;
+  attrs->data_layout = std::move(data_layout);
+  attrs->weight_layout = std::move(weight_layout);
+  attrs->out_layout = std::move(out_layout);
+  attrs->out_dtype = std::move(out_dtype);
+  static const Op& op = Op::Get("conv2d");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op._make.conv2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 12>(MakeConv2D, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("conv2d")
+.describe(R"code(2D convolution layer (e.g. spatial convolution over images).
+
+This layer creates a convolution kernel that is convolved
+with the layer input to produce a tensor of outputs.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+- **out**:  This depends on the `layout` parameter. Output is 4D array of shape
+            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(2)
+.add_type_rel("Conv2D", Conv2DRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/layout.h b/src/relay/op/nn/layout.h
new file mode 100644
index 000000000000..b1dc4a71af1c
--- /dev/null
+++ b/src/relay/op/nn/layout.h
@@ -0,0 +1,538 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/op/nn/layout.h
+ * \brief Layout expression.
+ *
+ *  This file is adapted from its nnvm counterpart and will keep involving
+ *  to the new layout system
+ *
+ *  The layout is composed of upper cases, lower cases and numbers,
+ *  where upper case indicates a (super-)dimension and
+ *  the corresponding lower case with factor size indicates the split (sub-)dimension.
+ *  For example, NCHW16c can describe a 5-D tensor of
+ *  [batch_size, channel, height, width, channel_block].
+ *  Here sub-dimension channel_block=16 is the split of super-dimension C (channel).
+ */
+#ifndef TVM_RELAY_OP_NN_LAYOUT_H_
+#define TVM_RELAY_OP_NN_LAYOUT_H_
+
+#include <string>
+#include <sstream>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief layout auxiliary structure */
+class Layout {
+ public:
+  using LayoutDim = char;
+
+  /*! \brief default constructor */
+  Layout() : name_("__undef__") {} // NOLINT(*)
+
+  /*!
+   * \brief construct from a string.
+   * \param layout input in layout convention:
+   *        upper case indicates a dimension and
+   *        the corresponding lower case with factor size
+   *        indicates the split dimension.
+   *        return undefined layout if "__undef__" is passed.
+   */
+  Layout(const std::string& layout) { // NOLINT(*)
+    if (layout.length() != 0) {
+      parse(layout);
+    } else {
+      parse("__undef__");
+    }
+  }
+  /*!
+   * \brief copy constructor from another layout
+   * \param s the source layout
+   */
+  Layout(const Layout& s) { // NOLINT(*)
+    this->parse(s.name_);
+  }
+  /*!
+   * \brief move constructor from Layout
+   * \param src the source layout
+   */
+  Layout(Layout&& src) { // NOLINT(*)
+    this->swap(src);
+  }
+  /*!
+   * \brief assignment from another layout.
+   * \param src source layout
+   * \return reference of self
+   */
+  Layout& operator=(const Layout& src) {
+    this->parse(src.name_);
+    return *this;
+  }
+  /*!
+   * \brief assignment from rvalue of another layout.
+   * \param src source layout
+   * \return reference of self
+   */
+  Layout& operator=(Layout&& src) {
+    Layout(std::move(src)).swap(*this); // NOLINT(*)
+    return *this;
+  }
+  /*!
+   * \brief assignment from string.
+   * \param src source layout
+   * \return reference of self
+   */
+  Layout& operator=(const std::string& src) {
+    this->parse(src);
+    return *this;
+  }
+  /*!
+   * \return whether two layout equals
+   * \param s the layout to compare against
+   */
+  bool operator==(const Layout& s) const {
+    return name_ == s.name_;
+  }
+  /*!
+   * \return whether two layout not equal
+   * \param s the layout to compare against
+   */
+  bool operator!=(const Layout& s) const {
+    return !(*this == s);
+  }
+
+  /*!
+   * \brief Append the current layout by another.
+   * @param other the layout to be appended
+   * @return a new layout
+   */
+  Layout operator+(const Layout& other) const {
+    if (!this->defined() && !other.defined()) {
+      return Layout::Undef();
+    } else if (!this->defined()) {
+      return other;
+    } else if (!other.defined()) {
+      return *this;
+    }
+    return Layout(this->name_ + other.name_);
+  }
+
+  /*!
+   * \brief Check whether a given dimension is a super-dimension.
+   * \param dim input dimension
+   * \return Whether a given dimension is a super-dimension.
+   */
+  static bool is_superdim(LayoutDim dim) {
+    return dim >= 'A' && dim <= 'Z';
+  }
+
+  /*!
+   * \brief Check whether a given dimension is a sub-dimension.
+   * \param dim input dimension
+   * \return Whether a given dimension is a sub-dimension.
+   */
+  static bool is_subdim(LayoutDim dim) {
+    return dim >= 'a' && dim <= 'z';
+  }
+
+  /*!
+   * \brief Convert a given dimension to super-dimension.
+   * \param dim input dimension
+   * \return The converted description.
+   */
+  static LayoutDim to_superdim(LayoutDim dim) {
+    if (is_subdim(dim)) {
+      return dim - 'a' + 'A';
+    }
+    return dim;
+  }
+
+  /*!
+   * \brief Convert a given dimension to sub-dimension.
+   * \param dim input dimension
+   * \return The converted description.
+   */
+  static LayoutDim to_subdim(LayoutDim dim) {
+    if (is_superdim(dim)) {
+      return dim - 'A' + 'a';
+    }
+    return dim;
+  }
+
+  /*!
+   * \brief Return an undefined layout.
+   * \return a (global) undefined layout.
+   */
+  static const Layout& Undef() {
+    static Layout undef;
+    return undef;
+  }
+
+  /*!
+   * \brief Swap current object with other
+   * \param other another object to be swapped.
+   */
+  void swap(Layout& other) {  // NOLINT(*)
+    std::swap(name_, other.name_);
+    std::swap(superdim_pos_, other.superdim_pos_);
+    std::swap(subdim_pos_, other.subdim_pos_);
+    std::swap(subdim_size_, other.subdim_size_);
+    std::swap(layout_simplified_, other.layout_simplified_);
+  }
+
+  /*!
+   * \brief Two layouts are convertible only if
+   *        they have same set of super-dimensions.
+   *        e.g., NCHW, NCHW16c, NHWC are convertible between each other,
+   *        but NCHW, CHW, OIHW are not.
+   * \param dst the target layout
+   * \return Whether can be converted to dst layout.
+   */
+  bool convertible(const Layout &dst) const {
+    if (!this->defined() || !dst.defined()) return false;
+    for (size_t i = 0; i < kUniqueDim; ++i) {
+      if ((superdim_pos_[i] >= 0 && dst.superdim_pos_[i] < 0) ||
+          (superdim_pos_[i] < 0 && dst.superdim_pos_[i] >= 0)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /*!
+   * \brief Returns a sublayout which is the portion of the object
+   *        that starts at dimension \p pos and spans \p len dimensions
+   *        (or until the end of the layout, whichever comes first).
+   * \param pos The start position.
+   * \param len The length of the sub-layout.
+   * \return A newly constructed Layout object.
+   */
+  Layout sublayout(size_t pos, size_t len) const {
+    if (pos > ndim()) return Layout::Undef();
+    if (pos + len > ndim()) len = ndim() - pos;
+    if (len == 0) return Layout::Undef();
+    std::ostringstream new_layout;
+    for (size_t i = pos; i < pos + len; ++i) {
+      if (is_subdim(layout_simplified_[i])) {
+        auto block_size = this->subsizeof(layout_simplified_[i]);
+        CHECK_GT(block_size, 0);
+        new_layout << block_size;
+      }
+      new_layout << layout_simplified_[i];
+    }
+    return Layout(new_layout.str());
+  }
+
+  /*! \return A newly constructed reversed Layout object. */
+  Layout reverse() const {
+    if (!this->defined()) return Layout::Undef();
+    std::ostringstream new_layout;
+    for (int64_t i = this->ndim() - 1; i >= 0; --i) {
+      if (is_subdim(layout_simplified_[i])) {
+        auto block_size = this->subsizeof(layout_simplified_[i]);
+        CHECK_GT(block_size, 0);
+        new_layout << block_size;
+      }
+      new_layout << layout_simplified_[i];
+    }
+    return Layout(new_layout.str());
+  }
+
+  /*!
+   * \brief Split \p dim by \p size and put the sub-dimension to position \p target_pos.
+   * \param dim The source dimension to be split. It must be a super-dimension.
+   * \param target_pos The target position of the newly split sub-dimension.
+   * \param size size of the sub-dimension.
+   * \return A newly constructed Layout object.
+   */
+  Layout split(LayoutDim dim, size_t target_pos, uint32_t size) const {
+    CHECK(target_pos <= this->ndim()) << "Invalid split position "
+                                      << target_pos << " for layout " << name_;
+    CHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
+    CHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
+    CHECK(!this->contains(to_subdim(dim))) << "Dimension " << dim
+                                           << " has already been split in "
+                                           << name_;
+    CHECK(size > 0) << "Invalid split size " << size;
+    std::ostringstream new_layout;
+    for (size_t i = 0; i <= this->ndim(); ++i) {
+      if (i == target_pos) {
+        new_layout << size << Layout::to_subdim(dim);
+      }
+      if (i == this->ndim()) break;
+      new_layout << this->at(i);
+    }
+    Layout x(new_layout.str());
+    return x;
+  }
+
+  using iterator = std::vector<LayoutDim>::const_iterator;
+  using reverse_iterator = std::vector<LayoutDim>::const_reverse_iterator;
+
+  /*! \return begin iterator */
+  iterator begin() const {
+    return layout_simplified_.begin();
+  }
+  /*! \return end iterator */
+  iterator end() const {
+    return layout_simplified_.end();
+  }
+  /*! \return rbegin iterator */
+  reverse_iterator rbegin() const {
+    return layout_simplified_.rbegin();
+  }
+  /*! \return rend iterator */
+  reverse_iterator rend() const {
+    return layout_simplified_.rend();
+  }
+
+  /*! \return number of dimensions */
+  size_t ndim() const {
+    return layout_simplified_.size();
+  }
+
+  /*!
+   * \brief The description of the \p i-th dimension.
+   *        If it is a sub-dimension, the size will be returned as well,
+   *        e.g., 16c. Otherwise a single character is returned, e.g., C.
+   * \param i The position
+   * \return the description of the dimension.
+   */
+  std::string at(size_t i) const {
+    CHECK_LT(i, this->ndim()) << "position " << i
+                              << " exceeds ndim=" << this->ndim();
+    std::ostringstream repr;
+    if (is_subdim(layout_simplified_[i])) {
+      auto factor = subsizeof(layout_simplified_[i]);
+      CHECK_GT(factor, 0);
+      repr << factor;
+    }
+    repr << layout_simplified_[i];
+    return repr.str();
+  }
+
+  /*!
+   * \brief return the index of the input dimension.
+   *        If it is not found in the layout or the layout is undefined,
+   *        return -1.
+   * \param dim the input dimension.
+   * \return the index or -1 if not found.
+   */
+  int32_t indexof(LayoutDim dim) const {
+    if (!this->defined()) return -1;
+    else if (is_superdim(dim)) return superdim_pos_[dim - 'A'];
+    else if (is_subdim(dim)) return subdim_pos_[dim - 'a'];
+    return -1;
+  }
+
+  /*!
+   * \param dim the input super-dimension or sub-dimension.
+   * \return the size of the sub-dimension of \p dim (if \p dim is a super-dimension),
+   *         or the size of \p dim itself (if \p dim is a sub-dimension).
+   *         Return -1 if \p dim is not in the layout or the layout is undefined.
+   */
+  int64_t subsizeof(LayoutDim dim) const {
+    CHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
+    if (!this->defined() || !this->contains(to_subdim(dim))) {
+      return -1;
+    }
+    int idx = to_subdim(dim) - 'a';
+    return subdim_size_[idx];
+  }
+
+  /*!
+   * \brief Whether the layout contains a dimension.
+   * \param dim dimension to be checked.
+   * \return Whether the layout contains the dimension.
+   */
+  bool contains(LayoutDim dim) const {
+    if (is_superdim(dim)) {
+      return superdim_pos_[dim-'A'] >= 0;
+    } else if (is_subdim(dim)) {
+      return subdim_pos_[dim-'a'] >= 0;
+    }
+    return false;
+  }
+
+  LayoutDim operator[](size_t i) const {
+    return layout_simplified_[i];
+  }
+
+  /*! \return whether the layout is defined */
+  bool defined() const {
+    return name_ != "__undef__";
+  }
+
+  /*! \return the string description of the layout */
+  const std::string& name() const {
+    return name_;
+  }
+
+  /*!
+   * \brief Write layout in JSON format.
+   * \param writer JSONWriter
+   */
+  void Save(dmlc::JSONWriter* writer) const {
+    writer->Write(name_);
+  }
+
+  /*!
+   * \brief Load layout from JSON.
+   * \param reader JSONReader
+   */
+  void Load(dmlc::JSONReader* reader) {
+    std::string tmp;
+    reader->Read(&tmp);
+    this->parse(tmp);
+  }
+
+  /*!
+   * \brief allow output string of layout to ostream
+   * \param os the output stream
+   * \param l the layout
+   * \return the ostream
+   */
+  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
+    os << l.name_;
+    return os;
+  }
+
+ private:
+  static const uint32_t kUniqueDim = 26;
+
+  std::string name_;
+  int32_t superdim_pos_[kUniqueDim];
+  int32_t subdim_pos_[kUniqueDim];
+  int64_t subdim_size_[kUniqueDim];
+  std::vector<LayoutDim> layout_simplified_;
+
+  void parse(const std::string& layout) {
+    name_ = layout;
+    std::fill_n(superdim_pos_, kUniqueDim, -1);
+    std::fill_n(subdim_pos_, kUniqueDim, -1);
+    std::fill_n(subdim_size_, kUniqueDim, -1);
+    layout_simplified_.clear();
+
+    if (layout == "__undef__") return;
+
+    int32_t factor = 0;
+    uint32_t curr = 0;
+    for (size_t i = 0; i < layout.size(); ++i) {
+      const LayoutDim c = layout.at(i);
+      if (is_superdim(c)) {
+        int pos = c - 'A';
+        CHECK_EQ(factor, 0) << "Invalid layout " << layout
+                            << ": invalid factor size " << factor
+                            << " before dimension " << c;
+        CHECK_EQ(superdim_pos_[pos], -1) << "Invalid layout " << layout
+                                         << ": duplicate dimension " << c;
+        superdim_pos_[pos] = curr++;
+        layout_simplified_.push_back(c);
+      } else if (is_subdim(c)) {
+        int pos = c - 'a';
+        CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size "
+                            << factor << " for dimension " << c;
+        CHECK_EQ(subdim_pos_[pos], -1) << "Invalid layout " << layout
+                                       << ": duplicate dimension " << c;
+        CHECK_EQ(subdim_size_[pos], -1) << "Invalid layout " << layout
+                                        << ": duplicate dimension " << c;
+        subdim_pos_[pos] = curr++;
+        subdim_size_[pos] = factor;
+        layout_simplified_.push_back(c);
+        factor = 0;
+      } else if (c >= '0' && c <= '9') {
+        CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
+        factor = factor * 10 + c - '0';
+      } else {
+        LOG(FATAL) << "Invalid layout " << layout;
+      }
+    }
+    CHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
+    for (LayoutDim dim : layout_simplified_) {
+      CHECK(is_superdim(dim) || superdim_pos_[dim-'a'] >= 0)
+        << "Invalid layout " << layout << ": missing axis "
+        << static_cast<char>(dim - 'a' + 'A');
+    }
+  }
+};
+
+/*!
+ * \brief Convert shape in src_layout to shape in dst_layout
+ * \param src original shape
+ * \param src_layout layout of original shape
+ * \param dst_layout target layout
+ * \return shape in target layout
+ */
+inline std::vector<IndexExpr> ConvertLayout(
+    std::vector<IndexExpr> src,
+    const Layout& src_layout,
+    const Layout& dst_layout) {
+  CHECK_EQ(src_layout.ndim(), src.size());
+  if (src_layout == dst_layout) {
+    return src;
+  } else if (!src_layout.defined()) {
+    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
+  } else if (!dst_layout.defined()) {
+    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
+  }
+
+  CHECK(src_layout.convertible(dst_layout))
+      << "cannot convert from "
+      << src_layout << " to " << dst_layout;
+
+  std::vector<IndexExpr> dst(dst_layout.ndim());
+  for (size_t i = 0; i < src_layout.ndim(); ++i) {
+    Layout::LayoutDim src_dim = src_layout[i];
+    if (Layout::is_superdim(src_dim)) {
+      int dst_major_pos = dst_layout.indexof(Layout::to_superdim(src_dim));
+      int dst_minor_pos = dst_layout.indexof(Layout::to_subdim(src_dim));
+      int src_minor_pos = src_layout.indexof(Layout::to_subdim(src_dim));
+      int src_factor = src_layout.subsizeof(src_dim);
+      int dst_factor = dst_layout.subsizeof(src_dim);
+      IndexExpr src_dim_size = src[i];
+
+      if (src_minor_pos >= 0) {
+        const int64_t* minor_size = as_const_int(src[src_minor_pos]);
+        CHECK(minor_size == nullptr &&
+              src_factor == minor_size[0])
+            << "src shape " << Array<IndexExpr>(src)
+            << " does not agree with layout "
+            << src_layout;
+        src_dim_size *= src_factor;
+      }
+      dst[dst_major_pos] = src_dim_size;
+      if (dst_minor_pos >= 0) {
+        CHECK_GT(dst_factor, 0);
+        if (const int64_t* const_src_dim_size = as_const_int(src_dim_size)) {
+          CHECK_LE(dst_factor, const_src_dim_size[0])
+              << "Converting " << Array<IndexExpr>(src)
+              << " from " << src_layout
+              << " to " << dst_layout
+              << ": cannot split dimension size of "
+              << src_dim_size << " by " << dst_factor;
+        }
+        dst[dst_major_pos] /= dst_factor;
+        dst[dst_minor_pos] = dst_factor;
+      }
+    }
+  }
+  return dst;
+}
+
+inline std::vector<IndexExpr> ConvertLayout(
+    const Array<IndexExpr>& src,
+    const Layout& src_layout,
+    const Layout& dst_layout) {
+  std::vector<IndexExpr> ret(src.size());
+  for (size_t i = 0; i < src.size(); ++i) {
+    ret[i] = src[i];
+  }
+  return ConvertLayout(ret, src_layout, dst_layout);
+}
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_OP_NN_LAYOUT_H_
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 58fcc18ad43e..169ef35474e2 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -69,8 +69,8 @@ Type ConcreteBroadcast(const TensorType& t1,
       rev_sh2++;
     }
 
-    Array<ShapeExpr> larger;
-    Array<ShapeExpr> smaller;
+    Array<IndexExpr> larger;
+    Array<IndexExpr> smaller;
 
     for (int i = 0; i < (full_len - suffix_len); i++) {
       smaller.push_back(make_const(tvm::Int(64), 1));
@@ -93,7 +93,7 @@ Type ConcreteBroadcast(const TensorType& t1,
 
     CHECK_EQ(larger.size(), smaller.size());
 
-    Array<ShapeExpr> out_shape;
+    Array<IndexExpr> out_shape;
     for (size_t i = 0; i < smaller.size(); i++) {
       auto left = smaller[i].as<tvm::ir::IntImm>();
       auto right = larger[i].as<tvm::ir::IntImm>();
diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
index f76da793c503..63ce834be7cf 100644
--- a/src/relay/pass/alpha_eq.cc
+++ b/src/relay/pass/alpha_eq.cc
@@ -1,8 +1,9 @@
 /*!
  *  Copyright (c) 2018 by Contributors
  * \file src/tvm/relay/pass/alpha_eq.cc
- * \brief Compute the set of variables not bound in the expression.
+ * \brief The structral equivalence comparison.
  */
+#include <tvm/ir_pass.h>
 #include <tvm/relay/expr_functor.h>
 #include "./type_visitor.h"
 #include "tvm/relay/pass.h"
@@ -19,9 +20,23 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
   TypeAlphaEq() : eq_map(), equal(true) {}
 
   void DataTypeEqual(const DataType& dt1, const DataType& dt2) {
-    equal = equal && dt1 == dt2;
+    if (dt1 != dt2) {
+      equal = false;
+    }
+  }
+
+  void ShapeEqual(const Array<IndexExpr>& s1, const Array<IndexExpr>& s2) {
+    if (s1.size() != s2.size()) {
+      equal = false;
+      return;
+    }
+    for (size_t i = 0; i < s1.size(); ++i) {
+      if (!tvm::ir::Equal(s1[i], s2[i])) {
+        equal = false;
+        return;
+      }
+    }
   }
-  void ShapeEqual(Array<ShapeExpr> s1, Array<ShapeExpr> s2) {}
 
   void VisitType_(const TensorTypeNode *tt1, const Type& t2) final {
     if (const TensorTypeNode *tt2 = t2.as<TensorTypeNode>()) {
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index c1ea090e9db9..1e2100fa902e 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -354,8 +354,8 @@ Expr TypeInferencer::Infer(Expr expr) {
   return Resolver(type_map_, &solver_).VisitExpr(expr);
 }
 
-Expr InferType(const Environment& env, const Expr& e) {
-  return TypeInferencer(env).Infer(e);
+Expr InferType(const Environment& env, const Expr& expr) {
+  return TypeInferencer(env).Infer(expr);
 }
 
 Expr InferType(const Environment& env,
@@ -370,11 +370,9 @@ Expr InferType(const Environment& env,
   return func_ret;
 }
 
-TVM_REGISTER_API("relay._ir_pass.check_expr")
+TVM_REGISTER_API("relay._ir_pass.infer_type")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-    Environment env = args[0];
-    Expr e = args[1];
-    *ret = InferType(env, e);
+    *ret = InferType(args[0], args[1]);
   });
 
 }  // namespace relay
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index 6e382a69a988..f21f6a67acf8 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -18,8 +18,13 @@ class TypeSolver::Reporter : public TypeReporterNode {
     solver_->Unify(dst, src);
   }
 
-  void AssertEQ(const ShapeExpr& lhs, const ShapeExpr& rhs) final {
-    // TODO(tqchen)
+  bool AssertEQ(const IndexExpr& lhs, const IndexExpr& rhs) final {
+    // early warning constant case.
+    IndexExpr diff = lhs - rhs;
+    if (const int64_t* pdiff = as_const_int(diff)) {
+      return pdiff[0] == 0;
+    }
+    return true;
   }
 
  private:
diff --git a/tests/python/relay/test_debug_printer.py b/tests/python/relay/test_debug_printer.py
index 867d9bb3791f..2ea0b7575ff8 100644
--- a/tests/python/relay/test_debug_printer.py
+++ b/tests/python/relay/test_debug_printer.py
@@ -8,7 +8,6 @@
 def show(e):
     r = debug_print(ib.env, e)
     assert r is not None
-    # print(r) # uncomment this line to debug
 
 
 def test_constant():
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
new file mode 100644
index 000000000000..d5dd64d76555
--- /dev/null
+++ b/tests/python/relay/test_op_level2.py
@@ -0,0 +1,62 @@
+import tvm
+from tvm import relay
+
+
+def test_conv2d_infer_type():
+    # symbolic in batch dimension
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 10, 224, 224
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    w = ib.param("w", relay.ty.IncompleteType())
+
+    with ib.function(x, w) as func:
+        ib.ret(relay.nn.conv2d(x.var, w.var,
+                               kernel_size=(3, 3),
+                               padding=(1, 1),
+                               channels=2))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type()
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, 2, 224, 224), "float32")
+    assert ftype.arg_types[1] == relay.ty.TensorType(
+        (2, 10, 3, 3), "float32")
+
+    # infer by shape of w, mixed precision
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 10, 224, 224
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
+    w = ib.param("w", relay.ty.TensorType((2, 10, 3, 3), "int8"))
+    with ib.function(x, w) as func:
+        ib.ret(relay.nn.conv2d(x.var, w.var, out_dtype="int32"))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type()
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, 2, 222, 222), "int32")
+
+    # Infer with a different layout
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = 4, 32, 224, 224
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
+    w = ib.param("w", relay.ty.IncompleteType())
+    with ib.function(x, w) as func:
+        ib.ret(relay.nn.conv2d(x.var, w.var,
+                               kernel_size=(3, 3),
+                               padding=(1, 1),
+                               channels=16,
+                               data_layout="NCHW4n4c",
+                               weight_layout="OIHW4o4i",
+                               out_dtype="int32"))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type()
+    assert ftype.ret_type == relay.ty.TensorType(
+        (1, 4, 224, 224, 4, 4), "int32")
+    assert ftype.arg_types[1] == relay.ty.TensorType(
+        (4, 8, 3, 3, 4, 4), "int8")
+
+
+
+if __name__ == "__main__":
+    test_conv2d_infer_type()
diff --git a/tests/python/relay/test_pass_alpha_eq.py b/tests/python/relay/test_pass_alpha_eq.py
new file mode 100644
index 000000000000..40140ea486a1
--- /dev/null
+++ b/tests/python/relay/test_pass_alpha_eq.py
@@ -0,0 +1,17 @@
+import tvm
+from tvm import relay
+
+def test_type_alpha_eq():
+    t1 = relay.ty.TensorType((3, 4), "float32")
+    t2 = relay.ty.TensorType((3, 4), "float32")
+    t3 = relay.ty.TensorType((3, 4, 5), "float32")
+    assert t1 == t2
+    assert t1 != t3
+
+    t1 = relay.ty.TensorType((), "float32")
+    t2 = relay.ty.TensorType((), "float32")
+    assert t1 == t2
+
+
+if __name__ == "__main__":
+    test_type_alpha_eq()
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 18cf4b940c4f..5b8375580424 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -3,7 +3,7 @@
 """
 import tvm
 import numpy as np
-from tvm.relay.ir_pass import check_expr
+from tvm.relay.ir_pass import infer_type
 from tvm.relay.ir_builder import IRBuilder, func_type
 from tvm.relay.ir_builder import scalar_type, convert, tensor_type
 from tvm.relay.env import Environment
@@ -11,8 +11,11 @@
 from tvm.relay.expr import Function
 
 def assert_has_type(expr, typ, env=Environment({})):
-    checked_expr = check_expr(env, expr)
-    assert checked_expr.checked_type() == typ
+    checked_expr = infer_type(env, expr)
+    checked_type = checked_expr.checked_type()
+    if checked_type != typ:
+        raise RuntimeError("Type mismatch %s vs %s" % (
+            checked_type, typ))
 
 
 def assert_decl_has_type(env, name, typ):
@@ -47,6 +50,7 @@ def test_add_op():
         }
     """
     b = IRBuilder()
+
     x = b.param('x', tensor_type(5, 5, 5))
     y = b.param('y', tensor_type(5, 5, 5))
     with b.function(x, y) as func:
@@ -71,8 +75,9 @@ def test_add_broadcast_op():
         b.ret(add(x.var, y.var))
     b.ret(func)
     prog, env = b.get()
-    ttype = tensor_type(5, 5, 5)
-    expected_ty = func_type([ttype, ttype], ttype)
+
+    expected_ty = func_type([tensor_type(10, 4), tensor_type(5, 10, 1)],
+                            tensor_type(5, 10, 4))
     assert_has_type(func.to_func(), expected_ty)
 
 def test_dual_op():
@@ -89,7 +94,9 @@ def test_dual_op():
         t1 = b.let('t1', log(x))
         t2 = b.let('t2', add(t1, x))
         b.ret(t2)
-    assert_has_type(func.to_func(), func_type(['float32'], 'float32'))
+
+    assert_has_type(func.to_func(),
+                    func_type([tensor_type(10, 10)], tensor_type(10, 10)))
 
 
 def test_decl():
@@ -152,12 +159,12 @@ def try_concat2(x: Float(3, 2), y: Float(2, 2)) -> Float(5, 2) {
     assert_decl_has_type(ib.env, try_concat2, fn_ty)
 
 if __name__ == "__main__":
-    test_recursion()
+    test_dual_op()
 
+    test_recursion()
     test_monomorphic_let()
     test_single_op()
     test_add_op()
     test_add_broadcast_op()
-    test_dual_op()
     test_decl()
     test_concat()
diff --git a/topi/include/topi/detail/constant_utils.h b/topi/include/topi/detail/constant_utils.h
index 343334562349..7ff137418c48 100644
--- a/topi/include/topi/detail/constant_utils.h
+++ b/topi/include/topi/detail/constant_utils.h
@@ -59,6 +59,7 @@ inline int64_t GetConstInt(Expr expr) {
  */
 inline std::vector<int> GetConstIntValues(Array<Expr> exprs, const std::string& var_name) {
   std::vector<int> result;
+  if (!exprs.defined()) return result;
   for (auto expr : exprs) {
     CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
     result.push_back(GetConstInt(expr));
@@ -77,6 +78,7 @@ inline std::vector<int> GetConstIntValues(Array<Expr> exprs, const std::string&
  */
 inline std::vector<int64_t> GetConstInt64Values(Array<Expr> exprs, const std::string& var_name) {
   std::vector<int64_t> result;
+  if (!exprs.defined()) return result;
   for (auto expr : exprs) {
     CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
     result.push_back(GetConstInt(expr));

From 23e988f37fd3ddaa498be222ac90d91bfaab6813 Mon Sep 17 00:00:00 2001
From: Pariksheet Pinjari <pariksheet.pinjari@huawei.com>
Date: Thu, 4 Oct 2018 10:29:22 +0530
Subject: [PATCH 161/529] Correction in documentation (#1810)

---
 tests/python/contrib/test_cblas.py   | 2 +-
 tests/python/contrib/test_cublas.py  | 2 +-
 tests/python/contrib/test_mps.py     | 4 ++--
 tests/python/contrib/test_nnpack.py  | 8 ++++----
 tests/python/contrib/test_random.py  | 6 +++---
 tests/python/contrib/test_rocblas.py | 2 +-
 topi/include/topi/nn.h               | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index f02000ea7e7b..6c9f24711896 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -18,7 +18,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.cblas.matmul", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index c488c8c680e1..40fc20aec049 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -16,7 +16,7 @@ def verify(target="cuda"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.gpu(0)
         f = tvm.build(s, [A, B, C], target)
diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py
index 25437605525b..6187d01b2ab8 100644
--- a/tests/python/contrib/test_mps.py
+++ b/tests/python/contrib/test_mps.py
@@ -33,7 +33,7 @@ def test_matmul():
 
     def verify(A, B, D, s, target="metal"):
         if not tvm.get_global_func("tvm.contrib.mps.matmul", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.metal(0)
         f = tvm.build(s, [A, B, D], "metal")
@@ -64,7 +64,7 @@ def test_conv2d():
 
     def verify(A, B, C, target="llvm"):
         if not tvm.get_global_func("tvm.contrib.mps.conv2d", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.metal(0)
         f = tvm.build(s1, [A, B, C], "metal")
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index 2c07afe1ffac..af5c8e5dfa5c 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -19,7 +19,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_output", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
@@ -49,7 +49,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
@@ -128,7 +128,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [data, kernel, bias, output], target)
@@ -173,7 +173,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [data, kernel, bias, output], target)
diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py
index a74273a0ccba..6f846836043e 100644
--- a/tests/python/contrib/test_random.py
+++ b/tests/python/contrib/test_random.py
@@ -13,7 +13,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.random.randint", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A], target)
@@ -37,7 +37,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.random.uniform", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A], target)
@@ -61,7 +61,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.random.normal", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A], target)
diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py
index 46350f4d6625..38b911f106c5 100644
--- a/tests/python/contrib/test_rocblas.py
+++ b/tests/python/contrib/test_rocblas.py
@@ -16,7 +16,7 @@ def verify(target="rocm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.rocm(0)
         f = tvm.build(s, [A, B, C], target)
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index dbbfecbcc28d..5fc05162f09b 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -64,7 +64,7 @@ inline tvm::Tensor relu(const tvm::Tensor& t,
 * \param name The name of the operation
 * \param tag The tag to mark the operation
 *
-* \return A Tensor whose op member is the relu operation
+* \return A Tensor whose op member is the leaky relu operation
 */
 inline tvm::Tensor leaky_relu(const tvm::Tensor& t,
                               double alpha = 0.1,
@@ -90,7 +90,7 @@ inline tvm::Tensor leaky_relu(const tvm::Tensor& t,
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
- * \return A Tensor whose op member is the relu operation
+ * \return A Tensor whose op member is the parametric relu operation
  */
 inline tvm::Tensor prelu(const tvm::Tensor &x,
                          const tvm::Tensor &slope,

From 986e1067ec3f5f8b62070330dcfbb72620dd609f Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Thu, 4 Oct 2018 20:16:57 +0300
Subject: [PATCH 162/529] [TVM] Fix negating undefined in DetectLinearEquation
 (#1816)

---
 src/arithmetic/detect_linear_equation.cc                   | 3 ++-
 tests/python/unittest/test_arith_detect_linear_equation.py | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index 4e6d8caf3772..6f4d3cfb53bb 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -111,8 +111,9 @@ class LinearEqDetector
     return ComputeExpr<Add>(a, b);
   }
   Expr SubCombine(Expr a, Expr b) {
-    if (!a.defined()) return -b;
+    // Check b first in case they are both undefined
     if (!b.defined()) return a;
+    if (!a.defined()) return -b;
     return ComputeExpr<Sub>(a, b);
   }
   Expr MulCombine(Expr a, Expr b) {
diff --git a/tests/python/unittest/test_arith_detect_linear_equation.py b/tests/python/unittest/test_arith_detect_linear_equation.py
index 9d875c910d1c..2b0f327b65b2 100644
--- a/tests/python/unittest/test_arith_detect_linear_equation.py
+++ b/tests/python/unittest/test_arith_detect_linear_equation.py
@@ -38,6 +38,10 @@ def test_multivariate():
     assert(m[2].value == 2)
     assert(m[len(m)-1].value == 2)
 
+    m = tvm.arith.DetectLinearEquation((v[0] - v[1]), [v[2]])
+    assert(m[0].value == 0)
+    assert(tvm.ir_pass.Simplify(m[1] - (v[0] - v[1])).value == 0)
+
 if __name__ == "__main__":
     test_basic()
     test_multivariate()

From f0b99d788ad1cc66321e5851255883c00ecfbcce Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Thu, 4 Oct 2018 22:49:54 +0530
Subject: [PATCH 163/529] [DEBUG]Support a debug framework for TVM Runtime
 (#1378)

---
 CMakeLists.txt                                |   3 +
 Jenkinsfile                                   |   4 +
 docs/dev/debugger.rst                         | 155 +++++
 python/tvm/contrib/debugger/__init__.py       |   0
 python/tvm/contrib/debugger/debug_result.py   | 189 +++++++
 python/tvm/contrib/debugger/debug_runtime.py  | 191 +++++++
 python/tvm/contrib/graph_runtime.py           |  34 +-
 src/api/api_base.cc                           |  34 ++
 .../graph/debug/graph_runtime_debug.cc        | 149 +++++
 src/runtime/graph/graph_runtime.cc            | 532 ++++--------------
 src/runtime/graph/graph_runtime.h             | 372 ++++++++++++
 .../unittest/test_runtime_graph_debug.py      |  76 +++
 12 files changed, 1305 insertions(+), 434 deletions(-)
 create mode 100644 docs/dev/debugger.rst
 create mode 100644 python/tvm/contrib/debugger/__init__.py
 create mode 100644 python/tvm/contrib/debugger/debug_result.py
 create mode 100644 python/tvm/contrib/debugger/debug_runtime.py
 create mode 100644 src/runtime/graph/debug/graph_runtime_debug.cc
 create mode 100644 tests/python/unittest/test_runtime_graph_debug.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6669dc3ce42..a47fe1f8b889 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -159,6 +159,9 @@ if(USE_GRAPH_RUNTIME)
   list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_SRCS})
 
   if(USE_GRAPH_RUNTIME_DEBUG)
+    message(STATUS "Build with Graph runtime debug support...")
+    file(GLOB RUNTIME_GRAPH_DEBUG_SRCS src/runtime/graph/debug/*.cc)
+    list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_DEBUG_SRCS})
     set_source_files_properties(${RUNTIME_GRAPH_SRCS}
       PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG")
   endif(USE_GRAPH_RUNTIME_DEBUG)
diff --git a/Jenkinsfile b/Jenkinsfile
index 2ecf3c59f8aa..e12ff3558ed1 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -97,6 +97,7 @@ stage('Build') {
            echo set\\(USE_SORT ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake
+           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_BLAS openblas\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
@@ -111,6 +112,7 @@ stage('Build') {
            echo set\\(USE_OPENCL ON\\) >> config.cmake
            echo set\\(USE_ROCM ON\\) >> config.cmake
            echo set\\(USE_VULKAN ON\\) >> config.cmake
+           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER clang-6.0\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
@@ -127,6 +129,7 @@ stage('Build') {
            cd build
            cp ../cmake/config.cmake .
            echo set\\(USE_SORT ON\\) >> config.cmake
+           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
@@ -150,6 +153,7 @@ stage('Build') {
            cp ../cmake/config.cmake .
            echo set\\(USE_SORT ON\\) >> config.cmake
            echo set\\(USE_RPC ON\\) >> config.cmake
+           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-5.0\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
diff --git a/docs/dev/debugger.rst b/docs/dev/debugger.rst
new file mode 100644
index 000000000000..a2a850a2dde4
--- /dev/null
+++ b/docs/dev/debugger.rst
@@ -0,0 +1,155 @@
+=================
+**Debugger**
+=================
+
+TVM Debugger is an interface for debugging TVM's computation graph execution. It helps to provide access to graph structures and tensor values at the TVM runtime.
+
+*******************************************
+**Debug Exchange Format**
+*******************************************
+
+**1. Computational Graph**
+==========================
+The optimized graph build by nnvm in json
+serialized format is dumped as it is. This contains the whole
+information about the graph. The UX can either use this graph directly
+or transform this graph to the format UX can understand.
+
+The Graph JSON format is explained below
+
+1. ``nodes``
+Nodes are either placeholders or computational nodes in NNVM graph. The nodes are stored
+as a list. A node contains the below information
+
+-     ``op`` - operation type, ``null`` means it is a placeholder/variable/input node and``tvm_op`` means this node can be executed
+-     ``name`` - Name of the node
+-     ``inputs`` - Position of the inputs for this operation, Inputs is a list of tuples with (nodeid, index, version). (Optional)
+-     ``attrs`` - Attributes of the node which contains the following information
+
+    -     ``flatten_data`` - Whether this data need to be flattened before execution
+    -     ``func_name`` - Fused function name, corresponds to the symbol in the lib generated by NNVM compilation process.
+    -     ``num_inputs`` - Number of inputs for this node
+    -     ``num_outputs`` - Number of outputs this node produces
+
+2. ``arg_nodes``
+arg_nodes is a list of indices of nodes which is placeholder/variable/input or constant/param to the graph.
+
+3. ``heads``
+heads is a list of entries as the output of the graph.
+
+4. ``node_row_ptr``
+node\_row\_ptr stores the history of forward path, so you can skip constructing the entire graph in inference tasks.
+
+5. ``attrs``
+attrs can contain version numbers or similar helpful information.
+
+- ``storage_id`` - Memory slot id for each node in the storage layout.
+- ``dtype`` - Datatype of each node (enum value).
+- ``dltype`` - Datatype of each node in order.
+- ``shape`` - Shape of each node k order.
+- ``device_index`` - Device assignment for each entry in the graph.
+
+Example of dumped graph:
+
+::
+
+    {
+      "nodes": [                                    # List of nodes
+        {
+          "op": "null",                             # operation type = null, this is a placeholder/variable/input or constant/param node
+          "name": "x",                              # Name of the argument node
+          "inputs": []                              # inputs for this node, its none since this is an argument node
+        },
+        {
+          "op": "tvm_op",                           # operation type = tvm_op, this node can be executed
+          "name": "relu0",                          # Name of the node
+          "attrs": {                                # Attributes of the node
+            "flatten_data": "0",                    # Whether this data need to be flattened
+            "func_name": "fuse_l2_normalize_relu",  # Fused function name, corresponds to the symbol in the lib generated by NNVM compilation process
+            "num_inputs": "1",                      # Number of inputs for this node
+            "num_outputs": "1"                      # Number of outputs this node produces
+          },
+          "inputs": [[0, 0, 0]]                     # Position of the inputs for this operation
+        }
+      ],
+      "arg_nodes": [0],                             # Which all nodes in this are argument nodes
+      "node_row_ptr": [0, 1, 2],                    # Row indices for faster depth first search
+      "heads": [[1, 0, 0]],                         # Position of the output nodes for this operation
+      "attrs": {                                    # Attributes for the graph
+        "storage_id": ["list_int", [1, 0]],         # memory slot id for each node in the storage layout
+        "dtype": ["list_int", [0, 0]],              # Datatype of each node (enum value)
+        "dltype": ["list_str", [                    # Datatype of each node in order
+            "float32",
+            "float32"]],
+        "shape": ["list_shape", [                   # Shape of each node k order
+            [1, 3, 20, 20],
+            [1, 3, 20, 20]]],
+        "device_index": ["list_int", [1, 1]],       # Device assignment for each node in order
+      }
+    }
+
+**2. Tensor dumping**
+=====================
+
+The tensor received after execution is in ``tvm.ndarray`` type. All the tensors will
+be saved as binary bytes in serialized format.  The result binary bytes can be loaded by the
+API "load_params".
+
+Example of loading the parameters
+   ::
+    with open(path_params, "rb") as fi:
+        loaded_params = bytearray(fi.read())
+
+    module.load_params(loaded_params)
+
+***************************************
+How to use Debugger?
+***************************************
+
+1. In ``config.cmake`` set the ``USE_GRAPH_RUNTIME_DEBUG`` flag to ``ON``
+
+   ::
+
+       # Whether enable additional graph debug functions
+       set(USE_GRAPH_RUNTIME_DEBUG ON)
+
+2. Do 'make' tvm, so that it will make the ``libtvm_runtime.so``
+
+3. In frontend script file instead of
+   ``from tvm.contrib import graph_runtime`` import the
+   ``debug_runtime``
+   ``from tvm.contrib.debugger import debug_runtime as graph_runtime``
+
+::
+
+    from tvm.contrib.debugger import debug_runtime as graph_runtime
+    m = graph_runtime.create(graph, lib, ctx, dump_root="/tmp/tvmdbg")
+    # set inputs
+    m.set_input('data', tvm.nd.array(data.astype(dtype)))
+    m.set_input(**params)
+    # execute
+    m.run()
+    tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+
+The outputs are dumped to a temporary folder in ``/tmp`` folder or the
+folder specified while creating the runtime.
+
+***************************************
+Sample Output
+***************************************
+
+The below is the output of running  ``tvm/nnvm/tutorials/from_onnnx.py`` with debugger.
+
+::
+
+    Node Name               Ops                                                                  Time(us)   Time(%)  Start Time       End Time         Shape                Inputs  Outputs
+    ---------               ---                                                                  --------   -------  ----------       --------         -----                ------  -------
+    1_NCHW1c                fuse___layout_transform___4                                          56.52      0.02     15:24:44.177475  15:24:44.177534  (1, 1, 224, 224)     1       1
+    _contrib_conv2d_nchwc0  fuse__contrib_conv2d_NCHWc                                           12436.11   3.4      15:24:44.177549  15:24:44.189993  (1, 1, 224, 224, 1)  2       1
+    relu0_NCHW8c            fuse___layout_transform___broadcast_add_relu___layout_transform__    4375.43    1.2      15:24:44.190027  15:24:44.194410  (8, 1, 5, 5, 1, 8)   2       1
+    _contrib_conv2d_nchwc1  fuse__contrib_conv2d_NCHWc_1                                         213108.6   58.28    15:24:44.194440  15:24:44.407558  (1, 8, 224, 224, 8)  2       1
+    relu1_NCHW8c            fuse___layout_transform___broadcast_add_relu___layout_transform__    2265.57    0.62     15:24:44.407600  15:24:44.409874  (64, 1, 1)           2       1
+    _contrib_conv2d_nchwc2  fuse__contrib_conv2d_NCHWc_2                                         104623.15  28.61    15:24:44.409905  15:24:44.514535  (1, 8, 224, 224, 8)  2       1
+    relu2_NCHW2c            fuse___layout_transform___broadcast_add_relu___layout_transform___1  2004.77    0.55     15:24:44.514567  15:24:44.516582  (8, 8, 3, 3, 8, 8)   2       1
+    _contrib_conv2d_nchwc3  fuse__contrib_conv2d_NCHWc_3                                         25218.4    6.9      15:24:44.516628  15:24:44.541856  (1, 8, 224, 224, 8)  2       1
+    reshape1                fuse___layout_transform___broadcast_add_reshape_transpose_reshape    1554.25    0.43     15:24:44.541893  15:24:44.543452  (64, 1, 1)           2       1
diff --git a/python/tvm/contrib/debugger/__init__.py b/python/tvm/contrib/debugger/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
new file mode 100644
index 000000000000..88874d138302
--- /dev/null
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -0,0 +1,189 @@
+"""Graph debug results dumping class."""
+import os
+import json
+import tvm
+
+GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json'
+
+class DebugResult(object):
+    """Graph debug data module.
+
+    Data dump module manage all the debug data formatting.
+    Output data and input graphs are formatted and dumped to file.
+    Frontend read these data and graph for visualization.
+
+    Parameters
+    ----------
+    graph_json : str
+        The graph to be deployed in json format output by nnvm graph. Each operator (tvm_op)
+        in the graph will have a one to one mapping with the symbol in libmod which is used
+        to construct a "PackedFunc" .
+
+    dump_path : str
+        Output data path is read/provided from frontend
+    """
+
+    def __init__(self, graph_json, dump_path):
+        self._dump_path = dump_path
+        self._output_tensor_list = []
+        self._time_list = []
+        self._parse_graph(graph_json)
+        # dump the json information
+        self.dump_graph_json(graph_json)
+
+    def _parse_graph(self, graph_json):
+        """Parse and extract the NNVM graph and update the nodes, shapes and dltype.
+
+        Parameters
+        ----------
+        graph_json : str or graph class
+           The graph to be deployed in json format output by nnvm graph.
+        """
+        json_obj = json.loads(graph_json)
+        self._nodes_list = json_obj['nodes']
+        self._shapes_list = json_obj['attrs']['shape']
+        self._dtype_list = json_obj['attrs']['dltype']
+        self._update_graph_json()
+
+    def _update_graph_json(self):
+        """update the nodes_list with name, shape and data type,
+        for temporarily storing the output.
+        """
+
+        nodes_len = len(self._nodes_list)
+        for i in range(nodes_len):
+            node = self._nodes_list[i]
+            input_list = []
+            for input_node in node['inputs']:
+                input_list.append(self._nodes_list[input_node[0]]['name'])
+            node['inputs'] = input_list
+            dtype = str("type: " + self._dtype_list[1][i])
+            if 'attrs' not in node:
+                node['attrs'] = {}
+                node['op'] = "param"
+            else:
+                node['op'] = node['attrs']['func_name']
+            node['attrs'].update({"T": dtype})
+            node['shape'] = self._shapes_list[1][i]
+
+    def _cleanup_tensors(self):
+        """Remove the tensor dump file (graph wont be removed)
+        """
+        for filename in os.listdir(self._dump_path):
+            if os.path.isfile(filename) and not filename.endswith(".json"):
+                os.remove(filename)
+
+    def get_graph_nodes(self):
+        """Return the nodes list
+        """
+        return self._nodes_list
+
+    def get_graph_node_shapes(self):
+        """Return the nodes shapes list
+        """
+        return self._shapes_list
+
+    def get_graph_node_output_num(self, node):
+        """Return the number of outputs of a node
+        """
+        return 1 if node['op'] == 'param' else int(node['attrs']['num_outputs'])
+
+    def get_graph_node_dtypes(self):
+        """Return the nodes dtype list
+        """
+        return self._dtype_list
+
+    def dump_output_tensor(self):
+        """Dump the outputs to a temporary folder, the tensors are in numpy format
+        """
+        #cleanup existing tensors before dumping
+        self._cleanup_tensors()
+        eid = 0
+        order = 0
+        output_tensors = {}
+        for node, time in zip(self._nodes_list, self._time_list):
+            num_outputs = self.get_graph_node_output_num(node)
+            for j in range(num_outputs):
+                order += time[0]
+                key = node['name'] + "_" + str(j) + "__" + str(order)
+                output_tensors[key] = self._output_tensor_list[eid]
+                eid += 1
+
+        with open(os.path.join(self._dump_path, "output_tensors.params"), "wb") as param_f:
+            param_f.write(save_tensors(output_tensors))
+
+    def dump_graph_json(self, graph):
+        """Dump json formatted graph.
+
+        Parameters
+        ----------
+        graph : json format
+            json formatted NNVM graph contain list of each node's
+            name, shape and type.
+        """
+        graph_dump_file_name = GRAPH_DUMP_FILE_NAME
+        with open(os.path.join(self._dump_path, graph_dump_file_name), 'w') as outfile:
+            json.dump(graph, outfile, indent=4, sort_keys=False)
+
+    def display_debug_result(self):
+        """Displays the debugger result"
+        """
+        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Start Time", \
+                    "End Time", "Shape", "Inputs", "Outputs"]
+        lines = ["---------", "---", "--------", "-------", "----------", \
+                    "--------", "-----", "------", "-------"]
+        eid = 0
+        data = []
+        total_time = sum(time[0] for time in self._time_list)
+        for node, time in zip(self._nodes_list, self._time_list):
+            num_outputs = self.get_graph_node_output_num(node)
+            for j in range(num_outputs):
+                op = node['op']
+                if node['op'] == 'param':
+                    continue
+                name = node['name']
+                shape = str(self._output_tensor_list[eid].shape)
+                time_us = round(time[0] * 1000000, 2)
+                time_percent = round(((time[0] / total_time) * 100), 2)
+                inputs = str(node['attrs']['num_inputs'])
+                outputs = str(node['attrs']['num_outputs'])
+                node_data = [name, op, time_us, time_percent, str(time[1]), str(time[2]), \
+                             shape, inputs, outputs]
+                data.append(node_data)
+                eid += 1
+        fmt = ""
+        for i, _ in enumerate(header):
+            max_len = len(header[i])
+            for j, _ in enumerate(data):
+                item_len = len(str(data[j][i]))
+                if item_len > max_len:
+                    max_len = item_len
+            fmt = fmt + "{:<" + str(max_len + 2) + "}"
+        print(fmt.format(*header))
+        print(fmt.format(*lines))
+        for row in data:
+            print(fmt.format(*row))
+
+def save_tensors(params):
+    """Save parameter dictionary to binary bytes.
+
+    The result binary bytes can be loaded by the
+    GraphModule with API "load_params".
+
+    Parameters
+    ----------
+    params : dict of str to NDArray
+        The parameter dictionary.
+
+    Returns
+    -------
+    param_bytes: bytearray
+        Serialized parameters.
+    """
+    _save_tensors = tvm.get_global_func("_save_param_dict")
+
+    args = []
+    for k, v in params.items():
+        args.append(k)
+        args.append(tvm.nd.array(v))
+    return _save_tensors(*args)
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
new file mode 100644
index 000000000000..986a7b167626
--- /dev/null
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -0,0 +1,191 @@
+"""Graph debug runtime executes TVM debug packed functions."""
+
+import os
+import tempfile
+import shutil
+from datetime import datetime
+from tvm._ffi.base import string_types
+from tvm.contrib import graph_runtime
+from tvm._ffi.function import get_global_func
+from . import debug_result
+
+_DUMP_ROOT_PREFIX = "tvmdbg_"
+_DUMP_PATH_PREFIX = "_tvmdbg_"
+
+def create(graph_json_str, libmod, ctx, dump_root=None):
+    """Create a runtime executor module given a graph and module.
+
+    Parameters
+    ----------
+    graph_json_str : str or graph class
+        The graph to be deployed in json format output by nnvm graph.
+        The graph can only contain one operator(tvm_op) that
+        points to the name of PackedFunc in the libmod.
+
+    libmod : tvm.Module
+        The module of the corresponding function.
+
+    ctx : TVMContext
+        The context to deploy the module, can be local or remote.
+
+    dump_root : str
+        To select which folder the outputs should be kept.
+        None will make a temp folder in /tmp/tvmdbg<rand_string> and does the dumping
+    Returns
+    -------
+    graph_module : GraphModuleDebug
+        Debug Runtime graph module that can be used to execute the graph.
+    """
+    if not isinstance(graph_json_str, string_types):
+        try:
+            graph_json_str = graph_json_str._tvm_graph_json()
+        except AttributeError:
+            raise ValueError("Type %s is not supported" % type(graph_json_str))
+    try:
+        fcreate = get_global_func("tvm.graph_runtime_debug.create")
+    except ValueError:
+        raise ValueError("Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in " \
+                         "config.cmake and rebuild TVM to enable debug mode")
+
+    ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx)
+    if num_rpc_ctx == len(ctx):
+        raise NotSupportedError("Remote graph debugging is not supported.")
+
+    func_obj = fcreate(graph_json_str, libmod, *device_type_id)
+    return GraphModuleDebug(func_obj, ctx, graph_json_str, dump_root)
+
+
+class GraphModuleDebug(graph_runtime.GraphModule):
+    """Graph debug runtime module.
+
+    This is a debug wrapper over the TVM runtime.
+    Runtime interfaces are wrapped with debug functionalities.
+    Manage the debug framework to format the debug data and
+    trigger the user interfaces.
+
+    Parameters
+    ----------
+    module : Module
+        The interal tvm module that holds the actual graph functions.
+
+    ctx : TVMContext
+        The context this module is under.
+
+    graph_json_str : str or graph class
+        Content of graph json file in string format
+
+    dump_root : str
+        To select which folder the outputs should be kept.
+        None will make a temp folder in /tmp/tvmdbg<rand_string> and does the dumping
+    """
+    def __init__(self, module, ctx, graph_json_str, dump_root):
+        self._dump_root = dump_root
+        self._dump_path = None
+        self._debug_run = module["debug_run"]
+        self._get_output_by_layer = module["get_output_by_layer"]
+        graph_runtime.GraphModule.__init__(self, module)
+        self._create_debug_env(graph_json_str, ctx)
+
+    def _format_context(self, ctx):
+        return str(ctx[0]).upper().replace("(", ":").replace(")", "")
+
+    def _ensure_dir(self, directory):
+        """Create a directory if not exists
+
+        Parameters
+        ----------
+
+        directory : str
+            File path to create
+        """
+        if not os.path.exists(directory):
+            os.makedirs(directory, 0o700)
+
+    def _get_dump_path(self, ctx):
+        """Make the graph and tensor dump folder and return the path.
+
+        Parameters
+        ----------
+        ctx : TVMContext
+            The context this module is under.
+
+        Returns
+        -------
+        path : str
+            Directory path where the graph and node outputs will be stored.
+        """
+        # save to file
+        folder_name = _DUMP_PATH_PREFIX + "ctx_"
+        folder_name = folder_name + ctx.replace(":", "_")
+        path = os.path.join(self._dump_root, folder_name)
+        self._ensure_dir(path)
+        return path
+
+    def _remove_dump_root(self):
+        if os.path.isdir(self._dump_root):
+            shutil.rmtree(self._dump_root)
+
+    def _create_debug_env(self, graph_json, ctx):
+        """Create UI wrapper framework to handle multiple UI frontends for tvmdbg
+
+        Parameters
+        ----------
+        graph_json : json format
+            json formatted NNVM graph contain list of each node's name, shape and type.
+
+        nodes_list : list
+            List of all the nodes presented in the graph
+
+        ctx : TVMContext
+            The context this module is under.
+        """
+        # make the dump folder if not given
+        if not self._dump_root:
+            self._dump_root = tempfile.mktemp(prefix=_DUMP_ROOT_PREFIX)
+
+        # format the context
+        ctx = self._format_context(ctx)
+
+        # updates the dumping directories
+        self._dump_path = self._get_dump_path(ctx)
+
+        # init the debug dumping environment
+        self.debug_datum = debug_result.DebugResult(graph_json, self._dump_path)
+
+    def _run_debug(self):
+        """Execute the node spcified with index will be executed.
+        Each debug output will be copied to the buffer
+        Time consumed for each execuion will be set as debug output.
+
+        """
+
+        for i, node in enumerate(self.debug_datum.get_graph_nodes()):
+            start_time = datetime.now().time()
+            time_stamp = self._debug_run(i)
+            end_time = datetime.now().time()
+            self.debug_datum._time_list.append([time_stamp, start_time, end_time])
+            num_outputs = self.debug_datum.get_graph_node_output_num(node)
+            for j in range(num_outputs):
+                out_tensor = self._get_output_by_layer(i, j)
+                self.debug_datum._output_tensor_list.append(out_tensor)
+    def run(self, **input_dict):
+        """Run forward execution of the graph with debug
+
+        Parameters
+        ----------
+        input_dict : dict of str to NDArray
+            List of input values to be feed to
+        """
+        if input_dict:
+            self.set_input(**input_dict)
+
+        # Step 1. Execute the graph
+        self._run_debug()
+        # Step 2. Dump the output tensors to the dump folder
+        self.debug_datum.dump_output_tensor()
+        # Step 3. Display the collected information
+        self.debug_datum.display_debug_result()
+
+    def exit(self):
+        """Exits the dump folder and all its contents"""
+        self._remove_dump_root()
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index f0e83eec0bb8..383711477bb7 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -31,6 +31,31 @@ def create(graph_json_str, libmod, ctx):
             graph_json_str = graph_json_str._tvm_graph_json()
         except AttributeError:
             raise ValueError("Type %s is not supported" % type(graph_json_str))
+
+    ctx, num_rpc_ctx, device_type_id = get_device_ctx(libmod, ctx)
+
+    if num_rpc_ctx == len(ctx):
+        hmod = rpc_base._ModuleHandle(libmod)
+        fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
+        return GraphModule(fcreate(graph_json_str, hmod, *device_type_id))
+
+    fcreate = get_global_func("tvm.graph_runtime.create")
+    return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
+
+def get_device_ctx(libmod, ctx):
+    """Parse and validate all the device context(s).
+    Parameters
+    ----------
+    libmod : tvm.Module
+        The module of the corresponding function
+    ctx : TVMContext or list of TVMContext
+    Returns
+    -------
+    ctx : list of TVMContext
+    num_rpc_ctx : Number of rpc contexts
+    device_type_id : List of device type and device id
+    """
+
     if isinstance(ctx, TVMContext):
         ctx = [ctx]
     elif not isinstance(ctx, (list, tuple)):
@@ -59,14 +84,7 @@ def create(graph_json_str, libmod, ctx):
 
     if 0 < num_rpc_ctx < len(ctx):
         raise ValueError("Either all or none of the contexts should be rpc.")
-
-    if num_rpc_ctx == len(ctx):
-        hmod = rpc_base._ModuleHandle(libmod)
-        fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
-        return GraphModule(fcreate(graph_json_str, hmod, *device_type_id))
-
-    fcreate = get_global_func("tvm.graph_runtime.create")
-    return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
+    return ctx, num_rpc_ctx, device_type_id
 
 
 class GraphModule(object):
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index a385d170070a..47895c61e2c0 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -3,6 +3,7 @@
  *  Implementation of basic API functions
  * \file api_base.cc
  */
+#include <dmlc/memory_io.h>
 #include <tvm/expr.h>
 #include <tvm/tensor.h>
 #include <tvm/api_registry.h>
@@ -33,4 +34,37 @@ TVM_REGISTER_API("_TVMSetStream")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     TVMSetStream(args[0], args[1], args[2]);
   });
+TVM_REGISTER_API("_save_param_dict")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    CHECK_EQ(args.size() % 2, 0u);
+    constexpr uint64_t TVMNDArrayListMagic = 0xF7E58D4F05049CB7;
+    size_t num_params = args.size() / 2;
+    std::vector<std::string> names;
+    names.reserve(num_params);
+    std::vector<DLTensor*> arrays;
+    arrays.reserve(num_params);
+    for (size_t i = 0; i < num_params * 2; i += 2) {
+      names.emplace_back(args[i].operator std::string());
+      arrays.emplace_back(args[i + 1].operator DLTensor*());
+    }
+    std::string bytes;
+    dmlc::MemoryStringStream strm(&bytes);
+    dmlc::Stream* fo = &strm;
+    uint64_t header = TVMNDArrayListMagic, reserved = 0;
+    fo->Write(header);
+    fo->Write(reserved);
+    fo->Write(names);
+    {
+      uint64_t sz = static_cast<uint64_t>(arrays.size());
+      fo->Write(sz);
+      for (size_t i = 0; i < sz; ++i) {
+        tvm::runtime::SaveDLTensor(fo, arrays[i]);
+      }
+    }
+    TVMByteArray arr;
+    arr.data = bytes.c_str();
+    arr.size = bytes.length();
+    *rv = arr;
+  });
+
 }  // namespace tvm
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
new file mode 100644
index 000000000000..98127424f7b8
--- /dev/null
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -0,0 +1,149 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file graph_runtime_debug.cc
+ */
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/ndarray.h>
+#include <chrono>
+#include "../graph_runtime.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Graph runtime with debug .
+ *
+ *  This is the extension of GraphRuntime class used for debugging
+ *  TVM runtime PackedFunc API.
+ */
+class GraphRuntimeDebug : public GraphRuntime {
+ public:
+    /*!
+     * \brief Run each operation and get the output.
+     * \param index The index of op which needs to be run.
+     */
+  double DebugRun(size_t index) {
+    CHECK(index < op_execs().size());
+    TVMContext ctx = data_entry()[GetEntryId(index, 0)].operator->()->ctx;
+    auto tbegin = std::chrono::high_resolution_clock::now();
+    if (op_execs()[index]) {
+      op_execs()[index]();
+    }
+    TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+    auto tend = std::chrono::high_resolution_clock::now();
+    double time = std::chrono::duration_cast<std::chrono::duration<double> >(
+        tend - tbegin).count();
+    return time;
+  }
+
+  /*!
+   * \brief Run each operation and get the output.
+   * \param index The index of op which needs to be returned.
+   * \param eid The Entry id of the op.
+   */
+  NDArray GetOutputByLayer(int index, int eid) {
+    return data_entry()[GetEntryId(index, eid)];
+  }
+
+  /*!
+   * \brief GetFunction Get the function based on input.
+   * \param name The function which needs to be invoked.
+   * \param sptr_to_self Packed function pointer.
+   */
+  PackedFunc GetFunction(const std::string& name,
+                         const std::shared_ptr<ModuleNode>& sptr_to_self);
+
+  /*!
+   * \brief Get the node index given the name of node.
+   * \param name The name of the node.
+   * \return The index of node.
+   */
+  int GetNodeIndex(const std::string& name) const {
+    for (size_t nid = 0; nid < GetNumOfNodes(); ++nid) {
+      if (GetNodeName(nid) == name) {
+        return static_cast<int>(nid);
+      }
+    }
+    LOG(FATAL) << "cannot find " << name << " among nodex";
+    return -1;
+}
+
+/*!
+ * \brief Copy index-th node to data_out.
+ *
+ * This method will do a partial run of the the graph
+ * from begining upto the index-th node and return output of index-th node.
+ * This is costly operation and suggest to use only for debug porpose.
+ *
+ * \param index: The  index of the node.
+ * \param data_out the node data.
+ */
+void DebugGetNodeOutput(int index, DLTensor* data_out) {
+  CHECK_LT(static_cast<size_t>(index), op_execs().size());
+  uint32_t eid = index;
+
+  for (size_t i = 0; i < op_execs().size(); ++i) {
+    if (op_execs()[i]) op_execs()[i]();
+    if (static_cast<int>(i) == index) break;
+  }
+
+  data_entry()[eid].CopyTo(data_out);
+}
+};
+
+
+/*!
+ * \brief GetFunction Get the function based on input.
+ * \param name The function which needs to be invoked.
+ * \param sptr_to_self Packed function pointer.
+ */
+PackedFunc GraphRuntimeDebug::GetFunction(
+    const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  // return member functions during query.
+  if (name == "debug_run") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->DebugRun(args[0]);
+      });
+  } else if (name == "get_output_by_layer") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->GetOutputByLayer(args[0], args[1]);
+      });
+  } else if (name == "debug_get_output") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        if (args[0].type_code() == kStr) {
+          this->DebugGetNodeOutput(this->GetNodeIndex(args[0]), args[1]);
+        } else {
+          this->DebugGetNodeOutput(args[0], args[1]);
+        }
+      });
+  } else {
+    return GraphRuntime::GetFunction(name, sptr_to_self);
+  }
+}
+
+/*!
+ * \brief GraphRuntimeDebugCreate Get the function based on input.
+ * \param sym_json The graph symbol in json format.
+ * \param m Compiled module which will be loaded.
+ * \param ctxs All devices contexts.
+ */
+  Module GraphRuntimeDebugCreate(const std::string& sym_json,
+                                 const tvm::runtime::Module& m,
+                                 const std::vector<TVMContext>& ctxs) {
+  std::shared_ptr<GraphRuntimeDebug> exec = std::make_shared<GraphRuntimeDebug>();
+  exec->Init(sym_json, m, ctxs);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    CHECK_GE(args.num_args, 4)
+        << "The expected number of arguments for graph_runtime.create is "
+           "at least 4, but it has "
+        << args.num_args;
+    *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args));
+  });
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index a48047fe369c..c4562d1c50e2 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -4,10 +4,6 @@
  */
 #include "graph_runtime.h"
 
-#include <dlpack/dlpack.h>
-#include <dmlc/json.h>
-#include <dmlc/memory_io.h>
-#include <tvm/runtime/device_api.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
@@ -17,431 +13,125 @@
 #include <functional>
 #include <numeric>
 #include <vector>
+#include <string>
 
 namespace tvm {
 namespace runtime {
 
-/*! \brief Macro to do C API call. */
-#define TVM_CCALL(func)                                            \
-  {                                                                \
-    int ret = (func);                                              \
-    CHECK_EQ(ret, 0)                                               \
-        << TVMGetLastError();                                      \
-  }
-
 /*!
- * \brief Tiny graph runtime.
- *
- *  This runtime can be acccesibly in various language via
- *  TVM runtime PackedFunc API.
+ * \brief Run all the operations one by one.
  */
-class GraphRuntime : public ModuleNode {
- public:
-  /*!
-   * \brief Get member function to front-end.
-   * \param name The name of the function.
-   * \param sptr_to_self The pointer to the module node.
-   * \return The corresponding member function.
-   */
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-
-  /*!
-   * \return The type key of the executor.
-   */
-  const char* type_key() const final {
-    return "GraphRuntime";
-  }
-  void Run() {
-    // setup the array and requirements.
-    for (size_t i = 0; i < op_execs_.size(); ++i) {
-      if (op_execs_[i]) op_execs_[i]();
-    }
+void GraphRuntime::Run() {
+  // setup the array and requirements.
+  for (size_t i = 0; i < op_execs_.size(); ++i) {
+    if (op_execs_[i]) op_execs_[i]();
   }
-  /*!
-   * \brief Initialize the graph executor with graph and context.
-   * \param graph_json The execution graph.
-   * \param module The module containing the compiled functions for the host
-   * processor.
-   * \param ctxs The context of the host and devices where graph nodes will be
-   * executed on.
-   */
-  void Init(const std::string& graph_json, const tvm::runtime::Module& module,
-            const std::vector<TVMContext>& ctxs) {
+}
+/*!
+ * \brief Initialize the graph executor with graph and context.
+ * \param graph_json The execution graph.
+ * \param module The module containing the compiled functions for the host
+ * processor.
+ * \param ctxs The context of the host and devices where graph nodes will be
+ * executed on.
+ */
+void GraphRuntime::Init(const std::string& graph_json,
+                        tvm::runtime::Module module,
+                        const std::vector<TVMContext>& ctxs) {
 #ifndef _LIBCPP_SGX_NO_IOSTREAMS
-    std::istringstream is(graph_json);
+  std::istringstream is(graph_json);
 #else
-    std::string is = graph_json;
+  std::string is = graph_json;
 #endif
-    dmlc::JSONReader reader(&is);
-    this->Load(&reader);
-    module_ = module;
-    ctxs_ = ctxs;
-    this->SetupStorage();
-    this->SetupOpExecs();
-  }
-
-  /*!
-   * \brief Get the input index given the name of input.
-   * \param name The name of the input.
-   * \return The index of input.
-   */
-  int GetInputIndex(const std::string& name) {
-    for (size_t i = 0; i< input_nodes_.size(); ++i) {
-      uint32_t nid = input_nodes_[i];
-      if (nodes_[nid].name == name) {
-        return static_cast<int>(i);
-      }
-    }
-    LOG(WARNING) << "Warning: cannot find \"" << name << "\" among input";
-    return -1;
-  }
-  /*!
-   * \brief Set index-th input to the graph.
-   * \param index The input index.
-   * \param data_in The input data.
-   */
-  void SetInput(int index, DLTensor* data_in) {
-    CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
-    uint32_t eid = this->entry_id(input_nodes_[index], 0);
-    data_entry_[eid].CopyFrom(data_in);
-  }
-  /*!
-   * \brief Get the number of outputs
-   *
-   * \return The number of outputs from graph.
-   */
-  int NumOutputs() const {
-    return outputs_.size();
-  }
-  /*!
-   * \brief Return NDArray for given input index.
-   * \param index The input index.
-   *
-   * \return NDArray corresponding to given input node index.
-   */
-  NDArray GetInput(int index) {
-    CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
-    uint32_t eid = this->entry_id(input_nodes_[index], 0);
-    return data_entry_[eid];
-  }
-  /*!
-   * \brief Return NDArray for given output index.
-   * \param index The output index.
-   *
-   * \return NDArray corresponding to given output node index.
-   */
-  NDArray GetOutput(int index) {
-    CHECK_LT(static_cast<size_t>(index), outputs_.size());
-    uint32_t eid = this->entry_id(outputs_[index]);
-    return data_entry_[eid];
-  }
-  /*!
-   * \brief Copy index-th output to data_out.
-   * \param index The output index.
-   * \param data_out The output data.
-   */
-  void CopyOutputTo(int index, DLTensor* data_out) {
-    CHECK_LT(static_cast<size_t>(index), outputs_.size());
-    uint32_t eid = this->entry_id(outputs_[index]);
-
-    // Check the shapes to avoid receiving in different dimension but same size.
-    const NDArray& data = data_entry_[eid];
-    CHECK_EQ(data->ndim, data_out->ndim);
-    for (int32_t j = 0; j < data->ndim; ++j) {
-      CHECK_EQ(data->shape[j], data_out->shape[j]);
+  dmlc::JSONReader reader(&is);
+  this->Load(&reader);
+  module_ = module;
+  ctxs_ = ctxs;
+  this->SetupStorage();
+  this->SetupOpExecs();
+}
+/*!
+ * \brief Get the input index given the name of input.
+ * \param name The name of the input.
+ * \return The index of input.
+ */
+int GraphRuntime::GetInputIndex(const std::string& name) {
+  for (size_t i = 0; i< input_nodes_.size(); ++i) {
+    uint32_t nid = input_nodes_[i];
+    if (nodes_[nid].name == name) {
+      return static_cast<int>(i);
     }
-
-    data_entry_[eid].CopyTo(data_out);
   }
-#ifdef TVM_GRAPH_RUNTIME_DEBUG
-  /*!
-   * \brief Get the node index given the name of node.
-   * \param name The name of the node.
-   * \return The index of node.
-   */
-  int GetNodeIndex(const std::string& name) {
-    for (uint32_t nid = 0; nid< nodes_.size(); ++nid) {
-      if (nodes_[nid].name == name) {
-        return static_cast<int>(nid);
-      }
-    }
-    LOG(FATAL) << "cannot find " << name << " among nodex";
-    return -1;
+  LOG(WARNING) << "Warning: cannot find \"" << name << "\" among input";
+  return -1;
+}
+/*!
+ * \brief set index-th input to the graph.
+ * \param index The input index.
+ * \param data_in The input data.
+ */
+void GraphRuntime::SetInput(int index, DLTensor* data_in) {
+  CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+  uint32_t eid = this->entry_id(input_nodes_[index], 0);
+  data_entry_[eid].CopyFrom(data_in);
+}
+/*!
+ * \brief Get the number of outputs
+ *
+ * \return The number of outputs from graph.
+ */
+int GraphRuntime::NumOutputs() const {
+  return outputs_.size();
+}
+/*!
+ * \brief Return NDArray for given input index.
+ * \param index The input index.
+ *
+ * \return NDArray corresponding to given input node index.
+ */
+NDArray GraphRuntime::GetInput(int index) const {
+  CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+  uint32_t eid = this->entry_id(input_nodes_[index], 0);
+  return data_entry_[eid];
+}
+/*!
+ * \brief Return NDArray for given output index.
+ * \param index The output index.
+ *
+ * \return NDArray corresponding to given output node index.
+ */
+NDArray GraphRuntime::GetOutput(int index) const {
+  CHECK_LT(static_cast<size_t>(index), outputs_.size());
+  uint32_t eid = this->entry_id(outputs_[index]);
+  return data_entry_[eid];
+}
+/*!
+ * \brief Copy index-th output to data_out.
+ * \param index The output index.
+ * \param data_out the output data.
+ */
+void GraphRuntime::CopyOutputTo(int index, DLTensor* data_out) {
+  CHECK_LT(static_cast<size_t>(index), outputs_.size());
+  uint32_t eid = this->entry_id(outputs_[index]);
+
+  // Check the shapes to avoid receiving in different dimension but same size.
+  const NDArray& data = data_entry_[eid];
+  CHECK_EQ(data->ndim, data_out->ndim);
+  for (int32_t j = 0; j < data->ndim; ++j) {
+    CHECK_EQ(data->shape[j], data_out->shape[j]);
   }
 
-  /*!
-   * \brief Copy index-th node to data_out.
-   *
-   * This method will do a partial run of the the graph
-   * from begining upto the index-th node and return output of index-th node.
-   * This is costly operation and suggest to use only for debug porpose.
-   *
-   * \param index The index of the node.
-   * \param data_out The node data.
-   */
-  void DebugGetNodeOutput(int index, DLTensor* data_out) {
-    CHECK_LT(static_cast<size_t>(index), nodes_.size());
-    uint32_t eid = index;
-
-    for (size_t i = 0; i < op_execs_.size(); ++i) {
-      if (op_execs_[i]) op_execs_[i]();
-      if (static_cast<int>(i) == index) break;
-    }
-
-    data_entry_[eid].CopyTo(data_out);
-  }
-#endif
-  /*!
-   * \brief Load parameters from binary stream.
-   * \param strm The input stream.
-   */
-  void LoadParams(dmlc::Stream* strm);
-  /*!
-   * \brief Load parameters from parameter blob.
-   * \param param_blob A binary blob of parameter.
-   */
-  void LoadParams(const std::string& param_blob) {
-    dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
-    this->LoadParams(&strm);
-  }
+  data_entry_[eid].CopyTo(data_out);
+}
 
- private:
-  // Memory pool entry.
-  struct PoolEntry {
-    size_t size;
-    int device_type;
-    PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {}
-  };
-  // Node entry
-  struct NodeEntry {
-    uint32_t node_id;
-    uint32_t index;
-    uint32_t version;
-    // JSON Loader
-    void Load(dmlc::JSONReader *reader) {
-      reader->BeginArray();
-      CHECK(reader->NextArrayItem()) << "invalid json format";
-      reader->Read(&node_id);
-      CHECK(reader->NextArrayItem()) << "invalid json format";
-      reader->Read(&index);
-      if (reader->NextArrayItem()) {
-        reader->Read(&version);
-        CHECK(!reader->NextArrayItem()) << "invalid json format";
-      } else {
-        version = 0;
-      }
-    }
-  };
-  // Node
-  struct Node {
-    // operator type in string
-    std::string op_type;
-    // name of the op
-    std::string name;
-    // parameters
-    TVMOpParam param;
-    // inputs
-    std::vector<NodeEntry> inputs;
-    // control deps
-    std::vector<uint32_t> control_deps;
-    // JSON Loader
-    void LoadAttrs(dmlc::JSONReader *reader, TVMOpParam* param) {
-      int bitmask = 0;
-      std::string key, value;
-      reader->BeginObject();
-      while (reader->NextObjectItem(&key)) {
-        reader->Read(&value);
-        if (key == "func_name") {
-          param->func_name = value;
-          bitmask |= 1;
-        } else if (key == "num_inputs") {
-          param->num_inputs = strtoul(value.c_str(), nullptr, 10);
-          bitmask |= 2;
-        } else if (key == "num_outputs") {
-          param->num_outputs = strtoul(value.c_str(), nullptr, 10);
-          bitmask |= 4;
-        } else if (key == "flatten_data") {
-          param->flatten_data = strtoul(value.c_str(), nullptr, 10);
-          bitmask |= 8;
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4|8) << "invalid format";
-    }
-    // JSON Loader
-    void Load(dmlc::JSONReader *reader) {
-      reader->BeginObject();
-      int bitmask = 0;
-      std::string key;
-      while (reader->NextObjectItem(&key)) {
-        if (key == "op") {
-          reader->Read(&op_type);
-          bitmask |= 1;
-        } else if (key == "name") {
-          reader->Read(&name);
-          bitmask |= 2;
-        } else if (key == "inputs") {
-          reader->Read(&inputs);
-          bitmask |= 4;
-        } else if (key == "attr" || key == "attrs") {
-          this->LoadAttrs(reader, &param);
-        } else if (key == "control_deps") {
-          reader->Read(&control_deps);
-        } else {
-          LOG(FATAL) << "do not support key " << key;
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
-    }
-  };
-  struct GraphAttr {
-    size_t storage_num_not_alloctaed{0};
-    std::vector<int> storage_id;
-    std::vector<int> device_index;
-    std::vector<std::string> dltype;
-    std::vector<std::vector<int64_t> > shape;
-    // The graph attribute fields.
-    void Load(dmlc::JSONReader *reader) {
-      reader->BeginObject();
-      int bitmask = 0;
-      std::string key, type;
-      while (reader->NextObjectItem(&key)) {
-        if (key == "dltype") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_str");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&dltype);
-          CHECK(!reader->NextArrayItem());
-          bitmask |= 1;
-        } else if (key == "storage_id") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_int");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&storage_id);
-          CHECK(!reader->NextArrayItem());
-          bitmask |= 2;
-        } else if (key == "shape") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_shape");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&shape);
-          CHECK(!reader->NextArrayItem());
-          bitmask |= 4;
-        } else if (key == "device_index") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_int");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&device_index);
-          CHECK(!reader->NextArrayItem());
-        } else {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          if (type == "list_int") {
-            CHECK(reader->NextArrayItem());
-            std::vector<int> temp;
-            reader->Read(&temp);
-          } else if (type == "size_t") {
-            CHECK(reader->NextArrayItem());
-            size_t temp;
-            reader->Read(&temp);
-          } else {
-            LOG(FATAL) << "cannot skip graph attr " << key;
-          }
-          CHECK(!reader->NextArrayItem());
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
-    }
-  };
-  // The graph attribute fields.
-  void Load(dmlc::JSONReader *reader) {
-      reader->BeginObject();
-      int bitmask = 0;
-      std::string key;
-      while (reader->NextObjectItem(&key)) {
-        if (key == "nodes") {
-          reader->Read(&nodes_);
-          bitmask |= 1;
-        } else if (key == "arg_nodes") {
-          reader->Read(&input_nodes_);
-          bitmask |= 2;
-        } else if (key == "node_row_ptr") {
-          reader->Read(&node_row_ptr_);
-          bitmask |= 4;
-        } else if (key == "heads") {
-          reader->Read(&outputs_);
-          bitmask |= 8;
-        } else if (key == "attrs") {
-          reader->Read(&attrs_);
-          bitmask |= 16;
-        } else {
-          LOG(FATAL) << "key " << key << " is not supported";
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4|8|16) << "invalid format";
-  }
-  /*! \brief Setup the temporal storage */
-  void SetupStorage();
-  /*! \brief Setup the executors. */
-  void SetupOpExecs();
-  /*!
-   * \brief Create a executtion function given input.
-   * \param attrs The node attributes.
-   * \param args The arguments to the functor, including inputs and outputs.
-   * \param num_inputs Number of inputs.
-   * \param dev_type The device type of the tvm_op.
-   * \return The created executor.
-   */
-  std::function<void()> CreateTVMOp(const TVMOpParam& attrs,
-                                    const std::vector<DLTensor>& args,
-                                    size_t num_inputs);
-  // Get node entry index.
-  uint32_t entry_id(uint32_t nid, uint32_t index) const {
-    return node_row_ptr_[nid] + index;
-  }
-  // Get node entry index.
-  uint32_t entry_id(const NodeEntry& e) const {
-    return entry_id(e.node_id, e.index);
-  }
-  // Number of node entries.
-  uint32_t num_node_entries() const {
-    return node_row_ptr_.back();
-  }
-  // Number of nodes.
-  uint32_t num_nodes() const {
-    return static_cast<uint32_t>(nodes_.size());
-  }
-  /*! \brief The graph nodes. */
-  std::vector<Node> nodes_;
-  /*! \brief The argument nodes. */
-  std::vector<uint32_t> input_nodes_;
-  /*! \brief Used for quick entry indexing. */
-  std::vector<uint32_t> node_row_ptr_;
-  /*! \brief Output entries. */
-  std::vector<NodeEntry> outputs_;
-  /*! \brief Additional graph attributes. */
-  GraphAttr attrs_;
-  /*! \brief The code module that contains both host and device code. */
-  tvm::runtime::Module module_;
-  /*! \brief Execution context of all devices including the host. */
-  std::vector<TVMContext> ctxs_;
-  /*! \brief Common storage pool for all devices. */
-  std::vector<NDArray> storage_pool_;
-  /*! \brief Data entry of each node. */
-  std::vector<NDArray> data_entry_;
-  /*! \brief Operator on each node. */
-  std::vector<std::function<void()> > op_execs_;
-};
+/*!
+ * \brief Load parameters from parameter blob.
+ * \param param_blob A binary blob of parameter.
+ */
+void GraphRuntime::LoadParams(const std::string& param_blob) {
+  dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
+  this->LoadParams(&strm);
+}
 
 void GraphRuntime::LoadParams(dmlc::Stream* strm) {
   uint64_t header, reserved;
@@ -540,9 +230,9 @@ void GraphRuntime::SetupStorage() {
 }
 
 void GraphRuntime::SetupOpExecs() {
-  op_execs_.resize(this->num_nodes());
+  op_execs_.resize(this->GetNumOfNodes());
   // setup the array and requirements.
-  for (uint32_t nid = 0; nid < this->num_nodes(); ++nid) {
+  for (uint32_t nid = 0; nid < this->GetNumOfNodes(); ++nid) {
     const auto& inode = nodes_[nid];
     if (inode.op_type == "null") continue;
     std::vector<DLTensor> args;
@@ -653,16 +343,6 @@ PackedFunc GraphRuntime::GetFunction(
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         *rv = this->NumOutputs();
       });
-#ifdef TVM_GRAPH_RUNTIME_DEBUG
-  } else if (name == "debug_get_output") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        if (args[0].type_code() == kStr) {
-          this->DebugGetNodeOutput(this->GetNodeIndex(args[0]), args[1]);
-        } else {
-          this->DebugGetNodeOutput(args[0], args[1]);
-        }
-      });
-#endif
   } else if (name == "run") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         this->Run();
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 7ebcf7d30b33..919e7ee9afe8 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -8,11 +8,26 @@
 #ifndef TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
 #define TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
 
+#include <dlpack/dlpack.h>
+#include <dmlc/memory_io.h>
+#include <dmlc/json.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <vector>
 #include <string>
 
 namespace tvm {
 namespace runtime {
 
+/*! \brief macro to do C API call */
+#define TVM_CCALL(func)                                            \
+  {                                                                \
+    int ret = (func);                                              \
+    CHECK_EQ(ret, 0)                                               \
+        << TVMGetLastError();                                      \
+  }
+
 /*! \brief Magic number for NDArray list file  */
 constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
 
@@ -24,6 +39,363 @@ struct TVMOpParam {
   uint32_t flatten_data;
 };
 
+/*!
+ * \brief Tiny graph runtime.
+ *
+ *  This runtime can be acccesibly in various language via
+ *  TVM runtime PackedFunc API.
+ */
+class GraphRuntime : public ModuleNode {
+ public:
+  /*!
+   * \brief Get member function to front-end
+   * \param name The name of the function.
+   * \param sptr_to_self The pointer to the module node.
+   * \return The corresponding member function.
+   */
+  virtual PackedFunc GetFunction(const std::string& name,
+                                 const std::shared_ptr<ModuleNode>& sptr_to_self);
+
+  /*!
+   * \return The type key of the executor.
+   */
+  const char* type_key() const final {
+    return "GraphRuntime";
+  }
+  void Run();
+
+  /*!
+   * \brief Initialize the graph executor with graph and context.
+   * \param graph_json The execution graph.
+   * \param module The module containing the compiled functions for the host
+   *  processor.
+   * \param ctxs The context of the host and devices where graph nodes will be
+   *  executed on.
+   */
+
+  void Init(const std::string& graph_json,
+            tvm::runtime::Module module,
+            const std::vector<TVMContext>& ctxs);
+
+  /*!
+   * \brief Get the input index given the name of input.
+   * \param name The name of the input.
+   * \return The index of input.
+   */
+  int GetInputIndex(const std::string& name);
+
+  /*!
+   * \brief set index-th input to the graph.
+   * \param index The input index.
+   * \param data_in The input data.
+   */
+  void SetInput(int index, DLTensor* data_in);
+  /*!
+   * \brief Get the number of outputs
+   *
+   * \return The number of outputs from graph.
+   */
+  int NumOutputs() const;
+  /*!
+   * \brief Return NDArray for given input index.
+   * \param index The input index.
+   *
+   * \return NDArray corresponding to given input node index.
+   */
+  NDArray GetInput(int index) const;
+  /*!
+   * \brief Return NDArray for given output index.
+   * \param index The output index.
+   *
+   * \return NDArray corresponding to given output node index.
+   */
+  NDArray GetOutput(int index) const;
+  /*!
+   * \brief Copy index-th output to data_out.
+   * \param index The output index.
+   * \param data_out the output data.
+   */
+  void CopyOutputTo(int index, DLTensor* data_out);
+  /*!
+   * \brief Load parameters from binary stream
+   * \param strm The input stream.
+   */
+  void LoadParams(dmlc::Stream* strm);
+  /*!
+   * \brief Load parameters from parameter blob.
+   * \param param_blob A binary blob of parameter.
+   */
+  void LoadParams(const std::string& param_blob);
+
+  /*!
+   * \brief Get the tensor vector pointer.
+   */
+  std::vector<NDArray>& data_entry() {
+      return data_entry_;
+  }
+
+  /*!
+   * \brief Get the execution function pointer.
+   */
+  std::vector<std::function<void()> >& op_execs() {
+        return op_execs_;
+  }
+
+  /*!
+   * \brief Get node entry index.
+   * \param nid Node id.
+   * \param index Index of the nodes.
+   */
+  uint32_t GetEntryId(uint32_t nid, uint32_t index) const {
+    return node_row_ptr_[nid] + index;
+  }
+
+ /*!
+  * \brief Get total number of nodes.
+  * \return Total number of nodes.
+  */
+  uint32_t GetNumOfNodes() const {
+    return static_cast<uint32_t>(nodes_.size());
+  }
+
+  std::string GetNodeName(uint32_t nid) const {
+    return nodes_[nid].name;
+  }
+
+
+ private:
+  // Memory pool entry.
+  struct PoolEntry {
+    size_t size;
+    int device_type;
+    PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {}
+  };
+  // Node entry
+  struct NodeEntry {
+    uint32_t node_id;
+    uint32_t index;
+    uint32_t version;
+    // JSON Loader
+    void Load(dmlc::JSONReader *reader) {
+      reader->BeginArray();
+      CHECK(reader->NextArrayItem()) << "invalid json format";
+      reader->Read(&node_id);
+      CHECK(reader->NextArrayItem()) << "invalid json format";
+      reader->Read(&index);
+      if (reader->NextArrayItem()) {
+        reader->Read(&version);
+        CHECK(!reader->NextArrayItem()) << "invalid json format";
+      } else {
+        version = 0;
+      }
+    }
+  };
+  // Node
+  struct Node {
+    // operator type in string
+    std::string op_type;
+    // name of the op
+    std::string name;
+    // parameters
+    TVMOpParam param;
+    // inputs
+    std::vector<NodeEntry> inputs;
+    // control deps
+    std::vector<uint32_t> control_deps;
+    // JSON Loader
+    void LoadAttrs(dmlc::JSONReader *reader, TVMOpParam* param) {
+      int bitmask = 0;
+      std::string key, value;
+      reader->BeginObject();
+      while (reader->NextObjectItem(&key)) {
+        reader->Read(&value);
+        if (key == "func_name") {
+          param->func_name = value;
+          bitmask |= 1;
+        } else if (key == "num_inputs") {
+          param->num_inputs = strtoul(value.c_str(), nullptr, 10);
+          bitmask |= 2;
+        } else if (key == "num_outputs") {
+          param->num_outputs = strtoul(value.c_str(), nullptr, 10);
+          bitmask |= 4;
+        } else if (key == "flatten_data") {
+          param->flatten_data = strtoul(value.c_str(), nullptr, 10);
+          bitmask |= 8;
+        }
+      }
+      CHECK_EQ(bitmask, 1|2|4|8) << "invalid format";
+    }
+    // JSON Loader
+    void Load(dmlc::JSONReader *reader) {
+      reader->BeginObject();
+      int bitmask = 0;
+      std::string key;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "op") {
+          reader->Read(&op_type);
+          bitmask |= 1;
+        } else if (key == "name") {
+          reader->Read(&name);
+          bitmask |= 2;
+        } else if (key == "inputs") {
+          reader->Read(&inputs);
+          bitmask |= 4;
+        } else if (key == "attr" || key == "attrs") {
+          this->LoadAttrs(reader, &param);
+        } else if (key == "control_deps") {
+          reader->Read(&control_deps);
+        } else {
+          LOG(FATAL) << "do not support key " << key;
+        }
+      }
+      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
+    }
+  };
+  struct GraphAttr {
+    size_t storage_num_not_alloctaed{0};
+    std::vector<int> storage_id;
+    std::vector<int> device_index;
+    std::vector<std::string> dltype;
+    std::vector<std::vector<int64_t> > shape;
+    // The graph attribute fields.
+    void Load(dmlc::JSONReader *reader) {
+      reader->BeginObject();
+      int bitmask = 0;
+      std::string key, type;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "dltype") {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          CHECK_EQ(type, "list_str");
+          CHECK(reader->NextArrayItem());
+          reader->Read(&dltype);
+          CHECK(!reader->NextArrayItem());
+          bitmask |= 1;
+        } else if (key == "storage_id") {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          CHECK_EQ(type, "list_int");
+          CHECK(reader->NextArrayItem());
+          reader->Read(&storage_id);
+          CHECK(!reader->NextArrayItem());
+          bitmask |= 2;
+        } else if (key == "shape") {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          CHECK_EQ(type, "list_shape");
+          CHECK(reader->NextArrayItem());
+          reader->Read(&shape);
+          CHECK(!reader->NextArrayItem());
+          bitmask |= 4;
+        } else if (key == "device_index") {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          CHECK_EQ(type, "list_int");
+          CHECK(reader->NextArrayItem());
+          reader->Read(&device_index);
+          CHECK(!reader->NextArrayItem());
+        } else {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          if (type == "list_int") {
+            CHECK(reader->NextArrayItem());
+            std::vector<int> temp;
+            reader->Read(&temp);
+          } else if (type == "size_t") {
+            CHECK(reader->NextArrayItem());
+            size_t temp;
+            reader->Read(&temp);
+          } else {
+            LOG(FATAL) << "cannot skip graph attr " << key;
+          }
+          CHECK(!reader->NextArrayItem());
+        }
+      }
+      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
+    }
+  };
+  // The graph attribute fields.
+  void Load(dmlc::JSONReader *reader) {
+      reader->BeginObject();
+      int bitmask = 0;
+      std::string key;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "nodes") {
+          reader->Read(&nodes_);
+          bitmask |= 1;
+        } else if (key == "arg_nodes") {
+          reader->Read(&input_nodes_);
+          bitmask |= 2;
+        } else if (key == "node_row_ptr") {
+          reader->Read(&node_row_ptr_);
+          bitmask |= 4;
+        } else if (key == "heads") {
+          reader->Read(&outputs_);
+          bitmask |= 8;
+        } else if (key == "attrs") {
+          reader->Read(&attrs_);
+          bitmask |= 16;
+        } else {
+          LOG(FATAL) << "key " << key << " is not supported";
+        }
+      }
+      CHECK_EQ(bitmask, 1|2|4|8|16) << "invalid format";
+  }
+  /*! \brief Setup the temporal storage */
+  void SetupStorage();
+  /*! \brief Setup the executors. */
+  void SetupOpExecs();
+  /*!
+   * \brief Create a executtion function given input.
+   * \param attrs The node attributes.
+   * \param args The arguments to the functor, including inputs and outputs.
+   * \param num_inputs Number of inputs.
+   * \param dev_type The device type of the tvm_op.
+   * \return The created executor.
+   */
+  std::function<void()> CreateTVMOp(const TVMOpParam& attrs,
+                                    const std::vector<DLTensor>& args,
+                                    size_t num_inputs);
+  // Get node entry index.
+  uint32_t entry_id(uint32_t nid, uint32_t index) const {
+    return node_row_ptr_[nid] + index;
+  }
+  // Get node entry index.
+  uint32_t entry_id(const NodeEntry& e) const {
+    return entry_id(e.node_id, e.index);
+  }
+  // Number of node entries.
+  uint32_t num_node_entries() const {
+    return node_row_ptr_.back();
+  }
+  /*! \brief The graph nodes. */
+  std::vector<Node> nodes_;
+  /*! \brief The argument nodes. */
+  std::vector<uint32_t> input_nodes_;
+  /*! \brief Used for quick entry indexing. */
+  std::vector<uint32_t> node_row_ptr_;
+  /*! \brief Output entries. */
+  std::vector<NodeEntry> outputs_;
+  /*! \brief Additional graph attributes. */
+  GraphAttr attrs_;
+  /*! \brief The code module that contains both host and device code. */
+  tvm::runtime::Module module_;
+  /*! \brief Execution context of all devices including the host. */
+  std::vector<TVMContext> ctxs_;
+  /*! \brief Common storage pool for all devices. */
+  std::vector<NDArray> storage_pool_;
+  /*! \brief Data entry of each node. */
+  std::vector<NDArray> data_entry_;
+  /*! \brief Operator on each node. */
+  std::vector<std::function<void()> > op_execs_;
+};
+
+std::vector<TVMContext> GetAllContext(const TVMArgs& args);
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
new file mode 100644
index 000000000000..ab6b729974df
--- /dev/null
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -0,0 +1,76 @@
+import os
+import tvm
+import numpy as np
+import json
+from tvm.contrib.debugger import debug_runtime as graph_runtime
+
+def test_graph_simple():
+    n = 4
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = tvm.create_schedule(B.op)
+
+    node0 = {"op": "null", "name": "x", "inputs": []}
+    node1 = {"op": "tvm_op", "name": "add",
+             "inputs": [[0, 0, 0]],
+             "attrs": {"func_name": "myadd",
+                       "flatten_data": "1",
+                       "num_inputs" : "1",
+                    "num_outputs" : "1"}}
+    nodes = [node0, node1]
+    arg_nodes = [0]
+    node_row_ptr = [0, 1, 2]
+    outputs = [[1, 0, 0]]
+    shape = (4,)
+    attrs = {
+        "shape" : ["list_shape", [shape, shape]],
+        "dltype" : ["list_str", ["float32", "float32"]],
+        "storage_id" : ["list_int", [0, 1]],
+    }
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": outputs,
+             "attrs": attrs}
+    graph = json.dumps(graph)
+
+    def check_verify():
+        if not tvm.module.enabled("llvm"):
+            print("Skip because llvm is not enabled")
+            return
+        mlib = tvm.build(s, [A, B], "llvm", name="myadd")
+        try:
+            mod = graph_runtime.create(graph, mlib, tvm.cpu(0))
+        except ValueError:
+            return
+
+        a = np.random.uniform(size=(n,)).astype(A.dtype)
+        mod.set_input(x=a)
+
+        #verify dumproot created
+        directory = mod._dump_path
+        assert(os.path.exists(directory))
+
+        #verify graph is there
+        GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json'
+        assert(len(os.listdir(directory)) == 1)
+
+        #verify the file name is proper
+        assert(os.path.exists(os.path.join(directory, GRAPH_DUMP_FILE_NAME)))
+
+        mod.run()
+        #Verify the tensors are dumped
+        assert(len(os.listdir(directory)) > 1)
+
+        #verify the output is correct
+        out = mod.get_output(0, tvm.nd.empty((n,)))
+        np.testing.assert_equal(out.asnumpy(), a + 1)
+
+        mod.exit()
+        #verify dump root delete after cleanup
+        assert(not os.path.exists(directory))
+
+    check_verify()
+
+if __name__ == "__main__":
+    test_graph_simple()

From c06cc8cee85aa7b10b7bbc81a8d8daa199d58414 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 4 Oct 2018 12:01:04 -0700
Subject: [PATCH 164/529] [RUNTIME] Fix debug runtime i386 build (#1818)

---
 src/runtime/graph/debug/graph_runtime_debug.cc | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 98127424f7b8..7faee4420f47 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -19,10 +19,11 @@ namespace runtime {
  */
 class GraphRuntimeDebug : public GraphRuntime {
  public:
-    /*!
-     * \brief Run each operation and get the output.
-     * \param index The index of op which needs to be run.
-     */
+  /*!
+   * \brief Run each operation and get the output.
+   * \param index The index of op which needs to be run.
+   * \return the elapsed time.
+   */
   double DebugRun(size_t index) {
     CHECK(index < op_execs().size());
     TVMContext ctx = data_entry()[GetEntryId(index, 0)].operator->()->ctx;
@@ -104,7 +105,7 @@ PackedFunc GraphRuntimeDebug::GetFunction(
   // return member functions during query.
   if (name == "debug_run") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        *rv = this->DebugRun(args[0]);
+        *rv = this->DebugRun(static_cast<size_t>(args[0].operator int64_t()));
       });
   } else if (name == "get_output_by_layer") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -129,9 +130,9 @@ PackedFunc GraphRuntimeDebug::GetFunction(
  * \param m Compiled module which will be loaded.
  * \param ctxs All devices contexts.
  */
-  Module GraphRuntimeDebugCreate(const std::string& sym_json,
-                                 const tvm::runtime::Module& m,
-                                 const std::vector<TVMContext>& ctxs) {
+Module GraphRuntimeDebugCreate(const std::string& sym_json,
+                               const tvm::runtime::Module& m,
+                               const std::vector<TVMContext>& ctxs) {
   std::shared_ptr<GraphRuntimeDebug> exec = std::make_shared<GraphRuntimeDebug>();
   exec->Init(sym_json, m, ctxs);
   return Module(exec);

From adf0e03c5d4b47e00801e07877e4bcdc8d987782 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 4 Oct 2018 18:42:04 -0700
Subject: [PATCH 165/529] [RELAY][DOCS] Core Operator docs (#1821)

---
 docs/langref/index.rst                 |  1 +
 docs/langref/relay_op.rst              | 58 +++++++++++++++++
 include/tvm/attrs.h                    |  4 ++
 python/tvm/relay/__init__.py           |  5 +-
 python/tvm/relay/nn.py                 |  4 ++
 python/tvm/relay/op/nn.py              | 54 ----------------
 python/tvm/relay/op/nn/__init__.py     |  4 ++
 python/tvm/relay/op/nn/_make.py        |  4 ++
 python/tvm/relay/op/nn/nn.py           | 88 ++++++++++++++++++++++++++
 python/tvm/relay/op/tensor.py          | 23 +++++--
 src/relay/op/nn/convolution.cc         |  6 +-
 tutorials/dev/low_level_custom_pass.py | 11 ++--
 12 files changed, 190 insertions(+), 72 deletions(-)
 create mode 100644 docs/langref/relay_op.rst
 create mode 100644 python/tvm/relay/nn.py
 delete mode 100644 python/tvm/relay/op/nn.py
 create mode 100644 python/tvm/relay/op/nn/__init__.py
 create mode 100644 python/tvm/relay/op/nn/_make.py
 create mode 100644 python/tvm/relay/op/nn/nn.py

diff --git a/docs/langref/index.rst b/docs/langref/index.rst
index 65f78d1d278b..22ca00f7faa5 100644
--- a/docs/langref/index.rst
+++ b/docs/langref/index.rst
@@ -6,4 +6,5 @@ embedded languages in TVM stack.
 .. toctree::
    :maxdepth: 2
 
+   relay_op
    hybrid_script
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
new file mode 100644
index 000000000000..1b99e0f4d9f2
--- /dev/null
+++ b/docs/langref/relay_op.rst
@@ -0,0 +1,58 @@
+Relay Core Tensor Operators
+===========================
+
+This page contains the list of core tensor operator primitives pre-defined in tvm.relay.
+The core tensor operator primitives covers typical workloads in deep learning.
+They can represent workloads in front-end frameworks, and provide basic building blocks for optimization.
+Since deep learning is a fast evolving field and it is that possible to have operators that are not in here.
+
+
+.. note::
+
+   This document will directly list the function signature of
+   these operators in the python frontend.
+
+
+Overview of Operators
+---------------------
+**Level 1: Basic Operators**
+
+This level enables fully connected multi-layer perceptron.
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.log
+   tvm.relay.sqrt
+   tvm.relay.exp
+   tvm.relay.add
+
+**Level 2: Convolutions**
+
+This level enables typical convnet models.
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.nn.conv2d
+
+
+**Level 3: Additional Math And Transform Operators**
+
+**Level 4: Broadcast and Reductions**
+
+
+**Level 5: Vision/Image Operators**
+
+
+Level 1 Definitions
+-------------------
+.. autofunction:: tvm.relay.log
+.. autofunction:: tvm.relay.sqrt
+.. autofunction:: tvm.relay.exp
+.. autofunction:: tvm.relay.add
+
+
+Level 2 Definitions
+-------------------
+.. autofunction:: tvm.relay.nn.conv2d
diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 9e56b45932dc..095e05b3f95f 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -329,6 +329,10 @@ inline void SetValue<std::string>(std::string* ptr, const TVMArgValue& val) {
   }
 }
 template<>
+inline void SetValue(Type* ptr, const TVMArgValue& val) {
+  *ptr = val.operator Type();
+}
+template<>
 inline void SetValue<double>(double* ptr, const TVMArgValue& val) {
   if (val.type_code() == kDLFloat || val.type_code() == kDLInt) {
     *ptr = val.operator double();
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 7add619c203c..1d9141fc9ab1 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -6,10 +6,11 @@
 from . import env
 from . import ir_pass
 from . import ir_builder
-# Operators
+
+# Root operators
 from .op import Op
 from .op.tensor import *
-from .op import nn
+from . import nn
 
 # Span
 Span = base.Span
diff --git a/python/tvm/relay/nn.py b/python/tvm/relay/nn.py
new file mode 100644
index 000000000000..6f45aea8b544
--- /dev/null
+++ b/python/tvm/relay/nn.py
@@ -0,0 +1,4 @@
+# pylint: disable=wildcard-import, unused-import, unused-wildcard-import
+"""Neural network related operators."""
+# Re-export in a specific file name so that autodoc can pick it up
+from .op.nn import *
diff --git a/python/tvm/relay/op/nn.py b/python/tvm/relay/op/nn.py
deleted file mode 100644
index 9d1714a82c67..000000000000
--- a/python/tvm/relay/op/nn.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""Neural network operations."""
-from __future__ import absolute_import as _abs
-from . import _make
-
-
-def conv2d(data,
-           weight,
-           strides=(1, 1),
-           padding=(0, 0),
-           dilation=(1, 1),
-           groups=1,
-           channels=None,
-           kernel_size=None,
-           data_layout="NCHW",
-           weight_layout="OIHW",
-           out_layout="",
-           out_dtype=""):
-    """Two dimensional convolution operator.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    weight : relay.Expr
-        The weight expressions.
-
-    strides : tuple of int, optional
-        The strides of convoltution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs.
-
-    dilation : tuple of int, optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    weight_layout : str, optional
-        Layout of the weight.
-
-    out_layout : str, optional
-        Layout of the output.
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-    """
-    return _make.conv2d(data, weight, strides, padding, dilation,
-                        groups, channels, kernel_size, data_layout,
-                        weight_layout, out_layout, out_dtype)
diff --git a/python/tvm/relay/op/nn/__init__.py b/python/tvm/relay/op/nn/__init__.py
new file mode 100644
index 000000000000..d1818e71882c
--- /dev/null
+++ b/python/tvm/relay/op/nn/__init__.py
@@ -0,0 +1,4 @@
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .nn import *
diff --git a/python/tvm/relay/op/nn/_make.py b/python/tvm/relay/op/nn/_make.py
new file mode 100644
index 000000000000..c4922ea8ab04
--- /dev/null
+++ b/python/tvm/relay/op/nn/_make.py
@@ -0,0 +1,4 @@
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.nn._make", __name__)
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
new file mode 100644
index 000000000000..f2d60d48eaad
--- /dev/null
+++ b/python/tvm/relay/op/nn/nn.py
@@ -0,0 +1,88 @@
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+
+def conv2d(data,
+           weight,
+           strides=(1, 1),
+           padding=(0, 0),
+           dilation=(1, 1),
+           groups=1,
+           channels=None,
+           kernel_size=None,
+           data_layout="NCHW",
+           weight_layout="OIHW",
+           out_layout="",
+           out_dtype=""):
+    r"""2D convolution.
+
+    This operator takes the weight as the convolution kernel
+    and convolves it with data to produce an output.
+
+
+    In the default case, where the data_layout is `NCHW`
+    and weight_layout is `OIHW`, conv2d takes in
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    and a weight Tensor with shape `(channels, in_channels, kernel_size[0], kernel_size[1])`
+    to produce an output Tensor with the following rule:
+
+    .. math::
+
+        \mbox{out}[b, c, y, x] = \sum_{dy, dx, k}
+           \mbox{data}[b, k, \mbox{strides}[0] * y  + dy, \mbox{strides}[1] * x + dx] *
+           \mbox{weight}[c, k, dy, dx]
+
+    Padding and dilation are applied to data and weight respectively before the computation.
+    This operator accepts data layout specification.
+    Semantically, the operator will convert the layout to the canonical layout
+    (`NCHW` for data and `OIHW` for weight), perform the computation,
+    then convert to the out_layout.
+
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    weight : relay.Expr
+        The weight expressions.
+
+    strides : tuple of int, optional
+        The strides of convoltution.
+
+    padding : tuple of int, optional
+        The padding of convolution on both sides of inputs before convolution.
+
+    dilation : tuple of int, optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    channels : int, optional
+        Number of output channels of this convolution.
+
+    kernel_size : tuple of int, optional
+        The spatial of the convolution kernel.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    weight_layout : str, optional
+        Layout of the weight.
+
+    out_layout : str, optional
+        Layout of the output, by default, out_layout is the same as data_layout
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.conv2d(data, weight, strides, padding, dilation,
+                        groups, channels, kernel_size, data_layout,
+                        weight_layout, out_layout, out_dtype)
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index fa54d8b53dd8..61addd06c553 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -12,9 +12,8 @@
 # - Not put too much burden on FFI to support complicated features
 #   like default value and keyword arguments
 
-
 def log(data):
-    """Take log of data.
+    """Compute elementwise log of data.
 
     Parameters
     ----------
@@ -30,7 +29,7 @@ def log(data):
 
 
 def exp(data):
-    """Take exp of data.
+    """Compute elementwise exp of data.
 
     Parameters
     ----------
@@ -46,7 +45,7 @@ def exp(data):
 
 
 def sqrt(data):
-    """Take sqrt of data.
+    """Compute elementwise sqrt of data.
 
     Parameters
     ----------
@@ -62,7 +61,7 @@ def sqrt(data):
 
 
 def add(lhs, rhs):
-    """Elementwise addition.
+    """Addition with numpy-style broadcasting.
 
     Parameters
     ----------
@@ -75,12 +74,20 @@ def add(lhs, rhs):
     -------
     result : relay.Expr
         The computed result.
+
+    Examples
+    --------
+    .. code:: python
+
+      x = relay.Var("a") # shape is [2, 3]
+      y = relay.Var("b") # shape is [2, 1]
+      z = relay.add(x, y)  # result shape is [2, 3]
     """
     return _make.add(lhs, rhs)
 
 
 def subtract(lhs, rhs):
-    """Elementwise subtraction.
+    """Elementwise subtraction with broadcasting.
 
     Parameters
     ----------
@@ -94,11 +101,13 @@ def subtract(lhs, rhs):
     result : relay.Expr
         The computed result.
     """
-    return _make.add(lhs, rhs)
+    return _make.subtract(lhs, rhs)
+
 
 def equal(lhs, rhs):
     return _make.equal(lhs, rhs)
 
+
 def concat(*args):
     """Concatenate the input tensors along the zero axis.
 
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index f61a34151408..920fc68d51e8 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -124,18 +124,18 @@ Expr MakeConv2D(Expr data,
   attrs->weight_layout = std::move(weight_layout);
   attrs->out_layout = std::move(out_layout);
   attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get("conv2d");
+  static const Op& op = Op::Get("nn.conv2d");
   return CallNode::make(op, {data, weight}, Attrs(attrs), {});
 }
 
 
-TVM_REGISTER_API("relay.op._make.conv2d")
+TVM_REGISTER_API("relay.op.nn._make.conv2d")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
     runtime::detail::unpack_call<Expr, 12>(MakeConv2D, args, rv);
   });
 
 
-RELAY_REGISTER_OP("conv2d")
+RELAY_REGISTER_OP("nn.conv2d")
 .describe(R"code(2D convolution layer (e.g. spatial convolution over images).
 
 This layer creates a convolution kernel that is convolved
diff --git a/tutorials/dev/low_level_custom_pass.py b/tutorials/dev/low_level_custom_pass.py
index 68e569477440..f2f38207931f 100644
--- a/tutorials/dev/low_level_custom_pass.py
+++ b/tutorials/dev/low_level_custom_pass.py
@@ -8,8 +8,8 @@
 to adapt TVM to their own specialized hardware. This tutorial helps users write
 a customized pass in TVM.
 
- Prerequisites
---------------
+Prerequisites
+-------------
 
 Before reading this tutorial, we assume readers have already known these topics well:
 
@@ -31,7 +31,7 @@
 
 ######################################################################
 # We first write a very simple vector add and build it with the default schedule. Then, we use
-# our customized lowering pass to manipulate the IR directly instead of using schedule premitives.  
+# our customized lowering pass to manipulate the IR directly instead of using schedule premitives.
 #
 
 n = tvm.const(128)
@@ -83,7 +83,7 @@ def find_width8(op):
 # post-order callback. If you want to keep the origin IR node, just return None. If you want to
 # change the current node to some node, use TVM IR maker interface to build it and return
 # this value.
-# 
+#
 # .. note::
 #
 #     If the pre-order function is called and returns a value which is not None, the post-order
@@ -113,7 +113,7 @@ def vectorize(stmt):
     # The last list arugment indicates what kinds of nodes will be transformed.
     # Thus, in this case only `For` nodes will call `vectorize8`
     stmt = tvm.ir_pass.IRTransform(stmt, None, vectorize8, ['For'])
-    
+
     return stmt
 
 #####################################################################
@@ -154,4 +154,3 @@ def vectorize(stmt):
 # - Wrap up two above to write an IR-transformation function.
 # - Use ``tvm.build_config`` to put this function to TVM lowering pass
 #
-

From 3966f0971243b9a641d4a0404f1754a503899cb6 Mon Sep 17 00:00:00 2001
From: nhynes <nhynes@berkeley.edu>
Date: Thu, 4 Oct 2018 21:44:05 -0700
Subject: [PATCH 166/529] Add SGX to docker (#1822)

---
 docker/Dockerfile.ci_cpu             |  4 ++++
 docker/install/install_tvm_cpu.sh    |  2 ++
 docker/install/ubuntu_install_sgx.sh | 28 ++++++++++++++++++++++++++++
 3 files changed, 34 insertions(+)
 create mode 100644 docker/install/ubuntu_install_sgx.sh

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index a0313a566b11..a58c4c6461e4 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -22,3 +22,7 @@ RUN bash /install/ubuntu_install_redis.sh
 # Golang environment
 COPY install/ubuntu_install_golang.sh /install/ubuntu_install_golang.sh
 RUN bash /install/ubuntu_install_golang.sh
+
+# SGX deps
+COPY install/ubuntu_install_sgx.sh /install/ubuntu_install_sgx.sh
+RUN bash /install/ubuntu_install_sgx.sh
diff --git a/docker/install/install_tvm_cpu.sh b/docker/install/install_tvm_cpu.sh
index 51593e66506e..461ad244d37c 100644
--- a/docker/install/install_tvm_cpu.sh
+++ b/docker/install/install_tvm_cpu.sh
@@ -6,6 +6,8 @@ echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
 echo set\(USE_BLAS openblas\) >> config.cmake
+echo set\(USE_SGX /opt/sgxsdk\) >> config.cmake
+echo set\(RUST_SGX_SDK /opt/rust-sgx-sdk\) >> config.cmake
 mkdir -p build
 cd build
 cmake ..
diff --git a/docker/install/ubuntu_install_sgx.sh b/docker/install/ubuntu_install_sgx.sh
new file mode 100644
index 000000000000..7ed7ede87927
--- /dev/null
+++ b/docker/install/ubuntu_install_sgx.sh
@@ -0,0 +1,28 @@
+apt-get update -qq && apt-get install -qq \
+    # for SGX SDK
+    build-essential git cmake \
+    wget python pkg-config software-properties-common \
+    autoconf automake libtool ocaml ocamlbuild \
+    # for SGX PSW
+    libssl-dev libcurl4-openssl-dev
+
+git clone https://github.com/intel/linux-sgx.git
+cd linux-sgx
+git checkout sgx_2.2
+curl 'https://gist.github.com/nhynes/c770b0e91610f8c020a8d1a803a1e7cb' | git am
+./download_prebuilt.sh
+make -j sdk && make -j sdk_install_pkg
+./linux/installer/bin/sgx_linux_x64_sdk_2.2.100.45311.bin --prefix /opt
+cd -
+
+git clone https://github.com/baidu/rust-sgx-sdk.git /opt/rust-sgx-sdk
+cd /opt/rust-sgx-sdk
+git checkout bdd75ca05f66d1f5df637182ec335970f769b03a
+cd -
+
+curl -sSo rustup.sh 'https://sh.rustup.rs'
+# rustc nightly-2018-08-25 is the version supported by the above version of rust-sgx-sdk
+bash rustup.sh -y --no-modify-path --default-toolchain nightly-2018-08-25
+rustup component add rust-src
+cargo install rustfmt-nightly --force
+cargo install xargo

From 4ddadd6fa3094bea3c18e79aaaffcb68d5109387 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 4 Oct 2018 22:54:31 -0700
Subject: [PATCH 167/529] [DOCKER] Fix CI script (#1826)

---
 docker/Dockerfile.ci_cpu              |  7 +++++++
 docker/Dockerfile.demo_opencl         |  6 +++---
 docker/install/ubuntu_install_rust.sh |  9 +++++++++
 docker/install/ubuntu_install_sgx.sh  | 15 +++------------
 4 files changed, 22 insertions(+), 15 deletions(-)
 create mode 100644 docker/install/ubuntu_install_rust.sh

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index a58c4c6461e4..60d811344b07 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -23,6 +23,13 @@ RUN bash /install/ubuntu_install_redis.sh
 COPY install/ubuntu_install_golang.sh /install/ubuntu_install_golang.sh
 RUN bash /install/ubuntu_install_golang.sh
 
+# Rust env
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+
 # SGX deps
 COPY install/ubuntu_install_sgx.sh /install/ubuntu_install_sgx.sh
 RUN bash /install/ubuntu_install_sgx.sh
+
+
+ENV PATH $PATH:/root/.cargo/bin
diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
index 6d54325050ae..460b901bf08f 100644
--- a/docker/Dockerfile.demo_opencl
+++ b/docker/Dockerfile.demo_opencl
@@ -13,14 +13,14 @@ RUN apt-get update
 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
 
 RUN echo "Installing utility libraries"
-RUN apt-get install -y apt-utils
+RUN apt-get install -y apt-utils sudo
 RUN apt-get install -y cmake g++ llvm
 RUN apt-get install -y git
 # make wget unzip libtinfo-dev libz-dev libcurl4-openssl-dev
 RUN apt-get install -y libopenblas-dev
 
 # RUN echo "Installing gtest"
-# RUN apt-get install -y libgtest-dev 
+# RUN apt-get install -y libgtest-dev
 # RUN cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib
 
 RUN echo "Installing Python"
@@ -35,7 +35,7 @@ RUN apt-get install -y libviennacl-dev mesa-opencl-icd ocl-icd-opencl-dev clinfo
 RUN apt-get install -y libclblas-dev libclfft-dev libclsparse-dev
 
 RUN echo "Installing OpenGL libraries"
-RUN apt-get install -y libcogl-dev libegl1 libgles1 libglfw3-dev 
+RUN apt-get install -y libcogl-dev libegl1 libgles1 libglfw3-dev
 # libglew-dev
 
 RUN echo "Upgrading dependencies"
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
new file mode 100644
index 000000000000..1d17b66164c9
--- /dev/null
+++ b/docker/install/ubuntu_install_rust.sh
@@ -0,0 +1,9 @@
+apt-get update && apt-get install -y --no-install-recommends --force-yes curl
+
+curl -sSo rustup.sh 'https://sh.rustup.rs'
+# rustc nightly-2018-08-25 is the version supported by the above version of rust-sgx-sdk
+bash rustup.sh -y --no-modify-path --default-toolchain nightly-2018-08-25
+. $HOME/.cargo/env
+rustup component add rust-src
+cargo install rustfmt-nightly --force
+cargo install xargo
diff --git a/docker/install/ubuntu_install_sgx.sh b/docker/install/ubuntu_install_sgx.sh
index 7ed7ede87927..917fd4b55954 100644
--- a/docker/install/ubuntu_install_sgx.sh
+++ b/docker/install/ubuntu_install_sgx.sh
@@ -1,10 +1,8 @@
-apt-get update -qq && apt-get install -qq \
-    # for SGX SDK
+apt-get update && apt-get install -y --no-install-recommends --force-yes \
     build-essential git cmake \
     wget python pkg-config software-properties-common \
-    autoconf automake libtool ocaml ocamlbuild \
-    # for SGX PSW
-    libssl-dev libcurl4-openssl-dev
+    autoconf automake libtool ocaml \
+    libssl-dev libcurl4-openssl-dev curl
 
 git clone https://github.com/intel/linux-sgx.git
 cd linux-sgx
@@ -19,10 +17,3 @@ git clone https://github.com/baidu/rust-sgx-sdk.git /opt/rust-sgx-sdk
 cd /opt/rust-sgx-sdk
 git checkout bdd75ca05f66d1f5df637182ec335970f769b03a
 cd -
-
-curl -sSo rustup.sh 'https://sh.rustup.rs'
-# rustc nightly-2018-08-25 is the version supported by the above version of rust-sgx-sdk
-bash rustup.sh -y --no-modify-path --default-toolchain nightly-2018-08-25
-rustup component add rust-src
-cargo install rustfmt-nightly --force
-cargo install xargo

From bef797ade05b860790c500ae746e565692ec39ed Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Fri, 5 Oct 2018 10:32:57 -0700
Subject: [PATCH 168/529] Fix dmlc-core path in nnvm Makefile (#1829)

---
 nnvm/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nnvm/Makefile b/nnvm/Makefile
index 4ebd9ac95b70..8392aadc3f2d 100644
--- a/nnvm/Makefile
+++ b/nnvm/Makefile
@@ -18,7 +18,7 @@ CFLAGS += -I$(TVMPATH)/include -I$(TVMPATH)/3rdparty/dlpack/include -I$(TVMPATH)
 ifdef DMLC_CORE_PATH
   CFLAGS += -I$(DMLC_CORE_PATH)/include
 else
-  CFLAGS += -I$(ROOTDIR)/../dmlc-core/include
+  CFLAGS += -I$(TVMPATH)/3rdparty/dmlc-core/include
 endif
 
 ifneq ($(ADD_CFLAGS), NONE)

From 53c0232cd224875a67393d0d8facc12927b6d183 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 5 Oct 2018 14:16:47 -0400
Subject: [PATCH 169/529] [RELAY][OP] expand_dims (#1819)

---
 docs/langref/relay_op.rst            |  1 +
 include/tvm/relay/attrs/transform.h  | 35 ++++++++++++
 python/tvm/relay/__init__.py         |  1 +
 python/tvm/relay/op/__init__.py      |  1 +
 python/tvm/relay/op/transform.py     | 28 ++++++++++
 src/relay/op/tensor/transform.cc     | 81 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level1.py | 20 +++++++
 topi/include/topi/transform.h        | 10 +++-
 8 files changed, 176 insertions(+), 1 deletion(-)
 create mode 100644 include/tvm/relay/attrs/transform.h
 create mode 100644 python/tvm/relay/op/transform.py
 create mode 100644 src/relay/op/tensor/transform.cc
 create mode 100644 tests/python/relay/test_op_level1.py

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 1b99e0f4d9f2..818491c7a79a 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -26,6 +26,7 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.sqrt
    tvm.relay.exp
    tvm.relay.add
+   tvm.relay.expand_dims
 
 **Level 2: Convolutions**
 
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
new file mode 100644
index 000000000000..b14e8f22722e
--- /dev/null
+++ b/include/tvm/relay/attrs/transform.h
@@ -0,0 +1,35 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/attrs/transform.h
+ * \brief Transform operators.
+ */
+#ifndef TVM_RELAY_ATTRS_TRANSFORM_H_
+#define TVM_RELAY_ATTRS_TRANSFORM_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Attributes used in expand_dims operators */
+struct ExpandDimsAttrs : public tvm::AttrsNode<ExpandDimsAttrs> {
+  int axis;
+  int num_newaxis;
+
+  TVM_DECLARE_ATTRS(ExpandDimsAttrs, "relay.attrs.ExpandDimsAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .describe("The axis at which the input array is expanded."
+                  "Should lie in range `[-data.ndim - 1, data.ndim]`."
+                  "If `axis < 0`, it is the first axis inserted;"
+                  "If `axis >= 0`, it is the last axis inserted in Python's negative indexing.");
+    TVM_ATTR_FIELD(num_newaxis)
+        .describe("Number of axises to be inserted. Should be >= 0.")
+        .set_lower_bound(0)
+        .set_default(1);
+  }
+};  // struct ExpandDimsAttrs
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 1d9141fc9ab1..c1299636eed2 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -11,6 +11,7 @@
 from .op import Op
 from .op.tensor import *
 from . import nn
+from .op.transform import *
 
 # Span
 Span = base.Span
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 4e6314001394..444dc74a31cb 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -6,6 +6,7 @@
 # Operators
 from .tensor import *
 from . import nn
+from .transform import *
 
 
 # operator registry
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
new file mode 100644
index 000000000000..21f61735e58a
--- /dev/null
+++ b/python/tvm/relay/op/transform.py
@@ -0,0 +1,28 @@
+"""Transform operators."""
+
+from . import _make
+
+
+def expand_dims(data, axis, num_newaxis=1):
+    """Insert `num_newaxis` axises at the position given by `axis`.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    axis : int
+        The axis at which the input array is expanded.
+        Should lie in range `[-data.ndim - 1, data.ndim]`.
+        If `axis < 0`, it is the first axis inserted;
+        If `axis >= 0`, it is the last axis inserted in Python's negative indexing.
+
+    num_newaxis : int
+        Number of axises to be inserted. Should be >= 0.
+
+    Returns
+    -------
+    result : relay.Expr
+        The reshaped result.
+    """
+    return _make.expand_dims(data, axis, num_newaxis)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
new file mode 100644
index 000000000000..61db1f90ae39
--- /dev/null
+++ b/src/relay/op/tensor/transform.cc
@@ -0,0 +1,81 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file transform.cc
+ * \brief Transform operators.
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/transform.h>
+#include <vector>
+
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(ExpandDimsAttrs);
+
+bool ExpandDimsRel(const Array<Type>& types,
+                   int num_inputs,
+                   const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // `types` contains: [data, output]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+  const ExpandDimsAttrs* param = attrs.as<ExpandDimsAttrs>();
+  const int ndim = static_cast<int>(data->shape.size());
+  const int axis = param->axis;
+  const int num_newaxis = param->num_newaxis;
+  CHECK(num_newaxis >= 0)
+    << "expand_dims only accepts `num_newaxis >= 0`"
+    << ", but got num_newaxis = " << num_newaxis;
+  CHECK(-ndim - 1 <= axis && axis <= ndim)
+    << "expand_dims only accepts `axis` in [-data.ndim - 1, data.ndim]"
+    << ", but got axis = " << axis
+    << ", and data.ndim = " << ndim;
+  const int pivot = axis < 0 ? ndim + axis + 1 : axis;
+  std::vector<IndexExpr> oshape;
+  oshape.reserve(ndim + num_newaxis);
+  for (int i = 0; i < pivot; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+  for (int i = 0; i < num_newaxis; ++i) {
+    oshape.emplace_back(1);
+  }
+  for (int i = pivot; i < ndim; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeExpandDims(Expr data,
+                    int axis,
+                    int num_newaxis) {
+  auto attrs = make_node<ExpandDimsAttrs>();
+  attrs->axis = axis;
+  attrs->num_newaxis = num_newaxis;
+  static const Op& op = Op::Get("expand_dims");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.expand_dims")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeExpandDims, args, rv);
+});
+
+RELAY_REGISTER_OP("expand_dims")
+.describe(R"code(Insert `num_newaxis` axises at the position given by `axis`
+
+- **data**: The input data to the operator.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("ExpandDims", ExpandDimsRel);
+
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
new file mode 100644
index 000000000000..c1c8b03c1c23
--- /dev/null
+++ b/tests/python/relay/test_op_level1.py
@@ -0,0 +1,20 @@
+import tvm
+from tvm import relay
+
+
+def test_expand_dims_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d = tvm.var("n"), tvm.var("t"), 100
+    # let's mimic a batch of sequences
+    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.expand_dims(x, axis=2))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type()
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, t, 1, 100), "float32")
+
+
+if __name__ == "__main__":
+    test_expand_dims_infer_type()
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 6dbdbe5574f4..e4e646453cca 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -37,10 +37,18 @@ inline Tensor expand_dims(const Tensor& x,
                           int num_newaxis = 1,
                           std::string name = "tensor",
                           std::string tag = kBroadcast) {
+  int ndim = static_cast<int>(x->shape.size());
   if (axis < 0) {
     // Calculate offset from last dimension
-    axis = static_cast<int>(x->shape.size()) + axis + 1;
+    axis = ndim + axis + 1;
   }
+  CHECK(-ndim - 1 <= axis && axis <= ndim)
+    << "expand_dims only accepts `axis` in [-data.ndim - 1, data.ndim]"
+    << ", but got axis = " << axis
+    << ", and data.ndim = " << ndim;
+  CHECK(num_newaxis >= 0)
+    << "expand_dims only accepts `num_newaxis >= 0`"
+    << ", but got num_newaxis = " << num_newaxis;
 
   Array<Expr> new_shape;
   for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {

From 17d7284a56995c77918964cfe26337efc33b8b03 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 5 Oct 2018 15:57:24 -0400
Subject: [PATCH 170/529] [RELAY][OP] comparison (#1824)

---
 docs/langref/relay_op.rst            |   9 +++
 python/tvm/relay/op/tensor.py        | 105 +++++++++++++++++++++++++++
 src/relay/op/tensor/elemwise.cc      |  33 +++++----
 tests/python/relay/test_op_level4.py |  24 ++++++
 4 files changed, 158 insertions(+), 13 deletions(-)
 create mode 100644 tests/python/relay/test_op_level4.py

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 818491c7a79a..acc93bc5f4a6 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -42,6 +42,15 @@ This level enables typical convnet models.
 
 **Level 4: Broadcast and Reductions**
 
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.equal
+   tvm.relay.not_equal
+   tvm.relay.greater
+   tvm.relay.greater_equal
+   tvm.relay.less
+   tvm.relay.less_equal
 
 **Level 5: Vision/Image Operators**
 
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 61addd06c553..b425ff8f7537 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -104,10 +104,115 @@ def subtract(lhs, rhs):
     return _make.subtract(lhs, rhs)
 
 
+
 def equal(lhs, rhs):
+    """Broadcasted elementwise test for (lhs == rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
     return _make.equal(lhs, rhs)
 
 
+def not_equal(lhs, rhs):
+    """Broadcasted elementwise test for (lhs != rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.not_equal(lhs, rhs)
+
+
+def less(lhs, rhs):
+    """Broadcasted elementwise test for (lhs < rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.less(lhs, rhs)
+
+
+def less_equal(lhs, rhs):
+    """Broadcasted elementwise test for (lhs <= rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.less_equal(lhs, rhs)
+
+
+def greater(lhs, rhs):
+    """Broadcasted elementwise test for (lhs > rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.greater(lhs, rhs)
+
+
+def greater_equal(lhs, rhs):
+    """Broadcasted elementwise test for (lhs >= rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.greater_equal(lhs, rhs)
+
+
 def concat(*args):
     """Concatenate the input tensors along the zero axis.
 
diff --git a/src/relay/op/tensor/elemwise.cc b/src/relay/op/tensor/elemwise.cc
index 8c1823114f44..738f0f0f1f89 100644
--- a/src/relay/op/tensor/elemwise.cc
+++ b/src/relay/op/tensor/elemwise.cc
@@ -106,19 +106,26 @@ RELAY_REGISTER_OP("subtract")
   // input2: Tensor[dtype, s2]
   // output: Tensor[dtype, broadcast(s1, s2)]
 
-// Equality Comparison
-TVM_REGISTER_API("relay.op._make.equal")
-  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {
-      static const Op& op = Op::Get("equal");
-    return CallNode::make(op, {lhs, rhs}, Attrs(), {});
-  });
-
-RELAY_REGISTER_OP("equal")
-  .set_num_inputs(2)
-  .add_argument("lhs", "Tensor", "The left hand side tensor.")
-  .add_argument("rhs", "Tensor", "The right hand side tensor.")
-  .set_support_level(1)
-  .add_type_rel("BroadcastComp", BroadcastCompRel);
+// Comparisons
+#define RELAY_REGISTER_CMP_OP(OpName, SupportLevel)                 \
+  TVM_REGISTER_API("relay.op._make." OpName)                        \
+  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {        \
+      static const Op& op = Op::Get(OpName);                        \
+    return CallNode::make(op, {lhs, rhs}, Attrs(), {});             \
+  });                                                               \
+  RELAY_REGISTER_OP(OpName)                                         \
+    .set_num_inputs(2)                                              \
+    .add_argument("lhs", "Tensor", "The left hand side tensor.")    \
+    .add_argument("rhs", "Tensor", "The right hand side tensor.")   \
+    .set_support_level(SupportLevel)                                \
+    .add_type_rel("BroadcastComp", BroadcastCompRel);
+
+RELAY_REGISTER_CMP_OP("equal", 4);
+RELAY_REGISTER_CMP_OP("not_equal", 4);
+RELAY_REGISTER_CMP_OP("less", 4);
+RELAY_REGISTER_CMP_OP("less_equal", 4);
+RELAY_REGISTER_CMP_OP("greater", 4);
+RELAY_REGISTER_CMP_OP("greater_equal", 4);
 
 // Concat
 TVM_REGISTER_API("relay.op._make.concat")
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
new file mode 100644
index 000000000000..726bc7623a2a
--- /dev/null
+++ b/tests/python/relay/test_op_level4.py
@@ -0,0 +1,24 @@
+import tvm
+from tvm import relay
+
+
+def test_cmp_type():
+    for op in (relay.greater,
+               relay.greater_equal,
+               relay.less,
+               relay.less_equal,
+               relay.equal,
+               relay.not_equal):
+        ib = relay.ir_builder.IRBuilder()
+        x = ib.param("x", relay.TensorType((10, 4), "float32"))
+        y = ib.param("y", relay.TensorType((5, 10, 1), "float32"))
+        with ib.function(x, y) as func:
+            ib.ret(op(x.var, y.var))
+        ib.ret(func)
+        func = relay.ir_pass.infer_type(ib.env, func.to_func())
+        ftype = func.checked_type()
+        assert ftype.ret_type == relay.TensorType((5, 10, 4), "uint1")
+
+
+if __name__ == "__main__":
+    test_cmp_type()

From 589f244581cb3cf6657d9cbdf7f7354604dc8374 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 5 Oct 2018 13:50:22 -0700
Subject: [PATCH 171/529] [OP] right_shift (#1832)

---
 python/tvm/relay/op/tensor.py        |  18 ++++
 src/relay/op/tensor/binary.cc        |  60 +++++++++++
 src/relay/op/tensor/elemwise.cc      | 144 ---------------------------
 src/relay/op/tensor/unary.cc         |  82 +++++++++++++++
 tests/python/relay/test_op_level4.py |  14 +++
 5 files changed, 174 insertions(+), 144 deletions(-)
 create mode 100644 src/relay/op/tensor/binary.cc
 delete mode 100644 src/relay/op/tensor/elemwise.cc
 create mode 100644 src/relay/op/tensor/unary.cc

diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index b425ff8f7537..c8c42c1a6ca4 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -213,6 +213,24 @@ def greater_equal(lhs, rhs):
     return _make.greater_equal(lhs, rhs)
 
 
+def right_shift(lhs, rhs):
+    """Right shift with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.right_shift(lhs, rhs)
+
+
 def concat(*args):
     """Concatenate the input tensors along the zero axis.
 
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
new file mode 100644
index 000000000000..4c0fa657bac4
--- /dev/null
+++ b/src/relay/op/tensor/binary.cc
@@ -0,0 +1,60 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file binary.cc
+ * \brief binary broadcast operators.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+#define RELAY_REGISTER_BINARY_OP(OpName)                               \
+  TVM_REGISTER_API("relay.op._make." OpName)                           \
+  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {           \
+      static const Op& op = Op::Get(OpName);                           \
+      return CallNode::make(op, {lhs, rhs}, Attrs(), {});              \
+    });                                                                \
+  RELAY_REGISTER_OP(OpName)                                            \
+  .set_num_inputs(2)                                                   \
+  .add_argument("lhs", "Tensor", "The left hand side tensor.")         \
+  .add_argument("rhs", "Tensor", "The right hand side tensor.")        \
+  .add_type_rel("Broadcast", BroadcastRel)
+
+// Addition
+RELAY_REGISTER_BINARY_OP("add")
+.describe("Elementwise add with with broadcasting")
+.set_support_level(1);
+
+RELAY_REGISTER_BINARY_OP("subtract")
+.describe("Elementwise substract with broadcasting")
+.set_support_level(1);
+
+RELAY_REGISTER_BINARY_OP("right_shift")
+.describe("Elementwise right shift with broadcasting")
+.set_support_level(4);
+
+// Comparisons
+#define RELAY_REGISTER_CMP_OP(OpName, SupportLevel)                 \
+  TVM_REGISTER_API("relay.op._make." OpName)                        \
+  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {        \
+      static const Op& op = Op::Get(OpName);                        \
+    return CallNode::make(op, {lhs, rhs}, Attrs(), {});             \
+  });                                                               \
+  RELAY_REGISTER_OP(OpName)                                         \
+    .set_num_inputs(2)                                              \
+    .add_argument("lhs", "Tensor", "The left hand side tensor.")    \
+    .add_argument("rhs", "Tensor", "The right hand side tensor.")   \
+    .set_support_level(SupportLevel)                                \
+    .add_type_rel("BroadcastComp", BroadcastCompRel);
+
+RELAY_REGISTER_CMP_OP("equal", 4);
+RELAY_REGISTER_CMP_OP("not_equal", 4);
+RELAY_REGISTER_CMP_OP("less", 4);
+RELAY_REGISTER_CMP_OP("less_equal", 4);
+RELAY_REGISTER_CMP_OP("greater", 4);
+RELAY_REGISTER_CMP_OP("greater_equal", 4);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/tensor/elemwise.cc b/src/relay/op/tensor/elemwise.cc
deleted file mode 100644
index 738f0f0f1f89..000000000000
--- a/src/relay/op/tensor/elemwise.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file elemwise.cc
- * \brief Elementwise operators.
- */
-#include <tvm/relay/expr.h>
-#include <tvm/relay/op.h>
-#include "../type_relations.h"
-
-namespace tvm {
-namespace relay {
-
-// Quick helper macro
-// - Expose a positional make function to construct the node.
-// - Register op to the registry.
-//
-// We make the decision to always only expose positional argument.
-// We will do rewrapping in the frontend to support language
-// sugars such as keyword arguments and default value.
-//
-#define RELAY_REGISTER_UNARY_OP(OpName)               \
-  TVM_REGISTER_API("relay.op._make." OpName)          \
-  .set_body_typed<Expr(Expr)>([](Expr data) {         \
-      static const Op& op = Op::Get(OpName);          \
-    return CallNode::make(op, {data}, Attrs(), {});   \
-    });                                               \
-  RELAY_REGISTER_OP(OpName)                           \
-  .set_num_inputs(1)                                  \
-  .add_argument("data", "Tensor", "The input tensor.")
-
-
-RELAY_REGISTER_UNARY_OP("log")
-.describe(R"code(Returns the log input array, computed element-wise.
-
-.. math::
-   log(x)
-
-)code" TVM_ADD_FILELINE)
-.set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
-
-// data : Tensor[shape, dtype]
-// result: Tensor[shape, dtype]
-
-
-RELAY_REGISTER_UNARY_OP("exp")
-.describe(R"code(Returns the exp input array, computed element-wise.
-
-.. math::
-   \exp(x)
-
-)code" TVM_ADD_FILELINE)
-.set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
-
-
-RELAY_REGISTER_UNARY_OP("sqrt")
-.describe(R"code(Returns the sqrt input array, computed element-wise.
-
-.. math::
-   sqrt(x)
-
-)code" TVM_ADD_FILELINE)
-.set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
-
-// Addition
-TVM_REGISTER_API("relay.op._make.add")
-  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {
-      static const Op& op = Op::Get("add");
-    return CallNode::make(op, {lhs, rhs}, Attrs(), {});
-  });
-
-RELAY_REGISTER_OP("add")
-  .set_num_inputs(2)
-  .add_argument("lhs", "Tensor", "The left hand side tensor.")
-  .add_argument("rhs", "Tensor", "The right hand side tensor.")
-  .set_support_level(1)
-  .add_type_rel("Broadcast", BroadcastRel);
-
-  // def broadcast(s1, s2):
-  // ...
-  //
-  // input1: Tensor[dtype, s1]
-  // input2: Tensor[dtype, s2]
-  // output: Tensor[dtype, broadcast(s1, s2)]
-
-// Addition
-TVM_REGISTER_API("relay.op._make.subtract")
-  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {
-      static const Op& op = Op::Get("subtract");
-    return CallNode::make(op, {lhs, rhs}, Attrs(), {});
-  });
-
-RELAY_REGISTER_OP("subtract")
-  .set_num_inputs(2)
-  .add_argument("lhs", "Tensor", "The left hand side tensor.")
-  .add_argument("rhs", "Tensor", "The right hand side tensor.")
-  .set_support_level(1)
-  .add_type_rel("Broadcast", BroadcastRel);
-
-  // def broadcast(s1, s2):
-  // ...
-  //
-  // input1: Tensor[dtype, s1]
-  // input2: Tensor[dtype, s2]
-  // output: Tensor[dtype, broadcast(s1, s2)]
-
-// Comparisons
-#define RELAY_REGISTER_CMP_OP(OpName, SupportLevel)                 \
-  TVM_REGISTER_API("relay.op._make." OpName)                        \
-  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {        \
-      static const Op& op = Op::Get(OpName);                        \
-    return CallNode::make(op, {lhs, rhs}, Attrs(), {});             \
-  });                                                               \
-  RELAY_REGISTER_OP(OpName)                                         \
-    .set_num_inputs(2)                                              \
-    .add_argument("lhs", "Tensor", "The left hand side tensor.")    \
-    .add_argument("rhs", "Tensor", "The right hand side tensor.")   \
-    .set_support_level(SupportLevel)                                \
-    .add_type_rel("BroadcastComp", BroadcastCompRel);
-
-RELAY_REGISTER_CMP_OP("equal", 4);
-RELAY_REGISTER_CMP_OP("not_equal", 4);
-RELAY_REGISTER_CMP_OP("less", 4);
-RELAY_REGISTER_CMP_OP("less_equal", 4);
-RELAY_REGISTER_CMP_OP("greater", 4);
-RELAY_REGISTER_CMP_OP("greater_equal", 4);
-
-// Concat
-TVM_REGISTER_API("relay.op._make.concat")
-  .set_body_typed<Expr(Expr)>([](Expr tuple) {
-      static const Op& op = Op::Get("concat");
-    return CallNode::make(op, { tuple }, Attrs(), {});
-  });
-
-RELAY_REGISTER_OP("concat")
-  .set_num_inputs(1)
-  .add_argument("tuple", "Tuple", "The tupled tensor arguments.")
-  .set_support_level(1)
-  .add_type_rel("Concat", ConcatRel);
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
new file mode 100644
index 000000000000..798d4aa791ad
--- /dev/null
+++ b/src/relay/op/tensor/unary.cc
@@ -0,0 +1,82 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file unary.cc
+ * \brief Unary operators.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+// Quick helper macro
+// - Expose a positional make function to construct the node.
+// - Register op to the registry.
+//
+// We make the decision to always only expose positional argument.
+// We will do rewrapping in the frontend to support language
+// sugars such as keyword arguments and default value.
+//
+#define RELAY_REGISTER_UNARY_OP(OpName)               \
+  TVM_REGISTER_API("relay.op._make." OpName)          \
+  .set_body_typed<Expr(Expr)>([](Expr data) {         \
+      static const Op& op = Op::Get(OpName);          \
+    return CallNode::make(op, {data}, Attrs(), {});   \
+    });                                               \
+  RELAY_REGISTER_OP(OpName)                           \
+  .set_num_inputs(1)                                  \
+  .add_argument("data", "Tensor", "The input tensor.")
+
+
+RELAY_REGISTER_UNARY_OP("log")
+.describe(R"code(Returns the log input array, computed element-wise.
+
+.. math::
+   log(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
+// data : Tensor[shape, dtype]
+// result: Tensor[shape, dtype]
+
+
+RELAY_REGISTER_UNARY_OP("exp")
+.describe(R"code(Returns the exp input array, computed element-wise.
+
+.. math::
+   \exp(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
+
+RELAY_REGISTER_UNARY_OP("sqrt")
+.describe(R"code(Returns the sqrt input array, computed element-wise.
+
+.. math::
+   sqrt(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
+
+// Concat
+TVM_REGISTER_API("relay.op._make.concat")
+  .set_body_typed<Expr(Expr)>([](Expr tuple) {
+      static const Op& op = Op::Get("concat");
+    return CallNode::make(op, { tuple }, Attrs(), {});
+  });
+
+RELAY_REGISTER_OP("concat")
+.set_num_inputs(1)
+.add_argument("tuple", "Tuple", "The tupled tensor arguments.")
+.set_support_level(1)
+.add_type_rel("Concat", ConcatRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 726bc7623a2a..5009994871f7 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -20,5 +20,19 @@ def test_cmp_type():
         assert ftype.ret_type == relay.TensorType((5, 10, 4), "uint1")
 
 
+def test_binary_broadcast():
+    for op in [relay.right_shift]:
+        ib = relay.ir_builder.IRBuilder()
+        x = ib.param("x", relay.TensorType((10, 4), "int32"))
+        y = ib.param("y", relay.TensorType((5, 10, 1), "int32"))
+        with ib.function(x, y) as func:
+            ib.ret(op(x.var, y.var))
+        ib.ret(func)
+        func = relay.ir_pass.infer_type(ib.env, func.to_func())
+        ftype = func.checked_type()
+        assert ftype.ret_type == relay.TensorType((5, 10, 4), "int32")
+
+
 if __name__ == "__main__":
     test_cmp_type()
+    test_binary_broadcast()

From 49dcdb06b4db45eab00d7c68bbba98ef8fcc4add Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Fri, 5 Oct 2018 13:53:17 -0700
Subject: [PATCH 172/529] [Relay] Add Let list, a helper datastructure to relay
 (#1827)

---
 src/relay/pass/let_list.h | 126 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 src/relay/pass/let_list.h

diff --git a/src/relay/pass/let_list.h b/src/relay/pass/let_list.h
new file mode 100644
index 000000000000..d13358fe0e30
--- /dev/null
+++ b/src/relay/pass/let_list.h
@@ -0,0 +1,126 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file let_list.h
+ * \brief LetList record let binding and insert let expression implicitly.
+ *  using it, one can treat AST as value instead of expression,
+ *  and pass them around freely without fear of AST explosion (or effect duplication).
+ *  for example, if one write 'b = a + a; c = b + b; d = c + c', the AST will contain 8 'a'.
+ *  if one instead write 'b = ll.Push(a + a); c = ll.Push(b + b); d = ll.Get(c + c);',
+ *  the AST will contain 2 'a', as b and c are now variables.
+ */
+#ifndef TVM_RELAY_PASS_LET_LIST_H_
+#define TVM_RELAY_PASS_LET_LIST_H_
+
+#include <tvm/relay/expr.h>
+#include <utility>
+#include <vector>
+#include <tuple>
+#include "tvm/relay/type.h"
+
+namespace tvm {
+namespace relay {
+
+/*! \brief LetList allow you to transform expression into variables, so you can copy them around.
+ *  one can insert into the LetList by calling Push, and wrap an expression with bindings with Get.
+ *  additionally, there is the 'With' function, which automatically call Get.
+ */
+class LetList {
+ public:
+  /*! \brief insert a binding.
+   *
+   *  \param pv the var of the binding.
+   *
+   *  \param ty the type of the binding.
+   *
+   *  \param expr the value of the binding.
+   *
+   *  \return a Var that hold the inserted expr.
+   */
+  Var Push(const Var& pv, const Type& ty, const Expr& expr) {
+    std::tuple<Var, Type, Expr> tuple(pv, ty, expr);
+    lets_.push_back(tuple);
+    return pv;
+  }
+
+  /*! \brief insert a binding.
+   *
+   *  \param ty the type of the binding.
+   *
+   *  \param expr the value of the binding.
+   *
+   *  \return a Var that hold the inserted expr.
+   */
+  Var Push(const Type& ty, const Expr& expr) {
+    return Push(VarNode::make("x"), ty, expr);
+  }
+
+  /*! \brief insert a binding.
+   *
+   *  \param pv the var of the binding.
+   *
+   *  \param expr the value of the binding.
+   *
+   *  \return a Var that hold the inserted expr.
+   */
+  Var Push(const Var& pv, const Expr& expr) {
+    return Push(pv, IncompleteTypeNode::make(TypeParamNode::kType), expr);
+  }
+
+  /*! \brief insert a binding.
+   *
+   *  \param expr the value of the binding.
+   *
+   *  \return a Var that hold the inserted expr.
+   */
+  Var Push(const Expr& expr) {
+    return Push(IncompleteTypeNode::make(TypeParamNode::kType), expr);
+  }
+
+  /*! \brief wrap an expr around the LetList.
+   *
+   *  \param body the Expression to be wrapped around.
+   *
+   *  \return the wrapped expr.
+   */
+  Expr Get(const Expr& body) const {
+    Expr ret = body;
+    for (auto rit = lets_.rbegin(); rit != lets_.rend(); ++rit) {
+      ret = LetNode::make(std::get<0>(*rit), std::get<2>(*rit), ret, std::get<1>(*rit));
+    }
+    return ret;
+  }
+
+  /*! \brief generate an LetList and wrap the result automatically.
+   *
+   *  \param f a function that generate the unwrapped Expr.
+   *
+   *  \code
+   *  // Example code that generate `16 * a` using 4 plus instead of 15 plus.
+   *  Expr mult_sixteen(const Var& a) {
+   *    Op plus = Op::Get("plus");
+   *    // Automatically call Get with LetList::With
+   *    return LetList::With([&](LetList* ll) {
+   *      // Turn a call to plus into a variable to avoid duplication of code
+   *      Var b = ll->Push(CallNode::make(plus, {a, a}));
+   *      Var c = ll->Push(CallNode::make(plus, {b, b}));
+   *      Var d = ll->Push(CallNode::make(plus, {c, c}));
+   *      return CallNode::make(plus, {d, d});
+   *    });
+   *  }
+   *  \endcode
+   *
+   *  \return the wrapped Expr.
+   */
+  template<typename F>
+  static Expr With(F&& f) {
+    LetList ll;
+    return ll.Get(f(&ll));
+  }
+
+ private:
+  std::vector<std::tuple<Var, Type, Expr> > lets_;
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_LET_LIST_H_

From dff36f74f79a96efb9de01eff06dc8bb7b2042a2 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 5 Oct 2018 13:58:02 -0700
Subject: [PATCH 173/529] [DOCS] Add docs of logical snd right shift (#1834)

---
 docs/langref/relay_op.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index acc93bc5f4a6..bcaece0bf0a1 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -45,6 +45,7 @@ This level enables typical convnet models.
 .. autosummary::
    :nosignatures:
 
+   tvm.relay.right_shift
    tvm.relay.equal
    tvm.relay.not_equal
    tvm.relay.greater
@@ -66,3 +67,14 @@ Level 1 Definitions
 Level 2 Definitions
 -------------------
 .. autofunction:: tvm.relay.nn.conv2d
+
+
+Level 4 Definitions
+-------------------
+.. autofunction:: tvm.relay.right_shift
+.. autofunction:: tvm.relay.equal
+.. autofunction:: tvm.relay.not_equal
+.. autofunction:: tvm.relay.greater
+.. autofunction:: tvm.relay.greater_equal
+.. autofunction:: tvm.relay.less
+.. autofunction:: tvm.relay.less_equal

From b65229a142eb5fc5c8a86ec3b96383522205c575 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Fri, 5 Oct 2018 14:35:02 -0700
Subject: [PATCH 174/529] [Relay][Doc] Correct bad formatting and typos in
 Relay operator addition doc (#1833)

---
 docs/dev/relay_add_op.rst | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/dev/relay_add_op.rst b/docs/dev/relay_add_op.rst
index 6bd6e67700bd..751e938fe012 100644
--- a/docs/dev/relay_add_op.rst
+++ b/docs/dev/relay_add_op.rst
@@ -6,13 +6,14 @@ operators need to be registered in Relay in order to ensure
 that they will be integrated into Relay's type system.
 
 Registering an operator requires three steps:
+
 - Using the ``RELAY_REGISTER_OP`` macro in C++ to
 register the operator's arity and type information
 - Defining a C++ function to produce a call node for the
 operator and registering a Python API hook for the function
 - Wrapping the above Python API hook in a neater interface
 
-The file ``src/relay/op/tensor/elemwise.cc`` provides
+The file ``src/relay/op/tensor/binary.cc`` provides
 examples of the first two steps, while
 ``python/tvm/relay/op/tensor.py`` gives examples of the
 last.
@@ -35,7 +36,7 @@ output type.
 
 For example, see ``src/relay/op/type_relations.h`` and their
 implementations. E.g., ``BroadcastRel`` takes two input types and an
-output type, checks that they are all tensor types with the same underlyin
+output type, checks that they are all tensor types with the same underlying
 data type, and finally ensures that the shape of the output type is the
 broadcast of the input types' shapes.
 
@@ -44,6 +45,7 @@ if the existing ones do not capture the behavior of the desired operator.
 
 The ``RELAY_REGISTER_OP`` macro in C++ allows a developer
 to specify the following information about an operator in Relay:
+
 - Arity (number of arguments)
 - Names and descriptions for positional arguments
 - Support level (1 indicating an internal intrinsic, higher numbers
@@ -51,7 +53,7 @@ indicating operators that are not as integral to the framework or are
 supported externally)
 - A type relation for the operator
 
-The below example is from ``elemwise.cc`` and uses a broadcasting
+The below example is from ``binary.cc`` and uses a broadcasting
 add for tensors:
 
 .. code:: c
@@ -141,6 +143,7 @@ before producing the call node:
 
 Summary
 -------
+
 - A TVM operator can be registered in Relay using a relation to express
 the appropriate type information.
 - Using an operator in Relay requires a function to produce a

From 9279f49672a641b90a12a075cf6c9fa331fc313d Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Fri, 5 Oct 2018 19:00:53 -0700
Subject: [PATCH 175/529] Fix saveload json bug (#1831)

---
 nnvm/src/pass/saveload_json.cc                  |  3 +++
 .../python/unittest/test_pass_saveload_json.py  | 17 +++++++++++++++++
 2 files changed, 20 insertions(+)
 create mode 100644 nnvm/tests/python/unittest/test_pass_saveload_json.py

diff --git a/nnvm/src/pass/saveload_json.cc b/nnvm/src/pass/saveload_json.cc
index 195d49bfb9b4..f1acb972158d 100644
--- a/nnvm/src/pass/saveload_json.cc
+++ b/nnvm/src/pass/saveload_json.cc
@@ -218,6 +218,9 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph &jgraph, bool no_parse)
     // rebuild attribute parser
     if (!no_parse && n.node->op() != nullptr && n.node->op()->attr_parser != nullptr) {
       n.node->op()->attr_parser(&(n.node->attrs));
+    } else if (!no_parse && n.node->is_variable()) {
+      n.node->attrs.parsed =
+        Symbol::CreateVariable(n.node->attrs.name).outputs[0].node->attrs.parsed;
     }
     for (const JSONGraph &subgraph : n.subgraphs) {
       // The "no_parse" option here, is to be compatible with
diff --git a/nnvm/tests/python/unittest/test_pass_saveload_json.py b/nnvm/tests/python/unittest/test_pass_saveload_json.py
new file mode 100644
index 000000000000..7b5f5ea6867a
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_pass_saveload_json.py
@@ -0,0 +1,17 @@
+import nnvm
+from tvm.contrib import util
+
+
+def test_variable_node_parsed():
+    sym = nnvm.sym.Variable('data')
+    tempdir = util.tempdir()
+    json_filename = 'test_nnvm_symbol.json'
+    with open(tempdir.relpath(json_filename), 'w') as fo:
+        fo.write(nnvm.graph.create(sym).json())
+    sym_str = open(tempdir.relpath(json_filename), 'r').read()
+    sym = nnvm.graph.load_json(sym_str).symbol()
+    sym = nnvm.sym.relu(sym)
+
+
+if __name__ == '__main__':
+    test_variable_node_parsed()

From 39703e3c48e89d5234e05dccd3de5a920cb08ca7 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Fri, 5 Oct 2018 19:07:35 -0700
Subject: [PATCH 176/529] [Relay] More type alpha equality test coverage
 (#1823)

---
 python/tvm/relay/__init__.py             |   1 +
 src/relay/pass/alpha_eq.cc               |  43 +++++-
 tests/python/relay/test_ir_nodes.py      |   4 +-
 tests/python/relay/test_pass_alpha_eq.py | 167 ++++++++++++++++++++++-
 4 files changed, 204 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index c1299636eed2..318a4d45d66d 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -25,6 +25,7 @@
 TypeConstraint = ty.TypeConstraint
 FuncType = ty.FuncType
 TypeRelation = ty.TypeRelation
+IncompleteType = ty.IncompleteType
 
 # Expr
 Constant = expr.Constant
diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
index 63ce834be7cf..39f55af6fe70 100644
--- a/src/relay/pass/alpha_eq.cc
+++ b/src/relay/pass/alpha_eq.cc
@@ -88,11 +88,23 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
 
   void VisitType_(const FuncTypeNode *op, const Type& t2) final {
     if (const FuncTypeNode *ta2 = t2.as<FuncTypeNode>()) {
-      if (op->arg_types.size() != ta2->arg_types.size()) {
+      if (op->arg_types.size() != ta2->arg_types.size()
+          || op->type_params.size() != ta2->type_params.size()
+          || op->type_constraints.size() != ta2->type_constraints.size()) {
         equal = false;
         return;
       }
 
+      // must visit params first so they are appropriate entered
+      // into equality map
+      for (size_t i = 0; i < op->type_params.size(); i++) {
+        eq_map.Set(op->type_params[i], ta2->type_params[i]);
+        this->VisitType(op->type_params[i], ta2->type_params[i]);
+        if (!equal) {
+          return;
+        }
+      }
+
       for (size_t i = 0; i < op->arg_types.size(); i++) {
         this->VisitType(op->arg_types[i], ta2->arg_types[i]);
         if (!equal) {
@@ -101,6 +113,16 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
       }
 
       this->VisitType(op->ret_type, ta2->ret_type);
+      if (!equal) {
+        return;
+      }
+
+      for (size_t i = 0; i < op->type_constraints.size(); i++) {
+        this->VisitType(op->type_constraints[i], ta2->type_constraints[i]);
+        if (!equal) {
+          return;
+        }
+      }
     } else {
       equal = false;
     }
@@ -108,7 +130,24 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
 
   void VisitType_(const TypeRelationNode *tr1, const Type& t2) final {
     if (const TypeRelationNode *tr2 = t2.as<TypeRelationNode>()) {
-      equal = tr1 == tr2;
+      if (tr1->func != tr2->func
+          || tr1->num_inputs != tr2->num_inputs
+          || tr1->attrs != tr2->attrs) {
+        equal = false;
+        return;
+      }
+
+      if (tr1->args.size() != tr2->args.size()) {
+        equal = false;
+        return;
+      }
+
+      for (size_t i = 0; i < tr1->args.size(); i++) {
+        this->VisitType(tr1->args[i], tr2->args[i]);
+        if (!equal) {
+          return;
+        }
+      }
     } else {
       equal = false;
     }
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index 4505710c06cc..a94e035e2fef 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -65,8 +65,8 @@ def test_type_relation():
     args = tvm.convert([tf, tt, tp])
 
     num_inputs = 2
-    func = None
-    attrs = None
+    func = tvm.get_env_func("tvm.relay.type_relation.Broadcast")
+    attrs = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
 
     tr = relay.TypeRelation(func, args, num_inputs, attrs)
     assert tr.args == args
diff --git a/tests/python/relay/test_pass_alpha_eq.py b/tests/python/relay/test_pass_alpha_eq.py
index 40140ea486a1..d925b54d47d2 100644
--- a/tests/python/relay/test_pass_alpha_eq.py
+++ b/tests/python/relay/test_pass_alpha_eq.py
@@ -1,17 +1,170 @@
 import tvm
 from tvm import relay
 
-def test_type_alpha_eq():
-    t1 = relay.ty.TensorType((3, 4), "float32")
-    t2 = relay.ty.TensorType((3, 4), "float32")
-    t3 = relay.ty.TensorType((3, 4, 5), "float32")
+
+def test_tensor_type_alpha_eq():
+    t1 = relay.TensorType((3, 4), "float32")
+    t2 = relay.TensorType((3, 4), "float32")
+    t3 = relay.TensorType((3, 4, 5), "float32")
     assert t1 == t2
     assert t1 != t3
 
-    t1 = relay.ty.TensorType((), "float32")
-    t2 = relay.ty.TensorType((), "float32")
+    t1 = relay.TensorType((), "float32")
+    t2 = relay.TensorType((), "float32")
     assert t1 == t2
 
 
+def test_incomplete_type_alpha_eq():
+    t1 = relay.IncompleteType(relay.Kind.Shape)
+    t2 = relay.IncompleteType(relay.Kind.Type)
+    t3 = relay.IncompleteType(relay.Kind.Type)
+
+    # only equal when there is pointer equality
+    assert t2 == t2
+    assert t1 == t1
+    assert t1 != t2
+    assert t2 != t3
+
+
+def test_type_param_alpha_eq():
+    t1 = relay.TypeParam("v1", relay.Kind.Type)
+    t2 = relay.TypeParam("v2", relay.Kind.Shape)
+    t3 = relay.TypeParam("v3", relay.Kind.Type)
+
+    # only pointer equality and eq_map allow equal params
+    assert t1 == t1
+    assert t2 == t2
+    assert t1 != t2 # different kind
+    assert t1 != t3 # not in eq_map
+
+    # function types are the only way to put type params
+    # in eq map
+    ft1 = relay.FuncType(tvm.convert([]), t1, tvm.convert([t1]), tvm.convert([]))
+    ft2 = relay.FuncType(tvm.convert([]), t3, tvm.convert([t3]), tvm.convert([]))
+    # actually an invalid type because t2 is wrong kind
+    ft3 = relay.FuncType(tvm.convert([]), t2, tvm.convert([t2]), tvm.convert([]))
+
+    assert ft1 == ft2
+    assert ft1 != ft3 # kinds still do not match
+
+
+def test_func_type_alpha_eq():
+    t1 = relay.TensorType((1, 2), "float32")
+    t2 = relay.TensorType((1, 2, 3), "float32")
+
+    tp1 = relay.TypeParam("v1", relay.Kind.Type)
+    tp2 = relay.TypeParam("v2", relay.Kind.Type)
+    tp3 = relay.TypeParam("v3", relay.Kind.Shape)
+    tp4 = relay.TypeParam("v3", relay.Kind.Shape)
+
+    broadcast = tvm.get_env_func("tvm.relay.type_relation.Broadcast")
+    identity = tvm.get_env_func("tvm.relay.type_relation.Identity")
+
+    tr1 = relay.TypeRelation(broadcast, tvm.convert([tp1, tp3]), 1, None)
+    tr2 = relay.TypeRelation(broadcast, tvm.convert([tp2, tp4]), 1, None)
+    tr3 = relay.TypeRelation(identity, tvm.convert([tp1, tp3]), 1, None)
+
+    ft = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                         tvm.convert([tp1, tp3]),
+                         tvm.convert([tr1]))
+    translate_vars = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                         tvm.convert([tp2, tp4]),
+                         tvm.convert([tr2]))
+    assert ft == translate_vars
+
+    different_args = relay.FuncType(tvm.convert([t1]), tp1,
+                         tvm.convert([tp1, tp3]),
+                         tvm.convert([tr1]))
+    assert ft != different_args
+
+    different_order = relay.FuncType(tvm.convert([t2, t1]), tp1,
+                         tvm.convert([tp1, tp3]),
+                         tvm.convert([tr1]))
+    assert ft != different_order
+
+    no_rel = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                         tvm.convert([tp1, tp3]),
+                         tvm.convert([]))
+    assert ft != no_rel
+
+    more_vars = relay.FuncType(tvm.convert([t1, t2]), tp2,
+                         tvm.convert([tp1, tp2, tp3]),
+                         tvm.convert([tr1]))
+    assert ft != more_vars
+
+    all_the_vars = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                         tvm.convert([tp1, tp2, tp3, tp4]),
+                         tvm.convert([tr1, tr2]))
+    assert ft != all_the_vars
+
+    different_rel = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                                   tvm.convert([tp1, tp3]),
+                                   tvm.convert([tr3]))
+    assert ft != different_rel
+
+    more_rels = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                                   tvm.convert([tp1, tp3]),
+                                   tvm.convert([tr1, tr3]))
+    assert ft != more_rels
+
+
+def test_tuple_type_alpha_eq():
+    t1 = relay.TensorType((1, 2, 3), "float32")
+    t2 = relay.TensorType((1, 2, 3, 4), "float32")
+    tp1 = relay.TypeParam("v1", relay.Kind.Type)
+    tp2 = relay.TypeParam("v2", relay.Kind.Type)
+
+    tup1 = relay.TupleType(tvm.convert([t1, t2, tp1]))
+    tup2 = relay.TupleType(tvm.convert([t1, t2, tp1]))
+    tup3 = relay.TupleType(tvm.convert([t2, t1, tp1]))
+    tup4 = relay.TupleType(tvm.convert([t1, t2, tp2]))
+
+    # as long as types are alpha-equal and in same order,
+    # tuples should be alpha-equal
+    assert tup1 == tup2
+    assert tup1 != tup3
+    assert tup1 != tup4
+
+
+def test_type_relation_alpha_eq():
+    t1 = relay.TensorType((1, 2), "float32")
+    t2 = relay.TensorType((1, 2, 3), "float32")
+    t3 = relay.TensorType((1, 2, 3, 4), "float32")
+
+    # functions are compared only by pointer equality so
+    # we need to be sure to use the same pointers
+    broadcast = tvm.get_env_func("tvm.relay.type_relation.Broadcast")
+    identity = tvm.get_env_func("tvm.relay.type_relation.Identity")
+
+    # attrs are also compared only by pointer equality
+    attr1 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+    attr2 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+
+    tr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1)
+    same = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1)
+    diff_func = relay.TypeRelation(identity, tvm.convert([t1, t2]), 1, attr1)
+    diff_order = relay.TypeRelation(broadcast, tvm.convert([t2, t1]), 1, attr1)
+    diff_args = relay.TypeRelation(broadcast, tvm.convert([t2, t3]), 1, attr1)
+    diff_attr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr2)
+
+    bigger = relay.TypeRelation(identity, tvm.convert([t1, t3, t2]), 2, attr1)
+    diff_num_inputs = relay.TypeRelation(identity, tvm.convert([t1, t3, t2]), 1, attr2)
+
+    # func, number of args, input count, and order should be the same
+    assert tr == same
+    assert tr != diff_func
+    assert tr != diff_order
+    assert tr != diff_args
+    assert tr != diff_attr
+    assert tr != bigger
+
+    assert bigger != diff_num_inputs
+
+
 if __name__ == "__main__":
-    test_type_alpha_eq()
+    test_tensor_type_alpha_eq()
+    test_incomplete_type_alpha_eq()
+    test_type_param_alpha_eq()
+    test_func_type_alpha_eq()
+    test_tuple_type_alpha_eq()
+    test_type_relation_alpha_eq()

From c3132567c002adbb1d7cd82d41c6231a34640824 Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Fri, 5 Oct 2018 21:30:43 -0700
Subject: [PATCH 177/529] [LANG] Generalize compute to tensor region (#1476)

---
 3rdparty/dmlc-core                            |   2 +-
 include/tvm/expr.h                            |   2 +
 include/tvm/operation.h                       |  72 +++-
 include/tvm/tensor_intrin.h                   |  53 +++
 python/tvm/api.py                             |  39 +-
 python/tvm/tensor.py                          |  12 +
 python/tvm/tensor_intrin.py                   |  28 +-
 src/api/api_lang.cc                           |  20 +
 src/lang/tensor.cc                            |  44 ++-
 src/op/compute_op.cc                          |  35 ++
 src/op/compute_op.h                           |  17 +-
 src/op/tensor_compute_op.cc                   | 361 ++++++++++++++++++
 src/op/tensorize.cc                           |  45 ---
 src/pass/arg_binder.cc                        |   4 +-
 src/schedule/schedule_dataflow_rewrite.cc     | 252 +++++++++---
 tests/python/unittest/test_lang_tensor.py     |  74 ++++
 .../unittest/test_schedule_schedule_ops.py    | 130 +++++++
 17 files changed, 1059 insertions(+), 131 deletions(-)
 create mode 100644 src/op/tensor_compute_op.cc

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 4f0564ec7694..946a54012d0c 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 4f0564ec769477c66d480dd966088f172050c874
+Subproject commit 946a54012d0c390675ab5b46cd990838d4183d6f
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 050ab4c334e2..e41f5f28d35b 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -108,6 +108,8 @@ class Range : public HalideIR::IR::Range {
   TVM_DLL static Range make_by_min_extent(Expr min, Expr extent);
 };
 
+using Region = Array<Range>;
+
 /*!
  * \brief Type of iteration variable.
  *  Each IterVar have a specific type.
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index c11242c0a55d..1a1d28ab71bb 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -49,7 +49,7 @@ class OperationNode : public FunctionBaseNode {
   }
   /*!
    * \return The list of iteration variable at root
-   * \note root_iter_vars dedides the shape of the outputs.
+   * \note root_iter_vars decides the shape of the outputs.
    */
   virtual Array<IterVar> root_iter_vars() const = 0;
   /*!
@@ -239,6 +239,74 @@ class TVM_DLL ComputeOpNode : public OperationNode {
   TVM_DECLARE_NODE_TYPE_INFO(ComputeOpNode, OperationNode);
 };
 
+/*!
+ * \brief A TenorCompute op that compute a tensor with an tensor intrinsic.
+ */
+class TensorComputeOpNode : public OperationNode {
+ public:
+  /*! \brief IterVar on each axis */
+  Array<IterVar> axis;
+  /*! \brief IterVar on each reduction axis, if the intrin will use the reduce axis */
+  Array<IterVar> reduce_axis;
+  /*! \brief number of axes that can be scheduled */
+  int schedulable_ndim;
+  /*! \brief TensorIntrin used to compute */
+  TensorIntrin intrin;
+  /*! \brief input tensors of intrin */
+  Array<Tensor> inputs;
+  /*! \brief region of input tensors */
+  Array<Region> input_regions;
+  /*! \brief constructor */
+  TensorComputeOpNode() {}
+  // override functions
+  int num_outputs() const final;
+  Array<IterVar> root_iter_vars() const final;
+  Type output_dtype(size_t i) const final;
+  Array<Expr> output_shape(size_t i) const final;
+  Array<Tensor> InputTensors() const final;
+  Operation ReplaceInputs(
+      const Operation& self,
+      const std::unordered_map<Tensor, Tensor>& rmap) const final;
+  void PropBoundToInputs(
+      const Operation& self,
+      const std::unordered_map<const Variable*, IntSet>& dom_map,
+      std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
+  void GatherBound(
+      const Operation& self,
+      const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+      std::unordered_map<IterVar, Range>* out_dom_map) const final;
+  Stmt BuildRealize(
+      const Stage& stage,
+      const std::unordered_map<IterVar, Range>& realize_map,
+      const Stmt& body) const final;
+  Stmt BuildProvide(
+      const Stage& stage,
+      const std::unordered_map<IterVar, Range>& dom_map,
+      bool debug_keep_trivial_loop) const final;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("tag", &tag);
+    v->Visit("axis", &axis);
+    v->Visit("reduce_axis", &reduce_axis);
+    v->Visit("schedulable_ndim", &schedulable_ndim);
+    v->Visit("intrin", &intrin);
+    v->Visit("inputs", &inputs);
+    v->Visit("input_regions", &input_regions);
+  }
+  static Operation make(std::string name,
+                        std::string tag,
+                        Array<IterVar> axis,
+                        Array<IterVar> reduce_axis,
+                        int schedulable_ndim,
+                        TensorIntrin intrin,
+                        Array<Tensor> tensors,
+                        Array<Region> regions);
+
+  static constexpr const char* _type_key = "TensorComputeOp";
+  TVM_DECLARE_NODE_TYPE_INFO(TensorComputeOpNode, OperationNode);
+};
+
 /*!
  * \brief Symbolic scan.
  */
@@ -326,7 +394,7 @@ class ExternOpNode : public OperationNode {
  public:
   /*! \brief The input tensors */
   Array<Tensor> inputs;
-  /*! \brief Symbolic placeholder representationinputs */
+  /*! \brief Symbolic placeholder representation of inputs */
   Array<Buffer> input_placeholders;
   /*! \brief Symbolic placeholder representation of outputs */
   Array<Buffer> output_placeholders;
diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index 944498d1e615..fbee4bccc0bf 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -89,5 +89,58 @@ class TensorIntrinNode : public Node {
 inline const TensorIntrinNode* TensorIntrin::operator->() const {
   return static_cast<const TensorIntrinNode*>(node_.get());
 }
+
+
+// Internal node container of tensor intrinsic calling.
+class TensorIntrinCallNode;
+
+/*! \brief Tensor intrinsic calling node. */
+class TensorIntrinCall : public NodeRef {
+ public:
+  TensorIntrinCall() {}
+  explicit TensorIntrinCall(NodePtr<Node> n) : NodeRef(n) {}
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline const TensorIntrinCallNode* operator->() const;
+
+  /*! \brief specify container node */
+  using ContainerType = TensorIntrinCallNode;
+};
+
+class TensorIntrinCallNode : public Node {
+ public:
+  /*! \brief the tensor intrinsic */
+  TensorIntrin intrin;
+  /*! \brief input tensors of the intrinsic */
+  Array<Tensor> tensors;
+  /*! \brief regions of input tensors */
+  Array<Region> regions;
+  /*!
+   * \brief IterVar on each reduction axis, if the
+   * intrin will use the reduce axis
+   */
+  Array<IterVar> reduce_axis;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("intrin", &intrin);
+    v->Visit("tensors", &tensors);
+    v->Visit("regions", &regions);
+    v->Visit("reduce_axis", &reduce_axis);
+  }
+  static TensorIntrinCall make(TensorIntrin intrin,
+                               Array<Tensor> tensors,
+                               Array<Region> regions,
+                               Array<IterVar> reduce_axis);
+
+  static constexpr const char* _type_key = "TensorIntrinCall";
+  TVM_DECLARE_NODE_TYPE_INFO(TensorIntrinCallNode, Node);
+};
+
+inline const TensorIntrinCallNode* TensorIntrinCall::operator->() const {
+  return static_cast<const TensorIntrinCallNode*>(node_.get());
+}
+
 }  // namespace tvm
 #endif  // TVM_TENSOR_INTRIN_H_
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 8cf507de6386..e275c1122c36 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -243,24 +243,43 @@ def compute(shape, fcompute, name="compute", tag="", attrs=None):
             raise ValueError("nested tag is not allowed for now")
         tag = _tag.TagScope.get_current().tag
     shape = (shape,) if isinstance(shape, _expr.Expr) else shape
+    # for python3
+    shape = tuple([int(s) if isinstance(s, float) else s for s in shape])
     ndim = len(shape)
     code = fcompute.__code__
 
-    if fcompute.__code__.co_argcount == 0:
+    out_ndim = ndim
+    if code.co_argcount == 0:
         arg_names = ["i%d" % i for i in range(ndim)]
     else:
         arg_names = code.co_varnames[:code.co_argcount]
+        out_ndim = code.co_argcount
 
-    if ndim != len(arg_names):
+    if out_ndim != len(arg_names):
         raise ValueError("fcompute do not match dimension, ndim=%d" % ndim)
 
-    dim_var = [_IterVar((0, s), x, 0) for x, s in zip(arg_names, shape)]
+    dim_var = [_IterVar((0, s), x, 0) for x, s in zip(arg_names, shape[:out_ndim])]
     body = fcompute(*[v.var for v in dim_var])
-    if not isinstance(body, (list, tuple)):
-        body = [body]
-    body = convert(body)
-    op_node = _api_internal._ComputeOp(
-        name, tag, attrs, dim_var, body)
+
+    if isinstance(body, _tensor.TensorIntrinCall):
+        for i, s in enumerate(shape[out_ndim:]):
+            var_name = "ax" + str(i)
+            dim_var.append(_IterVar((0, s), var_name, 4))
+        op_node = _api_internal._TensorComputeOp(name,
+                                                 tag,
+                                                 dim_var,
+                                                 body.reduce_axis,
+                                                 out_ndim,
+                                                 body.intrin,
+                                                 body.tensors,
+                                                 body.regions)
+    else:
+        if not isinstance(body, (list, tuple)):
+            body = [body]
+        body = convert(body)
+        op_node = _api_internal._ComputeOp(
+            name, tag, attrs, dim_var, body)
+
     num = op_node.num_outputs
     outputs = tuple(op_node.output(i) for i in range(num))
     return outputs[0] if num == 1 else outputs
@@ -529,14 +548,14 @@ def decl_buffer(shape,
     dtype = float32 if dtype is None else dtype
     strides = () if strides is None else strides
     if offset_factor != 0 and elem_offset is None:
-        elem_offset = var('%s_elem_offset' % name, shape[0].dtype)
+        shape_dtype = shape[0].dtype if hasattr(shape[0], "dtype") else "int32"
+        elem_offset = var('%s_elem_offset' % name, shape_dtype)
     if data is None:
         data = var(name, "handle")
     return _api_internal._Buffer(
         data, dtype, shape, strides, elem_offset, name, scope,
         data_alignment, offset_factor)
 
-
 def _IterVar(dom, name, iter_type, thread_tag=''):
     """Internal function to create IterVar
 
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index f0d60f514a37..f32b70eb9a12 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -30,6 +30,11 @@ def dtype(self):
         """Data content of the tensor."""
         return self.tensor.dtype
 
+@register_node
+class TensorIntrinCall(NodeBase):
+    """Intermediate structure for calling a tensor intrinsic."""
+    pass
+
 
 itervar_cls = None
 
@@ -106,6 +111,7 @@ def name(self):
         return "%s.v%d" % (op.name, self.value_index)
 
 
+
 class Operation(NodeBase):
     """Represent an operation that generate a tensor"""
 
@@ -155,6 +161,12 @@ def reduce_axis(self):
         return self.__getattr__("reduce_axis")
 
 
+@register_node
+class TensorComputeOp(Operation):
+    """Tensor operation."""
+    pass
+
+
 @register_node
 class ScanOp(Operation):
     """Scan operation."""
diff --git a/python/tvm/tensor_intrin.py b/python/tvm/tensor_intrin.py
index 193124b2f946..f1f26655fe27 100644
--- a/python/tvm/tensor_intrin.py
+++ b/python/tvm/tensor_intrin.py
@@ -6,9 +6,25 @@
 from . import stmt as _stmt
 from . import make as _make
 from . import tensor as _tensor
+from . import schedule as _schedule
 from .build_module import current_build_config
 from ._ffi.node import NodeBase, register_node
 
+
+def _get_region(tslice):
+    region = []
+    for idx in tslice.indices:
+        if isinstance(idx, slice):
+            assert idx.step is None
+            region.append(_api.Range(idx.start, idx.stop))
+        else:
+            if isinstance(idx, _schedule.IterVar):
+                begin = idx.var
+            else:
+                begin = idx
+            region.append(_make.range_by_min_extent(begin, 1))
+    return region
+
 @register_node
 class TensorIntrin(NodeBase):
     """Tensor intrinsic functions for certain computation.
@@ -17,8 +33,16 @@ class TensorIntrin(NodeBase):
     --------
     decl_tensor_intrin: Construct a TensorIntrin
     """
-    pass
-
+    def __call__(self, *args, **kwargs):
+        tensors = [x.tensor for x in args]
+        regions = [_get_region(x) for x in args]
+        reduce_axis = []
+        if "reduce_axis" in kwargs:
+            reduce_axis = kwargs["reduce_axis"]
+            if not isinstance(reduce_axis, (list, tuple)):
+                reduce_axis = [reduce_axis]
+            reduce_axis = _api.convert(reduce_axis)
+        return _api_internal._TensorIntrinCall(self, tensors, regions, reduce_axis)
 
 def decl_tensor_intrin(op,
                        fcompute,
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index 8ca49f19baec..75365da5bf50 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -239,6 +239,14 @@ TVM_REGISTER_API("_TensorIntrin")
                                   args[6]);
   });
 
+TVM_REGISTER_API("_TensorIntrinCall")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = TensorIntrinCallNode::make(args[0],
+                                      args[1],
+                                      args[2],
+                                      args[3]);
+  });
+
 TVM_REGISTER_API("_TensorEqual")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     *ret = args[0].operator Tensor() == args[1].operator Tensor();
@@ -278,6 +286,18 @@ TVM_REGISTER_API("_ScanOp")
                             args[7]);
   });
 
+TVM_REGISTER_API("_TensorComputeOp")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = TensorComputeOpNode::make(args[0],
+                                     args[1],
+                                     args[2],
+                                     args[3],
+                                     args[4],
+                                     args[5],
+                                     args[6],
+                                     args[7]);
+  });
+
 TVM_REGISTER_API("_ExternOp")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     *ret = ExternOpNode::make(args[0],
diff --git a/src/lang/tensor.cc b/src/lang/tensor.cc
index 4f9c3e9d1782..9b1a58abcee4 100644
--- a/src/lang/tensor.cc
+++ b/src/lang/tensor.cc
@@ -10,6 +10,8 @@
 
 namespace tvm {
 
+// Tensor
+
 Expr Tensor::operator()(Array<Var> indices) const {
   Array<Expr> arr(indices.begin(), indices.end());
   return operator()(arr);
@@ -26,6 +28,15 @@ Expr Tensor::operator()(Array<Expr> indices) const {
   return n;
 }
 
+Tensor Operation::output(size_t i) const {
+  auto node = make_node<TensorNode>();
+  node->op = *this;
+  node->value_index = i;
+  node->dtype = (*this)->output_dtype(i);
+  node->shape = (*this)->output_shape(i);
+  return Tensor(node);
+}
+
 Tensor TensorNode::make(Array<Expr> shape,
                         Type dtype,
                         Operation op,
@@ -46,14 +57,8 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 
 TVM_REGISTER_NODE_TYPE(TensorNode);
 
-Tensor Operation::output(size_t i) const {
-  auto node = make_node<TensorNode>();
-  node->op = *this;
-  node->value_index = i;
-  node->dtype = (*this)->output_dtype(i);
-  node->shape = (*this)->output_shape(i);
-  return Tensor(node);
-}
+
+// TensorIntrin
 
 TensorIntrin TensorIntrinNode::make(std::string name,
                                     Operation op,
@@ -79,4 +84,27 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   });
 
 TVM_REGISTER_NODE_TYPE(TensorIntrinNode);
+
+
+// TensorIntrinCall
+
+TensorIntrinCall TensorIntrinCallNode::make(TensorIntrin intrin,
+                                            Array<Tensor> tensors,
+                                            Array<Region> regions,
+                                            Array<IterVar> reduce_axis) {
+  auto n = make_node<TensorIntrinCallNode>();
+  n->intrin = std::move(intrin);
+  n->tensors = std::move(tensors);
+  n->regions = std::move(regions);
+  n->reduce_axis = std::move(reduce_axis);
+  return TensorIntrinCall(n);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<TensorIntrinCallNode>([](const TensorIntrinCallNode *n, IRPrinter *p) {
+    p->stream << "TensorIntrinCall(intrin=" << n->intrin << ", " << n << ")";
+  });
+
+TVM_REGISTER_NODE_TYPE(TensorIntrinCallNode);
+
 }  // namespace tvm
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index daafac21b180..5c972595ff00 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -13,6 +13,7 @@
 #include "compute_op.h"
 #include "op_util.h"
 #include "../schedule/message_passing.h"
+#include "../arithmetic/compute_expr.h"
 
 namespace tvm {
 
@@ -545,4 +546,38 @@ static void VerifyComputeOp(const ComputeOpNode* op) {
   v.Run();
 }
 
+Stmt TransformUpdate(const Stage& stage,
+                     const std::unordered_map<IterVar, Range>& dom_map,
+                     const ComputeLoopNest& n,
+                     Stmt body,
+                     Stmt update) {
+  Array<Expr> conds;
+  std::unordered_set<const Variable*> banned;
+  for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
+    IterVar iv = stage->leaf_iter_vars[i];
+    auto iit = stage->iter_var_attrs.find(iv);
+    if (iit != stage->iter_var_attrs.end()) {
+      const IterVarAttr& attr = (*iit).second;
+      if (attr->iter_type == kTensorized) {
+        break;
+      }
+    }
+    if (iv->iter_type == kCommReduce) {
+      auto vit = dom_map.find(iv);
+      CHECK(vit != dom_map.end());
+      const Range& vrange = vit->second;
+      conds.push_back(likely(iv->var > vrange->min));
+      banned.insert(iv->var.get());
+    }
+  }
+  for (const Expr& pred : n.main_predicates) {
+    if (ir::ExprUseVar(pred, banned)) {
+      LOG(FATAL) << "Tensorize update transform failed, the condition "
+                 << pred << " has a conflict with the reset condition";
+    }
+  }
+
+  return IfThenElse::make(arith::ComputeReduce<ir::Or>(conds, const_true(1)),
+                          update, body);
+}
 }  // namespace tvm
diff --git a/src/op/compute_op.h b/src/op/compute_op.h
index 996764c6cdc1..87b0814c1ad9 100644
--- a/src/op/compute_op.h
+++ b/src/op/compute_op.h
@@ -14,7 +14,7 @@
 
 namespace tvm {
 // loop nest structure for general compute
-// This the the loop nest structured used in compute.
+// This the loop nest structured used in compute.
 // Does not include the loop body.
 struct ComputeLoopNest {
   // The common number of loops between init and main
@@ -73,6 +73,21 @@ Stmt MakeTensorize(const ComputeOpNode* self,
                    const Stage& stage,
                    const std::unordered_map<IterVar, Range>& dom_map,
                    bool debug_keep_trivial_loop);
+
+/*!
+ * \brief Transform the update part when there is no init func in tensorizing
+ * \param stage The stage for tensorizing.
+ * \param dom_map The range of each iter var.
+ * \param n The loop nest structured used in compute.
+ * \param body The body func in tensorize intrin
+ * \param update The update func in tensorize intrin
+ * \return Transformed result.
+ */
+Stmt TransformUpdate(const Stage& stage,
+                     const std::unordered_map<IterVar, Range>& dom_map,
+                     const ComputeLoopNest& n,
+                     Stmt body,
+                     Stmt update);
 }  // namespace tvm
 
 #endif  // TVM_OP_COMPUTE_OP_H_
diff --git a/src/op/tensor_compute_op.cc b/src/op/tensor_compute_op.cc
new file mode 100644
index 000000000000..f9b8188d4685
--- /dev/null
+++ b/src/op/tensor_compute_op.cc
@@ -0,0 +1,361 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief Tensor Compute Op.
+ * \file tensor_compute_op.cc
+ */
+#include <tvm/operation.h>
+#include <tvm/arithmetic.h>
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/ir_pass.h>
+#include <unordered_set>
+#include "./op_util.h"
+#include "./compute_op.h"
+#include "../arithmetic/compute_expr.h"
+
+namespace tvm {
+using namespace ir;
+// TensorComputeOpNode
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<TensorComputeOpNode>([](const TensorComputeOpNode *op,
+                                      IRPrinter *p) {
+    p->stream << "tensor_compute_op(" << op->name << ", " << op << ")";
+  });
+
+TVM_REGISTER_NODE_TYPE(TensorComputeOpNode);
+
+int TensorComputeOpNode::num_outputs() const {
+  return static_cast<int>(this->intrin->buffers.size() - this->inputs.size());
+}
+
+Array<IterVar> TensorComputeOpNode::root_iter_vars() const {
+  Array<IterVar> ret = axis;
+  for (IterVar iv : reduce_axis) {
+    ret.push_back(iv);
+  }
+  return ret;
+}
+
+Type TensorComputeOpNode::output_dtype(size_t i) const {
+  return this->intrin->buffers[this->inputs.size() + i]->dtype;
+}
+
+Array<Expr> TensorComputeOpNode::output_shape(size_t i) const {
+  Array<Expr> shape;
+  for (const auto& ivar : this->axis) {
+    shape.push_back(ivar->dom->extent);
+  }
+  return shape;
+}
+
+
+Operation TensorComputeOpNode::make(std::string name,
+                                    std::string tag,
+                                    Array<IterVar> axis,
+                                    Array<IterVar> reduce_axis,
+                                    int schedulable_ndim,
+                                    TensorIntrin intrin,
+                                    Array<Tensor> tensors,
+                                    Array<Region> regions) {
+  auto n = make_node<TensorComputeOpNode>();
+  n->name = std::move(name);
+  n->tag = std::move(tag);
+  n->axis = std::move(axis);
+  n->reduce_axis = std::move(reduce_axis);
+  n->schedulable_ndim = std::move(schedulable_ndim);
+  n->intrin = std::move(intrin);
+  n->inputs = std::move(tensors);
+  n->input_regions = std::move(regions);
+  return Operation(n);
+}
+
+Array<Tensor> TensorComputeOpNode::InputTensors() const {
+  return inputs;
+}
+
+Operation TensorComputeOpNode::ReplaceInputs(
+    const Operation& self,
+    const std::unordered_map<Tensor, Tensor>& rmap) const {
+  CHECK_EQ(self.operator->(), this);
+  auto n = make_node<TensorComputeOpNode>(*this);
+  auto intrin = make_node<TensorIntrinNode>(*(this->intrin.operator->()));
+  intrin->body = op::ReplaceTensor(this->intrin->body, rmap);
+  if (intrin->reduce_init.defined()) {
+    intrin->reduce_init = op::ReplaceTensor(this->intrin->reduce_init, rmap);
+  }
+  if (intrin->reduce_update.defined()) {
+    intrin->reduce_update = op::ReplaceTensor(this->intrin->reduce_update, rmap);
+  }
+  for (size_t i = 0; i < n->inputs.size(); ++i) {
+    Tensor t = n->inputs[i];
+    if (rmap.count(t)) {
+      n->inputs.Set(i, rmap.at(t));
+    }
+  }
+
+  if (intrin->body.same_as(n->intrin->body) &&
+      intrin->reduce_init.same_as(n->intrin->reduce_init) &&
+      intrin->reduce_update.same_as(n->intrin->reduce_update) &&
+      inputs.same_as(n->inputs)) {
+    return self;
+  } else {
+    n->intrin = TensorIntrin(intrin);
+    return Operation(n);
+  }
+}
+
+void TensorComputeOpNode::PropBoundToInputs(
+    const Operation& self,
+    const std::unordered_map<const Variable*, IntSet>& dom_map,
+    std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
+  for (size_t i = 0; i < this->inputs.size(); ++i) {
+    Tensor t = this->inputs[i];
+    Region region = input_regions[i];
+
+    auto it = out_dom_map->find(t);
+    if (it == out_dom_map->end()) continue;
+    TensorDom& dom = it->second;
+    for (size_t j = 0; j < t.ndim(); ++j) {
+      dom.data[j].emplace_back(EvalSet(region[j], dom_map));
+    }
+  }
+}
+
+void TensorComputeOpNode::GatherBound(
+    const Operation& self,
+    const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+    std::unordered_map<IterVar, Range>* out_dom_map) const {
+  const TensorDom& tdom = tensor_dom.at(self.output(0));
+  for (size_t i = 0; i < this->axis.size(); ++i) {
+    Range r = arith::Union(tdom.data.at(i)).cover_range(this->axis[i]->dom);
+    CHECK(!out_dom_map->count(this->axis[i]));
+    (*out_dom_map)[this->axis[i]] = r;
+  }
+  for (size_t i = 0; i < this->reduce_axis.size(); ++i) {
+    CHECK(!out_dom_map->count(this->reduce_axis[i]));
+    (*out_dom_map)[this->reduce_axis[i]] = this->reduce_axis[i]->dom;
+  }
+}
+
+Stmt TensorComputeOpNode::BuildRealize(
+    const Stage& stage,
+    const std::unordered_map<IterVar, Range>& realize_map,
+    const Stmt& body) const {
+  CHECK_EQ(stage->op.get(), this);
+  HalideIR::Internal::Region bounds;
+  for (IterVar iv : this->axis) {
+    bounds.push_back(realize_map.at(iv));
+  }
+  Stmt realize = body;
+  for (int i = this->num_outputs(); i > 0; --i) {
+    Tensor t = stage->op.output(i-1);
+    realize = ir::Realize::make(t->op, t->value_index,
+      t->dtype, bounds, const_true(), realize);
+    // alignment requirement, only useful for compute
+    for (int i = 0; i < schedulable_ndim; ++i) {
+      auto it = stage->iter_var_attrs.find(this->axis[i]);
+      if (it != stage->iter_var_attrs.end()) {
+        IterVarAttr attr = (*it).second;
+        if (attr->dim_align_factor != 0) {
+          Array<Expr> tuple = {static_cast<int>(i),
+                               attr->dim_align_factor,
+                               attr->dim_align_offset};
+          realize = ir::AttrStmt::make(
+              t, ir::attr::buffer_dim_align,
+              Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic),
+              realize);
+        }
+      }
+    }
+  }
+  return realize;
+}
+
+ComputeLoopNest MakeLoopNest(
+    const TensorComputeOpNode* self,
+    const Stage& stage,
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) {
+  CHECK_EQ(stage->op.operator->(), self);
+  ComputeLoopNest ret;
+  // make main loop nest
+  ret.main_nest = op::MakeLoopNest(
+      stage, dom_map, 0, false, std::unordered_set<IterVar>(), &ret.main_vmap,
+      debug_keep_trivial_loop);
+  ret.main_predicates = schedule::MakeBoundCheck(
+      stage, dom_map, ret.main_vmap, false,
+      std::unordered_set<IterVar>());
+  for (auto& e : ret.main_predicates) {
+    e = likely(e);
+  }
+  if (stage->store_predicate.defined()) {
+    ret.main_predicates.push_back(stage->store_predicate);
+  }
+  if (self->reduce_axis.size() != 0) {
+    // try to find the location to insert the initialization.
+    // Fuse the initialization and provide loop when possible.
+    std::unordered_map<IterVar, int> update_state;
+    for (IterVar iv : self->reduce_axis) {
+      update_state[iv] = 2;
+    }
+    for (int i = 0; i < self->schedulable_ndim; ++i) {
+      update_state[self->axis[i]] = 1;
+    }
+    // find which iter var is related to reduction and which is related to axis.
+    schedule::PassDownBitMaskOr(stage, &update_state);
+    auto leaf_iter_vars = stage->leaf_iter_vars;
+    // first first loop that is related to reduction.
+    size_t begin_loop = leaf_iter_vars.size();
+    for (size_t i = 0; i < leaf_iter_vars.size(); ++i) {
+      auto iv = leaf_iter_vars[i];
+      int flag = update_state.at(iv);
+      if ((flag & 2) != 0) {
+        begin_loop = i; break;
+      }
+      ret.init_vmap[iv] = ret.main_vmap.at(iv);
+    }
+    ret.num_common_loop = begin_loop;
+    // skip loops that does not relates to axis.
+    std::unordered_set<IterVar> skip_iter;
+    for (auto kv : update_state) {
+      int flag = kv.second;
+      if ((flag & 1) == 0) skip_iter.insert(kv.first);
+    }
+    ret.init_nest = op::MakeLoopNest(
+        stage, dom_map, begin_loop, true,
+        skip_iter, &(ret.init_vmap), debug_keep_trivial_loop);
+    ret.init_predicates = schedule::MakeBoundCheck(
+        stage, dom_map, ret.init_vmap, true, skip_iter);
+    for (auto& e : ret.init_predicates) {
+      e = likely(e);
+    }
+  } else {
+    CHECK_EQ(ret.main_nest.size(), stage->leaf_iter_vars.size() + 1);
+    ret.num_common_loop = stage->leaf_iter_vars.size();
+  }
+  // copy elison here.
+  return ret;
+}
+
+
+Stmt TensorComputeOpNode::BuildProvide(
+    const Stage& stage,
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) const {
+  CHECK_EQ(stage->op.operator->(), this);
+
+  // Start bind data.
+  Stmt nop = Evaluate::make(0);
+  std::vector<Stmt> input_bind_nest, output_bind_nest;
+  Array<Tensor> inputs = this->InputTensors();
+
+  // input binding
+  size_t num_inputs = inputs.size();
+  for (size_t i = 0; i < num_inputs; ++i) {
+    Tensor tensor = inputs[i];
+    Region region = this->input_regions[i];
+    Buffer buffer = this->intrin->buffers[i];
+    Array<NodeRef> bind_spec{buffer, tensor};
+
+    Array<Expr> tuple;
+    for (size_t i = 0; i < region.size(); ++i) {
+      tuple.push_back(region[i]->min);
+      tuple.push_back(region[i]->extent);
+    }
+    input_bind_nest.emplace_back(AttrStmt::make(
+        bind_spec, ir::attr::buffer_bind_scope,
+        Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
+  }
+
+  // output binding
+  for (int i = 0; i < this->num_outputs(); ++i) {
+    Tensor tensor = stage->op.output(i);
+    Buffer buffer = this->intrin->buffers[num_inputs + i];
+    Array<NodeRef> bind_spec{buffer, tensor};
+
+    Array<Expr> tuple;
+    for (size_t i = 0; i < this->axis.size(); ++i) {
+      auto ivar = this->axis[i];
+      if (i < static_cast<size_t>(this->schedulable_ndim)) {
+        tuple.push_back(ivar->var);
+        tuple.push_back(1);
+      } else {
+        Range dom = ivar->dom;
+        tuple.push_back(dom->min);
+        tuple.push_back(dom->extent);
+      }
+    }
+
+    output_bind_nest.emplace_back(AttrStmt::make(
+        bind_spec, ir::attr::buffer_bind_scope,
+        Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
+  }
+
+  // Check variable remap
+  std::unordered_map<const Variable*, Expr> vmap;
+  ir::ArgBinder binder(&vmap);
+
+  size_t tloc = stage->leaf_iter_vars.size();
+  ComputeLoopNest n = MakeLoopNest(this, stage, dom_map, debug_keep_trivial_loop);
+
+  if (this->reduce_axis.size() == 0) {
+    std::vector<std::vector<Stmt> > nest(
+        n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
+    nest.emplace_back(op::MakeIfNest(n.main_predicates));
+    CHECK_EQ(n.init_predicates.size(), 0U);
+    CHECK(this->intrin->body.defined())
+        << "Normal store op for intrin " << this << " is not defined";
+    Stmt body = MergeNest(output_bind_nest, this->intrin->body);
+    body = MergeNest(input_bind_nest, body);
+    body = ir::Substitute(body, vmap);
+    body = MergeNest(binder.asserts(), body);
+    body = op::Substitute(body, n.main_vmap);
+    Stmt ret =  MergeNest(nest, body);
+    return ret;
+  } else {
+    // Need to split reduction
+    CHECK(this->intrin->reduce_update.defined())
+        << "Reduction update op is not defined";
+    // Need init and update steps
+    CHECK_NE(this->reduce_axis.size(), 0U);
+    std::vector<std::vector<Stmt> > common(
+        n.main_nest.begin(), n.main_nest.begin() + n.num_common_loop + 1);
+    std::vector<std::vector<Stmt> > update_nest(
+        n.main_nest.begin() + n.num_common_loop + 1, n.main_nest.begin() + tloc + 1);
+    update_nest.emplace_back(op::MakeIfNest(n.main_predicates));
+
+    if (this->intrin->reduce_init.defined()) {
+      // init nest
+      std::vector<std::vector<Stmt> > init_nest(
+          n.init_nest.begin(), n.init_nest.begin() + tloc + 1);
+      init_nest.emplace_back(op::MakeIfNest(n.init_predicates));
+      Stmt init = MergeNest(output_bind_nest, this->intrin->reduce_init);
+      init = op::Substitute(init, n.init_vmap);
+      init = MergeNest(init_nest, init);
+      // The update
+      Stmt update = MergeNest(output_bind_nest, this->intrin->reduce_update);
+      update = MergeNest(input_bind_nest, update);
+      update = ir::Substitute(update, vmap);
+      update = MergeNest(binder.asserts(), update);
+      update = op::Substitute(update, n.main_vmap);
+      update = MergeNest(update_nest, update);
+      return MergeNest(common, Block::make(init, update));
+    } else {
+      // When init op is not available, use body op for reset in the first iter.
+      CHECK(this->intrin->body.defined())
+          << "Normal body op is not defined";
+      Stmt update = TransformUpdate(stage, dom_map, n,
+                                    this->intrin->body,
+                                    this->intrin->reduce_update);
+      update = MergeNest(output_bind_nest, update);
+      update = MergeNest(input_bind_nest, update);
+      update = ir::Substitute(update, vmap);
+      update = MergeNest(binder.asserts(), update);
+      update = op::Substitute(update, n.main_vmap);
+      update = MergeNest(update_nest, update);
+      return MergeNest(common, update);
+    }
+  }
+}
+
+}  // namespace tvm
diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc
index 6daaedd16de1..a61aac422284 100644
--- a/src/op/tensorize.cc
+++ b/src/op/tensorize.cc
@@ -10,7 +10,6 @@
 #include "op_util.h"
 #include "compute_op.h"
 #include "../schedule/message_passing.h"
-#include "../arithmetic/compute_expr.h"
 
 namespace tvm {
 
@@ -323,50 +322,6 @@ void VerifyTensorizeBody(
   }
 }
 
-/*!
- * \brief Transform the update part when there is no init func in tensorizing
- * \param stage The stage for tensorizing.
- * \param dom_map The range of each iter var.
- * \param n The loop nest structured used in compute. 
- * \param body The body func in tensorize intrin
- * \param update The update func in tensorize intrin
- * \return Transformed result.
- */
-Stmt TransformUpdate(const Stage& stage,
-                     const std::unordered_map<IterVar, Range>& dom_map,
-                     const ComputeLoopNest& n,
-                     Stmt body,
-                     Stmt update) {
-  Array<Expr> conds;
-  std::unordered_set<const Variable*> banned;
-  for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
-    IterVar iv = stage->leaf_iter_vars[i];
-    auto iit = stage->iter_var_attrs.find(iv);
-    if (iit != stage->iter_var_attrs.end()) {
-      const IterVarAttr& attr = (*iit).second;
-      if (attr->iter_type == kTensorized) {
-        break;
-      }
-    }
-    if (iv->iter_type == kCommReduce) {
-      auto vit = dom_map.find(iv);
-      CHECK(vit != dom_map.end());
-      const Range& vrange = vit->second;
-      conds.push_back(likely(iv->var > vrange->min));
-      banned.insert(iv->var.get());
-    }
-  }
-  for (const Expr& pred : n.main_predicates) {
-    if (ir::ExprUseVar(pred, banned)) {
-      LOG(FATAL) << "Tensorize update transform failed, the condition "
-                 << pred << " has a conflict with the reset condition";
-    }
-  }
-
-  return IfThenElse::make(arith::ComputeReduce<ir::Or>(conds, const_true(1)),
-                          update, body);
-}
-
 Stmt MakeTensorize(const ComputeOpNode* self,
                    const Stage& stage,
                    const std::unordered_map<IterVar, Range>& dom_map,
diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc
index 0fac313c079b..623886c31b86 100644
--- a/src/pass/arg_binder.cc
+++ b/src/pass/arg_binder.cc
@@ -91,7 +91,9 @@ void ArgBinder::BindBuffer(const Buffer& arg,
   // bind pointer and offset.
   if (is_zero(arg->elem_offset)) {
     CHECK(is_zero(value->elem_offset))
-        << "Trying to bind a Buffer with offset into one without offset";
+        << "Trying to bind a Buffer with offset into one without offset "
+        << " required elem_offset=" << arg->elem_offset
+        << ", provided elem_offset=" << value->elem_offset;
   }
 
   this->Bind(arg->data, value->data, arg_name + ".data");
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index 8591c77bd7cc..ccf7fd617194 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -135,29 +135,29 @@ Tensor Schedule::cache_read(const Tensor& tensor,
   return cache;
 }
 
-// Cache write and relayout the data according to loop pattern
-Array<Tensor> CacheWriteWithReLayout(Schedule sch,
-                              const Array<Tensor>& tensor_array,
-                              const std::string& scope) {
-  size_t tensor_size = tensor_array.size();
-  sch->InvalidateCache();
-  Tensor tensor = tensor_array[0];
-  Stage orig_stage = sch[tensor->op];
-  const ComputeOpNode* compute = orig_stage->op.as<ComputeOpNode>();
-  std::unordered_set<IterVar> red_axis;
-  for (IterVar iv : compute->reduce_axis) {
+template<typename OpType>
+void PrepareAxisMapping(Stage orig_stage,
+                        OpType* op,
+                        std::unordered_set<IterVar>* p_red_axis,
+                        Array<IterVar>* p_new_axis,
+                        std::unordered_map<IterVar, Range>* p_dom_map,
+                        std::unordered_map<const Variable*, Expr>* p_vsub,
+                        std::unordered_map<const Variable*, Expr>* p_vsub2newvar,
+                        std::vector<Expr>* p_predicates) {
+  auto& red_axis = *p_red_axis;
+  auto& new_axis = *p_new_axis;
+  auto& dom_map = *p_dom_map;
+  auto& vsub = *p_vsub;
+  auto& vsub2newvar = *p_vsub2newvar;
+  auto& predicates = *p_predicates;
+
+  for (IterVar iv : op->reduce_axis) {
     red_axis.insert(iv);
   }
-  std::unordered_map<IterVar, Range> dom_map;
-  Array<IterVar> new_axis;
-
-  for (IterVar iv : compute->axis) {
+  for (IterVar iv : op->axis) {
     dom_map[iv] = iv->dom;
   }
   schedule::PassDownDomain(orig_stage, &dom_map, true);
-  std::unordered_map<const Variable*, Expr> vsub;
-  std::unordered_map<const Variable*, Expr> vsub2newvar;
-  std::vector<Expr> predicates;
   {
     // The source->cache
     std::unordered_map<IterVar, Expr> value_map;
@@ -178,17 +178,85 @@ Array<Tensor> CacheWriteWithReLayout(Schedule sch,
     }
     // skip reduction iteration.
     std::unordered_set<IterVar> skip_bound_check;
-    for (IterVar iv : compute->reduce_axis) {
+    for (IterVar iv : op->reduce_axis) {
       skip_bound_check.insert(iv);
     }
     schedule::PassUpIndex(orig_stage, dom_map, &value_map, true);
     predicates = schedule::MakeBoundCheck(
         orig_stage, dom_map, value_map, true, skip_bound_check);
     // The root axis
-    for (IterVar iv : compute->axis) {
-      vsub[iv->var.get()] = value_map.at(iv);
+    for (IterVar iv : op->axis) {
+      if (value_map.count(iv)) {
+        vsub[iv->var.get()] = value_map.at(iv);
+      }  // to handle tensor axis
     }
   }
+}
+
+Array<Tensor> ReplaceOriginalOp(Schedule sch,
+                                Stage orig_stage,
+                                const std::string& scope,
+                                Operation cache_op,
+                                Operation orig_new_op,
+                                size_t tensor_size) {
+  Array<Tensor> cache_tensor_list;
+  for (size_t i = 0; i < tensor_size; i++) {
+    Tensor cache_tensor = cache_op.output(i);
+    cache_tensor_list.push_back(cache_tensor);
+  }
+  // The replace of the dataflow
+  std::unordered_map<Tensor, Tensor> vmap;
+  std::unordered_map<Tensor, Tensor> rvmap;
+  vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
+  rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
+  for (size_t i = 0; i < tensor_size; i++) {
+    vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
+    rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
+  }
+  ReplaceDataFlow(sch->stages, &vmap, &rvmap);
+  // mutate orig stage
+  orig_stage->op = orig_new_op;
+  orig_stage->all_iter_vars = orig_stage->op->root_iter_vars();
+  orig_stage->leaf_iter_vars = orig_stage->all_iter_vars;
+  orig_stage->relations = Array<IterVarRelation>();
+  // create schedule for new cached stage.
+  ArrayNode* stages = sch->stages.CopyOnWrite();
+  size_t pos = FindNodeRef(stages, orig_stage);
+  Stage cache_stage = Stage(cache_op);
+  cache_stage.set_scope(scope);
+  CHECK_LT(pos, stages->data.size());
+  stages->data.insert(stages->data.begin() + pos,
+                      cache_stage.node_);
+  sch->stage_map.Set(cache_op, cache_stage);
+  // Update group
+  cache_stage->group = orig_stage->group;
+  if (cache_stage->group.defined()) {
+    ++cache_stage->group->num_child_stages;
+  }
+  return cache_tensor_list;
+}
+
+
+// Cache write and relayout the data according to loop pattern
+Array<Tensor> CacheWriteWithReLayout(Schedule sch,
+                                     const Array<Tensor>& tensor_array,
+                                     const std::string& scope) {
+  size_t tensor_size = tensor_array.size();
+  sch->InvalidateCache();
+  Tensor tensor = tensor_array[0];
+  Stage orig_stage = sch[tensor->op];
+  const ComputeOpNode* compute = orig_stage->op.as<ComputeOpNode>();
+
+  std::unordered_set<IterVar> red_axis;
+  Array<IterVar> new_axis;
+  std::unordered_map<IterVar, Range> dom_map;
+
+  std::unordered_map<const Variable*, Expr> vsub;
+  std::unordered_map<const Variable*, Expr> vsub2newvar;
+  std::vector<Expr> predicates;
+
+  PrepareAxisMapping(orig_stage, compute,
+    &red_axis, &new_axis, &dom_map, &vsub, &vsub2newvar, &predicates);
 
   Expr body;
   Array<Expr> body_list;
@@ -198,7 +266,7 @@ Array<Tensor> CacheWriteWithReLayout(Schedule sch,
     body = InjectPredicate(predicates, body);
     body = VarReplacer(vsub2newvar).Mutate(body);
     // Reduce nodes in ONE computeOp must be the same except value_index
-    // This is right only if the oringinal body ensures Reduce nodes are the same
+    // This is right only if the original body ensures Reduce nodes are the same
     if (body->is_type<ir::Reduce>()) {
       const ir::Reduce* reduce_body = body.as<ir::Reduce>();
       if (first_reduce != nullptr) {
@@ -234,48 +302,107 @@ Array<Tensor> CacheWriteWithReLayout(Schedule sch,
   Operation cache_op = ComputeOpNode::make(
       compute->name + "." + scope, compute->tag, compute->attrs,
       new_axis, body_list);
-  Array<Tensor> cache_tensor_list;
+
   Array<Expr> cache_expr_list;
   for (size_t i = 0; i < tensor_size; i++) {
     Tensor cache_tensor = cache_op.output(i);
-    cache_tensor_list.push_back(cache_tensor);
     cache_expr_list.push_back(cache_tensor(args));
   }
   Operation orig_new_op = ComputeOpNode::make(
       compute->name, compute->tag, compute->attrs,
       compute->axis, cache_expr_list);
-  // The replace of the dataflow
-  std::unordered_map<Tensor, Tensor> vmap;
-  std::unordered_map<Tensor, Tensor> rvmap;
-  vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
-  rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
-  for (size_t i = 0; i < tensor_size; i++) {
-    vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
-    rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
+  return ReplaceOriginalOp(sch, orig_stage, scope,
+    cache_op, orig_new_op, tensor_size);
+}
+
+
+// for tensor compute op
+Array<Tensor> CacheWriteWithReLayoutTensor(Schedule sch,
+                                           const Array<Tensor>& tensor_array,
+                                           const std::string& scope) {
+  size_t tensor_size = tensor_array.size();
+  sch->InvalidateCache();
+  Tensor tensor = tensor_array[0];
+  Stage orig_stage = sch[tensor->op];
+  const TensorComputeOpNode* tensor_op = orig_stage->op.as<TensorComputeOpNode>();
+  CHECK_EQ(tensor_op->num_outputs(), 1)
+      << "cache write only support single output tensor_compute_op";
+
+  std::unordered_set<IterVar> red_axis;
+  Array<IterVar> new_axis;
+  std::unordered_map<IterVar, Range> dom_map;
+
+  std::unordered_map<const Variable*, Expr> vsub;
+  std::unordered_map<const Variable*, Expr> vsub2newvar;
+  std::vector<Expr> predicates;
+
+  PrepareAxisMapping(orig_stage, tensor_op,
+    &red_axis, &new_axis, &dom_map, &vsub, &vsub2newvar, &predicates);
+
+
+  for (int i = tensor_op->schedulable_ndim; i < static_cast<int>(tensor_op->axis.size()); ++i) {
+    IterVar iv = tensor_op->axis[i];
+    IterVar new_iv = IterVarNode::make(
+      iv->dom, iv->var.copy_with_suffix(".c"), iv->iter_type);
+    new_axis.push_back(new_iv);
+  }
+  Array<Region> new_regions;
+  for (Region old_region : tensor_op->input_regions) {
+    Region region;
+    for (Range r : old_region) {
+      Expr min = VarReplacer(vsub2newvar).Mutate(r->min);
+      Expr extent = VarReplacer(vsub2newvar).Mutate(r->extent);
+      region.push_back(Range::make_by_min_extent(min, extent));
+    }
+    new_regions.push_back(region);
   }
-  ReplaceDataFlow(sch->stages, &vmap, &rvmap);
-  // mutate orig stage
-  orig_stage->op = orig_new_op;
-  orig_stage->all_iter_vars = orig_stage->op->root_iter_vars();
-  orig_stage->leaf_iter_vars = orig_stage->all_iter_vars;
-  orig_stage->relations = Array<IterVarRelation>();
-  // create schedule for new cached stage.
-  ArrayNode* stages = sch->stages.CopyOnWrite();
-  size_t pos = FindNodeRef(stages, orig_stage);
-  Stage cache_stage = Stage(cache_op);
-  cache_stage.set_scope(scope);
-  CHECK_LT(pos, stages->data.size());
-  stages->data.insert(stages->data.begin() + pos,
-                      cache_stage.node_);
-  sch->stage_map.Set(cache_op, cache_stage);
-  // Update group
-  cache_stage->group = orig_stage->group;
-  if (cache_stage->group.defined()) {
-    ++cache_stage->group->num_child_stages;
+
+  Operation cache_op = TensorComputeOpNode::make(
+      tensor_op->name + "." + scope, tensor_op->tag, new_axis,
+      tensor_op->reduce_axis, tensor_op->schedulable_ndim,
+      tensor_op->intrin, tensor_op->inputs, new_regions);
+
+  // axis will be used in generating compute op
+  Array<IterVar> compute_axis = tensor_op->axis;
+  for (size_t i = tensor_op->schedulable_ndim; i < tensor_op->axis.size(); ++i) {
+    IterVar iv = tensor_op->axis[i];
+    IterVar aiv = IterVarNode::make(iv->dom, iv->var, kDataPar);
+    compute_axis.Set(i, aiv);
   }
-  return cache_tensor_list;
+
+  // The reader args
+  Array<Expr> args;
+  {
+    // cache->compute
+    std::unordered_map<IterVar, Expr> value_map;
+    for (IterVar iv : compute_axis) {
+      value_map[iv] = iv->var;
+    }
+    schedule::PassDownIndex(orig_stage, dom_map, &value_map, true);
+    for (IterVar iv : orig_stage->leaf_iter_vars) {
+      if (red_axis.count(iv)) continue;
+      args.push_back(value_map.at(iv));
+    }
+    // tensorized region axis
+    for (size_t i = tensor_op->schedulable_ndim; i < tensor_op->axis.size(); ++i) {
+      IterVar iv = compute_axis[i];
+      args.push_back(value_map.at(iv));
+    }
+  }
+
+  Array<Expr> cache_expr_list;
+  for (size_t i = 0; i < tensor_size; i++) {
+    Tensor cache_tensor = cache_op.output(i);
+    cache_expr_list.push_back(cache_tensor(args));
+  }
+  Operation orig_new_op = ComputeOpNode::make(
+      tensor_op->name, tensor_op->tag, {},
+      compute_axis, cache_expr_list);
+  return ReplaceOriginalOp(sch, orig_stage, scope,
+    cache_op, orig_new_op, tensor_size);
 }
 
+
 Array<Tensor> Schedule::cache_write(const Array<Tensor>& tensor_array,
                              const std::string& scope) {
   (*this)->InvalidateCache();
@@ -291,23 +418,26 @@ Array<Tensor> Schedule::cache_write(const Array<Tensor>& tensor_array,
     CHECK(orig_stage.same_as(tmp_stage))
         << "Input tensor list must be generated by ONE computeOp";
   }
-
   return CacheWriteWithReLayout(*this, tensor_array, scope);
 }
 
+
 Tensor Schedule::cache_write(const Tensor& tensor,
                              const std::string& scope) {
+  // support original compute and tensor compute both
   (*this)->InvalidateCache();
-  Stage orig_stage = operator[](tensor->op);
-  const ComputeOpNode* compute = tensor->op.as<ComputeOpNode>();
-  CHECK(compute)
-      << "cache write only take ComputeOp as writers";
-  CHECK_EQ(compute->num_outputs(), 1)
-      << "cache write only support single output ComputeOp";
-
-  return (CacheWriteWithReLayout(*this, {tensor}, scope))[0];
+  const char* type_key = tensor->op->type_key();
+  if (!strcmp(type_key, "ComputeOp")) {
+    return (CacheWriteWithReLayout(*this, {tensor}, scope))[0];
+  } else if (!strcmp(type_key, "TensorComputeOp")) {
+    return (CacheWriteWithReLayoutTensor(*this, {tensor}, scope))[0];
+  } else {
+    LOG(FATAL) << "cache write only take ComputeOp or TensorComputeOp as writers";
+    return Tensor();
+  }
 }
 
+
 void RebaseNonZeroMinLoop(const Schedule& sch) {
   std::unordered_map<IterVar, IterVar> rebase_map;
   for (Stage s : sch->stages) {
diff --git a/tests/python/unittest/test_lang_tensor.py b/tests/python/unittest/test_lang_tensor.py
index f562a48e44ae..50492ca41fca 100644
--- a/tests/python/unittest/test_lang_tensor.py
+++ b/tests/python/unittest/test_lang_tensor.py
@@ -85,6 +85,78 @@ def test_tensor_reduce():
     assert(isinstance(C_loaded, tvm.tensor.Tensor))
     assert(str(C_loaded) == str(C))
 
+def test_tensor_compute1():
+    m = 1024
+    factor = 16
+    dtype = 'float32'
+
+    def intrin_vadd(n):
+        x = tvm.placeholder((n,))
+        y = tvm.placeholder((n,))
+        z = tvm.compute(x.shape, lambda i: x[i] + y[i])
+
+        def intrin_func(ins, outs):
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
+            return ib.get()
+
+        with tvm.build_config(offset_factor=n):
+            return tvm.decl_tensor_intrin(z.op, intrin_func)
+
+    vadd = intrin_vadd(factor)
+
+    A = tvm.placeholder((m//factor, factor), name="A", dtype=dtype)
+    B = tvm.placeholder((m//factor, factor), name="B", dtype=dtype)
+    C = tvm.compute((m//factor, factor),
+          lambda i: vadd(A[i, 0:factor], B[i, 0:factor]))
+
+    s = tvm.create_schedule(C.op)
+    stmt = tvm.lower(s, [A, B, C], simple_mode=True)
+    assert isinstance(stmt.body.body, tvm.stmt.Evaluate)
+
+def test_tensor_compute2():
+    M = 2048
+    N = 1024
+    L = 1024
+    factor = 16
+    factor1 = 32
+    factor2 = 32
+    dtype = 'float32'
+
+    def intrin_gemm(m, n, l):
+        k = tvm.reduce_axis((0, l))
+        x = tvm.placeholder((m, l))
+        y = tvm.placeholder((n, l))
+        # in theory, no relation
+        z = tvm.compute((m, n), lambda i, j: tvm.sum(x[i][k] * y[j][k], axis=k))
+
+        def intrin_func(ins, outs):
+            x_ptr = ins[0].access_ptr("r")
+            y_ptr = ins[1].access_ptr("r")
+            z_ptr = outs[0].access_ptr("w")
+            body = tvm.call_packed(
+                "gemv", x_ptr, y_ptr, z_ptr, m, n, l)
+            reset = tvm.call_packed(
+                "fill_zero", z_ptr, m, n)
+            update = tvm.call_packed(
+                "gemv_add", x_ptr, y_ptr, z_ptr, m, n, l)
+            return body, reset, update
+
+        with tvm.build_config(offset_factor=n):
+            return tvm.decl_tensor_intrin(z.op, intrin_func)
+
+    vgemm = intrin_gemm(factor1, factor2, factor)
+
+    A = tvm.placeholder((M//factor1, L//factor, factor1, factor), name="A", dtype=dtype)
+    B = tvm.placeholder((N//factor2, L//factor, factor2, factor), name="B", dtype=dtype)
+    k = tvm.reduce_axis((0, L//factor), name='k')
+    C = tvm.compute((M//factor1, N//factor2, factor1, factor2),
+          lambda i, j: vgemm(A[i, k, 0:factor1, 0:factor], B[j, k, 0:factor2, 0:factor], reduce_axis=k))
+
+    s = tvm.create_schedule(C.op)
+    stmt = tvm.lower(s, [A, B, C], simple_mode=True)
+    assert isinstance(stmt.body.body.body.first, tvm.stmt.Evaluate)
+    assert isinstance(stmt.body.body.body.rest.body, tvm.stmt.Evaluate)
 
 def test_tensor_scan():
     m = tvm.var("m")
@@ -221,6 +293,8 @@ def intrin_func(ins, outs):
     test_conv1d()
     test_tensor_slice()
     test_tensor()
+    test_tensor_compute1()
+    test_tensor_compute2()
     test_tensor_reduce()
     test_tensor_scan()
     test_scan_multi_out()
diff --git a/tests/python/unittest/test_schedule_schedule_ops.py b/tests/python/unittest/test_schedule_schedule_ops.py
index 8e6f4090d403..8774514cfa17 100644
--- a/tests/python/unittest/test_schedule_schedule_ops.py
+++ b/tests/python/unittest/test_schedule_schedule_ops.py
@@ -276,6 +276,133 @@ def test_schedule_bound_condition():
    stmt = tvm.ir_pass.Simplify(stmt)
    assert (isinstance(stmt.body.body.first.body.body.then_case, tvm.stmt.IfThenElse))
 
+
+def intrin_gemv(m, n):
+    w = tvm.placeholder((m, n), name='w')
+    x = tvm.placeholder((n,), name='x')
+    k = tvm.reduce_axis((0, n), name='k')
+    z = tvm.compute((m,), lambda i:
+                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
+    Wb = tvm.decl_buffer(w.shape, w.dtype,
+                         name="W",
+                         offset_factor=16,
+                         strides=[tvm.var('ldw'), 1])
+    def intrin_func(ins, outs):
+        ww, xx = ins
+        zz = outs[0]
+        ww_ptr = ww.access_ptr("r")
+        xx_ptr = xx.access_ptr("r")
+        zz_ptr = zz.access_ptr("w")
+        body = tvm.call_packed(
+            "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
+        reset = tvm.call_packed(
+            "fill_zero", zz_ptr, n)
+        update = tvm.call_packed(
+            "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
+        return body, reset, update
+
+    with tvm.build_config(data_alignment=16,
+                          offset_factor=16):
+        return tvm.decl_tensor_intrin(z.op, intrin_func,
+                                      binds={w: Wb})
+
+
+def test_schedule_tensor_compute1():
+    # basic: split, reorder, tile
+    M, N, L = 2048, 1024, 512
+    factor, rfactor = 16, 16
+    A = tvm.placeholder((N//factor, L//rfactor, factor, rfactor), name='A')
+    B = tvm.placeholder((M, L//rfactor, rfactor), name='B')
+    k = tvm.reduce_axis((0, L//rfactor), name='k')
+
+    gemv = intrin_gemv(factor, rfactor)
+    C = tvm.compute((N, M//factor, factor),
+        lambda i, j: gemv(A[i, k, 0:factor, 0:factor], B[j, k, 0:rfactor], reduce_axis=k),
+        name='C')
+
+    s = tvm.create_schedule(C.op)
+    ai, aj, ax = s[C].op.axis
+    aio, aii = s[C].split(ai, 16)
+    s[C].reorder(aio, aj, aii)
+    aioo, ajo, aioi, aji = s[C].tile(aio, aj, 16, 4)
+
+    s = s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+
+
+def intrin_vadd(n, cache_read=False, cache_write=False):
+    scope_ubuf = 'local'
+    dtype = 'float32'
+    x = tvm.placeholder((n,), dtype=dtype, name='vx')
+    y = tvm.placeholder((n,), dtype=dtype, name='vy')
+    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
+    s = tvm.create_schedule(z.op)
+
+    def create_buffer(t):
+        return tvm.decl_buffer(t.shape, t.dtype,
+                               name='W'+t.name,
+                               scope=scope_ubuf,
+                               offset_factor=16)
+
+    binds = {}
+    if cache_read:
+        binds[x] = create_buffer(x)
+        binds[y] = create_buffer(y)
+    if cache_write:
+        binds[z] = create_buffer(z)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+        ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
+        return ib.get()
+
+    with tvm.build_config(offset_factor=16):
+        return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
+
+
+def test_schedule_tensor_compute2():
+    # cache_read, cache_write
+    M = 1024
+    factor = 16
+    dtype = 'float32'
+    scope_ubuf = 'local'
+
+    A = tvm.placeholder((M//factor, factor), name="A", dtype=dtype)
+    B = tvm.placeholder((M//factor, factor), name="B", dtype=dtype)
+
+    vadd = intrin_vadd(factor, True, True)
+    C = tvm.compute((M//factor, factor),
+        lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name='C')
+
+    s = tvm.create_schedule(C.op)
+    AL = s.cache_read(A, scope_ubuf, C)
+    BL = s.cache_read(B, scope_ubuf, C)
+    CL = s.cache_write(C, scope_ubuf)
+    s = s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+
+
+def test_schedule_tensor_compute3():
+    # compute_at
+    M = 1024
+    factor = 16
+    dtype = 'float32'
+    A = tvm.placeholder((M//factor, factor), name="A", dtype=dtype)
+    B = tvm.placeholder((M//factor, factor), name="B", dtype=dtype)
+    Bi = tvm.compute((M//factor, factor), lambda i, j: B[i, j] + 5, name="Bi")
+
+    vadd = intrin_vadd(factor)
+    C = tvm.compute((M//factor, factor),
+        lambda i: vadd(A[i, 0:factor], Bi[i, 0:factor]), name='C')
+    s = tvm.create_schedule(C.op)
+    s[Bi].compute_at(s[C], C.op.axis[0])
+    s = s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+
+
 if __name__ == "__main__":
     test_schedule_middle_cache()
     test_inline_multi_reduce()
@@ -294,3 +421,6 @@ def test_schedule_bound_condition():
     test_schedule2()
     test_schedule_cache()
     test_schedule_bound_condition()
+    test_schedule_tensor_compute1()
+    test_schedule_tensor_compute2()
+    test_schedule_tensor_compute3()

From 65560d89512edf4f410b65aaa85722a09befac10 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <cowanmeg@cs.washington.edu>
Date: Fri, 5 Oct 2018 21:54:47 -0700
Subject: [PATCH 178/529] [RELAY] Add sigmoid relay operator (#1836)

---
 docs/langref/relay_op.rst            |  2 ++
 python/tvm/relay/op/tensor.py        | 16 ++++++++++++++++
 src/relay/op/tensor/unary.cc         | 11 +++++++++++
 tests/python/relay/test_op_level1.py | 16 ++++++++++++++++
 4 files changed, 45 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index bcaece0bf0a1..a4728eaa7db7 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -25,6 +25,7 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.log
    tvm.relay.sqrt
    tvm.relay.exp
+   tvm.relay.sigmoid
    tvm.relay.add
    tvm.relay.expand_dims
 
@@ -61,6 +62,7 @@ Level 1 Definitions
 .. autofunction:: tvm.relay.log
 .. autofunction:: tvm.relay.sqrt
 .. autofunction:: tvm.relay.exp
+.. autofunction:: tvm.relay.sigmoid
 .. autofunction:: tvm.relay.add
 
 
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index c8c42c1a6ca4..05538c401a4b 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -60,6 +60,22 @@ def sqrt(data):
     return _make.sqrt(data)
 
 
+def sigmoid(data):
+    """Compute elementwise sigmoid of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.sigmoid(data)
+
+
 def add(lhs, rhs):
     """Addition with numpy-style broadcasting.
 
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 798d4aa791ad..77bb08e14257 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -65,6 +65,17 @@ RELAY_REGISTER_UNARY_OP("sqrt")
 .add_type_rel("Identity", IdentityRel);
 
 
+RELAY_REGISTER_UNARY_OP("sigmoid")
+.describe(R"code(Returns the sigmoid input array, computed element-wise.
+
+.. math::
+   sigmoid(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
+
 // Concat
 TVM_REGISTER_API("relay.op._make.concat")
   .set_body_typed<Expr(Expr)>([](Expr tuple) {
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index c1c8b03c1c23..9cfca9630561 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -16,5 +16,21 @@ def test_expand_dims_infer_type():
         (n, t, 1, 100), "float32")
 
 
+def test_unary_op():
+    for op in [relay.exp,
+               relay.log,
+               relay.sqrt,
+               relay.sigmoid]:
+        ib = relay.ir_builder.IRBuilder()
+        x = ib.param("x", relay.TensorType((10, 4), "int32"))
+        with ib.function(x) as func:
+            ib.ret(op(x.var))
+        ib.ret(func)
+        func = relay.ir_pass.infer_type(ib.env, func.to_func())
+        ftype = func.checked_type()
+        assert ftype.ret_type == relay.TensorType((10, 4), "int32")
+
+
 if __name__ == "__main__":
     test_expand_dims_infer_type()
+    test_unary_op()

From 2038455b9a95c453912c9db7c1eea98874faa8c1 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@cs.washington.edu>
Date: Fri, 5 Oct 2018 22:07:01 -0700
Subject: [PATCH 179/529] [RELAY][OP] Left shift operator (#1839)

---
 docs/langref/relay_op.rst            |  2 ++
 python/tvm/relay/op/tensor.py        | 19 +++++++++++++-
 src/relay/op/tensor/binary.cc        | 38 ++++++++++++++++++++--------
 tests/python/relay/test_op_level4.py |  3 ++-
 4 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index a4728eaa7db7..59396e2ba91f 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -47,6 +47,7 @@ This level enables typical convnet models.
    :nosignatures:
 
    tvm.relay.right_shift
+   tvm.relay.left_shift
    tvm.relay.equal
    tvm.relay.not_equal
    tvm.relay.greater
@@ -74,6 +75,7 @@ Level 2 Definitions
 Level 4 Definitions
 -------------------
 .. autofunction:: tvm.relay.right_shift
+.. autofunction:: tvm.relay.left_shift
 .. autofunction:: tvm.relay.equal
 .. autofunction:: tvm.relay.not_equal
 .. autofunction:: tvm.relay.greater
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 05538c401a4b..a82512a4d5a8 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -120,7 +120,6 @@ def subtract(lhs, rhs):
     return _make.subtract(lhs, rhs)
 
 
-
 def equal(lhs, rhs):
     """Broadcasted elementwise test for (lhs == rhs).
 
@@ -247,6 +246,24 @@ def right_shift(lhs, rhs):
     return _make.right_shift(lhs, rhs)
 
 
+def left_shift(lhs, rhs):
+    """Left shift with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.left_shift(lhs, rhs)
+
+
 def concat(*args):
     """Concatenate the input tensors along the zero axis.
 
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index 4c0fa657bac4..518b6050f997 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -27,16 +27,23 @@ RELAY_REGISTER_BINARY_OP("add")
 .describe("Elementwise add with with broadcasting")
 .set_support_level(1);
 
+// Subtraction
 RELAY_REGISTER_BINARY_OP("subtract")
 .describe("Elementwise substract with broadcasting")
 .set_support_level(1);
 
+// Right shift
 RELAY_REGISTER_BINARY_OP("right_shift")
 .describe("Elementwise right shift with broadcasting")
 .set_support_level(4);
 
+// Left shift
+RELAY_REGISTER_BINARY_OP("left_shift")
+.describe("Elementwise left shift with broadcasting")
+.set_support_level(4);
+
 // Comparisons
-#define RELAY_REGISTER_CMP_OP(OpName, SupportLevel)                 \
+#define RELAY_REGISTER_CMP_OP(OpName)                               \
   TVM_REGISTER_API("relay.op._make." OpName)                        \
   .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {        \
       static const Op& op = Op::Get(OpName);                        \
@@ -46,15 +53,26 @@ RELAY_REGISTER_BINARY_OP("right_shift")
     .set_num_inputs(2)                                              \
     .add_argument("lhs", "Tensor", "The left hand side tensor.")    \
     .add_argument("rhs", "Tensor", "The right hand side tensor.")   \
-    .set_support_level(SupportLevel)                                \
-    .add_type_rel("BroadcastComp", BroadcastCompRel);
-
-RELAY_REGISTER_CMP_OP("equal", 4);
-RELAY_REGISTER_CMP_OP("not_equal", 4);
-RELAY_REGISTER_CMP_OP("less", 4);
-RELAY_REGISTER_CMP_OP("less_equal", 4);
-RELAY_REGISTER_CMP_OP("greater", 4);
-RELAY_REGISTER_CMP_OP("greater_equal", 4);
+    .add_type_rel("BroadcastComp", BroadcastCompRel)
+
+RELAY_REGISTER_CMP_OP("equal")
+.describe("Elementwise equal compare with broadcasting")
+.set_support_level(4);
+RELAY_REGISTER_CMP_OP("not_equal")
+.describe("Elementwise not equal with broadcasting")
+.set_support_level(4);
+RELAY_REGISTER_CMP_OP("less")
+.describe("Elementwise less than with broadcasting")
+.set_support_level(4);
+RELAY_REGISTER_CMP_OP("less_equal")
+.describe("Elementwise less than or equal compare with broadcasting")
+.set_support_level(4);
+RELAY_REGISTER_CMP_OP("greater")
+.describe("Elementwise greater than compare with broadcasting")
+.set_support_level(4);
+RELAY_REGISTER_CMP_OP("greater_equal")
+.describe("Elementwise greater than or equal compare with broadcasting")
+.set_support_level(4);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 5009994871f7..ec2a17e94e7f 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -21,7 +21,8 @@ def test_cmp_type():
 
 
 def test_binary_broadcast():
-    for op in [relay.right_shift]:
+    for op in (relay.right_shift,
+               relay.left_shift):
         ib = relay.ir_builder.IRBuilder()
         x = ib.param("x", relay.TensorType((10, 4), "int32"))
         y = ib.param("y", relay.TensorType((5, 10, 1), "int32"))

From bb87f4d755c5f3be33fa0d9650b017bed6d315ce Mon Sep 17 00:00:00 2001
From: Liang Luo <llmainland@live.cn>
Date: Fri, 5 Oct 2018 22:09:57 -0700
Subject: [PATCH 180/529] maximum relay op V2 (#1838)

---
 docs/langref/relay_op.rst            |  2 ++
 python/tvm/relay/op/tensor.py        | 18 ++++++++++++++++++
 src/relay/op/tensor/binary.cc        |  5 ++++-
 tests/python/relay/test_op_level4.py |  5 +++--
 4 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 59396e2ba91f..9810f7c2c06d 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -54,6 +54,7 @@ This level enables typical convnet models.
    tvm.relay.greater_equal
    tvm.relay.less
    tvm.relay.less_equal
+   tvm.relay.maximum
 
 **Level 5: Vision/Image Operators**
 
@@ -82,3 +83,4 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.greater_equal
 .. autofunction:: tvm.relay.less
 .. autofunction:: tvm.relay.less_equal
+.. autofunction:: tvm.relay.maximum
\ No newline at end of file
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index a82512a4d5a8..fb0ecc618216 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -228,6 +228,24 @@ def greater_equal(lhs, rhs):
     return _make.greater_equal(lhs, rhs)
 
 
+def maximum(lhs, rhs):
+    """Maximum with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.maximum(lhs, rhs)
+
+
 def right_shift(lhs, rhs):
     """Right shift with numpy-style broadcasting.
 
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index 518b6050f997..d16c2ea72203 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -37,10 +37,13 @@ RELAY_REGISTER_BINARY_OP("right_shift")
 .describe("Elementwise right shift with broadcasting")
 .set_support_level(4);
 
-// Left shift
 RELAY_REGISTER_BINARY_OP("left_shift")
 .describe("Elementwise left shift with broadcasting")
 .set_support_level(4);
+  
+RELAY_REGISTER_BINARY_OP("maximum")
+.describe("Elementwise maximum of two tensors with broadcasting")
+.set_support_level(4);
 
 // Comparisons
 #define RELAY_REGISTER_CMP_OP(OpName)                               \
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index ec2a17e94e7f..a4b8cebd297d 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -21,8 +21,9 @@ def test_cmp_type():
 
 
 def test_binary_broadcast():
-    for op in (relay.right_shift,
-               relay.left_shift):
+    for op in [relay.right_shift,
+               relay.left_shift,
+               relay.maximum]:
         ib = relay.ir_builder.IRBuilder()
         x = ib.param("x", relay.TensorType((10, 4), "int32"))
         y = ib.param("y", relay.TensorType((5, 10, 1), "int32"))

From 1ad64c8fe14f6972717274db5e97095fe79d1228 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 5 Oct 2018 22:16:01 -0700
Subject: [PATCH 181/529] Fix lint error missed during CI outrage (#1846)

---
 src/relay/op/tensor/binary.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index d16c2ea72203..7f8f6884c597 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -40,7 +40,7 @@ RELAY_REGISTER_BINARY_OP("right_shift")
 RELAY_REGISTER_BINARY_OP("left_shift")
 .describe("Elementwise left shift with broadcasting")
 .set_support_level(4);
-  
+
 RELAY_REGISTER_BINARY_OP("maximum")
 .describe("Elementwise maximum of two tensors with broadcasting")
 .set_support_level(4);

From 1d2a012904cbf97ec7d5ca3b72f4caa85f5b9d83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Fri, 5 Oct 2018 22:56:18 -0700
Subject: [PATCH 182/529] [Relay] [Op] zeros_like and ones_like (#1835)

---
 docs/langref/relay_op.rst            |  6 ++++++
 python/tvm/relay/op/tensor.py        | 32 ++++++++++++++++++++++++++++
 src/relay/op/tensor/unary.cc         | 14 ++++++++----
 tests/python/relay/test_op_level3.py | 13 +++++++++++
 4 files changed, 61 insertions(+), 4 deletions(-)
 create mode 100644 tests/python/relay/test_op_level3.py

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 9810f7c2c06d..6563903a9cbf 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -41,6 +41,12 @@ This level enables typical convnet models.
 
 **Level 3: Additional Math And Transform Operators**
 
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.zeros_like
+   tvm.relay.ones_like
+
 **Level 4: Broadcast and Reductions**
 
 .. autosummary::
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index fb0ecc618216..d6e4f32ae553 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -295,3 +295,35 @@ def concat(*args):
     """
     tup = Tuple(list(args))
     return _make.concat(tup)
+
+
+def zeros_like(data):
+    """Returns an array of zeros, with same type and shape as the input.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.zeros_like(data)
+
+
+def ones_like(data):
+    """Returns an array of ones, with same type and shape as the input.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.ones_like(data)
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 77bb08e14257..cfcc14e4276f 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -56,14 +56,21 @@ RELAY_REGISTER_UNARY_OP("exp")
 
 RELAY_REGISTER_UNARY_OP("sqrt")
 .describe(R"code(Returns the sqrt input array, computed element-wise.
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
 
-.. math::
-   sqrt(x)
-
+RELAY_REGISTER_UNARY_OP("zeros_like")
+.describe(R"code(Returns an array of zeros, with same type and shape as the input.
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
+RELAY_REGISTER_UNARY_OP("ones_like")
+.describe(R"code(Returns an array of ones, with same type and shape as the input.
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
 
 RELAY_REGISTER_UNARY_OP("sigmoid")
 .describe(R"code(Returns the sigmoid input array, computed element-wise.
@@ -75,7 +82,6 @@ RELAY_REGISTER_UNARY_OP("sigmoid")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
-
 // Concat
 TVM_REGISTER_API("relay.op._make.concat")
   .set_body_typed<Expr(Expr)>([](Expr tuple) {
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
new file mode 100644
index 000000000000..450a7c3458f7
--- /dev/null
+++ b/tests/python/relay/test_op_level3.py
@@ -0,0 +1,13 @@
+import tvm
+from tvm import relay
+
+def test_unary_identity():
+    for op in [relay.zeros_like, relay.ones_like]:
+        ib = relay.ir_builder.IRBuilder()
+        x = ib.param("x", relay.TensorType((8, 9, 4), "int32"))
+        with ib.function(x) as func:
+            ib.ret(op(x.var))
+        ib.ret(func)
+        func = relay.ir_pass.infer_type(ib.env, func.to_func())
+        ftype = func.checked_type()
+        assert ftype.ret_type == relay.TensorType((8, 9, 4), "int32")

From acddf8efd2c4e173cbba43b9f800a210f628b097 Mon Sep 17 00:00:00 2001
From: Pratyush Patel <pratyushpatel.1995@gmail.com>
Date: Sat, 6 Oct 2018 10:20:12 -0700
Subject: [PATCH 183/529] [RELAY][OP] Add relay minimum op (#1840)

---
 docs/langref/relay_op.rst            |  4 +++-
 python/tvm/relay/op/tensor.py        | 18 ++++++++++++++++++
 src/relay/op/tensor/binary.cc        |  4 ++++
 tests/python/relay/test_op_level4.py |  3 ++-
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 6563903a9cbf..8566404561b2 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -61,6 +61,7 @@ This level enables typical convnet models.
    tvm.relay.less
    tvm.relay.less_equal
    tvm.relay.maximum
+   tvm.relay.minimum
 
 **Level 5: Vision/Image Operators**
 
@@ -89,4 +90,5 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.greater_equal
 .. autofunction:: tvm.relay.less
 .. autofunction:: tvm.relay.less_equal
-.. autofunction:: tvm.relay.maximum
\ No newline at end of file
+.. autofunction:: tvm.relay.maximum
+.. autofunction:: tvm.relay.minimum
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index d6e4f32ae553..859bfdc26799 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -246,6 +246,24 @@ def maximum(lhs, rhs):
     return _make.maximum(lhs, rhs)
 
 
+def minimum(lhs, rhs):
+    """Minimum with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.minimum(lhs, rhs)
+
+
 def right_shift(lhs, rhs):
     """Right shift with numpy-style broadcasting.
 
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index 7f8f6884c597..11175f21573d 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -45,6 +45,10 @@ RELAY_REGISTER_BINARY_OP("maximum")
 .describe("Elementwise maximum of two tensors with broadcasting")
 .set_support_level(4);
 
+RELAY_REGISTER_BINARY_OP("minimum")
+.describe("Elementwise minimum of two tensors with broadcasting")
+.set_support_level(4);
+
 // Comparisons
 #define RELAY_REGISTER_CMP_OP(OpName)                               \
   TVM_REGISTER_API("relay.op._make." OpName)                        \
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index a4b8cebd297d..72876780f944 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -23,7 +23,8 @@ def test_cmp_type():
 def test_binary_broadcast():
     for op in [relay.right_shift,
                relay.left_shift,
-               relay.maximum]:
+               relay.maximum,
+               relay.minimum]:
         ib = relay.ir_builder.IRBuilder()
         x = ib.param("x", relay.TensorType((10, 4), "int32"))
         y = ib.param("y", relay.TensorType((5, 10, 1), "int32"))

From 05738cb23cad8f3a4c8c1a51ec4aca2ffbb504d4 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Sat, 6 Oct 2018 22:58:06 +0530
Subject: [PATCH 184/529] [DOCKER][GOLANG] Fix golang compiler version to 0.10
 (#1848)

---
 docker/Dockerfile.ci_cpu                | 2 +-
 docker/install/ubuntu_install_golang.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 60d811344b07..86a633bf8f3c 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -32,4 +32,4 @@ COPY install/ubuntu_install_sgx.sh /install/ubuntu_install_sgx.sh
 RUN bash /install/ubuntu_install_sgx.sh
 
 
-ENV PATH $PATH:/root/.cargo/bin
+ENV PATH $PATH:/root/.cargo/bin:/usr/lib/go-1.10/bin
diff --git a/docker/install/ubuntu_install_golang.sh b/docker/install/ubuntu_install_golang.sh
index 9585824091a7..e15a456bc15a 100644
--- a/docker/install/ubuntu_install_golang.sh
+++ b/docker/install/ubuntu_install_golang.sh
@@ -1,4 +1,4 @@
 #install the necessary dependancies for golang build
-apt-get update && apt-get install -y golang-go
+apt-get update && apt-get install -y golang-0.10-go
 apt-get update && apt-get install -y godoc
 apt-get update && apt-get install -y golint

From 2c2ca3f7287f5bc48bc346bdaf573a9842340345 Mon Sep 17 00:00:00 2001
From: nhynes <nhynes@berkeley.edu>
Date: Sat, 6 Oct 2018 10:30:25 -0700
Subject: [PATCH 185/529] Add rust runtime (#1597)

---
 .gitignore                                    |   3 -
 rust/.gitignore                               |   3 +
 rust/.rustfmt.toml                            |  59 ++
 rust/.travis.yml                              |   5 +
 rust/Cargo.toml                               |  28 +
 rust/src/errors.rs                            |  39 +
 rust/src/lib.rs                               |  68 ++
 rust/src/runtime/allocator.rs                 |  52 ++
 rust/src/runtime/array.rs                     | 461 +++++++++++
 rust/src/runtime/c_runtime_api.rs             | 770 ++++++++++++++++++
 rust/src/runtime/graph.rs                     | 466 +++++++++++
 rust/src/runtime/mod.rs                       |  25 +
 rust/src/runtime/module.rs                    |  46 ++
 rust/src/runtime/packed_func.rs               | 286 +++++++
 rust/src/runtime/sgx.rs                       |  82 ++
 rust/src/runtime/threading.rs                 | 334 ++++++++
 rust/src/runtime/workspace.rs                 | 119 +++
 rust/tests/.gitignore                         |   3 +
 rust/tests/build_model.py                     |  53 ++
 rust/tests/test_graph_serde.rs                |  38 +
 rust/tests/test_nnvm/Cargo.toml               |  14 +
 rust/tests/test_nnvm/build.rs                 |  28 +
 rust/tests/test_nnvm/src/build_test_graph.py  |  63 ++
 rust/tests/test_nnvm/src/main.rs              |  80 ++
 rust/tests/test_tvm_basic/Cargo.toml          |  12 +
 rust/tests/test_tvm_basic/build.rs            |  28 +
 .../test_tvm_basic/src/build_test_lib.py      |  21 +
 rust/tests/test_tvm_basic/src/main.rs         |  25 +
 28 files changed, 3208 insertions(+), 3 deletions(-)
 create mode 100644 rust/.gitignore
 create mode 100644 rust/.rustfmt.toml
 create mode 100644 rust/.travis.yml
 create mode 100644 rust/Cargo.toml
 create mode 100644 rust/src/errors.rs
 create mode 100644 rust/src/lib.rs
 create mode 100644 rust/src/runtime/allocator.rs
 create mode 100644 rust/src/runtime/array.rs
 create mode 100644 rust/src/runtime/c_runtime_api.rs
 create mode 100644 rust/src/runtime/graph.rs
 create mode 100644 rust/src/runtime/mod.rs
 create mode 100644 rust/src/runtime/module.rs
 create mode 100644 rust/src/runtime/packed_func.rs
 create mode 100644 rust/src/runtime/sgx.rs
 create mode 100644 rust/src/runtime/threading.rs
 create mode 100644 rust/src/runtime/workspace.rs
 create mode 100644 rust/tests/.gitignore
 create mode 100644 rust/tests/build_model.py
 create mode 100644 rust/tests/test_graph_serde.rs
 create mode 100644 rust/tests/test_nnvm/Cargo.toml
 create mode 100644 rust/tests/test_nnvm/build.rs
 create mode 100755 rust/tests/test_nnvm/src/build_test_graph.py
 create mode 100644 rust/tests/test_nnvm/src/main.rs
 create mode 100644 rust/tests/test_tvm_basic/Cargo.toml
 create mode 100644 rust/tests/test_tvm_basic/build.rs
 create mode 100755 rust/tests/test_tvm_basic/src/build_test_lib.py
 create mode 100644 rust/tests/test_tvm_basic/src/main.rs

diff --git a/.gitignore b/.gitignore
index 833eee1a0774..368764941cec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,10 +91,8 @@ ENV/
 *~
 *.pyc
 *~
-build
 config.mk
 config.cmake
-build_*
 Win32
 *.dir
 perf
@@ -187,7 +185,6 @@ tvm_u.*
 tvm_t.*
 # Mac OS X
 .DS_Store
-build*
 
 # Jetbrain
 .idea
diff --git a/rust/.gitignore b/rust/.gitignore
new file mode 100644
index 000000000000..230ab66104df
--- /dev/null
+++ b/rust/.gitignore
@@ -0,0 +1,3 @@
+Cargo.lock
+target/
+**/*.rs.bk
diff --git a/rust/.rustfmt.toml b/rust/.rustfmt.toml
new file mode 100644
index 000000000000..df9a65dacfaa
--- /dev/null
+++ b/rust/.rustfmt.toml
@@ -0,0 +1,59 @@
+max_width = 100
+hard_tabs = false
+tab_spaces = 2
+newline_style = "Auto"
+use_small_heuristics = "Default"
+indent_style = "Block"
+wrap_comments = false
+comment_width = 80
+normalize_comments = false
+format_strings = false
+format_macro_matchers = false
+format_macro_bodies = true
+empty_item_single_line = true
+struct_lit_single_line = true
+fn_single_line = false
+where_single_line = false
+imports_indent = "Block"
+imports_layout = "Mixed"
+merge_imports = true
+reorder_imports = true
+reorder_modules = true
+reorder_impl_items = false
+type_punctuation_density = "Wide"
+space_before_colon = false
+space_after_colon = true
+spaces_around_ranges = false
+binop_separator = "Front"
+remove_nested_parens = true
+combine_control_expr = true
+struct_field_align_threshold = 0
+match_arm_blocks = true
+force_multiline_blocks = false
+fn_args_density = "Tall"
+brace_style = "SameLineWhere"
+control_brace_style = "AlwaysSameLine"
+trailing_semicolon = true
+trailing_comma = "Vertical"
+match_block_trailing_comma = false
+blank_lines_upper_bound = 1
+blank_lines_lower_bound = 0
+edition = "Edition2015"
+merge_derives = true
+use_try_shorthand = true
+use_field_init_shorthand = false
+force_explicit_abi = true
+condense_wildcard_suffixes = false
+color = "Auto"
+required_version = "0.99.4"
+unstable_features = false
+disable_all_formatting = false
+skip_children = false
+hide_parse_errors = false
+error_on_line_overflow = false
+error_on_unformatted = false
+report_todo = "Never"
+report_fixme = "Never"
+ignore = []
+emit_mode = "Files"
+make_backup = false
diff --git a/rust/.travis.yml b/rust/.travis.yml
new file mode 100644
index 000000000000..63a3d0277c1b
--- /dev/null
+++ b/rust/.travis.yml
@@ -0,0 +1,5 @@
+language: rust
+rust:
+  - nightly
+matrix:
+  fast_finish: true
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
new file mode 100644
index 000000000000..0819e0c70023
--- /dev/null
+++ b/rust/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "tvm"
+version = "0.1.0"
+license = "Apache-2.0"
+description = "TVM Rust runtime"
+repository = "https://github.com/dmlc/tvm"
+readme = "README.md"
+keywords = ["tvm", "nnvm"]
+categories = ["api-bindings", "science"]
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+
+[features]
+default = ["nom/std"]
+sgx = ["nom/alloc"]
+
+[dependencies]
+bounded-spsc-queue = "0.4.0"
+error-chain = { version = "0.12.0", default-features = false }
+itertools = "0.7.8"
+lazy_static = "1.1.0"
+ndarray = "0.11.2"
+nom = {version = "4.0.0", default-features = false }
+serde = "1.0.59"
+serde_derive = "1.0.79"
+serde_json = "1.0.17"
+
+[target.'cfg(not(target_env = "sgx"))'.dependencies]
+num_cpus = "1.8.0"
diff --git a/rust/src/errors.rs b/rust/src/errors.rs
new file mode 100644
index 000000000000..f9da7180b8cc
--- /dev/null
+++ b/rust/src/errors.rs
@@ -0,0 +1,39 @@
+#[cfg(target_env = "sgx")]
+use alloc::alloc;
+#[cfg(not(target_env = "sgx"))]
+use std::alloc;
+use std::num;
+
+use ndarray;
+use serde_json;
+
+error_chain! {
+  errors {
+    TryFromTVMRetValueError(expected: String, actual: i64) {
+      description("mismatched types while downcasting TVMRetValue")
+      display("invalid downcast: expected `{}` but was `{}`", expected, actual)
+    }
+
+    GraphFormatError(msg: String) {
+      description("unable to load graph")
+      display("could not load graph json: {}", msg)
+    }
+
+    LoadGraphParamsError(msg: String) {
+      description("unable to load graph params")
+      display("could not load graph params: {}", msg)
+    }
+  }
+  foreign_links {
+    Alloc(alloc::AllocErr);
+    GraphDeserialize(serde_json::Error);
+    ParseInt(num::ParseIntError);
+    ShapeError(ndarray::ShapeError);
+  }
+}
+
+impl From<alloc::LayoutErr> for Error {
+  fn from(_err: alloc::LayoutErr) -> Error {
+    Error::from_kind(ErrorKind::Msg("Layout error".to_string()))
+  }
+}
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
new file mode 100644
index 000000000000..4a70e428d37a
--- /dev/null
+++ b/rust/src/lib.rs
@@ -0,0 +1,68 @@
+//! This crate is an implementation of the TVM runtime for modules compiled with `--system-lib`.
+//! It's mainly useful for compiling to WebAssembly and SGX,
+//! but also native if you prefer Rust to C++.
+//!
+//! For TVM graphs, the entrypoint to this crate is `runtime::GraphExecutor`.
+//! Single-function modules are used via the `packed_func!` macro after obtaining
+//! the function from `runtime::SystemLibModule`
+//!
+//! The main entrypoints to this crate are `GraphExecutor`
+//! For examples of use, please refer to the multi-file tests in the `tests` directory.
+
+#![feature(
+  alloc,
+  allocator_api,
+  box_syntax,
+  extern_prelude,
+  fn_traits,
+  try_from,
+  unboxed_closures,
+  vec_remove_item
+)]
+
+#[cfg(target_env = "sgx")]
+extern crate alloc;
+extern crate bounded_spsc_queue;
+#[cfg(target_env = "sgx")]
+extern crate core;
+#[macro_use]
+extern crate error_chain;
+#[macro_use]
+extern crate itertools;
+#[macro_use]
+extern crate lazy_static;
+extern crate ndarray;
+#[macro_use]
+extern crate nom;
+#[cfg(not(target_env = "sgx"))]
+extern crate num_cpus;
+extern crate serde;
+#[macro_use]
+extern crate serde_derive;
+extern crate serde_json;
+
+pub mod ffi {
+  #![allow(
+    non_camel_case_types,
+    non_snake_case,
+    non_upper_case_globals,
+    unused
+  )]
+
+  pub mod runtime {
+    use std::os::raw::{c_char, c_int, c_void};
+
+    include!(concat!(
+      env!("CARGO_MANIFEST_DIR"),
+      "/src/runtime/c_runtime_api.rs"
+    ));
+
+    pub type BackendPackedCFunc =
+      extern "C" fn(args: *const TVMValue, type_codes: *const c_int, num_args: c_int) -> c_int;
+  }
+}
+
+pub mod errors;
+pub mod runtime;
+
+pub use errors::*;
diff --git a/rust/src/runtime/allocator.rs b/rust/src/runtime/allocator.rs
new file mode 100644
index 000000000000..d704336bff1f
--- /dev/null
+++ b/rust/src/runtime/allocator.rs
@@ -0,0 +1,52 @@
+#[cfg(target_env = "sgx")]
+use alloc::alloc::{self, Layout};
+#[cfg(not(target_env = "sgx"))]
+use std::alloc::{self, Layout};
+
+use errors::*;
+
+const DEFAULT_ALIGN_BYTES: usize = 4;
+
+#[derive(PartialEq, Eq)]
+pub struct Allocation {
+  layout: Layout,
+  ptr: *mut u8,
+}
+
+impl Allocation {
+  /// Allocates a chunk of memory of `size` bytes with optional alignment.
+  pub fn new(size: usize, align: Option<usize>) -> Result<Self> {
+    let alignment = align.unwrap_or(DEFAULT_ALIGN_BYTES);
+    let layout = Layout::from_size_align(size, alignment)?;
+    let ptr = unsafe { alloc::alloc(layout.clone()) };
+    if ptr.is_null() {
+      alloc::handle_alloc_error(layout);
+    }
+    Ok(Self {
+      ptr: ptr,
+      layout: layout,
+    })
+  }
+
+  pub fn as_mut_ptr(&self) -> *mut u8 {
+    self.ptr
+  }
+
+  /// Returns the size of the Allocation in bytes.
+  pub fn size(&self) -> usize {
+    self.layout.size()
+  }
+
+  /// Returns the byte alignment of the Allocation.
+  pub fn align(&self) -> usize {
+    self.layout.align()
+  }
+}
+
+impl Drop for Allocation {
+  fn drop(&mut self) {
+    unsafe {
+      alloc::dealloc(self.ptr, self.layout.clone());
+    }
+  }
+}
diff --git a/rust/src/runtime/array.rs b/rust/src/runtime/array.rs
new file mode 100644
index 000000000000..79d22e400cff
--- /dev/null
+++ b/rust/src/runtime/array.rs
@@ -0,0 +1,461 @@
+use std::{
+  any::TypeId,
+  convert::TryFrom,
+  mem,
+  os::raw::{c_int, c_void},
+  ptr, slice,
+};
+
+use ndarray;
+
+use super::allocator::Allocation;
+use errors::*;
+use ffi::runtime::{
+  DLContext, DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt,
+  DLDeviceType_kDLCPU, DLTensor,
+};
+
+/// A `Storage` is a container which holds `Tensor` data.
+#[derive(PartialEq)]
+pub enum Storage<'a> {
+  /// A `Storage` which owns its contained bytes.
+  Owned(Allocation),
+
+  /// A view of an existing `Storage`.
+  View(&'a mut [u8], usize), // ptr, align
+}
+
+impl<'a> Storage<'a> {
+  pub fn new(size: usize, align: Option<usize>) -> Result<Storage<'static>> {
+    Ok(Storage::Owned(Allocation::new(size, align)?))
+  }
+
+  pub fn as_mut_ptr(&self) -> *mut u8 {
+    match self {
+      Storage::Owned(alloc) => alloc.as_mut_ptr(),
+      Storage::View(slice, _) => slice.as_ptr() as *mut u8,
+    }
+  }
+
+  pub fn size(&self) -> usize {
+    match self {
+      Storage::Owned(alloc) => alloc.size(),
+      Storage::View(slice, _) => slice.len(),
+    }
+  }
+
+  pub fn align(&self) -> usize {
+    match self {
+      Storage::Owned(alloc) => alloc.align(),
+      Storage::View(_, align) => *align,
+    }
+  }
+
+  pub fn as_ptr(&self) -> *const u8 {
+    self.as_mut_ptr() as *const _
+  }
+
+  /// Returns a `Storage::View` which points to an owned `Storage::Owned`.
+  pub fn view(&self) -> Storage<'a> {
+    match self {
+      Storage::Owned(alloc) => Storage::View(
+        unsafe { slice::from_raw_parts_mut(alloc.as_mut_ptr(), self.size()) },
+        self.align(),
+      ),
+      Storage::View(slice, _) => Storage::View(
+        unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), slice.len()) },
+        self.align(),
+      ),
+    }
+  }
+
+  pub fn is_owned(&self) -> bool {
+    match self {
+      Storage::Owned(_) => true,
+      _ => false,
+    }
+  }
+
+  /// Returns an owned version of this storage via cloning.
+  pub fn to_owned(&self) -> Storage<'static> {
+    let s = Storage::new(self.size(), Some(self.align())).unwrap();
+    unsafe {
+      s.as_mut_ptr()
+        .copy_from_nonoverlapping(self.as_ptr(), self.size())
+    }
+    s
+  }
+}
+
+impl<'a, T> From<&'a [T]> for Storage<'a> {
+  fn from(data: &'a [T]) -> Self {
+    let data = unsafe {
+      slice::from_raw_parts_mut(
+        data.as_ptr() as *const u8 as *mut u8,
+        data.len() * mem::size_of::<T>() as usize,
+      )
+    };
+    Storage::View(data, mem::align_of::<T>())
+  }
+}
+
+/// A n-dimensional array type which can be converted to/from `tvm::DLTensor` and `ndarray::Array`.
+/// `Tensor` is primarily a holder of data which can be operated on via TVM (via `DLTensor`) or
+/// converted to `ndarray::Array` for non-TVM processing.
+///
+/// # Examples
+///
+/// ```
+/// extern crate ndarray;
+///
+/// let mut a_nd: ndarray::Array = ndarray::Array::from_vec(vec![1f32, 2., 3., 4.]);
+/// let mut a: Tensor = a_nd.into();
+/// let mut a_dl: DLTensor = (&mut t).into();
+/// call_packed!(tvm_fn, &mut a_dl);
+///
+/// // Array -> Tensor is mostly useful when post-processing TVM graph outputs.
+/// let mut a_nd = ndarray::Array::try_from(&a).unwrap();
+/// ```
+#[derive(PartialEq)]
+pub struct Tensor<'a> {
+  /// The bytes which contain the data this `Tensor` represents.
+  pub(super) data: Storage<'a>,
+  pub(super) ctx: TVMContext,
+  pub(super) dtype: DataType,
+  pub(super) shape: Vec<i64>, // not usize because `typedef int64_t tvm_index_t` in c_runtime_api.h
+  /// The `Tensor` strides. Can be `None` if the `Tensor` is contiguous.
+  pub(super) strides: Option<Vec<usize>>,
+  pub(super) byte_offset: isize,
+  pub(super) size: usize,
+}
+
+unsafe impl<'a> Send for Tensor<'a> {}
+
+impl<'a> Tensor<'a> {
+  pub fn shape(&self) -> Vec<i64> {
+    self.shape.clone()
+  }
+
+  /// Returns the data of this `Tensor` as a `Vec`.
+  ///
+  /// # Panics
+  ///
+  /// Panics if the `Tensor` is not contiguous or does not contain elements of type `T`.
+  pub fn to_vec<T: 'static>(&self) -> Vec<T> {
+    assert!(self.is_contiguous());
+    assert!(self.dtype.is_type::<T>());
+    let mut vec: Vec<T> = Vec::with_capacity(self.size * self.dtype.itemsize());
+    unsafe {
+      vec.as_mut_ptr().copy_from_nonoverlapping(
+        self.data.as_ptr().offset(self.byte_offset) as *const T,
+        self.size,
+      );
+      vec.set_len(self.size);
+    }
+    vec
+  }
+
+  /// Returns `true` iff this `Tensor` is represented by a contiguous region of memory.
+  pub fn is_contiguous(&self) -> bool {
+    match self.strides {
+      None => true,
+      Some(ref strides) => {
+        // check that stride for each dimension is the product of all trailing dimensons' shapes
+        self
+          .shape
+          .iter()
+          .zip(strides)
+          .rfold(
+            (true, 1),
+            |(is_contig, expected_stride), (shape, stride)| {
+              (
+                is_contig && *stride == expected_stride,
+                expected_stride * (*shape as usize),
+              )
+            },
+          ).0
+      }
+    }
+  }
+
+  /// Returns a clone of this `Tensor`.
+  ///
+  /// # Panics
+  ///
+  /// Panics if the `Tensor` is not contiguous or does not contain elements of type `T`.
+  pub fn copy(&mut self, other: &Tensor) {
+    assert!(
+      self.dtype == other.dtype && self.size == other.size,
+      "Tensor shape/dtype mismatch."
+    );
+    assert!(
+      self.is_contiguous() && other.is_contiguous(),
+      "copy currently requires contiguous tensors\n`self.strides = {:?}` `other.strides = {:?}`",
+      self.strides,
+      other.strides
+    );
+    unsafe {
+      self
+        .data
+        .as_mut_ptr()
+        .offset(self.byte_offset as isize)
+        .copy_from_nonoverlapping(
+          other.data.as_mut_ptr().offset(other.byte_offset),
+          other.size * other.dtype.itemsize(),
+        );
+    }
+  }
+
+  /// Returns an owned version of this `Tensor` via cloning.
+  pub fn to_owned(&self) -> Tensor<'static> {
+    let t = Tensor {
+      data: self.data.to_owned(),
+      ctx: self.ctx.clone(),
+      dtype: self.dtype.clone(),
+      size: self.size.clone(),
+      shape: self.shape.clone(),
+      strides: None,
+      byte_offset: 0,
+    };
+    unsafe { mem::transmute::<Tensor<'a>, Tensor<'static>>(t) }
+  }
+
+  fn from_array_storage<'s, T, D: ndarray::Dimension>(
+    arr: &ndarray::Array<T, D>,
+    storage: Storage<'s>,
+    type_code: usize,
+  ) -> Tensor<'s> {
+    let type_width = mem::size_of::<T>() as usize;
+    Tensor {
+      data: storage,
+      ctx: TVMContext::default(),
+      dtype: DataType {
+        code: type_code,
+        bits: 8 * type_width,
+        lanes: 1,
+      },
+      size: arr.len(),
+      shape: arr.shape().iter().map(|&v| v as i64).collect(),
+      strides: Some(arr.strides().into_iter().map(|&v| v as usize).collect()),
+      byte_offset: 0,
+    }
+  }
+}
+
+/// Conversions to `ndarray::Array` from `Tensor`, if the types match.
+macro_rules! impl_ndarray_try_from_tensor {
+  ($type:ty, $dtype:expr) => {
+    impl<'a, 't> TryFrom<&'a Tensor<'t>> for ndarray::ArrayD<$type> {
+      type Error = Error;
+      fn try_from(tensor: &'a Tensor) -> Result<ndarray::ArrayD<$type>> {
+        ensure!(
+          tensor.dtype == $dtype,
+          "Cannot convert Tensor with dtype {:?} to ndarray",
+          tensor.dtype
+        );
+        Ok(ndarray::Array::from_shape_vec(
+          tensor
+            .shape
+            .iter()
+            .map(|s| *s as usize)
+            .collect::<Vec<usize>>(),
+          tensor.to_vec::<$type>(),
+        )?)
+      }
+    }
+  };
+}
+
+impl_ndarray_try_from_tensor!(i32, DTYPE_INT32);
+impl_ndarray_try_from_tensor!(u32, DTYPE_UINT32);
+impl_ndarray_try_from_tensor!(f32, DTYPE_FLOAT32);
+impl_ndarray_try_from_tensor!(f64, DTYPE_FLOAT64);
+
+impl DLTensor {
+  pub(super) fn from_tensor<'a>(tensor: &'a Tensor, flatten: bool) -> Self {
+    assert!(!flatten || tensor.is_contiguous());
+    Self {
+      data: unsafe { tensor.data.as_mut_ptr().offset(tensor.byte_offset) } as *mut c_void,
+      ctx: DLContext::from(&tensor.ctx),
+      ndim: if flatten { 1 } else { tensor.shape.len() } as i32,
+      dtype: DLDataType::from(&tensor.dtype),
+      shape: if flatten {
+        &tensor.size as *const _ as *mut i64
+      } else {
+        tensor.shape.as_ptr()
+      } as *mut i64,
+      strides: if flatten || tensor.is_contiguous() {
+        ptr::null_mut()
+      } else {
+        tensor.strides.as_ref().unwrap().as_ptr()
+      } as *mut i64,
+      byte_offset: 0,
+    }
+  }
+}
+
+impl<'a, 't> From<&'a Tensor<'t>> for DLTensor {
+  fn from(tensor: &'a Tensor<'t>) -> Self {
+    DLTensor::from_tensor(tensor, false /* flatten */)
+  }
+}
+
+impl<'a, 't> From<&'a mut Tensor<'t>> for DLTensor {
+  fn from(tensor: &'a mut Tensor<'t>) -> Self {
+    DLTensor::from_tensor(tensor, false /* flatten */)
+  }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct DataType {
+  pub(super) code: usize,
+  pub(super) bits: usize,
+  pub(super) lanes: usize,
+}
+
+impl DataType {
+  /// Returns the number of bytes occupied by an element of this `DataType`.
+  fn itemsize(&self) -> usize {
+    (self.bits * self.lanes) >> 3
+  }
+
+  /// Returns whether this `DataType` represents primitive type `T`.
+  fn is_type<T: 'static>(&self) -> bool {
+    if self.lanes != 1 {
+      return false;
+    }
+    let typ = TypeId::of::<T>();
+    (typ == TypeId::of::<i32>() && self.code == 0 && self.bits == 32)
+      || (typ == TypeId::of::<i64>() && self.code == 0 && self.bits == 64)
+      || (typ == TypeId::of::<u32>() && self.code == 1 && self.bits == 32)
+      || (typ == TypeId::of::<u64>() && self.code == 1 && self.bits == 64)
+      || (typ == TypeId::of::<f32>() && self.code == 2 && self.bits == 32)
+      || (typ == TypeId::of::<f64>() && self.code == 2 && self.bits == 64)
+  }
+}
+
+impl<'a> From<&'a DataType> for DLDataType {
+  fn from(dtype: &'a DataType) -> Self {
+    Self {
+      code: dtype.code as u8,
+      bits: dtype.bits as u8,
+      lanes: dtype.lanes as u16,
+    }
+  }
+}
+
+macro_rules! make_dtype_const {
+  ($name: ident, $code: ident, $bits: expr, $lanes: expr) => {
+    const $name: DataType = DataType {
+      code: $code as usize,
+      bits: $bits,
+      lanes: $lanes,
+    };
+  };
+}
+
+make_dtype_const!(DTYPE_INT32, DLDataTypeCode_kDLInt, 32, 1);
+make_dtype_const!(DTYPE_UINT32, DLDataTypeCode_kDLUInt, 32, 1);
+// make_dtype_const!(DTYPE_FLOAT16, DLDataTypeCode_kDLFloat, 16, 1);
+make_dtype_const!(DTYPE_FLOAT32, DLDataTypeCode_kDLFloat, 32, 1);
+make_dtype_const!(DTYPE_FLOAT64, DLDataTypeCode_kDLFloat, 64, 1);
+
+impl Default for DLContext {
+  fn default() -> Self {
+    DLContext {
+      device_type: DLDeviceType_kDLCPU,
+      device_id: 0,
+    }
+  }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct TVMContext {
+  pub(super) device_type: usize,
+  pub(super) device_id: usize,
+}
+
+impl<'a> From<&'a TVMContext> for DLContext {
+  fn from(ctx: &'a TVMContext) -> Self {
+    Self {
+      device_type: ctx.device_type as u32,
+      device_id: ctx.device_id as i32,
+    }
+  }
+}
+
+impl Default for TVMContext {
+  fn default() -> Self {
+    Self {
+      device_type: DLDeviceType_kDLCPU as usize,
+      device_id: 0,
+    }
+  }
+}
+
+/// `From` conversions to `Tensor` for owned or borrowed `ndarray::Array`.
+///
+/// # Panics
+///
+/// Panics if the ndarray is not contiguous.
+macro_rules! impl_tensor_from_ndarray {
+  ($type:ty, $typecode:expr) => {
+    impl<D: ndarray::Dimension> From<ndarray::Array<$type, D>> for Tensor<'static> {
+      fn from(arr: ndarray::Array<$type, D>) -> Self {
+        assert!(arr.is_standard_layout(), "Array must be contiguous.");
+        let size = arr.len() * mem::size_of::<$type>() as usize;
+        let storage =
+          Storage::from(unsafe { slice::from_raw_parts(arr.as_ptr() as *const u8, size) });
+        Tensor::from_array_storage(&arr, storage, $typecode as usize)
+      }
+    }
+    impl<'a, D: ndarray::Dimension> From<&'a ndarray::Array<$type, D>> for Tensor<'a> {
+      fn from(arr: &'a ndarray::Array<$type, D>) -> Self {
+        assert!(arr.is_standard_layout(), "Array must be contiguous.");
+        Tensor::from_array_storage(
+          arr,
+          Storage::from(arr.as_slice().unwrap()),
+          $typecode as usize,
+        )
+      }
+    }
+  };
+}
+
+/// `From` conversions to `DLTensor` for `ndarray::Array`.
+/// Takes a reference to the `ndarray` since `DLTensor` is not owned.
+macro_rules! impl_dltensor_from_ndarray {
+  ($type:ty, $typecode:expr) => {
+    impl<'a, D: ndarray::Dimension> From<&'a mut ndarray::Array<$type, D>> for DLTensor {
+      fn from(arr: &'a mut ndarray::Array<$type, D>) -> Self {
+        DLTensor {
+          data: arr.as_mut_ptr() as *mut c_void,
+          ctx: DLContext::default(),
+          ndim: arr.ndim() as c_int,
+          dtype: DLDataType {
+            code: $typecode as u8,
+            bits: 8 * mem::size_of::<$type>() as u8,
+            lanes: 1,
+          },
+          shape: arr.shape().as_ptr() as *const i64 as *mut i64,
+          strides: arr.strides().as_ptr() as *const isize as *mut i64,
+          byte_offset: 0,
+        }
+      }
+    }
+  };
+}
+
+impl_dltensor_from_ndarray!(f32, DLDataTypeCode_kDLFloat);
+impl_dltensor_from_ndarray!(f64, DLDataTypeCode_kDLFloat);
+impl_dltensor_from_ndarray!(i32, DLDataTypeCode_kDLInt);
+impl_dltensor_from_ndarray!(i64, DLDataTypeCode_kDLInt);
+impl_dltensor_from_ndarray!(u32, DLDataTypeCode_kDLUInt);
+impl_dltensor_from_ndarray!(u64, DLDataTypeCode_kDLUInt);
+
+impl_tensor_from_ndarray!(f32, DLDataTypeCode_kDLFloat);
+impl_tensor_from_ndarray!(f64, DLDataTypeCode_kDLFloat);
+impl_tensor_from_ndarray!(i32, DLDataTypeCode_kDLInt);
+impl_tensor_from_ndarray!(i64, DLDataTypeCode_kDLInt);
+impl_tensor_from_ndarray!(u32, DLDataTypeCode_kDLUInt);
+impl_tensor_from_ndarray!(u64, DLDataTypeCode_kDLUInt);
diff --git a/rust/src/runtime/c_runtime_api.rs b/rust/src/runtime/c_runtime_api.rs
new file mode 100644
index 000000000000..62cfa0d15451
--- /dev/null
+++ b/rust/src/runtime/c_runtime_api.rs
@@ -0,0 +1,770 @@
+/* automatically generated by rust-bindgen for TVM revision 6292c78 */
+
+pub const TVM_VERSION: &'static [u8; 8usize] = b"0.5.dev\0";
+pub const DLPACK_VERSION: u32 = 8;
+pub const _STDINT_H: u32 = 1;
+pub const _FEATURES_H: u32 = 1;
+pub const _DEFAULT_SOURCE: u32 = 1;
+pub const __USE_ISOC11: u32 = 1;
+pub const __USE_ISOC99: u32 = 1;
+pub const __USE_ISOC95: u32 = 1;
+pub const __USE_POSIX_IMPLICITLY: u32 = 1;
+pub const _POSIX_SOURCE: u32 = 1;
+pub const _POSIX_C_SOURCE: u32 = 200809;
+pub const __USE_POSIX: u32 = 1;
+pub const __USE_POSIX2: u32 = 1;
+pub const __USE_POSIX199309: u32 = 1;
+pub const __USE_POSIX199506: u32 = 1;
+pub const __USE_XOPEN2K: u32 = 1;
+pub const __USE_XOPEN2K8: u32 = 1;
+pub const _ATFILE_SOURCE: u32 = 1;
+pub const __USE_MISC: u32 = 1;
+pub const __USE_ATFILE: u32 = 1;
+pub const __USE_FORTIFY_LEVEL: u32 = 0;
+pub const _STDC_PREDEF_H: u32 = 1;
+pub const __STDC_IEC_559__: u32 = 1;
+pub const __STDC_IEC_559_COMPLEX__: u32 = 1;
+pub const __STDC_ISO_10646__: u32 = 201505;
+pub const __STDC_NO_THREADS__: u32 = 1;
+pub const __GNU_LIBRARY__: u32 = 6;
+pub const __GLIBC__: u32 = 2;
+pub const __GLIBC_MINOR__: u32 = 23;
+pub const _SYS_CDEFS_H: u32 = 1;
+pub const __WORDSIZE: u32 = 64;
+pub const __WORDSIZE_TIME64_COMPAT32: u32 = 1;
+pub const __SYSCALL_WORDSIZE: u32 = 64;
+pub const _BITS_WCHAR_H: u32 = 1;
+pub const INT8_MIN: i32 = -128;
+pub const INT16_MIN: i32 = -32768;
+pub const INT32_MIN: i32 = -2147483648;
+pub const INT8_MAX: u32 = 127;
+pub const INT16_MAX: u32 = 32767;
+pub const INT32_MAX: u32 = 2147483647;
+pub const UINT8_MAX: u32 = 255;
+pub const UINT16_MAX: u32 = 65535;
+pub const UINT32_MAX: u32 = 4294967295;
+pub const INT_LEAST8_MIN: i32 = -128;
+pub const INT_LEAST16_MIN: i32 = -32768;
+pub const INT_LEAST32_MIN: i32 = -2147483648;
+pub const INT_LEAST8_MAX: u32 = 127;
+pub const INT_LEAST16_MAX: u32 = 32767;
+pub const INT_LEAST32_MAX: u32 = 2147483647;
+pub const UINT_LEAST8_MAX: u32 = 255;
+pub const UINT_LEAST16_MAX: u32 = 65535;
+pub const UINT_LEAST32_MAX: u32 = 4294967295;
+pub const INT_FAST8_MIN: i32 = -128;
+pub const INT_FAST16_MIN: i64 = -9223372036854775808;
+pub const INT_FAST32_MIN: i64 = -9223372036854775808;
+pub const INT_FAST8_MAX: u32 = 127;
+pub const INT_FAST16_MAX: u64 = 9223372036854775807;
+pub const INT_FAST32_MAX: u64 = 9223372036854775807;
+pub const UINT_FAST8_MAX: u32 = 255;
+pub const UINT_FAST16_MAX: i32 = -1;
+pub const UINT_FAST32_MAX: i32 = -1;
+pub const INTPTR_MIN: i64 = -9223372036854775808;
+pub const INTPTR_MAX: u64 = 9223372036854775807;
+pub const UINTPTR_MAX: i32 = -1;
+pub const PTRDIFF_MIN: i64 = -9223372036854775808;
+pub const PTRDIFF_MAX: u64 = 9223372036854775807;
+pub const SIG_ATOMIC_MIN: i32 = -2147483648;
+pub const SIG_ATOMIC_MAX: u32 = 2147483647;
+pub const SIZE_MAX: i32 = -1;
+pub const WINT_MIN: u32 = 0;
+pub const WINT_MAX: u32 = 4294967295;
+pub type int_least8_t = ::std::os::raw::c_schar;
+pub type int_least16_t = ::std::os::raw::c_short;
+pub type int_least32_t = ::std::os::raw::c_int;
+pub type int_least64_t = ::std::os::raw::c_long;
+pub type uint_least8_t = ::std::os::raw::c_uchar;
+pub type uint_least16_t = ::std::os::raw::c_ushort;
+pub type uint_least32_t = ::std::os::raw::c_uint;
+pub type uint_least64_t = ::std::os::raw::c_ulong;
+pub type int_fast8_t = ::std::os::raw::c_schar;
+pub type int_fast16_t = ::std::os::raw::c_long;
+pub type int_fast32_t = ::std::os::raw::c_long;
+pub type int_fast64_t = ::std::os::raw::c_long;
+pub type uint_fast8_t = ::std::os::raw::c_uchar;
+pub type uint_fast16_t = ::std::os::raw::c_ulong;
+pub type uint_fast32_t = ::std::os::raw::c_ulong;
+pub type uint_fast64_t = ::std::os::raw::c_ulong;
+pub type intmax_t = ::std::os::raw::c_long;
+pub type uintmax_t = ::std::os::raw::c_ulong;
+pub type wchar_t = ::std::os::raw::c_int;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct max_align_t {
+  pub __clang_max_align_nonce1: ::std::os::raw::c_longlong,
+  pub __bindgen_padding_0: u64,
+  pub __clang_max_align_nonce2: f64,
+}
+pub const DLDeviceType_kDLCPU: DLDeviceType = 1;
+pub const DLDeviceType_kDLGPU: DLDeviceType = 2;
+pub const DLDeviceType_kDLCPUPinned: DLDeviceType = 3;
+pub const DLDeviceType_kDLOpenCL: DLDeviceType = 4;
+pub const DLDeviceType_kDLMetal: DLDeviceType = 8;
+pub const DLDeviceType_kDLVPI: DLDeviceType = 9;
+pub const DLDeviceType_kDLROCM: DLDeviceType = 10;
+/// \brief The device type in DLContext.
+pub type DLDeviceType = u32;
+/// \brief A Device context for Tensor and operator.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct DLContext {
+  /// \brief The device type used in the device.
+  pub device_type: DLDeviceType,
+  /// \brief The device index
+  pub device_id: ::std::os::raw::c_int,
+}
+pub const DLDataTypeCode_kDLInt: DLDataTypeCode = 0;
+pub const DLDataTypeCode_kDLUInt: DLDataTypeCode = 1;
+pub const DLDataTypeCode_kDLFloat: DLDataTypeCode = 2;
+/// \brief The type code options DLDataType.
+pub type DLDataTypeCode = u32;
+/// \brief The data type the tensor can hold.
+///
+/// Examples
+/// - float: type_code = 2, bits = 32, lanes=1
+/// - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+/// - int8: type_code = 0, bits = 8, lanes=1
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct DLDataType {
+  /// \brief Type code of base types.
+  /// We keep it uint8_t instead of DLDataTypeCode for minimal memory
+  /// footprint, but the value should be one of DLDataTypeCode enum values.
+  ///
+  pub code: u8,
+  /// \brief Number of bits, common choices are 8, 16, 32.
+  pub bits: u8,
+  /// \brief Number of lanes in the type, used for vector types.
+  pub lanes: u16,
+}
+/// \brief Plain C Tensor object, does not manage memory.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct DLTensor {
+  /// \brief The opaque data pointer points to the allocated data.
+  /// This will be CUDA device pointer or cl_mem handle in OpenCL.
+  /// This pointer is always aligns to 256 bytes as in CUDA.
+  pub data: *mut ::std::os::raw::c_void,
+  /// \brief The device context of the tensor
+  pub ctx: DLContext,
+  /// \brief Number of dimensions
+  pub ndim: ::std::os::raw::c_int,
+  /// \brief The data type of the pointer
+  pub dtype: DLDataType,
+  /// \brief The shape of the tensor
+  pub shape: *mut i64,
+  /// \brief strides of the tensor,
+  /// can be NULL, indicating tensor is compact.
+  pub strides: *mut i64,
+  /// \brief The offset in bytes to the beginning pointer to data
+  pub byte_offset: u64,
+}
+/// \brief C Tensor object, manage memory of DLTensor. This data structure is
+/// intended to faciliate the borrowing of DLTensor by another framework. It is
+/// not meant to transfer the tensor. When the borrowing framework doesn't need
+/// the tensor, it should call the deleter to notify the host that the resource
+/// is no longer needed.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct DLManagedTensor {
+  /// \brief DLTensor which is being memory managed
+  pub dl_tensor: DLTensor,
+  /// \brief the context of the original host framework of DLManagedTensor in
+  /// which DLManagedTensor is used in the framework. It can also be NULL.
+  pub manager_ctx: *mut ::std::os::raw::c_void,
+  /// \brief Destructor signature void (*)(void*) - this should be called
+  /// to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+  /// if there is no way for the caller to provide a reasonable destructor.
+  pub deleter: ::std::option::Option<unsafe extern "C" fn(self_: *mut DLManagedTensor)>,
+}
+/// \brief type of array index.
+pub type tvm_index_t = i64;
+pub const TVMDeviceExtType_kDLAOCL: TVMDeviceExtType = 5;
+pub const TVMDeviceExtType_kDLSDAccel: TVMDeviceExtType = 6;
+pub const TVMDeviceExtType_kDLVulkan: TVMDeviceExtType = 7;
+pub const TVMDeviceExtType_kOpenGL: TVMDeviceExtType = 11;
+pub const TVMDeviceExtType_kExtDev: TVMDeviceExtType = 12;
+/// \brief Extension device types in TVM
+pub type TVMDeviceExtType = u32;
+pub const TVMTypeCode_kHandle: TVMTypeCode = 3;
+pub const TVMTypeCode_kNull: TVMTypeCode = 4;
+pub const TVMTypeCode_kTVMType: TVMTypeCode = 5;
+pub const TVMTypeCode_kTVMContext: TVMTypeCode = 6;
+pub const TVMTypeCode_kArrayHandle: TVMTypeCode = 7;
+pub const TVMTypeCode_kNodeHandle: TVMTypeCode = 8;
+pub const TVMTypeCode_kModuleHandle: TVMTypeCode = 9;
+pub const TVMTypeCode_kFuncHandle: TVMTypeCode = 10;
+pub const TVMTypeCode_kStr: TVMTypeCode = 11;
+pub const TVMTypeCode_kBytes: TVMTypeCode = 12;
+pub const TVMTypeCode_kNDArrayContainer: TVMTypeCode = 13;
+pub const TVMTypeCode_kExtBegin: TVMTypeCode = 15;
+pub const TVMTypeCode_kNNVMFirst: TVMTypeCode = 16;
+pub const TVMTypeCode_kNNVMLast: TVMTypeCode = 20;
+pub const TVMTypeCode_kExtReserveEnd: TVMTypeCode = 64;
+pub const TVMTypeCode_kExtEnd: TVMTypeCode = 128;
+/// \brief The type code in TVMType
+/// \note TVMType is used in two places.
+pub type TVMTypeCode = u32;
+/// \brief The data type used in TVM Runtime.
+///
+/// Examples
+/// - float: type_code = 2, bits = 32, lanes=1
+/// - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+/// - int8: type_code = 0, bits = 8, lanes=1
+///
+/// \note Arguments TVM API function always takes bits=64 and lanes=1
+pub type TVMType = DLDataType;
+/// \brief The Device information, abstract away common device types.
+pub type TVMContext = DLContext;
+/// \brief The tensor array stucture to TVM API.
+pub type TVMArray = DLTensor;
+/// \brief the array handle
+pub type TVMArrayHandle = *mut TVMArray;
+/// \brief Union type of values
+/// being passed through API and function calls.
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union TVMValue {
+  pub v_int64: i64,
+  pub v_float64: f64,
+  pub v_handle: *mut ::std::os::raw::c_void,
+  pub v_str: *const ::std::os::raw::c_char,
+  pub v_type: TVMType,
+  pub v_ctx: TVMContext,
+  _bindgen_union_align: u64,
+}
+/// \brief Byte array type used to pass in byte array
+/// When kBytes is used as data type.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct TVMByteArray {
+  pub data: *const ::std::os::raw::c_char,
+  pub size: usize,
+}
+/// \brief Handle to TVM runtime modules.
+pub type TVMModuleHandle = *mut ::std::os::raw::c_void;
+/// \brief Handle to packed function handle.
+pub type TVMFunctionHandle = *mut ::std::os::raw::c_void;
+/// \brief Handle to hold return value.
+pub type TVMRetValueHandle = *mut ::std::os::raw::c_void;
+/// \brief The stream that is specific to device
+/// can be NULL, which indicates the default one.
+pub type TVMStreamHandle = *mut ::std::os::raw::c_void;
+extern "C" {
+  /// \brief Used for implementing C API function.
+  /// Set last error message before return.
+  /// \param msg The error message to be set.
+  pub fn TVMAPISetLastError(msg: *const ::std::os::raw::c_char);
+}
+extern "C" {
+  /// \brief return str message of the last error
+  /// all function in this file will return 0 when success
+  /// and -1 when an error occured,
+  /// TVMGetLastError can be called to retrieve the error
+  ///
+  /// this function is threadsafe and can be called by different thread
+  /// \return error info
+  pub fn TVMGetLastError() -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+  /// \brief Load module from file.
+  /// \param file_name The file name to load the module from.
+  /// \param format The format of the module.
+  /// \param out The result module
+  ///
+  /// \return 0 when success, -1 when failure happens
+  /// \note The resulting module do not contain import relation.
+  /// It can be reconstructed by TVMModImport.
+  pub fn TVMModLoadFromFile(
+    file_name: *const ::std::os::raw::c_char,
+    format: *const ::std::os::raw::c_char,
+    out: *mut TVMModuleHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Add dep to mod's dependency.
+  /// This allows functions in this module to use modules.
+  ///
+  /// \param mod The module handle.
+  /// \param dep The dependent module to be imported.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMModImport(mod_: TVMModuleHandle, dep: TVMModuleHandle) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Get function from the module.
+  /// \param mod The module handle.
+  /// \param func_name The name of the function.
+  /// \param query_imports Whether to query imported modules
+  /// \param out The result function, can be NULL if it is not available.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMModGetFunction(
+    mod_: TVMModuleHandle,
+    func_name: *const ::std::os::raw::c_char,
+    query_imports: ::std::os::raw::c_int,
+    out: *mut TVMFunctionHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Free front-end extension type resource.
+  /// \param handle The extension handle.
+  /// \param type_code The type of of the extension type.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMExtTypeFree(
+    handle: *mut ::std::os::raw::c_void,
+    type_code: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Free the Module
+  /// \param mod The module to be freed.
+  ///
+  /// \note This may not free up the module's resources.
+  /// If there is active TVMFunctionHandle uses the module
+  /// Or if this module is imported by another active module.
+  ///
+  /// The all functions remains valid until TVMFuncFree is called.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMModFree(mod_: TVMModuleHandle) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Free the function when it is no longer needed.
+  /// \param func The function handle
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMFuncFree(func: TVMFunctionHandle) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Call a Packed TVM Function.
+  ///
+  /// \param func node handle of the function.
+  /// \param arg_values The arguments
+  /// \param type_codes The type codes of the arguments
+  /// \param num_args Number of arguments.
+  ///
+  /// \param ret_val The return value.
+  /// \param ret_type_code the type code of return value.
+  ///
+  /// \return 0 when success, -1 when failure happens
+  /// \note TVM calls always exchanges with type bits=64, lanes=1
+  ///
+  /// \note API calls always exchanges with type bits=64, lanes=1
+  /// If API call returns container handles (e.g. FunctionHandle)
+  /// these handles should be managed by the front-end.
+  /// The front-end need to call free function (e.g. TVMFuncFree)
+  /// to free these handles.
+  pub fn TVMFuncCall(
+    func: TVMFunctionHandle,
+    arg_values: *mut TVMValue,
+    type_codes: *mut ::std::os::raw::c_int,
+    num_args: ::std::os::raw::c_int,
+    ret_val: *mut TVMValue,
+    ret_type_code: *mut ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Set the return value of TVMPackedCFunc.
+  ///
+  /// This function is called by TVMPackedCFunc to set the return value.
+  /// When this function is not called, the function returns null by default.
+  ///
+  /// \param ret The return value handle, pass by ret in TVMPackedCFunc
+  /// \param value The value to be returned.
+  /// \param type_code The type of the value to be returned.
+  /// \param num_ret Number of return values, for now only 1 is supported.
+  pub fn TVMCFuncSetReturn(
+    ret: TVMRetValueHandle,
+    value: *mut TVMValue,
+    type_code: *mut ::std::os::raw::c_int,
+    num_ret: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Inplace translate callback argument value to return value.
+  /// This is only needed for non-POD arguments.
+  ///
+  /// \param value The value to be translated.
+  /// \param code The type code to be translated.
+  /// \note This function will do a shallow copy when necessary.
+  ///
+  /// \return 0 when success, -1 when failure happens.
+  pub fn TVMCbArgToReturn(
+    value: *mut TVMValue,
+    code: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+/// \brief C type of packed function.
+///
+/// \param args The arguments
+/// \param type_codes The type codes of the arguments
+/// \param num_args Number of arguments.
+/// \param ret The return value handle.
+/// \param resource_handle The handle additional resouce handle from fron-end.
+/// \return 0 if success, -1 if failure happens, set error via TVMAPISetLastError.
+/// \sa TVMCFuncSetReturn
+pub type TVMPackedCFunc = ::std::option::Option<
+  unsafe extern "C" fn(
+    args: *mut TVMValue,
+    type_codes: *mut ::std::os::raw::c_int,
+    num_args: ::std::os::raw::c_int,
+    ret: TVMRetValueHandle,
+    resource_handle: *mut ::std::os::raw::c_void,
+  ) -> ::std::os::raw::c_int,
+>;
+/// \brief C callback to free the resource handle in C packed function.
+/// \param resource_handle The handle additional resouce handle from fron-end.
+pub type TVMPackedCFuncFinalizer =
+  ::std::option::Option<unsafe extern "C" fn(resource_handle: *mut ::std::os::raw::c_void)>;
+/// \brief Signature for extension function declarer.
+///
+/// TVM call this function to get the extension functions
+/// The declarer will call register_func to register function and their name.
+///
+/// \param register_func_handle The register function
+/// \return 0 if success, -1 if failure happens
+pub type TVMExtensionFuncDeclarer = ::std::option::Option<
+  unsafe extern "C" fn(register_func_handle: TVMFunctionHandle) -> ::std::os::raw::c_int,
+>;
+extern "C" {
+  /// \brief Wrap a TVMPackedCFunc to become a FunctionHandle.
+  ///
+  /// The resource_handle will be managed by TVM API, until the function is no longer used.
+  ///
+  /// \param func The packed C function.
+  /// \param resource_handle The resource handle from front-end, can be NULL.
+  /// \param fin The finalizer on resource handle when the FunctionHandle get freed, can be NULL
+  /// \param out the result function handle.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMFuncCreateFromCFunc(
+    func: TVMPackedCFunc,
+    resource_handle: *mut ::std::os::raw::c_void,
+    fin: TVMPackedCFuncFinalizer,
+    out: *mut TVMFunctionHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Register the function to runtime's global table.
+  ///
+  /// The registered function then can be pulled by the backend by the name.
+  ///
+  /// \param name The name of the function.
+  /// \param f The function to be registered.
+  /// \param override Whether allow override already registered function.
+  pub fn TVMFuncRegisterGlobal(
+    name: *const ::std::os::raw::c_char,
+    f: TVMFunctionHandle,
+    override_: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Get a global function.
+  ///
+  /// \param name The name of the function.
+  /// \param out the result function pointer, NULL if it does not exist.
+  ///
+  /// \note The function handle of global function is managed by TVM runtime,
+  /// So TVMFuncFree is should not be called when it get deleted.
+  pub fn TVMFuncGetGlobal(
+    name: *const ::std::os::raw::c_char,
+    out: *mut TVMFunctionHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief List all the globally registered function name
+  /// \param out_size The number of functions
+  /// \param out_array The array of function names.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMFuncListGlobalNames(
+    out_size: *mut ::std::os::raw::c_int,
+    out_array: *mut *mut *const ::std::os::raw::c_char,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Allocate a nd-array's memory,
+  /// including space of shape, of given spec.
+  ///
+  /// \param shape The shape of the array, the data content will be copied to out
+  /// \param ndim The number of dimension of the array.
+  /// \param dtype_code The type code of the dtype
+  /// \param dtype_bits The number of bits of dtype
+  /// \param dtype_lanes The number of lanes in the dtype.
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context.
+  /// \param out The output handle.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayAlloc(
+    shape: *const tvm_index_t,
+    ndim: ::std::os::raw::c_int,
+    dtype_code: ::std::os::raw::c_int,
+    dtype_bits: ::std::os::raw::c_int,
+    dtype_lanes: ::std::os::raw::c_int,
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    out: *mut TVMArrayHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Free the TVM Array.
+  /// \param handle The array handle to be freed.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayFree(handle: TVMArrayHandle) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Copy array data from CPU byte array.
+  /// \param handle The array handle.
+  /// \param data the data pointer
+  /// \param nbytes The number of bytes to copy.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayCopyFromBytes(
+    handle: TVMArrayHandle,
+    data: *mut ::std::os::raw::c_void,
+    nbytes: usize,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Copy array data to CPU byte array.
+  /// \param handle The array handle.
+  /// \param data the data pointer
+  /// \param nbytes The number of bytes to copy.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayCopyToBytes(
+    handle: TVMArrayHandle,
+    data: *mut ::std::os::raw::c_void,
+    nbytes: usize,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Copy the array, both from and to must be valid during the copy.
+  /// \param from The array to be copied from.
+  /// \param to The target space.
+  /// \param stream The stream where the copy happens, can be NULL.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayCopyFromTo(
+    from: TVMArrayHandle,
+    to: TVMArrayHandle,
+    stream: TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Produce an array from the DLManagedTensor that shares data memory
+  /// with the DLManagedTensor.
+  /// \param from The source DLManagedTensor.
+  /// \param out The output array handle.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayFromDLPack(
+    from: *mut DLManagedTensor,
+    out: *mut TVMArrayHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Produce a DLMangedTensor from the array that shares data memory with
+  /// the array.
+  /// \param from The source array.
+  /// \param out The DLManagedTensor handle.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayToDLPack(
+    from: TVMArrayHandle,
+    out: *mut *mut DLManagedTensor,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Delete (free) a DLManagedTensor's data.
+  /// \param dltensor Pointer to the DLManagedTensor.
+  pub fn TVMDLManagedTensorCallDeleter(dltensor: *mut DLManagedTensor);
+}
+extern "C" {
+  /// \brief Create a new runtime stream.
+  ///
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context
+  /// \param out The new stream handle
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMStreamCreate(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    out: *mut TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Free a created stream handle.
+  ///
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context
+  /// \param stream The stream to be freed
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMStreamFree(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    stream: TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Set the runtime stream of current thread to be stream.
+  /// The subsequent calls to the same device_type
+  /// will use the setted stream handle.
+  /// The specific type of stream is runtime device dependent.
+  ///
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context.
+  /// \param handle The stream handle.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMSetStream(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    handle: TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Wait until all computations on stream completes.
+  ///
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context.
+  /// \param stream The stream to be synchronized.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMSynchronize(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    stream: TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Synchronize two streams of execution.
+  ///
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context
+  /// \param src The source stream to synchronize.
+  /// \param dst The destination stream to synchronize.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMStreamStreamSynchronize(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    src: TVMStreamHandle,
+    dst: TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Backend function for modules to get function
+  /// from its environment mod_node (its imports and global function).
+  /// The user do should not call TVMFuncFree on func.
+  ///
+  /// \param mod_node The module handle.
+  /// \param func_name The name of the function.
+  /// \param out The result function.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMBackendGetFuncFromEnv(
+    mod_node: *mut ::std::os::raw::c_void,
+    func_name: *const ::std::os::raw::c_char,
+    out: *mut TVMFunctionHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Backend function to register system-wide library symbol.
+  ///
+  /// \param name The name of the symbol
+  /// \param ptr The symbol address.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMBackendRegisterSystemLibSymbol(
+    name: *const ::std::os::raw::c_char,
+    ptr: *mut ::std::os::raw::c_void,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Backend function to allocate temporal workspace.
+  ///
+  /// \note The result allocate spaced is ensured to be aligned to kTempAllocaAlignment.
+  ///
+  /// \param nbytes The size of the space requested.
+  /// \param device_type The device type which the space will be allocated.
+  /// \param device_id The device id which the space will be allocated.
+  /// \param dtype_code_hint The type code of the array elements. Only used in
+  /// certain backends such as OpenGL.
+  /// \param dtype_bits_hint The type bits of the array elements. Only used in
+  /// certain backends such as OpenGL.
+  /// \return nullptr when error is thrown, a valid ptr if success
+  pub fn TVMBackendAllocWorkspace(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    nbytes: u64,
+    dtype_code_hint: ::std::os::raw::c_int,
+    dtype_bits_hint: ::std::os::raw::c_int,
+  ) -> *mut ::std::os::raw::c_void;
+}
+extern "C" {
+  /// \brief Backend function to free temporal workspace.
+  ///
+  /// \param ptr The result allocated space pointer.
+  /// \param device_type The device type which the space will be allocated.
+  /// \param device_id The device id which the space will be allocated.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  ///
+  /// \sa TVMBackendAllocWorkspace
+  pub fn TVMBackendFreeWorkspace(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    ptr: *mut ::std::os::raw::c_void,
+  ) -> ::std::os::raw::c_int;
+}
+/// \brief Environment for TVM parallel task.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct TVMParallelGroupEnv {
+  /// \brief Auxiliary used for synchronization
+  pub sync_handle: *mut ::std::os::raw::c_void,
+  /// \brief total amount of task
+  pub num_task: i32,
+}
+/// \brief The callback function to execute a parallel lambda
+/// \param task_id the task id of the function.
+/// \param penv The parallel environment backs the execution.
+/// \param cdata The supporting closure data.
+pub type FTVMParallelLambda = ::std::option::Option<
+  unsafe extern "C" fn(
+    task_id: ::std::os::raw::c_int,
+    penv: *mut TVMParallelGroupEnv,
+    cdata: *mut ::std::os::raw::c_void,
+  ) -> ::std::os::raw::c_int,
+>;
+extern "C" {
+  /// \brief Backend function for running parallel jobs.
+  ///
+  /// \param flambda The parallel function to be launched.
+  /// \param cdata The closure data.
+  /// \param num_task Number of tasks to launch, can be 0, means launch
+  /// with all available threads.
+  ///
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMBackendParallelLaunch(
+    flambda: FTVMParallelLambda,
+    cdata: *mut ::std::os::raw::c_void,
+    num_task: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief BSP barrrier between parallel threads
+  /// \param task_id the task id of the function.
+  /// \param penv The parallel environment backs the execution.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMBackendParallelBarrier(
+    task_id: ::std::os::raw::c_int,
+    penv: *mut TVMParallelGroupEnv,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Simple static initialization fucntion.
+  /// Run f once and set handle to be not null.
+  /// This function is mainly used for test purpose.
+  ///
+  /// \param handle An global address to indicate f
+  /// \param f The function to be ran
+  /// \param cdata The closure data to pass to the function.
+  /// \param nbytes Number of bytes in the closure data.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMBackendRunOnce(
+    handle: *mut *mut ::std::os::raw::c_void,
+    f: ::std::option::Option<
+      unsafe extern "C" fn(arg1: *mut ::std::os::raw::c_void) -> ::std::os::raw::c_int,
+    >,
+    cdata: *mut ::std::os::raw::c_void,
+    nbytes: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
diff --git a/rust/src/runtime/graph.rs b/rust/src/runtime/graph.rs
new file mode 100644
index 000000000000..6c53aeb9f6e9
--- /dev/null
+++ b/rust/src/runtime/graph.rs
@@ -0,0 +1,466 @@
+use std::{cmp, collections::HashMap, convert::TryFrom, iter::FromIterator, mem, str};
+
+use nom::{alpha1, digit1, le_i32, le_i64, le_u16, le_u32, le_u64, le_u8, types::CompleteStr};
+use serde;
+use serde_json;
+
+use super::{DataType, Module, Storage, TVMArgValue, TVMContext, Tensor};
+use errors::{Error, ErrorKind, Result};
+use ffi::runtime::{
+  DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt, DLTensor,
+};
+
+// Magic number for NDArray file. @see `kTVMNDArrayMagic` in `ndarray.h`
+const _NDARRAY_MAGIC: u64 = 0xDD5E40F096B4A13F;
+// Magic number for NDArray list file. @see `kTVMNDArrayListMagic` in `graph_runtime.h`
+const _NDARRAY_LIST_MAGIC: u64 = 0xF7E58D4F05049CB7;
+
+/// A TVM computation graph.
+///
+/// # Examples
+///
+/// ```
+/// let graph_json = fs::read_to_string("graph.json")).unwrap();
+/// let graph = Graph::try_from(&graph_json).unwrap();
+/// ```
+#[derive(Serialize, Deserialize, Debug)]
+pub struct Graph {
+  pub nodes: Vec<Node>,
+  pub arg_nodes: Vec<usize>,
+  pub heads: Vec<Entry>,
+  pub node_row_ptr: Option<Vec<usize>>,
+  pub attrs: Option<HashMap<String, serde_json::Value>>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct Entry {
+  pub id: usize,
+  pub index: usize,
+  pub version: usize,
+}
+
+impl Graph {
+  fn entry_index(&self, entry: &Entry) -> Result<usize> {
+    self
+      .node_row_ptr
+      .as_ref()
+      .map(|nrp| nrp[entry.id] + entry.index)
+      .ok_or("Missing node_row_ptr.".into())
+  }
+
+  /// Attempt to deserialize a JSON attribute to a type `T`.
+  fn get_attr<T: serde::de::DeserializeOwned>(&self, attr: &str) -> Result<T> {
+    Ok(serde_json::from_value::<T>(
+      self
+        .attrs
+        .as_ref()
+        .ok_or(ErrorKind::GraphFormatError(
+          "Missing graph attrs".to_string(),
+        ))?.get(attr)
+        .ok_or(ErrorKind::GraphFormatError(format!(
+          "Missing {} attr",
+          attr
+        )))?.to_owned(),
+    )?)
+  }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct Node {
+  pub op: String,
+  pub name: String,
+  pub inputs: Vec<Entry>,
+  pub attrs: Option<HashMap<String, String>>,
+  pub control_deps: Option<Vec<Entry>>,
+}
+
+struct NodeAttrs {
+  func_name: String,
+  num_outputs: usize,
+  flatten_data: bool,
+}
+
+impl Node {
+  fn parse_attrs(&self) -> Result<NodeAttrs> {
+    let attrs = self
+      .attrs
+      .as_ref()
+      .ok_or(format!("Missing node.attrs for `{}`", self.name))?;
+    let func_name = attrs
+      .get("func_name")
+      .ok_or(format!("Node `{}` is missing attrs.func_name", self.name))?
+      .to_string();
+    let num_outputs = attrs
+      .get("num_outputs")
+      .ok_or(format!("Node `{}` is missing attrs.num_outputs", self.name))?
+      .parse::<usize>()?;
+    let flatten_data = attrs
+      .get("flatten_data")
+      .ok_or(format!(
+        "Node `{}` is missing attrs.flatten_data",
+        self.name
+      ))?.parse::<u8>()?
+      == 1;
+    Ok(NodeAttrs {
+      func_name,
+      num_outputs,
+      flatten_data,
+    })
+  }
+}
+
+impl<'a> TryFrom<&'a String> for Graph {
+  type Error = Error;
+  fn try_from(graph_json: &String) -> Result<Self> {
+    let graph = serde_json::from_str(graph_json)?;
+    Ok(graph)
+  }
+}
+
+impl<'a> TryFrom<&'a str> for Graph {
+  type Error = Error;
+  fn try_from(graph_json: &'a str) -> Result<Self> {
+    let graph = serde_json::from_str(graph_json)?;
+    Ok(graph)
+  }
+}
+
+/// A executor for a TVM computation graph.
+///
+/// # Examples
+///
+/// ```
+/// use ndarray::Array;
+///
+/// let syslib = SystemLibModule::default(); // a provider of TVM functions
+///
+/// let mut params_bytes = Vec::new();
+/// fs::File::open("graph.params").unwrap().read_to_end(&mut params_bytes).unwrap();
+/// let params = tvm::runtime::load_param_dict(&params_bytes).unwrap();
+///
+/// let graph = Graph::try_from(&fs::read_to_string("graph.json").unwrap()).unwrap();
+///
+/// let mut exec = GraphExecutor::new(graph, &syslib).unwrap();
+/// exec.load_params(params);
+///
+/// let x = Array::from_vec(vec![1f32, 2., 3., 4.]);
+/// exec.set_input("data", x.into());
+/// exec.run();
+/// let output = exec.get_output(0).unwrap();
+///
+/// println!("{:#?}", Array::try_from(output).unwrap());
+/// ```
+pub struct GraphExecutor<'m, 't> {
+  graph: Graph,
+  op_execs: Vec<Box<Fn() + 'm>>,
+  tensors: Vec<Tensor<'t>>,
+}
+
+unsafe impl<'m, 't> Send for GraphExecutor<'m, 't> {}
+
+impl<'m, 't> GraphExecutor<'m, 't> {
+  pub fn new<M: 'm + Module>(graph: Graph, lib: &'m M) -> Result<Self> {
+    let tensors = Self::setup_storages(&graph)?;
+    Ok(GraphExecutor {
+      op_execs: Self::setup_op_execs(&graph, lib, &tensors)?,
+      tensors: tensors,
+      graph: graph,
+    })
+  }
+
+  /// Runs the computation graph.
+  pub fn run(&self) {
+    self.op_execs.iter().for_each(|op_exec| {
+      op_exec();
+    });
+  }
+
+  /// Allocates `Storages` for each `storage_id` and returns `Tensor`s to hold each output.
+  fn setup_storages<'a>(graph: &'a Graph) -> Result<Vec<Tensor<'t>>> {
+    let storage_ids = graph.get_attr::<(String, Vec<usize>)>("storage_id")?.1;
+    let shapes = graph.get_attr::<(String, Vec<Vec<i64>>)>("shape")?.1;
+    let dtypes = graph
+      .get_attr::<(String, Vec<String>)>("dltype")?
+      .1
+      .iter()
+      .map(|dltype| {
+        if let Ok((_, dtype)) = tvm_str_to_type(CompleteStr(dltype)) {
+          Ok(dtype)
+        } else {
+          Err(ErrorKind::GraphFormatError(format!("Invalid dltype: {}", dltype).to_string()).into())
+        }
+      }).collect::<Result<Vec<DataType>>>()?;
+
+    let align = dtypes.iter().map(|dtype| dtype.bits as usize).max();
+    let mut storage_num_bytes = vec![0usize; *storage_ids.iter().max().unwrap_or(&1) + 1];
+    for (i, &storage_id) in storage_ids.iter().enumerate() {
+      let dtype_size = dtypes[i].bits * dtypes[i].lanes >> 3;
+      let nbytes = dtype_size * shapes[i].iter().product::<i64>() as usize;
+      storage_num_bytes[storage_id] = cmp::max(nbytes, storage_num_bytes[storage_id]);
+    }
+
+    let mut storages: Vec<Storage> = storage_num_bytes
+      .into_iter()
+      .map(|nbytes| Storage::new(nbytes, align))
+      .collect::<Result<Vec<Storage>>>()?;
+
+    let tensors = izip!(storage_ids, shapes, dtypes)
+      .map(|(storage_id, shape, dtype)| {
+        let storage = storages[storage_id].view();
+        Tensor {
+          data: mem::replace(&mut storages[storage_id], storage),
+          ctx: TVMContext::default(),
+          dtype: dtype,
+          size: shape.iter().product::<i64>() as usize,
+          shape: shape,
+          strides: None,
+          byte_offset: 0,
+        }
+      }).collect();
+
+    Ok(tensors)
+  }
+
+  /// Creates closures which represent the computation performed by this graph.
+  fn setup_op_execs<M: 'm + Module>(
+    graph: &Graph,
+    lib: &'m M,
+    tensors: &Vec<Tensor<'t>>,
+  ) -> Result<Vec<Box<Fn() + 'm>>> {
+    ensure!(graph.node_row_ptr.is_some(), "Missing node_row_ptr.");
+    let node_row_ptr = graph.node_row_ptr.as_ref().unwrap();
+
+    let mut op_execs = Vec::new();
+    for (i, node) in graph.nodes.iter().enumerate() {
+      if node.op == "null" {
+        continue;
+      }
+      ensure!(node.op == "tvm_op", "Only TVM ops are supported.");
+      ensure!(node.attrs.is_some(), "Missing node attrs.");
+
+      let attrs = node.parse_attrs()?;
+
+      if attrs.func_name == "__nop" {
+        continue;
+      }
+
+      let func = lib
+        .get_function(&attrs.func_name)
+        .ok_or(format!("Missing function {}", attrs.func_name))?;
+      let arg_indices = node
+        .inputs
+        .iter()
+        .map(|entry| graph.entry_index(entry))
+        .chain((0..attrs.num_outputs).map(|oi| Ok(node_row_ptr[i].clone() + oi)));
+
+      let dl_tensors = arg_indices
+        .map(|idx| {
+          let tensor = &tensors[idx?];
+          Ok(if attrs.flatten_data {
+            DLTensor::from_tensor(tensor, true /* flatten */)
+          } else {
+            DLTensor::from(tensor)
+          })
+        }).collect::<Result<Vec<DLTensor>>>()
+        .unwrap();
+      let op: Box<Fn()> = box move || {
+        let args = dl_tensors
+          .iter()
+          .map(|t| t.into())
+          .collect::<Vec<TVMArgValue>>();
+        func(args.as_slice());
+      };
+      op_execs.push(op);
+    }
+    Ok(op_execs)
+  }
+
+  pub fn load_params(&mut self, params: HashMap<String, Tensor<'t>>) {
+    params.into_iter().for_each(|(name, param)| {
+      self.set_input(name, param);
+    })
+  }
+
+  pub fn set_input<S: AsRef<str>>(&mut self, name: S, value: Tensor<'t>) {
+    if let Some(idx) = self.get_input_index(name.as_ref()) {
+      // TODO: consider `new_with_params` to avoid ever allocating
+      let ptr = self.tensors[idx].data.as_ptr();
+      let mut to_replace = self.tensors.iter_mut().filter(|t| t.data.as_ptr() == ptr);
+      let mut owner = to_replace.nth(0).unwrap();
+      if value.data.is_owned() {
+        // FIXME: for no-copy, need setup_op_execs to not capture tensor ptr
+        // mem::replace(&mut (*owner), value);
+        // to_replace.for_each(|t| {
+        //   panic!("replacing");
+        //   t.data = owner.data.view();
+        // });
+        owner.copy(&value);
+      } else {
+        owner.copy(&value);
+      }
+    } else {
+      println!("Unexpected input `{}`", name.as_ref());
+    }
+  }
+
+  /// Returns the graph input with name `name`, if it exists.
+  pub fn get_input<S: AsRef<str>>(&mut self, name: S) -> Option<&Tensor> {
+    self
+      .get_input_index(name.as_ref())
+      .and_then(move |idx| Some(&self.tensors[idx]))
+  }
+
+  /// Returns the graph output with index `index`, if it exists.
+  pub fn get_output(&self, idx: usize) -> Option<&Tensor> {
+    let graph = &self.graph;
+    graph.heads.get(idx).and_then(|entry| {
+      graph
+        .entry_index(entry)
+        .map(|idx| self.tensors.get(idx))
+        .unwrap_or(None)
+    })
+  }
+
+  /// Returns the index for graph input with name `name`, if it exists.
+  pub fn get_input_index<S: AsRef<str>>(&self, name: S) -> Option<usize> {
+    let graph = &self.graph;
+    (0..graph.nodes.len())
+      .skip_while(|&i| graph.nodes[i].name != name.as_ref())
+      .nth(0)
+      .and_then(|i| {
+        if graph.arg_nodes.iter().any(|&id| id == i) {
+          graph.node_row_ptr.as_ref().map(|nrp| nrp[i])
+        } else {
+          None
+        }
+      })
+  }
+}
+
+/// Converts a string to TVM DLDataTypeCode. @see `String2TVMType` in packed_func.h
+named!(
+  tvm_str_to_type<CompleteStr, DataType>,
+  do_parse!(
+    type_name: alpha1 >>
+    bits: digit1 >>
+    lanes: opt!(tuple!(tag!("x"), digit1)) >>
+    (DataType {
+      code: match type_name {
+        CompleteStr("int") => DLDataTypeCode_kDLInt,
+        CompleteStr("uint") => DLDataTypeCode_kDLUInt,
+        CompleteStr("float") => DLDataTypeCode_kDLFloat,
+        _ => DLDataTypeCode_kDLFloat,
+      } as usize,
+      bits: bits.parse::<u8>().unwrap() as usize,
+      lanes: match lanes {
+        Some(lanes) => lanes.1.parse::<u16>().unwrap() as usize,
+        None => 1,
+      },
+    })
+  )
+);
+
+/// Converts a bytes to String.
+named!(
+  name<String>,
+  map_res!(length_bytes!(le_u64), |b: &[u8]| String::from_utf8(
+    b.to_vec()
+  ))
+);
+
+/// Parses a TVMContext
+named!(
+  tvm_ctx<&[u8], TVMContext>,
+  do_parse!(
+    device_type: le_u32 >>
+    device_id: le_i32 >>
+    (TVMContext { device_type: device_type as usize, device_id: device_id as usize })
+  )
+);
+
+/// Parses a DataType
+named!(
+  data_type<&[u8], DataType>,
+  do_parse!(
+    code: le_u8 >>
+    bits: le_u8 >>
+    lanes: le_u16 >>
+    (DataType { code: code as usize, bits: bits as usize, lanes: lanes as usize })
+  )
+);
+
+/// Parses a Tensor from a TVM array file.
+named!(
+  tensor<Tensor>,
+  do_parse!(
+    take!(8)
+      >> bits!(tag_bits!(u64, 64, 0))
+      >> ctx: tvm_ctx
+      >> ndim: le_u32
+      >> dtype: data_type
+      >> shape: count!(map!(le_i64, |sz| sz as i64), ndim as usize)
+      >> length: le_i64
+      >> data: take!(length)
+      >> (Tensor {
+        data: Storage::from(data),
+        ctx: ctx,
+        dtype: dtype,
+        size: shape.iter().product::<i64>() as usize,
+        shape: shape,
+        strides: None,
+        byte_offset: 0,
+      })
+  )
+);
+
+/// Parses a graph params dict from a params binary file.
+named!(
+  parse_param_dict<HashMap<String, Tensor>>,
+  do_parse!(
+    take!(8)
+      >> bits!(tag_bits!(u64, 64, 0))
+      >> names: length_count!(le_u64, name)
+      >> tensors: length_count!(le_u64, tensor)
+      >> (HashMap::from_iter(names.into_iter().zip(tensors.into_iter())))
+  )
+);
+
+/// Loads a param dict saved using `nnvm.compiler.save_param_dict`.
+pub fn load_param_dict(bytes: &[u8]) -> Result<HashMap<String, Tensor>> {
+  if let Ok((remaining_bytes, param_dict)) = parse_param_dict(bytes) {
+    if remaining_bytes.len() > 0 {
+      bail!(ErrorKind::LoadGraphParamsError("extra input".to_string()))
+    } else {
+      Ok(param_dict)
+    }
+  } else {
+    bail!(ErrorKind::LoadGraphParamsError(
+      "invalid parameters file".to_string()
+    ))
+  }
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  #[test]
+  fn test_str_to_type() {
+    assert_eq!(
+      tvm_str_to_type(CompleteStr("float24")).unwrap().1,
+      DataType {
+        code: DLDataTypeCode_kDLFloat as usize,
+        bits: 24,
+        lanes: 1
+      }
+    );
+    assert_eq!(
+      tvm_str_to_type(CompleteStr("uint111x44")).unwrap().1,
+      DataType {
+        code: DLDataTypeCode_kDLUInt as usize,
+        bits: 111,
+        lanes: 44
+      }
+    );
+  }
+}
diff --git a/rust/src/runtime/mod.rs b/rust/src/runtime/mod.rs
new file mode 100644
index 000000000000..bdf7094113d8
--- /dev/null
+++ b/rust/src/runtime/mod.rs
@@ -0,0 +1,25 @@
+mod allocator;
+mod array;
+mod module;
+#[macro_use]
+mod packed_func;
+mod graph;
+#[cfg(target_env = "sgx")]
+#[macro_use]
+pub mod sgx;
+mod threading;
+mod workspace;
+
+use std::os::raw::c_char;
+
+pub use self::{array::*, graph::*, module::*, packed_func::*, threading::*, workspace::*};
+
+#[no_mangle]
+pub extern "C" fn TVMAPISetLastError(cmsg: *const c_char) {
+  #[cfg(not(target_env = "sgx"))]
+  unsafe {
+    panic!(std::ffi::CStr::from_ptr(cmsg).to_str().unwrap());
+  }
+  #[cfg(target_env = "sgx")]
+  ocall_packed!("__sgx_set_last_error__", cmsg);
+}
diff --git a/rust/src/runtime/module.rs b/rust/src/runtime/module.rs
new file mode 100644
index 000000000000..2594756d9885
--- /dev/null
+++ b/rust/src/runtime/module.rs
@@ -0,0 +1,46 @@
+use std::{
+  collections::HashMap, convert::AsRef, ffi::CStr, os::raw::c_char, string::String, sync::Mutex,
+};
+
+use ffi::runtime::BackendPackedCFunc;
+use runtime::packed_func::{wrap_backend_packed_func, PackedFunc};
+
+pub trait Module {
+  fn get_function<S: AsRef<str>>(&self, name: S) -> Option<PackedFunc>;
+}
+
+pub struct SystemLibModule;
+
+lazy_static! {
+  static ref SYSTEM_LIB_FUNCTIONS: Mutex<HashMap<String, BackendPackedCFunc>> =
+    Mutex::new(HashMap::new());
+}
+
+impl Module for SystemLibModule {
+  fn get_function<S: AsRef<str>>(&self, name: S) -> Option<PackedFunc> {
+    SYSTEM_LIB_FUNCTIONS
+      .lock()
+      .unwrap()
+      .get(name.as_ref())
+      .map(|func| wrap_backend_packed_func(func.to_owned()))
+  }
+}
+
+impl Default for SystemLibModule {
+  fn default() -> Self {
+    SystemLibModule {}
+  }
+}
+
+#[no_mangle]
+pub extern "C" fn TVMBackendRegisterSystemLibSymbol(
+  cname: *const c_char,
+  func: BackendPackedCFunc,
+) -> i32 {
+  let name = unsafe { CStr::from_ptr(cname).to_str().unwrap() };
+  SYSTEM_LIB_FUNCTIONS
+    .lock()
+    .unwrap()
+    .insert(name.to_string(), func);
+  return 0;
+}
diff --git a/rust/src/runtime/packed_func.rs b/rust/src/runtime/packed_func.rs
new file mode 100644
index 000000000000..030d677329c0
--- /dev/null
+++ b/rust/src/runtime/packed_func.rs
@@ -0,0 +1,286 @@
+use std::{any::Any, convert::TryFrom, marker::PhantomData, os::raw::c_void};
+
+use ffi::runtime::{
+  BackendPackedCFunc, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLTensor,
+  TVMTypeCode_kArrayHandle, TVMTypeCode_kHandle, TVMValue,
+};
+
+use errors::*;
+
+pub type PackedFunc = Box<Fn(&[TVMArgValue]) -> TVMRetValue + Send + Sync>;
+
+/// Calls a packed function and returns a `TVMRetValue`.
+///
+/// # Example
+///
+/// `call_packed!(my_tvm_func, &mut arg1, &mut arg2)`
+#[macro_export]
+macro_rules! call_packed {
+  ($fn:expr, $($args:expr),+) => {
+    $fn(&[$($args.into(),)+])
+  };
+  ($fn:expr) => {
+    $fn(&Vec::new())
+  };
+}
+
+/// A borrowed TVMPODValue. Can be constructed using `into()` but the preferred way
+/// to obtain a `TVMArgValue` is automatically via `call_packed!`.
+#[derive(Clone, Copy)]
+pub struct TVMArgValue<'a> {
+  _lifetime: PhantomData<&'a ()>,
+  pub(crate) value: TVMValue,
+  pub(crate) type_code: i64,
+}
+
+impl<'a> TVMArgValue<'a> {
+  pub fn new(value: TVMValue, type_code: i64) -> Self {
+    TVMArgValue {
+      _lifetime: PhantomData,
+      value: value,
+      type_code: type_code,
+    }
+  }
+}
+
+/// Creates a conversion to a `TVMArgValue` for a primitive type and DLDataTypeCode.
+macro_rules! impl_prim_tvm_arg {
+  ($type:ty, $field:ident, $code:expr, $as:ty) => {
+    impl<'a> From<$type> for TVMArgValue<'a> {
+      fn from(val: $type) -> Self {
+        TVMArgValue {
+          value: TVMValue { $field: val as $as },
+          type_code: $code as i64,
+          _lifetime: PhantomData,
+        }
+      }
+    }
+  };
+  ($type:ty, $field:ident, $code:expr) => {
+    impl_prim_tvm_arg!($type, $field, $code, $type);
+  };
+  ($type:ty,v_int64) => {
+    impl_prim_tvm_arg!($type, v_int64, DLDataTypeCode_kDLInt, i64);
+  };
+  ($type:ty,v_float64) => {
+    impl_prim_tvm_arg!($type, v_float64, DLDataTypeCode_kDLFloat, f64);
+  };
+}
+
+impl_prim_tvm_arg!(f32, v_float64);
+impl_prim_tvm_arg!(f64, v_float64);
+impl_prim_tvm_arg!(i8, v_int64);
+impl_prim_tvm_arg!(u8, v_int64);
+impl_prim_tvm_arg!(i32, v_int64);
+impl_prim_tvm_arg!(u32, v_int64);
+impl_prim_tvm_arg!(i64, v_int64);
+impl_prim_tvm_arg!(u64, v_int64);
+impl_prim_tvm_arg!(bool, v_int64);
+
+/// Creates a conversion to a `TVMArgValue` for an object handle.
+impl<'a, T> From<*const T> for TVMArgValue<'a> {
+  fn from(ptr: *const T) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: ptr as *mut T as *mut c_void,
+      },
+      type_code: TVMTypeCode_kArrayHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+
+/// Creates a conversion to a `TVMArgValue` for a mutable object handle.
+impl<'a, T> From<*mut T> for TVMArgValue<'a> {
+  fn from(ptr: *mut T) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: ptr as *mut c_void,
+      },
+      type_code: TVMTypeCode_kHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+
+impl<'a> From<&'a mut DLTensor> for TVMArgValue<'a> {
+  fn from(arr: &'a mut DLTensor) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: arr as *mut _ as *mut c_void,
+      },
+      type_code: TVMTypeCode_kArrayHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+
+impl<'a> From<&'a DLTensor> for TVMArgValue<'a> {
+  fn from(arr: &'a DLTensor) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: arr as *const _ as *mut DLTensor as *mut c_void,
+      },
+      type_code: TVMTypeCode_kArrayHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+
+/// An owned TVMPODValue. Can be converted from a variety of primitive and object types.
+/// Can be downcasted using `try_from` if it contains the desired type.
+///
+/// # Example
+///
+/// ```
+/// let a = 42u32;
+/// let b: i64 = TVMRetValue::from(a).try_into().unwrap();
+///
+/// let s = "hello, world!";
+/// let t: TVMRetValue = s.into();
+/// assert_eq!(String::try_from(t).unwrap(), s);
+/// ```
+pub struct TVMRetValue {
+  /// A primitive return value, if any.
+  prim_value: u64,
+  /// An object return value, if any.
+  box_value: Box<Any>,
+  /// The DLDataTypeCode which determines whether `prim_value` or `box_value` is in use.
+  type_code: i64,
+}
+
+#[cfg(target_env = "sgx")]
+impl TVMRetValue {
+  pub(crate) fn from_tvm_value(value: TVMValue, type_code: i64) -> Self {
+    unsafe {
+      Self {
+        prim_value: match type_code {
+          0 | 1 => value.v_int64 as u64,
+          2 => value.v_float64 as u64,
+          3 | 7 | 8 | 9 | 10 => value.v_handle as u64,
+          11 | 12 => value.v_str as u64,
+          _ => 0,
+        } as u64,
+        box_value: box (),
+        type_code: type_code,
+      }
+    }
+  }
+
+  pub fn into_tvm_value(self) -> (TVMValue, i64) {
+    let val = match self.type_code {
+      0 | 1 => TVMValue {
+        v_int64: self.prim_value.clone() as i64,
+      },
+      2 => TVMValue {
+        v_float64: self.prim_value.clone() as f64,
+      },
+      3 | 7 | 8 | 9 | 10 => TVMValue {
+        v_handle: Box::into_raw(self.box_value) as *mut c_void,
+      },
+      11 | 12 => TVMValue {
+        v_str: Box::into_raw(self.box_value) as *const _,
+      },
+      _ => unreachable!(),
+    };
+    (val, self.type_code)
+  }
+}
+
+impl Default for TVMRetValue {
+  fn default() -> Self {
+    TVMRetValue {
+      prim_value: 0,
+      box_value: box (),
+      type_code: 0,
+    }
+  }
+}
+
+macro_rules! impl_prim_ret_value {
+  ($type:ty, $code:expr) => {
+    impl From<$type> for TVMRetValue {
+      fn from(val: $type) -> Self {
+        TVMRetValue {
+          prim_value: val as u64,
+          box_value: box (),
+          type_code: $code,
+        }
+      }
+    }
+    impl TryFrom<TVMRetValue> for $type {
+      type Error = Error;
+      fn try_from(ret: TVMRetValue) -> Result<$type> {
+        if ret.type_code == $code {
+          Ok(ret.prim_value as $type)
+        } else {
+          bail!(ErrorKind::TryFromTVMRetValueError(
+            stringify!($type).to_string(),
+            ret.type_code
+          ))
+        }
+      }
+    }
+  };
+}
+
+macro_rules! impl_boxed_ret_value {
+  ($type:ty, $code:expr) => {
+    impl From<$type> for TVMRetValue {
+      fn from(val: $type) -> Self {
+        TVMRetValue {
+          prim_value: 0,
+          box_value: box val,
+          type_code: $code,
+        }
+      }
+    }
+    impl TryFrom<TVMRetValue> for $type {
+      type Error = Error;
+      fn try_from(ret: TVMRetValue) -> Result<$type> {
+        if let Ok(val) = ret.box_value.downcast::<$type>() {
+          Ok(*val)
+        } else {
+          bail!(ErrorKind::TryFromTVMRetValueError(
+            stringify!($type).to_string(),
+            ret.type_code
+          ))
+        }
+      }
+    }
+  };
+}
+
+impl_prim_ret_value!(i8, 0);
+impl_prim_ret_value!(u8, 1);
+impl_prim_ret_value!(i16, 0);
+impl_prim_ret_value!(u16, 1);
+impl_prim_ret_value!(i32, 0);
+impl_prim_ret_value!(u32, 1);
+impl_prim_ret_value!(f32, 2);
+impl_prim_ret_value!(i64, 0);
+impl_prim_ret_value!(u64, 1);
+impl_prim_ret_value!(f64, 2);
+impl_prim_ret_value!(isize, 0);
+impl_prim_ret_value!(usize, 1);
+impl_boxed_ret_value!(String, 11);
+
+// @see `WrapPackedFunc` in `llvm_module.cc`.
+pub(super) fn wrap_backend_packed_func(func: BackendPackedCFunc) -> PackedFunc {
+  box move |args: &[TVMArgValue]| {
+    func(
+      args
+        .iter()
+        .map(|ref arg| arg.value)
+        .collect::<Vec<TVMValue>>()
+        .as_ptr(),
+      args
+        .iter()
+        .map(|ref arg| arg.type_code as i32)
+        .collect::<Vec<i32>>()
+        .as_ptr() as *const i32,
+      args.len() as i32,
+    );
+    TVMRetValue::default()
+  }
+}
diff --git a/rust/src/runtime/sgx.rs b/rust/src/runtime/sgx.rs
new file mode 100644
index 000000000000..bf9d54a4af65
--- /dev/null
+++ b/rust/src/runtime/sgx.rs
@@ -0,0 +1,82 @@
+use std::{
+  ffi::CString,
+  os::raw::{c_char, c_int},
+};
+
+use errors::Result;
+use ffi::runtime::TVMValue;
+use runtime::{threading::sgx_join_threads, SystemLibModule, TVMArgValue, TVMRetValue};
+
+pub use runtime::threading::tvm_run_worker as run_worker;
+
+#[macro_export]
+macro_rules! tvm_ocall {
+  ($func: expr) => {
+    match $func {
+      0 => Ok(()),
+      err => Err(format!("SGX error: {}", err)),
+    }
+  };
+}
+
+pub type SgxStatus = u32;
+
+#[cfg(target_env = "sgx")]
+extern "C" {
+  fn tvm_ocall_packed_func(
+    name: *const c_char,
+    arg_values: *const TVMValue,
+    type_codes: *const c_int,
+    num_args: c_int,
+    ret_val: *mut TVMValue,
+    ret_type_code: *mut c_int,
+  ) -> SgxStatus;
+}
+
+pub fn ocall_packed_func<S: AsRef<str>>(fn_name: S, args: &[TVMArgValue]) -> Result<TVMRetValue> {
+  let mut ret_val = TVMValue { v_int64: 0 };
+  let ret_type_code = 0i64;
+  unsafe {
+    tvm_ocall!(tvm_ocall_packed_func(
+      CString::new(fn_name.as_ref()).unwrap().as_ptr(),
+      args
+        .iter()
+        .map(|ref arg| arg.value)
+        .collect::<Vec<TVMValue>>()
+        .as_ptr(),
+      args
+        .iter()
+        .map(|ref arg| arg.type_code as i32)
+        .collect::<Vec<i32>>()
+        .as_ptr() as *const i32,
+      args.len() as i32,
+      &mut ret_val as *mut TVMValue,
+      &mut (ret_type_code as i32) as *mut c_int,
+    ))?;
+  }
+  Ok(TVMRetValue::from_tvm_value(ret_val, ret_type_code as i64))
+}
+
+#[macro_export]
+macro_rules! ocall_packed {
+  ($fn_name:expr, $($args:expr),+) => {
+    ::runtime::sgx::ocall_packed_func($fn_name, &[$($args.into(),)+])
+      .expect(concat!("Error calling `", $fn_name, "`"))
+  };
+  ($fn_name:expr) => {
+    ::runtime::sgx::ocall_packed_func($fn_name, &Vec::new())
+      .expect(concat!("Error calling `", $fn_name, "`"))
+  }
+}
+
+pub fn shutdown() {
+  if env!("TVM_NUM_THREADS") != "0" {
+    sgx_join_threads()
+  }
+}
+
+impl Drop for SystemLibModule {
+  fn drop(&mut self) {
+    shutdown()
+  }
+}
diff --git a/rust/src/runtime/threading.rs b/rust/src/runtime/threading.rs
new file mode 100644
index 000000000000..c0d6221c91b7
--- /dev/null
+++ b/rust/src/runtime/threading.rs
@@ -0,0 +1,334 @@
+use std::{
+  os::raw::{c_int, c_void},
+  sync::{
+    atomic::{AtomicUsize, Ordering, ATOMIC_USIZE_INIT},
+    Arc, Barrier,
+  },
+};
+
+#[cfg(not(target_env = "sgx"))]
+use num_cpus;
+#[cfg(not(target_env = "sgx"))]
+use std::{
+  env,
+  thread::{self, JoinHandle},
+};
+
+#[cfg(target_env = "sgx")]
+use std::{collections::VecDeque, ptr, sync::Mutex};
+
+use bounded_spsc_queue::{self, Producer};
+
+use super::super::errors::*;
+use ffi::runtime::TVMParallelGroupEnv;
+
+#[cfg(target_env = "sgx")]
+use super::{TVMArgValue, TVMRetValue};
+
+type FTVMParallelLambda =
+  extern "C" fn(task_id: usize, penv: *const TVMParallelGroupEnv, cdata: *const c_void) -> i32;
+
+/// Holds a parallel job request made by a TVM library function.
+struct Job {
+  cb: FTVMParallelLambda,
+  cdata: *const c_void,
+  req_num_tasks: usize,
+  pending: Arc<AtomicUsize>,
+}
+
+impl Job {
+  /// Splits this job into a number of `Task`s which can be scheduled.
+  fn tasks(&self, num_workers: usize) -> Vec<Task> {
+    let num_tasks = if self.req_num_tasks == 0 {
+      num_workers
+    } else {
+      self.req_num_tasks.min(num_workers)
+    };
+    self.pending.store(num_tasks, Ordering::SeqCst);
+
+    let barrier = Arc::new(Barrier::new(num_tasks));
+
+    (0..num_tasks)
+      .map(move |i| Task {
+        id: i,
+        flambda: self.cb,
+        penv: TVMParallelGroupEnv {
+          sync_handle: &Arc::clone(&barrier) as *const _ as *mut c_void,
+          num_task: num_tasks as i32,
+        },
+        cdata: self.cdata,
+        pending: Arc::clone(&self.pending),
+      }).collect()
+  }
+
+  /// Waits for all tasks in this `Job` to be completed.
+  fn wait(&self) -> Result<()> {
+    while self.pending.load(Ordering::Acquire) > 0 {
+      #[cfg(not(target_env = "sgx"))]
+      thread::yield_now();
+    }
+    Ok(())
+  }
+}
+
+/// A chunk of work requested by a TVM function.
+struct Task {
+  id: usize,
+  flambda: FTVMParallelLambda,
+  penv: TVMParallelGroupEnv,
+  cdata: *const c_void,
+  pending: Arc<AtomicUsize>,
+}
+unsafe impl Send for Task {}
+unsafe impl Sync for Task {}
+
+impl FnOnce<()> for Task {
+  type Output = i32;
+  extern "rust-call" fn call_once(self, _args: ()) -> Self::Output {
+    let status = (self.flambda)(self.id, &self.penv as *const _, self.cdata);
+    self.pending.fetch_sub(1, Ordering::AcqRel);
+    status
+  }
+}
+
+#[derive(Default)]
+struct Threads {
+  #[allow(unused)]
+  #[cfg(not(target_env = "sgx"))]
+  handles: Vec<JoinHandle<()>>,
+  queues: Vec<Producer<Task>>,
+}
+
+impl<'a> Threads {
+  #[cfg(not(target_env = "sgx"))]
+  fn launch<F: Sync + Send + FnOnce(Consumer<Task>) + 'static + Copy>(
+    num_threads: usize,
+    cb: F,
+  ) -> Self {
+    let (handles, queues) = (0..num_threads)
+      .map(|_| {
+        let (p, c) = bounded_spsc_queue::make(2);
+        let handle = thread::spawn(move || cb(c.into()));
+        (handle, p)
+      }).unzip();
+    Threads {
+      handles: handles,
+      queues: queues,
+    }
+  }
+
+  #[cfg(target_env = "sgx")]
+  fn launch<F: Sync + Send + FnOnce(Consumer<Task>) + 'static + Copy>(
+    num_threads: usize,
+    _cb: F,
+  ) -> Self {
+    let mut consumer_queues = SGX_QUEUES.lock().unwrap();
+    let queues = (0..num_threads)
+      .map(|_| {
+        let (p, c) = bounded_spsc_queue::make(2);
+        consumer_queues.push_back(c.into());
+        p
+      }).collect();
+    ocall_packed!("__sgx_thread_group_launch__", num_threads as u64);
+    Threads { queues: queues }
+  }
+}
+
+struct ThreadPool {
+  num_workers: usize,
+  #[allow(unused)]
+  threads: Threads,
+}
+
+thread_local!(static THREAD_POOL: ThreadPool = ThreadPool::new());
+
+impl ThreadPool {
+  fn new() -> Self {
+    let num_workers = max_concurrency();
+    ThreadPool {
+      num_workers: num_workers,
+      threads: Threads::launch(num_workers, ThreadPool::run_worker),
+    }
+  }
+
+  fn launch(&self, job: Job) {
+    let mut tasks = job.tasks(self.num_workers + 1);
+
+    for (i, task) in tasks.split_off(1).into_iter().enumerate() {
+      self.threads.queues[i].push(task);
+    }
+
+    tasks.pop().unwrap()();
+    job.wait().unwrap();
+  }
+
+  fn run_worker(queue: Consumer<Task>) {
+    loop {
+      let task = queue.pop();
+      let result = task();
+      if result == <i32>::min_value() {
+        break;
+      } else if result != 0 {
+        panic!("Error running task.");
+      }
+    }
+  }
+}
+
+// Send + Sync wrapper for bounded_spsc_queue::Consumer
+struct Consumer<T> {
+  consumer: bounded_spsc_queue::Consumer<T>,
+}
+impl<T> From<bounded_spsc_queue::Consumer<T>> for Consumer<T> {
+  fn from(c: bounded_spsc_queue::Consumer<T>) -> Self {
+    Consumer { consumer: c }
+  }
+}
+impl<T> Consumer<T> {
+  fn pop(&self) -> T {
+    self.consumer.pop()
+  }
+}
+unsafe impl<T> Send for Consumer<T> {}
+unsafe impl<T> Sync for Consumer<T> {}
+
+#[cfg(target_env = "sgx")]
+lazy_static! {
+  /// Holds tasks for untrusted threads which re-enter the enclave to execute.
+  static ref SGX_QUEUES: Mutex<VecDeque<Consumer<Task>>> = Mutex::new(VecDeque::new());
+}
+
+#[cfg(all(not(target_arch = "wasm32"), not(target_env = "sgx")))]
+fn max_concurrency() -> usize {
+  if let Ok(threads_str) = env::var("TVM_NUM_THREADS").or(env::var("OMP_NUM_THREADS")) {
+    if let Ok(threads) = usize::from_str_radix(&threads_str, 10) {
+      return threads;
+    }
+  }
+  num_cpus::get_physical()
+}
+
+#[cfg(target_env = "sgx")]
+fn max_concurrency() -> usize {
+  usize::from_str_radix(env!("TVM_NUM_THREADS"), 10).unwrap_or(1)
+}
+
+#[cfg(target_arch = "wasm32")]
+fn max_concurrency() -> usize {
+  0 // wasm doesn't support threads yet
+}
+
+#[cfg(target_env = "sgx")]
+pub fn tvm_run_worker(_args: &[TVMArgValue]) -> TVMRetValue {
+  let q = {
+    let mut qs = SGX_QUEUES.lock().unwrap();
+    qs.pop_front()
+    // `qs: MutexGuard` needs to be dropped here since `run_worker` won't return
+  };
+  if let Some(q) = q {
+    ThreadPool::run_worker(q);
+  }
+  TVMRetValue::default()
+}
+
+#[no_mangle]
+pub extern "C" fn TVMBackendParallelLaunch(
+  cb: FTVMParallelLambda,
+  cdata: *const c_void,
+  num_task: usize,
+) -> c_int {
+  if max_concurrency() == 0 {
+    let penv = TVMParallelGroupEnv {
+      sync_handle: 0 as *mut c_void,
+      num_task: 1,
+    };
+    cb(0, &penv as *const _, cdata);
+  } else {
+    THREAD_POOL.with(|pool| {
+      pool.launch(Job {
+        cb: cb,
+        cdata: cdata,
+        req_num_tasks: num_task,
+        pending: Arc::new(ATOMIC_USIZE_INIT),
+      });
+    });
+  }
+  return 0;
+}
+
+#[cfg(target_env = "sgx")]
+pub(crate) fn sgx_join_threads() {
+  extern "C" fn poison_pill(
+    _task_id: usize,
+    _penv: *const TVMParallelGroupEnv,
+    _cdata: *const c_void,
+  ) -> i32 {
+    <i32>::min_value()
+  }
+
+  THREAD_POOL.with(|pool| {
+    pool.launch(Job {
+      cb: poison_pill,
+      cdata: ptr::null(),
+      req_num_tasks: 0,
+      pending: Arc::new(ATOMIC_USIZE_INIT),
+    });
+  });
+  ocall_packed!("__sgx_thread_group_join__", 0);
+}
+
+// @see https://github.com/dmlc/tvm/issues/988 for information on why this function is used.
+#[no_mangle]
+pub extern "C" fn TVMBackendParallelBarrier(_task_id: usize, penv: *const TVMParallelGroupEnv) {
+  let barrier: &Arc<Barrier> = unsafe { &*((*penv).sync_handle as *const Arc<Barrier>) };
+  barrier.wait();
+}
+
+#[cfg(test)]
+mod tests {
+  use std::{ptr, thread, time::Duration};
+
+  use super::*;
+
+  #[test]
+  fn test_max_concurrency() {
+    env::set_var("TVM_NUM_THREADS", "42");
+    env::set_var("OMP_NUM_THREADS", "24");
+    assert_eq!(max_concurrency(), 42);
+    env::remove_var("TVM_NUM_THREADS");
+    assert_eq!(max_concurrency(), 24);
+  }
+
+  extern "C" fn flambda(
+    task_id: usize,
+    penv: *const TVMParallelGroupEnv,
+    cdata: *const c_void,
+  ) -> i32 {
+    if cdata == ptr::null() {
+      return 0;
+    }
+    unsafe {
+      let &(ref counter, ref task_ids_sum) = &*(cdata as *const (AtomicUsize, AtomicUsize));
+      thread::sleep(Duration::from_millis(50 * task_id as u64));
+      counter.fetch_add(1, Ordering::SeqCst);
+      task_ids_sum.fetch_add(task_id, Ordering::SeqCst);
+      assert_eq!((*penv).num_task, 3);
+    }
+    0
+  }
+
+  #[test]
+  fn test_parallel_launch() {
+    TVMBackendParallelLaunch(flambda, ptr::null(), 6);
+    let counter = ATOMIC_USIZE_INIT;
+    let task_ids_sum = ATOMIC_USIZE_INIT;
+    let cdata = (counter, task_ids_sum);
+    let num_tasks = 3;
+    TVMBackendParallelLaunch(flambda, &cdata as *const _ as *const c_void, num_tasks);
+    assert_eq!(cdata.0.load(Ordering::SeqCst), num_tasks);
+    assert_eq!(
+      cdata.1.load(Ordering::SeqCst),
+      (0..num_tasks).sum::<usize>()
+    );
+  }
+}
diff --git a/rust/src/runtime/workspace.rs b/rust/src/runtime/workspace.rs
new file mode 100644
index 000000000000..d0e6d8c89255
--- /dev/null
+++ b/rust/src/runtime/workspace.rs
@@ -0,0 +1,119 @@
+use std::{
+  cell::RefCell,
+  os::raw::{c_int, c_void},
+  ptr,
+};
+
+use super::allocator::Allocation;
+use errors::*;
+
+const WS_ALIGN: usize = 64; // taken from `kTempAllocaAlignment` in `device_api.h`
+
+struct WorkspacePool {
+  workspaces: Vec<Allocation>,
+  free: Vec<usize>,
+  in_use: Vec<usize>,
+}
+
+impl WorkspacePool {
+  fn new() -> Self {
+    WorkspacePool {
+      workspaces: Vec::new(),
+      free: Vec::new(),
+      in_use: Vec::new(),
+    }
+  }
+
+  fn alloc_new(&mut self, size: usize) -> Result<*mut u8> {
+    self.workspaces.push(Allocation::new(size, Some(WS_ALIGN))?);
+    self.in_use.push(self.workspaces.len() - 1);
+    Ok(self.workspaces[self.workspaces.len() - 1].as_mut_ptr())
+  }
+
+  fn alloc(&mut self, size: usize) -> Result<*mut u8> {
+    if self.free.len() == 0 {
+      return self.alloc_new(size);
+    }
+    let idx = self
+      .free
+      .iter()
+      .fold(None, |cur_ws_idx: Option<usize>, &idx| {
+        let ws_size = self.workspaces[idx].size();
+        if !ws_size >= size {
+          return cur_ws_idx;
+        }
+        cur_ws_idx.or(Some(idx)).and_then(|cur_idx| {
+          let cur_size = self.workspaces[cur_idx].size();
+          Some(match ws_size <= cur_size {
+            true => idx,
+            false => cur_idx,
+          })
+        })
+      });
+    match idx {
+      Some(idx) => {
+        self.free.remove_item(&idx).unwrap();
+        self.in_use.push(idx);
+        Ok(self.workspaces[idx].as_mut_ptr())
+      }
+      None => self.alloc_new(size),
+    }
+  }
+
+  fn free(&mut self, ptr: *mut u8) -> Result<()> {
+    let mut ws_idx = None;
+    for i in 0..self.in_use.len() {
+      let idx = self.in_use[i];
+      if self.workspaces[idx].as_mut_ptr() == ptr {
+        self.in_use.remove(i);
+        ws_idx = Some(idx);
+        break;
+      }
+    }
+    Ok(
+      self
+        .free
+        .push(ws_idx.ok_or("Tried to free nonexistent workspace.")?),
+    )
+  }
+}
+
+thread_local!(static WORKSPACE_POOL: RefCell<WorkspacePool> = RefCell::new(WorkspacePool::new()));
+
+const WORKSPACE_PAGE_SIZE: usize = 4 << 10;
+
+#[no_mangle]
+pub extern "C" fn TVMBackendAllocWorkspace(
+  _device_type: c_int,
+  _device_id: c_int,
+  size: u64,
+  _dtype_code_hint: c_int,
+  _dtype_bits_hint: c_int,
+) -> *mut c_void {
+  let nbytes = if size == 0 {
+    WORKSPACE_PAGE_SIZE
+  } else {
+    size as usize
+  };
+  WORKSPACE_POOL.with(|pool_cell| {
+    pool_cell
+      .borrow_mut()
+      .alloc(nbytes as usize)
+      .unwrap_or(ptr::null_mut()) as *mut c_void
+  })
+}
+
+#[no_mangle]
+pub extern "C" fn TVMBackendFreeWorkspace(
+  _device_type: c_int,
+  _device_id: c_int,
+  ptr: *mut c_void,
+) -> c_int {
+  WORKSPACE_POOL.with(|pool_cell| {
+    (match pool_cell.borrow_mut().free(ptr as *mut u8) {
+      Ok(()) => 0,
+      Err(_) => -1,
+    }) as c_int
+  });
+  return 0;
+}
diff --git a/rust/tests/.gitignore b/rust/tests/.gitignore
new file mode 100644
index 000000000000..811076739bfa
--- /dev/null
+++ b/rust/tests/.gitignore
@@ -0,0 +1,3 @@
+*.json
+*.params
+*.o
diff --git a/rust/tests/build_model.py b/rust/tests/build_model.py
new file mode 100644
index 000000000000..e0b90495159f
--- /dev/null
+++ b/rust/tests/build_model.py
@@ -0,0 +1,53 @@
+"""Builds a simple NNVM graph for testing."""
+
+from os import path as osp
+
+import nnvm
+from nnvm import sym
+from nnvm.compiler import graph_util
+from nnvm.testing import init
+import numpy as np
+import tvm
+
+CWD = osp.dirname(osp.abspath(osp.expanduser(__file__)))
+
+
+def _get_model(dshape):
+    data = sym.Variable('data', shape=dshape)
+    fc1 = sym.dense(data, units=dshape[-1]*2, use_bias=True)
+    left, right = sym.split(fc1, indices_or_sections=2, axis=1)
+    return sym.Group(((left + 1), (right - 1)))
+
+
+def _init_params(graph, input_shapes, initializer=init.Xavier(), seed=10):
+    if isinstance(graph, sym.Symbol):
+        graph = nnvm.graph.create(graph)
+    ishapes, _ = graph_util.infer_shape(graph, **input_shapes)
+    param_shapes = dict(zip(graph.index.input_names, ishapes))
+    np.random.seed(seed)
+    params = {}
+    for param, shape in param_shapes.items():
+        if param in {'data', 'label'} or not shape:
+            continue
+        init_value = np.empty(shape).astype('float32')
+        initializer(param, init_value)
+        params[param] = tvm.nd.array(init_value)
+    return params
+
+def main():
+    dshape = (32, 16)
+    net = _get_model(dshape)
+    ishape_dict = {'data': dshape}
+    params = _init_params(net, ishape_dict)
+    graph, lib, params = nnvm.compiler.build(net, 'llvm',
+                                             shape=ishape_dict,
+                                             params=params,
+                                             dtype='float32')
+
+    with open(osp.join(CWD, 'graph.json'), 'w') as f_resnet:
+        f_resnet.write(graph.json())
+    with open(osp.join(CWD, 'graph.params'), 'wb') as f_params:
+        f_params.write(nnvm.compiler.save_param_dict(params))
+
+if __name__ == '__main__':
+    main()
diff --git a/rust/tests/test_graph_serde.rs b/rust/tests/test_graph_serde.rs
new file mode 100644
index 000000000000..a596544212ca
--- /dev/null
+++ b/rust/tests/test_graph_serde.rs
@@ -0,0 +1,38 @@
+#![feature(try_from)]
+
+extern crate serde;
+extern crate serde_json;
+
+extern crate tvm;
+
+use std::{convert::TryFrom, fs, io::Read};
+
+use tvm::runtime::Graph;
+
+#[test]
+fn test_load_graph() {
+  let mut params_bytes = Vec::new();
+  fs::File::open(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/graph.params"))
+    .expect("Could not find TVM graph. Did you run `tests/build_model.py`?")
+    .read_to_end(&mut params_bytes)
+    .unwrap();
+  let _params = tvm::runtime::load_param_dict(&params_bytes);
+
+  let graph = Graph::try_from(
+    &fs::read_to_string(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/graph.json")).unwrap(),
+  ).unwrap();
+
+  assert_eq!(graph.nodes[3].op, "tvm_op");
+  assert_eq!(
+    graph.nodes[3]
+      .attrs
+      .as_ref()
+      .unwrap()
+      .get("func_name")
+      .unwrap(),
+    "fuse_dense"
+  );
+  assert_eq!(graph.nodes[5].inputs[0].index, 0);
+  assert_eq!(graph.nodes[6].inputs[0].index, 1);
+  assert_eq!(graph.heads.len(), 2);
+}
diff --git a/rust/tests/test_nnvm/Cargo.toml b/rust/tests/test_nnvm/Cargo.toml
new file mode 100644
index 000000000000..7e6ce5fb729c
--- /dev/null
+++ b/rust/tests/test_nnvm/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "test-nnvm"
+version = "0.0.0"
+license = "Apache-2.0"
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+
+[dependencies]
+ndarray = "0.11.2"
+tvm = { path = "../../" }
+serde = "1.0.59"
+serde_json = "1.0.17"
+
+[build-dependencies]
+ar = "0.6.0"
diff --git a/rust/tests/test_nnvm/build.rs b/rust/tests/test_nnvm/build.rs
new file mode 100644
index 000000000000..cb3a4e0d574d
--- /dev/null
+++ b/rust/tests/test_nnvm/build.rs
@@ -0,0 +1,28 @@
+extern crate ar;
+
+use std::{env, path::PathBuf, process::Command};
+
+use ar::Builder;
+use std::fs::File;
+
+fn main() {
+  let out_dir = env::var("OUT_DIR").unwrap();
+
+  let output = Command::new(concat!(
+    env!("CARGO_MANIFEST_DIR"),
+    "/src/build_test_graph.py"
+  )).arg(&out_dir)
+    .output()
+    .expect("Failed to execute command");
+  if output.stderr.len() > 0 {
+    panic!(String::from_utf8(output.stderr).unwrap());
+  }
+
+  let in_path: PathBuf = [&out_dir, "graph.o"].iter().collect();
+  let out_path: PathBuf = [&out_dir, "libgraph.a"].iter().collect();
+  let mut builder = Builder::new(File::create(out_path.to_str().unwrap()).unwrap());
+  builder.append_path(in_path.to_str().unwrap()).unwrap();
+
+  println!("cargo:rustc-link-lib=static=graph");
+  println!("cargo:rustc-link-search=native={}", out_dir);
+}
diff --git a/rust/tests/test_nnvm/src/build_test_graph.py b/rust/tests/test_nnvm/src/build_test_graph.py
new file mode 100755
index 000000000000..429cc2128931
--- /dev/null
+++ b/rust/tests/test_nnvm/src/build_test_graph.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+"""Builds a simple NNVM graph for testing."""
+
+from os import path as osp
+import sys
+
+import nnvm
+from nnvm import sym
+from nnvm.compiler import graph_util
+from nnvm.testing import init
+import numpy as np
+import tvm
+
+
+def _get_model(dshape):
+    data = sym.Variable('data', shape=dshape)
+    fc = sym.dense(data, units=dshape[-1]*2, use_bias=True)
+    left, right = sym.split(fc, indices_or_sections=2, axis=1)
+    return sym.Group(((left + 1), (right - 1), fc))
+
+
+def _init_params(graph, input_shapes, initializer=init.Xavier(), seed=10):
+    if isinstance(graph, sym.Symbol):
+        graph = nnvm.graph.create(graph)
+    ishapes, _ = graph_util.infer_shape(graph, **input_shapes)
+    param_shapes = dict(zip(graph.index.input_names, ishapes))
+    np.random.seed(seed)
+    params = {}
+    for param, shape in param_shapes.items():
+        if param in {'data', 'label'} or not shape:
+            continue
+
+        init_value = np.arange(np.product(shape), 0, -1).reshape(*shape).astype('float32')
+        if param.endswith('_bias'):
+            params[param] = tvm.nd.array(init_value)
+            continue
+
+        init_value = np.empty(shape).astype('float32')
+        initializer(param, init_value)
+        # init_value /= init_value.sum() + 1e-10
+        params[param] = tvm.nd.array(init_value)
+    return params
+
+def main():
+    dshape = (4, 8)
+    net = _get_model(dshape)
+    ishape_dict = {'data': dshape}
+    params = _init_params(net, ishape_dict)
+    graph, lib, params = nnvm.compiler.build(net, 'llvm --system-lib',
+                                             shape=ishape_dict,
+                                             params=params,
+                                             dtype='float32')
+
+    out_dir = sys.argv[1]
+    lib.save(osp.join(sys.argv[1], 'graph.o'))
+    with open(osp.join(out_dir, 'graph.json'), 'w') as f_resnet:
+        f_resnet.write(graph.json())
+    with open(osp.join(out_dir, 'graph.params'), 'wb') as f_params:
+        f_params.write(nnvm.compiler.save_param_dict(params))
+
+if __name__ == '__main__':
+    main()
diff --git a/rust/tests/test_nnvm/src/main.rs b/rust/tests/test_nnvm/src/main.rs
new file mode 100644
index 000000000000..0953ce2a2603
--- /dev/null
+++ b/rust/tests/test_nnvm/src/main.rs
@@ -0,0 +1,80 @@
+#![feature(try_from)]
+
+#[macro_use]
+extern crate ndarray;
+extern crate serde;
+extern crate serde_json;
+
+extern crate tvm;
+use std::{collections::HashMap, convert::TryFrom, fs, io::Read};
+
+use ndarray::Array;
+use tvm::runtime::{Graph, GraphExecutor, SystemLibModule, Tensor};
+
+const BATCH_SIZE: usize = 4;
+const IN_DIM: usize = 8;
+
+macro_rules! check_sum {
+  ($e:expr, $a:ident, $b:ident) => {
+    let a = Array::try_from($e.get_input(stringify!($a)).unwrap()).unwrap();
+    check_sum!(a, $b);
+  };
+  ($e:expr, $a:expr, $b:ident) => {
+    let a = Array::try_from($e.get_output($a).unwrap()).unwrap();
+    check_sum!(a, $b);
+  };
+  ($a:ident, $b:ident) => {
+    let a_sum: f32 = $a.scalar_sum();
+    let b_sum: f32 = $b.scalar_sum();
+    assert!((a_sum - b_sum).abs() < 1e-2, "{} != {}", a_sum, b_sum);
+  };
+}
+
+fn main() {
+  let syslib = SystemLibModule::default();
+
+  let mut params_bytes = Vec::new();
+  fs::File::open(concat!(env!("OUT_DIR"), "/graph.params"))
+    .unwrap()
+    .read_to_end(&mut params_bytes)
+    .unwrap();
+  let params = tvm::runtime::load_param_dict(&params_bytes)
+    .unwrap()
+    .into_iter()
+    .map(|(k, v)| (k, v.to_owned()))
+    .collect::<HashMap<String, Tensor<'static>>>();
+
+  let graph =
+    Graph::try_from(&fs::read_to_string(concat!(env!("OUT_DIR"), "/graph.json")).unwrap()).unwrap();
+  let mut exec = GraphExecutor::new(graph, &syslib).unwrap();
+
+  let x = Array::from_shape_vec(
+    (BATCH_SIZE, IN_DIM),
+    (0..BATCH_SIZE * IN_DIM)
+      .map(|x| x as f32)
+      .collect::<Vec<f32>>(),
+  ).unwrap();
+  let w = Array::try_from(params.get("dense0_weight").unwrap())
+    .unwrap()
+    .into_shape((IN_DIM * 2, IN_DIM))
+    .unwrap();
+  let b = Array::try_from(params.get("dense0_bias").unwrap()).unwrap();
+  let dense = x.dot(&w.t()) + &b;
+  let left = dense.slice(s![.., 0..IN_DIM]);
+  let right = dense.slice(s![.., IN_DIM..]);
+  let expected_o0 = &left + 1f32;
+  let expected_o1 = &right - 1f32;
+
+  exec.load_params(params);
+  exec.set_input("data", x.clone().into());
+
+  check_sum!(exec, data, x);
+  check_sum!(exec, dense0_weight, w);
+  check_sum!(exec, dense0_bias, b);
+
+  exec.run();
+
+  check_sum!(exec, 0, expected_o0);
+  check_sum!(exec, 1, expected_o1);
+  check_sum!(exec, 2, dense);
+}
diff --git a/rust/tests/test_tvm_basic/Cargo.toml b/rust/tests/test_tvm_basic/Cargo.toml
new file mode 100644
index 000000000000..bd4193bcb8fb
--- /dev/null
+++ b/rust/tests/test_tvm_basic/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "test-tvm-basic"
+version = "0.0.0"
+license = "Apache-2.0"
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+
+[dependencies]
+ndarray = "0.11.2"
+tvm = { path = "../../" }
+
+[build-dependencies]
+ar = "0.6.0"
diff --git a/rust/tests/test_tvm_basic/build.rs b/rust/tests/test_tvm_basic/build.rs
new file mode 100644
index 000000000000..778dd1cab1ca
--- /dev/null
+++ b/rust/tests/test_tvm_basic/build.rs
@@ -0,0 +1,28 @@
+extern crate ar;
+
+use std::{env, path::PathBuf, process::Command};
+
+use ar::Builder;
+use std::fs::File;
+
+fn main() {
+  let out_dir = env::var("OUT_DIR").unwrap();
+
+  let output = Command::new(concat!(
+    env!("CARGO_MANIFEST_DIR"),
+    "/src/build_test_lib.py"
+  )).arg(&out_dir)
+    .output()
+    .expect("Failed to execute command");
+  if output.stderr.len() > 0 {
+    panic!(String::from_utf8(output.stderr).unwrap());
+  }
+
+  let in_path: PathBuf = [&out_dir, "test.o"].iter().collect();
+  let out_path: PathBuf = [&out_dir, "libtest.a"].iter().collect();
+  let mut builder = Builder::new(File::create(out_path.to_str().unwrap()).unwrap());
+  builder.append_path(in_path.to_str().unwrap()).unwrap();
+
+  println!("cargo:rustc-link-lib=static=test");
+  println!("cargo:rustc-link-search=native={}", out_dir);
+}
diff --git a/rust/tests/test_tvm_basic/src/build_test_lib.py b/rust/tests/test_tvm_basic/src/build_test_lib.py
new file mode 100755
index 000000000000..7289a778fcec
--- /dev/null
+++ b/rust/tests/test_tvm_basic/src/build_test_lib.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+
+"""Prepares a simple TVM library for testing."""
+
+from os import path as osp
+import sys
+
+import tvm
+
+def main():
+    n = tvm.var('n')
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.create_schedule(C.op)
+    s[C].parallel(s[C].op.axis[0])
+    print(tvm.lower(s, [A, B, C], simple_mode=True))
+    tvm.build(s, [A, B, C], 'llvm --system-lib').save(osp.join(sys.argv[1], 'test.o'))
+
+if __name__ == '__main__':
+    main()
diff --git a/rust/tests/test_tvm_basic/src/main.rs b/rust/tests/test_tvm_basic/src/main.rs
new file mode 100644
index 000000000000..b6c11451d12a
--- /dev/null
+++ b/rust/tests/test_tvm_basic/src/main.rs
@@ -0,0 +1,25 @@
+extern crate ndarray;
+#[macro_use]
+extern crate tvm;
+
+use ndarray::Array;
+use tvm::{
+  ffi::runtime::DLTensor,
+  runtime::{Module, SystemLibModule},
+};
+
+fn main() {
+  let syslib = SystemLibModule::default();
+  let add = syslib
+    .get_function("default_function")
+    .expect("main function not found");
+  let mut a = Array::from_vec(vec![1f32, 2., 3., 4.]);
+  let mut b = Array::from_vec(vec![1f32, 0., 1., 0.]);
+  let mut c = Array::from_vec(vec![0f32; 4]);
+  let e = Array::from_vec(vec![2f32, 2., 4., 4.]);
+  let mut a_dl: DLTensor = (&mut a).into();
+  let mut b_dl: DLTensor = (&mut b).into();
+  let mut c_dl: DLTensor = (&mut c).into();
+  call_packed!(add, &mut a_dl, &mut b_dl, &mut c_dl);
+  assert!(c.all_close(&e, 1e-8f32));
+}

From 6e599399d6662128bb3fd32e53765238eb173c4f Mon Sep 17 00:00:00 2001
From: nhynes <nhynes@berkeley.edu>
Date: Sat, 6 Oct 2018 10:32:43 -0700
Subject: [PATCH 186/529] Update SGX example (#1825)

---
 .gitignore                                    |   8 ++
 apps/sgx/Makefile                             | 107 ++++++++--------
 apps/sgx/README.md                            |  18 ++-
 apps/sgx/enclave/.rustfmt.toml                |  59 +++++++++
 apps/sgx/enclave/Cargo.toml                   |  16 +++
 apps/sgx/enclave/Makefile                     |  35 ++++++
 apps/sgx/enclave/Xargo.toml                   |  17 +++
 apps/sgx/enclave/enclave.lds                  |   9 ++
 .../enclave_config.xml.in}                    |   8 +-
 apps/sgx/enclave/src/lib.rs                   | 119 ++++++++++++++++++
 apps/sgx/prepare_test_libs.py                 |  26 ----
 apps/sgx/run_example.sh                       |   6 +-
 apps/sgx/run_model.py                         |  20 +++
 apps/sgx/test_addone.py                       |  13 --
 cmake/modules/SGX.cmake                       |   6 +-
 15 files changed, 361 insertions(+), 106 deletions(-)
 create mode 100644 apps/sgx/enclave/.rustfmt.toml
 create mode 100644 apps/sgx/enclave/Cargo.toml
 create mode 100644 apps/sgx/enclave/Makefile
 create mode 100644 apps/sgx/enclave/Xargo.toml
 create mode 100644 apps/sgx/enclave/enclave.lds
 rename apps/sgx/{enclave_config.xml => enclave/enclave_config.xml.in} (50%)
 create mode 100644 apps/sgx/enclave/src/lib.rs
 delete mode 100644 apps/sgx/prepare_test_libs.py
 create mode 100644 apps/sgx/run_model.py
 delete mode 100644 apps/sgx/test_addone.py

diff --git a/.gitignore b/.gitignore
index 368764941cec..d24fccb6f513 100644
--- a/.gitignore
+++ b/.gitignore
@@ -198,3 +198,11 @@ tvm_t.*
 
 # tmp file
 .nfs*
+
+# keys
+*.pem
+*.p12
+*.pfx
+*.cer
+*.crt
+*.der
diff --git a/apps/sgx/Makefile b/apps/sgx/Makefile
index 1038f57c3ba1..875897b82d23 100644
--- a/apps/sgx/Makefile
+++ b/apps/sgx/Makefile
@@ -1,13 +1,12 @@
-# Makefile for example to deploy TVM modules in SGX.
-
-TVM_ROOT := $(shell cd ../..; pwd)
-NNVM_PATH := nnvm
-DMLC_CORE := ${TVM_ROOT}/dmlc-core
-
 SGX_SDK ?= /opt/sgxsdk
+RUST_SGX_SDK ?= /opt/rust-sgx-sdk
 SGX_MODE ?= SIM
-SGX_ARCH ?= x64
-SGX_DEBUG ?= 1
+DEBUG ?= true
+NUM_THREADS ?= 4
+
+TVM_DIR ?= ../..
+
+export
 
 sgx_edger8r := $(SGX_SDK)/bin/x64/sgx_edger8r
 sgx_enclave_signer := $(SGX_SDK)/bin/x64/sgx_sign
@@ -20,69 +19,71 @@ trts_library_name := sgx_trts$(sgx_sim)
 tservice_library_name := sgx_tservice$(sgx_sim)
 uservice_library_name := sgx_uae_service$(sgx_sim)
 
-pkg_cflags := -std=c++11 -O2 -fPIC\
-	-I${TVM_ROOT}/include\
-	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/3rdparty/dlpack/include\
-	-I.\
-	-DDMLC_LOG_STACK_TRACE=0\
-	-fmax-errors=4
-
-pkg_ldflags := -L${TVM_ROOT}/lib
-
-enclave_include_paths := -I$(SGX_SDK)/include\
-	-I$(SGX_SDK)/include/tlibc\
-	-I$(SGX_SDK)/include/libcxx\
-	-I$(SGX_SDK)/include/stdc++\
+pkg_cflags := -std=c++11 -fPIC \
+	-I$(SGX_SDK)/include \
+	-I$(TVM_DIR)/include \
+	-I$(TVM_DIR)/dlpack/include \
+	-I$(TVM_DIR)/dmlc-core/include
+
+pkg_ldflags := -L$(TVM_DIR)/build -ltvm_runtime
+
+ifneq ($(DEBUG), false)
+	debug := debug
+	enclave_cflags += -Og -g
+	pkg_cflags += -Og -g
+else
+	debug := release
+	enclave_cflags += -O2
+	pkg_cflags += -O2
+endif
 
-enclave_cflags := -static -nostdinc\
-	-fvisibility=hidden -fpie -fstack-protector-strong\
-	-ffunction-sections -fdata-sections\
-	-DDMLC_CXX11_THREAD_LOCAL=0\
-	-include "lib/tvm_t.h"\
-	$(enclave_include_paths)\
+build_dir := build
 
-enclave_cxxflags := -nostdinc++ $(enclave_cflags) -DTVM_SGX_MAX_CONCURRENCY=4
+enclave_cflags := \
+	-I$(SGX_SDK)/include \
+	-I$(SGX_SDK)/include/tlibc \
+	-I$(SGX_SDK)/include/stdport \
+	-I$(SGX_SDK)/include/epid \
+	-I$(TVM_DIR)/include \
+	-I$(TVM_DIR)/dlpack/include \
+	-I$(TVM_DIR)/dmlc-core/include
 
 enclave_ldflags :=\
+	-L$(build_dir) -L$(TVM_DIR)/build \
 	-Wl,--no-undefined -nostdlib -nodefaultlibs -nostartfiles -L$(SGX_SDK)/lib64\
 	-Wl,--whole-archive -l$(trts_library_name) -Wl,--no-whole-archive\
 	-Wl,--start-group\
 	-lsgx_tstdc -lsgx_tstdcxx -lsgx_tcxx -lsgx_tcrypto -lsgx_tkey_exchange -l$(tservice_library_name)\
+	-lenclave -ltvm_t\
 	-Wl,--end-group\
 	-Wl,-Bstatic -Wl,-Bsymbolic -Wl,--no-undefined\
 	-Wl,-pie,-eenclave_entry -Wl,--export-dynamic\
-	-Wl,--defsym,__ImageBase=0 -Wl,--gc-sections
-
-.PHONY: clean all
+	-Wl,--defsym,__ImageBase=0 -Wl,--gc-sections\
+	-Wl,--version-script=enclave/enclave.lds
 
-all: lib/test_addone.signed.so
+.PHONY: enclave clean
 
-# The code library built by TVM
-lib/test_addone_sys.o: prepare_test_libs.py
-	python prepare_test_libs.py
+enclave: $(build_dir)/enclave.signed.so
 
-lib/tvm_t.h: ../../src/runtime/sgx/tvm.edl
-	$(sgx_edger8r) --trusted $< --trusted-dir lib --search-path $(SGX_SDK)/include
-	mv $@ $@.in
-	awk 'NR==4{print "#include <tvm/runtime/c_runtime_api.h>"}1' $@.in > $@
+$(build_dir)/enclave.signed.so: $(build_dir)/enclave.so build/enclave_config.xml enclave/enclave.pem
+	$(sgx_enclave_signer) sign -key enclave/enclave.pem -enclave $< -out $@ -config build/enclave_config.xml
 
-lib/tvm_t.c: lib/tvm_t.h
+enclave/enclave.pem:
+	curl -sSo $@ 'https://gist.githubusercontent.com/nhynes/8a2d80068a92e672f8b0b7d710ceb404/raw/2d5ae5fbe83198ede49465fdc6535065e093543b/tvm_sgx_demo.pem'
 
-lib/tvm_t.o: lib/tvm_t.c
-	$(CC) $(enclave_cflags) $(pkg_cflags) -c $< -o $@ -include $(TVM_ROOT)/include/tvm/runtime/c_runtime_api.h
+build/enclave_config.xml: enclave/enclave_config.xml.in
+	cpp $^ -P -o $@ -DNUM_THREADS=$$(( $(NUM_THREADS) + 1 ))
 
-# The enclave library
-lib/test_addone.so: $(TVM_ROOT)/src/runtime/sgx/trusted/runtime.cc lib/tvm_t.o lib/test_addone_sys.o
-	$(CXX) $^ -o $@ $(pkg_cflags) $(pkg_ldflags) $(enclave_cxxflags) $(enclave_ldflags) -g
+$(build_dir)/enclave.so: $(build_dir)/libenclave.a $(TVM_DIR)/build/libtvm_t.a
+	$(CXX) $< -o $@ $(enclave_ldflags) $(enclave_cflags) -ltvm_t
 
-# The demo enclave signing key
-lib/enclave.pem:
-	curl -Lso $@ https://gist.githubusercontent.com/nhynes/8a2d80068a92e672f8b0b7d710ceb404/raw/2d5ae5fbe83198ede49465fdc6535065e093543b/tvm_sgx_demo.pem
+$(build_dir)/libenclave.a: enclave/target/x86_64-unknown-linux-sgx/$(debug)/libmodel_enclave.a
+	@mkdir -p $(@D)
+	@cp $< $@
 
-# The signed enclave
-lib/test_addone.signed.so: lib/test_addone.so enclave_config.xml lib/enclave.pem
-	$(sgx_enclave_signer) sign -key lib/enclave.pem -enclave $< -out $@ -config enclave_config.xml
+enclave/target/x86_64-unknown-linux-sgx/$(debug)/libmodel_enclave.a: enclave/**/*
+	$(MAKE) -C enclave
 
 clean:
-	rm -rf lib
+	$(MAKE) -s -C enclave clean
+	rm -rf build
diff --git a/apps/sgx/README.md b/apps/sgx/README.md
index 565519d457ce..dd21cff02f80 100644
--- a/apps/sgx/README.md
+++ b/apps/sgx/README.md
@@ -4,13 +4,22 @@ This application demonstrates the use of a simple TVM model in the [Intel SGX](h
 
 ## Prerequisites
 
+1. The TVM premade Docker image
+
+or
+
 1. A GNU/Linux environment
 2. TVM compiled with LLVM and SGX; and the `tvm` Python module
 3. The [Linux SGX SDK](https://github.com/intel/linux-sgx) [link to pre-built libraries](https://01.org/intel-software-guard-extensions/downloads)
+4. [Rust](https://rustup.sh)
+5. The [rust-sgx-sdk](https://github.com/baidu/rust-sgx-sdk)
+6. [xargo](https://github.com/japaric/xargo)
+
+Check out the `/tvm/install/ubuntu_install_sgx.sh` for the commands to get these dependencies.
 
 ## Running the example
 
-`SGX_SDK=/path/to/sgxsdk bash run_example.sh`
+`bash run_example.sh`
 
 If everything goes well, you should see a lot of build messages and below them
 the text `It works!`.
@@ -24,10 +33,9 @@ In this library, one can use other libraries like TVM.
 Building this example performs the following steps:
 
 1. Creates a simple TVM module that computes `x + 1` and save it as a system library.
-2. Builds a minimal TVM runtime pack that can load the module.
-3. Links the TVM module into an SGX enclave along with some code that runs the module.
-4. Compiles and runs an executable that loads the enclave and calls a function
-   which invokes the TVM module.
+2. Builds a TVM runtime that links the module and allows running it using the TVM Python runtime.
+3. Packages the bundle into an SGX enclave
+4. Runs the enclave using the usual TVM Python `module` API
 
 For more information on building, please refer to the `Makefile`.  
 For more information on the TVM module, please refer to `../howto_deploy`.  
diff --git a/apps/sgx/enclave/.rustfmt.toml b/apps/sgx/enclave/.rustfmt.toml
new file mode 100644
index 000000000000..9ae87cc6bfcf
--- /dev/null
+++ b/apps/sgx/enclave/.rustfmt.toml
@@ -0,0 +1,59 @@
+max_width = 100
+hard_tabs = false
+tab_spaces = 2
+newline_style = "Auto"
+use_small_heuristics = "Default"
+indent_style = "Block"
+wrap_comments = false
+comment_width = 80
+normalize_comments = false
+format_strings = false
+format_macro_matchers = false
+format_macro_bodies = true
+empty_item_single_line = true
+struct_lit_single_line = true
+fn_single_line = false
+where_single_line = false
+imports_indent = "Block"
+imports_layout = "Mixed"
+merge_imports = true
+reorder_imports = true
+reorder_modules = true
+reorder_impl_items = false
+type_punctuation_density = "Wide"
+space_before_colon = false
+space_after_colon = true
+spaces_around_ranges = false
+binop_separator = "Front"
+remove_nested_parens = true
+combine_control_expr = true
+struct_field_align_threshold = 0
+match_arm_blocks = true
+force_multiline_blocks = false
+fn_args_density = "Tall"
+brace_style = "SameLineWhere"
+control_brace_style = "AlwaysSameLine"
+trailing_semicolon = true
+trailing_comma = "Vertical"
+match_block_trailing_comma = true
+blank_lines_upper_bound = 1
+blank_lines_lower_bound = 0
+edition = "2015"
+merge_derives = true
+use_try_shorthand = true
+use_field_init_shorthand = false
+force_explicit_abi = true
+condense_wildcard_suffixes = true
+color = "Auto"
+required_version = "0.99.5"
+unstable_features = false
+disable_all_formatting = false
+skip_children = false
+hide_parse_errors = false
+error_on_line_overflow = false
+error_on_unformatted = false
+report_todo = "Never"
+report_fixme = "Never"
+ignore = []
+emit_mode = "Files"
+make_backup = false
diff --git a/apps/sgx/enclave/Cargo.toml b/apps/sgx/enclave/Cargo.toml
new file mode 100644
index 000000000000..9a14c76c5897
--- /dev/null
+++ b/apps/sgx/enclave/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "model-enclave"
+version = "0.1.0"
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+lazy_static = "1.1.0"
+# tvm = { path = "../../../rust", default-features = false, features = ["sgx"] }
+tvm = { path = "/home/nhynes/myelin/deps/tvm-rs", default-features = false, features = ["sgx"] }
+
+[profile.release]
+lto = true
+opt-level = 3
diff --git a/apps/sgx/enclave/Makefile b/apps/sgx/enclave/Makefile
new file mode 100644
index 000000000000..e8515356238a
--- /dev/null
+++ b/apps/sgx/enclave/Makefile
@@ -0,0 +1,35 @@
+MODEL ?= resnet
+NUM_THREADS ?= 4
+BATCH_SIZE ?= 64
+TRAINING ?= true
+DEBUG ?= false
+
+build_dir := ../build
+
+ifeq ($(DEBUG), false)
+	debug := release
+	xargo_args := --release
+else
+	debug := debug
+endif
+
+target/x86_64-unknown-linux-sgx/$(debug)/libmodel-enclave.a: $(build_dir)/libmodel.a **/*
+	RUST_TARGET_PATH=$(shell pwd) \
+		RUST_TARGET_DIR=$(shell pwd)/target \
+		RUSTFLAGS="-Z force-unstable-if-unmarked" \
+		TVM_NUM_THREADS=$(NUM_THREADS) \
+		BUILD_DIR=../build \
+		xargo build --target x86_64-unknown-linux-sgx $(xargo_args) -q
+
+$(build_dir)/libmodel.a: $(build_dir)/model.o
+	llvm-ar cr $@ $^
+
+$(build_dir)/model.o: $(build_dir)/model.bc
+	clang -c $< -o $@ -fPIC -O3
+	objcopy --globalize-symbol __tvm_module_startup $@
+
+$(build_dir)/model.bc: src/build_model.py
+	python3 $< -o $(build_dir)
+
+clean:
+	xargo clean
diff --git a/apps/sgx/enclave/Xargo.toml b/apps/sgx/enclave/Xargo.toml
new file mode 100644
index 000000000000..1fd50d699264
--- /dev/null
+++ b/apps/sgx/enclave/Xargo.toml
@@ -0,0 +1,17 @@
+[dependencies]
+alloc = {}
+panic_unwind = {}
+panic_abort = {}
+
+[dependencies.std]
+features = ["backtrace", "stdio", "untrusted_time"]
+path = "/home/nhynes/myelin/deps/rust-sgx-sdk/xargo/sgx_tstd"
+# git = "https://github.com/oasislabs/rust-sgx-sdk"
+# rev = "7334c30d85cb1752577998705110b7b27c69b570"
+stage = 2
+
+[dependencies.xargo_sgx_rand]
+# git = "https://github.com/oasislabs/rust-sgx-sdk"
+path = "/home/nhynes/myelin/deps/rust-sgx-sdk/xargo/sgx_rand"
+# rev = "7334c30d85cb1752577998705110b7b27c69b570"
+stage = 3
diff --git a/apps/sgx/enclave/enclave.lds b/apps/sgx/enclave/enclave.lds
new file mode 100644
index 000000000000..e3d9d0ee0d90
--- /dev/null
+++ b/apps/sgx/enclave/enclave.lds
@@ -0,0 +1,9 @@
+enclave.so
+{
+    global:
+        g_global_data_sim;
+        g_global_data;
+        enclave_entry;
+    local:
+        *;
+};
diff --git a/apps/sgx/enclave_config.xml b/apps/sgx/enclave/enclave_config.xml.in
similarity index 50%
rename from apps/sgx/enclave_config.xml
rename to apps/sgx/enclave/enclave_config.xml.in
index 07be0d7a7ad2..d49b6693f231 100644
--- a/apps/sgx/enclave_config.xml
+++ b/apps/sgx/enclave/enclave_config.xml.in
@@ -1,10 +1,10 @@
 <EnclaveConfiguration>
   <ProdID>0</ProdID>
   <ISVSVN>0</ISVSVN>
-  <StackMaxSize>0x2000</StackMaxSize>
-  <HeapMaxSize>0x2000</HeapMaxSize>
-  <TCSNum>5</TCSNum>
-  <TCSPolicy>1</TCSPolicy>
+  <StackMaxSize>0x100000</StackMaxSize>
+  <HeapMaxSize>0xf0000000</HeapMaxSize>
+  <TCSNum>NUM_THREADS</TCSNum>
+  <TCSPolicy>0</TCSPolicy> <!-- must be "bound" to use thread_local -->
   <DisableDebug>0</DisableDebug>
   <MiscSelect>0</MiscSelect>
   <MiscMask>0xFFFFFFFF</MiscMask>
diff --git a/apps/sgx/enclave/src/lib.rs b/apps/sgx/enclave/src/lib.rs
new file mode 100644
index 000000000000..d74015a92510
--- /dev/null
+++ b/apps/sgx/enclave/src/lib.rs
@@ -0,0 +1,119 @@
+#![feature(try_from)]
+
+#[macro_use]
+extern crate lazy_static;
+extern crate tvm;
+
+use std::{convert::TryFrom, sync::Mutex};
+
+use tvm::runtime::{sgx, Graph, GraphExecutor, SystemLibModule, TVMArgValue, TVMRetValue};
+
+lazy_static! {
+  static ref SYSLIB: SystemLibModule = { SystemLibModule::default() };
+  static ref MODEL: Mutex<GraphExecutor<'static, 'static>> = {
+    let _params = include_bytes!(concat!("../", env!("BUILD_DIR"), "/params.bin"));
+    let graph_json = include_str!(concat!("../", env!("BUILD_DIR"), "/graph.json"));
+
+    let graph = Graph::try_from(graph_json).unwrap();
+    Mutex::new(GraphExecutor::new(graph, &*SYSLIB).unwrap())
+  };
+}
+
+fn ecall_init(_args: &[TVMArgValue]) -> TVMRetValue {
+  lazy_static::initialize(&MODEL);
+  TVMRetValue::from(0)
+}
+
+fn ecall_main(_args: &[TVMArgValue]) -> TVMRetValue {
+  let model = MODEL.lock().unwrap();
+  // model.set_input("data", args[0]);
+  model.run();
+  sgx::shutdown();
+  // model.get_output(0).into()
+  TVMRetValue::from(42)
+}
+
+pub mod ecalls {
+  //! todo: generate this using proc_macros
+
+  use super::*;
+
+  use std::{
+    ffi::CString,
+    os::raw::{c_char, c_int},
+    slice,
+  };
+
+  use tvm::{
+    ffi::runtime::{TVMRetValueHandle, TVMValue},
+    runtime::{
+      sgx::{run_worker, SgxStatus},
+      PackedFunc,
+    },
+  };
+
+  macro_rules! tvm_ocall {
+    ($func: expr) => {
+      match $func {
+        0 => Ok(()),
+        err => Err(err),
+      }
+    };
+  }
+
+  const ECALLS: &'static [&'static str] = &["__tvm_run_worker__", "__tvm_main__", "init"];
+
+  lazy_static! {
+    static ref ECALL_FUNCS: Vec<PackedFunc> = {
+      vec![
+        Box::new(run_worker),
+        Box::new(ecall_main),
+        Box::new(ecall_init),
+      ]
+    };
+  }
+
+  extern "C" {
+    fn __tvm_module_startup() -> ();
+    fn tvm_ocall_register_export(name: *const c_char, func_id: c_int) -> SgxStatus;
+  }
+
+  #[no_mangle]
+  pub extern "C" fn tvm_ecall_init(_ret: TVMRetValueHandle) {
+    unsafe {
+      __tvm_module_startup();
+
+      ECALLS.into_iter().enumerate().for_each(|(i, ecall)| {
+        tvm_ocall!(tvm_ocall_register_export(
+          CString::new(*ecall).unwrap().as_ptr(),
+          i as i32
+        )).expect(&format!("Error registering `{}`", ecall));
+      });
+    }
+  }
+
+  #[no_mangle]
+  pub extern "C" fn tvm_ecall_packed_func(
+    func_id: c_int,
+    arg_values: *const TVMValue,
+    type_codes: *const c_int,
+    num_args: c_int,
+    ret_val: *mut TVMValue,
+    ret_type_code: *mut i64,
+  ) {
+    let args = unsafe {
+      let values = slice::from_raw_parts(arg_values, num_args as usize);
+      let type_codes = slice::from_raw_parts(type_codes, num_args as usize);
+      values
+        .into_iter()
+        .zip(type_codes.into_iter())
+        .map(|(v, t)| TVMArgValue::new(*v, *t as i64))
+        .collect::<Vec<TVMArgValue>>()
+    };
+    let (rv, tc) = ECALL_FUNCS[func_id as usize](&args).into_tvm_value();
+    unsafe {
+      *ret_val = rv;
+      *ret_type_code = tc;
+    }
+  }
+}
diff --git a/apps/sgx/prepare_test_libs.py b/apps/sgx/prepare_test_libs.py
deleted file mode 100644
index f676f46b7ff0..000000000000
--- a/apps/sgx/prepare_test_libs.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""Script to prepare test_addone_sys.o"""
-
-from os import path as osp
-
-import tvm
-
-CWD = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-
-
-def main():
-    out_dir = osp.join(CWD, 'lib')
-
-    n = tvm.var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B')
-    s = tvm.create_schedule(B.op)
-    s[B].parallel(s[B].op.axis[0])
-    print(tvm.lower(s, [A, B], simple_mode=True))
-
-    # Compile library in system library mode
-    fadd_syslib = tvm.build(s, [A, B], 'llvm --system-lib')
-    fadd_syslib.save(osp.join(out_dir, 'test_addone_sys.o'))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/apps/sgx/run_example.sh b/apps/sgx/run_example.sh
index 9334b260cbf3..cc6f22f24e00 100755
--- a/apps/sgx/run_example.sh
+++ b/apps/sgx/run_example.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
 sgx_sdk=${SGX_SDK:=/opt/sgxsdk}
-make
-echo "========================="
-LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH} TVM_CACHE_DIR=/tmp python test_addone.py
+LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH} make
+printf "\n"
+LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH} TVM_CACHE_DIR=/tmp python3 run_model.py
diff --git a/apps/sgx/run_model.py b/apps/sgx/run_model.py
new file mode 100644
index 000000000000..491a5ccbda3c
--- /dev/null
+++ b/apps/sgx/run_model.py
@@ -0,0 +1,20 @@
+import os.path as osp
+import numpy as np
+import tvm
+
+CWD = osp.abspath(osp.dirname(__file__))
+
+
+def main():
+    ctx = tvm.context('cpu', 0)
+    model = tvm.module.load(osp.join(CWD, 'build', 'enclave.signed.so'))
+    out = model()
+    if out == 42:
+        print('It works!')
+    else:
+        print('It doesn\'t work!')
+        exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/apps/sgx/test_addone.py b/apps/sgx/test_addone.py
deleted file mode 100644
index 5ddccfa425cc..000000000000
--- a/apps/sgx/test_addone.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import tvm
-import numpy as np
-
-ctx = tvm.context('cpu', 0)
-fadd1 = tvm.module.load('lib/test_addone.signed.so')
-
-n = 10
-x = tvm.nd.array(np.random.uniform(size=n).astype('float32'), ctx)
-y = tvm.nd.array(np.zeros(n, dtype='float32'), ctx)
-fadd1(x, y)
-
-np.testing.assert_allclose(y.asnumpy(), x.asnumpy() + 1)
-print("It works!")
diff --git a/cmake/modules/SGX.cmake b/cmake/modules/SGX.cmake
index c9894de11f8b..608d6ff5a4bd 100644
--- a/cmake/modules/SGX.cmake
+++ b/cmake/modules/SGX.cmake
@@ -1,5 +1,4 @@
 if(NOT USE_SGX STREQUAL "OFF")
-  message(STATUS "Build with SGX support")
 
   set(_sgx_src ${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/sgx)
   set(_tvm_u_h ${_sgx_src}/untrusted/tvm_u.h)
@@ -9,8 +8,11 @@ if(NOT USE_SGX STREQUAL "OFF")
   set(_sgx_ustdc ${RUST_SGX_SDK}/sgx_ustdc)
 
   set(_urts_lib "sgx_urts")
-  if(SGX_MODE STREQUAL "SIM")
+  if(NOT SGX_MODE STREQUAL "HW")
+    message(STATUS "Build with SGX support (SIM)")
     set(_urts_lib "${_urts_lib}_sim")
+  else()
+    message(STATUS "Build with SGX support (HW)")
   endif()
 
   # build edge routines

From 47fa0f9186ac97b26b6df141fd4c8afbe6d638d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Sat, 6 Oct 2018 10:37:48 -0700
Subject: [PATCH 187/529] [RELAY][PASS] Dead Code Elimination (#1776)

---
 include/tvm/relay/pass.h                      |  25 +++-
 include/tvm/runtime/ndarray.h                 |  15 +++
 python/tvm/relay/_ir_pass.pyi                 |   3 +-
 python/tvm/relay/ir_builder.py                |   6 +-
 python/tvm/relay/ir_pass.py                   |  39 +++++-
 python/tvm/relay/ty.py                        |   2 +-
 src/relay/pass/alpha_eq.cc                    | 112 ++++++++++++-----
 src/relay/pass/dead_code.cc                   | 119 ++++++++++++++++++
 src/runtime/ndarray.cc                        |  13 +-
 .../relay/test_dead_code_elimination.py       |  77 ++++++++++++
 ...s_alpha_eq.py => test_pass_alpha_equal.py} |  33 +++--
 tests/python/relay/test_type_infer.py         |  10 +-
 12 files changed, 381 insertions(+), 73 deletions(-)
 create mode 100644 src/relay/pass/dead_code.cc
 create mode 100644 tests/python/relay/test_dead_code_elimination.py
 rename tests/python/relay/{test_pass_alpha_eq.py => test_pass_alpha_equal.py} (89%)

diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 8b2a5fafd8f0..3678aee32850 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -80,7 +80,7 @@ bool AlphaEqual(const Expr& e1, const Expr& e2);
  */
 bool AlphaEqual(const Type& t1, const Type& t2);
 
-/*! brief Check that each Var is only bind once.
+/*! \brief Check that each Var is only bound once.
  *
  * For example, the expression `let x = 1 in let x = 2 in 3` bound x twice.
  *
@@ -88,9 +88,9 @@ bool AlphaEqual(const Type& t1, const Type& t2);
  *
  * \param e the expression to check.
  *
- * \return true iff all Var in e is bind at most once.
+ * \return true iff all Var in e is bound at most once.
  */
-bool WellFormed(const Expr & e);
+bool WellFormed(const Expr& e);
 
 /*! \brief Get free variables from expression e.
  *
@@ -100,7 +100,7 @@ bool WellFormed(const Expr & e);
  *
  * \return the set of free variable.
  */
-tvm::Array<Var> FreeVariables(const Expr & e);
+tvm::Array<Var> FreeVariables(const Expr& e);
 
 /*! \brief Get free type parameters from expression e.
  *
@@ -110,7 +110,7 @@ tvm::Array<Var> FreeVariables(const Expr & e);
  *
  * \return the set of free type variables.
  */
-tvm::Array<TypeParam> FreeTypeVariables(const Expr & e);
+tvm::Array<TypeParam> FreeTypeVariables(const Expr& e);
 
 /*! \brief Get free type parameters from type t.
  *
@@ -120,7 +120,20 @@ tvm::Array<TypeParam> FreeTypeVariables(const Expr & e);
  *
  * \return the set of free type variables.
  */
-tvm::Array<TypeParam> FreeTypeVariables(const Type & t);
+tvm::Array<TypeParam> FreeTypeVariables(const Type& t);
+
+/*! \brief Remove expressions which does not effect the program result.
+ *
+ * It will remove let binding that are not referenced, and if branch that are not entered.
+ *
+ * For example, this pass should turn `let a = 1 in 2` into `2`, as the value of the expression does not depend on a.
+ * Another example is `if (true) then 1 else 2` will be optimized into 1.
+ *
+ * \param e the expression to optimize.
+ *
+ * \return the optimized expression.
+ */
+Expr DeadCodeElimination(const Expr& e);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 313e0a5c3da8..0fc8e42b8bcb 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -282,6 +282,21 @@ inline void NDArray::reset() {
   }
 }
 
+/*! \brief return the size of data the DLTensor hold, in term of number of bytes
+ *
+ *  \param arr the input DLTensor
+ *
+ *  \return number of  bytes of data in the DLTensor.
+ */
+inline size_t GetDataSize(const DLTensor& arr) {
+  size_t size = 1;
+  for (tvm_index_t i = 0; i < arr.ndim; ++i) {
+    size *= static_cast<size_t>(arr.shape[i]);
+  }
+  size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8;
+  return size;
+}
+
 inline void NDArray::CopyFrom(DLTensor* other) {
   CHECK(data_ != nullptr);
   CopyFromTo(other, &(data_->dl_tensor));
diff --git a/python/tvm/relay/_ir_pass.pyi b/python/tvm/relay/_ir_pass.pyi
index f321083aa443..f1432803e9e2 100644
--- a/python/tvm/relay/_ir_pass.pyi
+++ b/python/tvm/relay/_ir_pass.pyi
@@ -4,4 +4,5 @@ from . import ir
 def check_expr(env: Environment, expr: ir.Expr) -> ir.Type: ...
 def generalize(env: Environment, expr: ir.Expr) -> ir.Expr: ...
 def _get_checked_type(expr: ir.Expr) -> ir.Type: ...
-def well_formed(expr: ir.Expr) -> bool: ...
\ No newline at end of file
+def well_formed(expr: ir.Expr) -> bool: ...
+def dead_code_elimination(expr: ir.Expr) -> ir.Expr: ...
\ No newline at end of file
diff --git a/python/tvm/relay/ir_builder.py b/python/tvm/relay/ir_builder.py
index 6e52f209d0c6..accb782659df 100644
--- a/python/tvm/relay/ir_builder.py
+++ b/python/tvm/relay/ir_builder.py
@@ -16,12 +16,12 @@ def _convert_to_value(arg, ctxt=tvm.cpu(0)):
     """Convert Python values into the appropriate types
        for the Relay evaluator.
     """
-    if isinstance(arg, int):
+    if isinstance(arg, bool): # bool is subclass of int
+        return tvm.nd.array(np.array(arg, dtype='uint8'), ctxt)
+    elif isinstance(arg, int):
         return tvm.nd.array(np.array(arg, dtype='int32'), ctxt)
     elif isinstance(arg, float):
         return tvm.nd.array(arg, ctxt)
-    elif isinstance(arg, bool):
-        return tvm.nd.array(np.array(arg, dtype='float32'), ctxt)
     elif isinstance(arg, np.ndarray):
         return tvm.nd.array(arg, ctxt)
     elif isinstance(arg, tvm.ndarray.NDArray):
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 78cc5027c32c..6de6437b9eb9 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -6,15 +6,16 @@
 them in Python.
 """
 from . import _ir_pass
+from . import _make
 # pylint: disable=invalid-name
 
 def infer_type(env, expr):
-    """Infer the type of expr under the context of env
+    """Infer the type of expr under the context of env.
 
     Parameters
     ----------
     env : relay.Environment
-        The global environmemt.
+        The global environment.
 
     expr : relay.Expr
         The input expression.
@@ -34,3 +35,37 @@ def infer_type(env, expr):
 free_vars = _ir_pass.free_vars
 
 free_type_vars = _ir_pass.free_type_vars
+
+def dead_code_elimination(e):
+    """ Remove expressions which does not effect the program result (dead code).
+
+    Parameters
+    ----------
+    e: relay.Expr
+      The input Expression
+
+    Returns
+    -------
+    result: relay.Expr
+      An expression which is semantically equal to the input expression,
+      but with dead code removed.
+    """
+    return _ir_pass.dead_code_elimination(e)
+
+def alpha_equal(lhs, rhs):
+    """Compare two Relay expr for structural equivalence (alpha equivalence).
+
+    Parameters
+    ----------
+    lhs: relay.Expr
+      One of the input Expression.
+    rhs: relay.Expr
+      One of the input Expression.
+
+
+    Returns
+    -------
+    result: bool
+      True iff lhs is alpha equal to rhs.
+    """
+    return bool(_make._alpha_equal(lhs, rhs))
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index c7cf9a346b68..a6ac1857bfa8 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -12,7 +12,7 @@ def __eq__(self, other):
         """Compare two Relay types for structural equivalence using
            alpha equivalence.
         """
-        return bool(_make._type_alpha_eq(self, other))
+        return bool(_make._type_alpha_equal(self, other))
 
     def __ne__(self, other):
         return not self.__eq__(other)
diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
index 39f55af6fe70..3c4c3d78063f 100644
--- a/src/relay/pass/alpha_eq.cc
+++ b/src/relay/pass/alpha_eq.cc
@@ -1,10 +1,11 @@
 /*!
  *  Copyright (c) 2018 by Contributors
  * \file src/tvm/relay/pass/alpha_eq.cc
- * \brief The structral equivalence comparison.
+ * \brief Check that two type are syntactically equal up to alpha equivalence.
  */
 #include <tvm/ir_pass.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/ndarray.h>
 #include "./type_visitor.h"
 #include "tvm/relay/pass.h"
 
@@ -13,6 +14,25 @@ namespace relay {
 
 using namespace tvm::runtime;
 
+bool SameNDArray(const NDArray& lhs, const NDArray& rhs) {
+  if (lhs.defined() != rhs.defined()) {
+    return false;
+  } else if (lhs.same_as(rhs)) {
+    return true;
+  } else {
+    auto ldt = lhs->dtype;
+    auto rdt = rhs->dtype;
+    CHECK_EQ(lhs->ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    CHECK_EQ(rhs->ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    if (ldt.code == rdt.code && ldt.lanes == rdt.lanes && ldt.bits == rdt.bits) {
+      size_t s = GetDataSize(*lhs.operator->());
+      return memcmp(lhs->data, rhs->data, s) == 0;
+    } else {
+      return false;
+    }
+  }
+}
+
 struct TypeAlphaEq : TypeVisitor<const Type&> {
   tvm::Map<TypeParam, TypeParam> eq_map;
   bool equal;
@@ -38,8 +58,8 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
     }
   }
 
-  void VisitType_(const TensorTypeNode *tt1, const Type& t2) final {
-    if (const TensorTypeNode *tt2 = t2.as<TensorTypeNode>()) {
+  void VisitType_(const TensorTypeNode* tt1, const Type& t2) final {
+    if (const TensorTypeNode* tt2 = t2.as<TensorTypeNode>()) {
       DataTypeEqual(tt1->dtype, tt2->dtype);
       ShapeEqual(tt1->shape, tt2->shape);
     } else {
@@ -47,8 +67,8 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
     }
   }
 
-  void VisitType_(const IncompleteTypeNode *bt1, const Type& t2) final {
-    if (const IncompleteTypeNode *bt2 = t2.as<IncompleteTypeNode>()) {
+  void VisitType_(const IncompleteTypeNode* bt1, const Type& t2) final {
+    if (const IncompleteTypeNode* bt2 = t2.as<IncompleteTypeNode>()) {
       equal = equal && bt1 == bt2;
       return;
     } else {
@@ -56,8 +76,8 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
     }
   }
 
-  void VisitType_(const TypeParamNode *ti1, const Type& t2) final {
-    if (const TypeParamNode *ti2 = t2.as<TypeParamNode>()) {
+  void VisitType_(const TypeParamNode* ti1, const Type& t2) final {
+    if (const TypeParamNode* ti2 = t2.as<TypeParamNode>()) {
       auto tid1 = GetRef<TypeParam>(ti1);
       auto tid2 = GetRef<TypeParam>(ti2);
 
@@ -86,8 +106,8 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
     }
   }
 
-  void VisitType_(const FuncTypeNode *op, const Type& t2) final {
-    if (const FuncTypeNode *ta2 = t2.as<FuncTypeNode>()) {
+  void VisitType_(const FuncTypeNode* op, const Type& t2) final {
+    if (const FuncTypeNode* ta2 = t2.as<FuncTypeNode>()) {
       if (op->arg_types.size() != ta2->arg_types.size()
           || op->type_params.size() != ta2->type_params.size()
           || op->type_constraints.size() != ta2->type_constraints.size()) {
@@ -128,8 +148,8 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
     }
   }
 
-  void VisitType_(const TypeRelationNode *tr1, const Type& t2) final {
-    if (const TypeRelationNode *tr2 = t2.as<TypeRelationNode>()) {
+  void VisitType_(const TypeRelationNode* tr1, const Type& t2) final {
+    if (const TypeRelationNode* tr2 = t2.as<TypeRelationNode>()) {
       if (tr1->func != tr2->func
           || tr1->num_inputs != tr2->num_inputs
           || tr1->attrs != tr2->attrs) {
@@ -153,8 +173,8 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
     }
   }
 
-  void VisitType_(const TupleTypeNode *op, const Type& t2) final {
-    if (const TupleTypeNode *pt = t2.as<TupleTypeNode>()) {
+  void VisitType_(const TupleTypeNode* op, const Type& t2) final {
+    if (const TupleTypeNode* pt = t2.as<TupleTypeNode>()) {
       if (op->fields.size() != pt->fields.size()) {
         equal = false;
         return;
@@ -185,8 +205,8 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
   bool equal;
   AlphaEq() : eq_map(), equal(true) {}
 
-  void VisitExpr_(const VarNode *e1, const Expr& e2) final {
-    if (const VarNode *id2 = e2.as<VarNode>()) {
+  void VisitExpr_(const VarNode* e1, const Expr& e2) final {
+    if (const VarNode* id2 = e2.as<VarNode>()) {
       auto local1 = GetRef<Var>(e1);
       auto local2 = GetRef<Var>(id2);
       // We handle open terms with this rule assuming variables are identical.
@@ -207,17 +227,17 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
     }
   }
 
-  void VisitExpr_(const GlobalVarNode *g1, const Expr& e2) final {
-    if (const GlobalVarNode *g2 = e2.as<GlobalVarNode>()) {
+  void VisitExpr_(const GlobalVarNode* g1, const Expr& e2) final {
+    if (const GlobalVarNode* g2 = e2.as<GlobalVarNode>()) {
       equal = equal && g1 == g2;
     } else {
       equal = false;
     }
   }
 
-  void VisitExpr_(const TupleNode *pl1, const Expr& e2) final {
+  void VisitExpr_(const TupleNode* pl1, const Expr& e2) final {
     Tuple prod1 = GetRef<Tuple>(pl1);
-    if (const TupleNode *pl2 = e2.as<TupleNode>()) {
+    if (const TupleNode* pl2 = e2.as<TupleNode>()) {
       Tuple prod2 = GetRef<Tuple>(pl2);
       if (prod1->fields.size() != prod2->fields.size()) {
         equal = false;
@@ -232,8 +252,8 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
     }
   }
 
-  void VisitExpr_(const ParamNode *p1, const Expr& e2) final {
-    if (const ParamNode *p2 = e2.as<ParamNode>()) {
+  void VisitExpr_(const ParamNode* p1, const Expr& e2) final {
+    if (const ParamNode* p2 = e2.as<ParamNode>()) {
       eq_map.Set(p1->var, p2->var);
       equal = equal && AlphaEqual(p1->type, p2->type);
     } else {
@@ -241,8 +261,8 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
     }
   }
 
-  void VisitExpr_(const FunctionNode *func1, const Expr& e2) final {
-    if (const FunctionNode *func2 = e2.as<FunctionNode>()) {
+  void VisitExpr_(const FunctionNode* func1, const Expr& e2) final {
+    if (const FunctionNode* func2 = e2.as<FunctionNode>()) {
       if (func1->params.size() != func2->params.size()) {
         equal = false;
         return;
@@ -258,8 +278,8 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
     }
   }
 
-  void VisitExpr_(const CallNode *op, const Expr& e2) final {
-    if (const CallNode *call = e2.as<CallNode>()) {
+  void VisitExpr_(const CallNode* op, const Expr& e2) final {
+    if (const CallNode* call = e2.as<CallNode>()) {
       this->VisitExpr(op->op, call->op);
 
       if (op->args.size() != call->args.size()) {
@@ -276,8 +296,8 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
     }
   }
 
-  void VisitExpr_(const LetNode *op, const Expr& e2) final {
-    if (const LetNode *let = e2.as<LetNode>()) {
+  void VisitExpr_(const LetNode* op, const Expr& e2) final {
+    if (const LetNode* let = e2.as<LetNode>()) {
       eq_map.Set(op->var, let->var);
       this->VisitExpr(op->value, let->value);
       this->VisitExpr(op->body, let->body);
@@ -285,6 +305,36 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
       equal = false;
     }
   }
+
+  void VisitExpr_(const IfNode* op, const Expr& e2) final {
+    if (const IfNode* i = e2.as<IfNode>()) {
+      VisitExpr(op->cond, i->cond);
+      VisitExpr(op->true_branch, i->true_branch);
+      VisitExpr(op->false_branch, i->false_branch);
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitExpr_(const OpNode* op, const Expr& e2) final {
+    if (const OpNode* o = e2.as<OpNode>()) {
+      equal = equal && op->name == o->name;
+    } else {
+      equal = false;
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* op, const Expr& e2) final {
+    if (const ConstantNode* c = e2.as<ConstantNode>()) {
+      if (AlphaEqual(op->tensor_type(), c->tensor_type())) {
+        equal = equal && SameNDArray(op->data, c->data);
+      } else {
+        equal = false;
+      }
+    } else {
+      equal = false;
+    }
+  }
 };
 
 bool AlphaEqual(const Expr& e1, const Expr& e2) {
@@ -294,15 +344,15 @@ bool AlphaEqual(const Expr& e1, const Expr& e2) {
 }
 
 // TODO(@jroesch): move to correct namespace?
-TVM_REGISTER_API("relay._make._alpha_eq")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
+TVM_REGISTER_API("relay._make._alpha_equal")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
       Expr e1 = args[0];
       Expr e2 = args[1];
       *ret = AlphaEqual(e1, e2);
     });
 
-TVM_REGISTER_API("relay._make._type_alpha_eq")
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
+TVM_REGISTER_API("relay._make._type_alpha_equal")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
       Type t1 = args[0];
       Type t2 = args[1];
       *ret = AlphaEqual(t1, t2);
diff --git a/src/relay/pass/dead_code.cc b/src/relay/pass/dead_code.cc
new file mode 100644
index 000000000000..05036042a635
--- /dev/null
+++ b/src/relay/pass/dead_code.cc
@@ -0,0 +1,119 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file dead_code.cc
+ *
+ * \brief Remove code that does not effect the program result.
+ *
+ * The algorithm is implemented by two visitor:
+ * CalcDep turn an expr into a dependency graph of expr,
+ * GenLet turn the dependency graph into a let list, taking only the used value.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include "let_list.h"
+
+namespace tvm {
+namespace relay {
+
+bool IsBoolLit(const Expr& e, bool b) {
+  if (const ConstantNode* c = e.as<ConstantNode>()) {
+    if (c->is_scalar()) {
+      auto dt = c->tensor_type()->dtype;
+      if (dt == UInt(8)) {
+        return *reinterpret_cast<const uint8_t*>(c->data->data) == b;
+      } else if (dt == UInt(16)) {
+        return *reinterpret_cast<const uint16_t*>(c->data->data) == b;
+      } else if (dt == UInt(32)) {
+        return *reinterpret_cast<const uint32_t*>(c->data->data) == b;
+      } else if (dt == UInt(64)) {
+        return *reinterpret_cast<const uint64_t*>(c->data->data) == b;
+      } else if (dt == Int(8)) {
+        return *reinterpret_cast<const int8_t*>(c->data->data) == b;
+      } else if (dt == Int(16)) {
+        return *reinterpret_cast<const int16_t*>(c->data->data) == b;
+      } else if (dt == Int(32)) {
+        return *reinterpret_cast<const int32_t*>(c->data->data) == b;
+      } else if (dt == Int(64)) {
+        return *reinterpret_cast<const int64_t*>(c->data->data) == b;
+      }
+    }
+  }
+  return false;
+}
+
+// calculate the dependency graph from expression
+class CalcDep : private ExprMutator {
+ public:
+  static Expr Eliminate(const Expr& e) {
+    CalcDep cd;
+    auto res = cd(e);
+    GenLet gl(cd.var_map_);
+    gl(res);
+    return gl.lets_.Get(res);
+  }
+
+ private:
+  struct Binder {
+    Type t;
+    Expr e;
+    Binder(const Type& t, const Expr& e) : t(t), e(e) { }
+  };
+  using VarMap = std::unordered_map<Var, Binder, NodeHash, NodeEqual>;
+  VarMap var_map_;
+
+  Expr VisitExpr_(const IfNode* i) final {
+    auto cond = VisitExpr(i->cond);
+    if (IsBoolLit(cond, true)) {
+      return Eliminate(i->true_branch);
+    } else if (IsBoolLit(cond, false)) {
+      return Eliminate(i->false_branch);
+    } else {
+      return IfNode::make(cond, Eliminate(i->true_branch), Eliminate(i->false_branch));
+    }
+  }
+
+  Expr VisitExpr_(const LetNode* l) final {
+    var_map_.insert(std::pair<Var, Binder>(l->var,
+                                           Binder(l->value_type,
+                                                  Eliminate(l->value))));
+    return VisitExpr(l->body);
+  }
+
+  Expr VisitExpr_(const FunctionNode* f) final {
+    return FunctionNode::make(f->params, f->ret_type, Eliminate(f->body), f->type_params);
+  }
+
+  // generate the let list from dependency graph
+  class GenLet : private ExprVisitor {
+   private:
+    LetList lets_;
+    VarMap var_map_;
+    explicit GenLet(const VarMap& var_map) : var_map_(var_map) { }
+    friend CalcDep;
+
+    void VisitExpr_(const VarNode* vn) final {
+      Var v = GetRef<Var>(vn);
+      if (var_map_.count(v) != 0) {
+        auto val = var_map_.at(v);
+        var_map_.erase(v);
+        // erase before visit to handle letrec
+        VisitExpr(val.e);
+        // visit before push back so the dependency of dependency is before the dependency
+        lets_.Push(v, val.t, val.e);
+      }
+    }
+  };
+};
+
+Expr DeadCodeElimination(const Expr& e) {
+  return CalcDep::Eliminate(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.dead_code_elimination")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = DeadCodeElimination(args[0]);
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 04c178f25dfa..574111e39b64 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -25,15 +25,6 @@ inline void VerifyDataType(DLDataType dtype) {
   CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }
 
-inline size_t GetDataSize(const DLTensor& arr) {
-  size_t size = 1;
-  for (tvm_index_t i = 0; i < arr.ndim; ++i) {
-    size *= arr.shape[i];
-  }
-  size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8;
-  return size;
-}
-
 inline size_t GetDataAlignment(const DLTensor& arr) {
   size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes;
   if (align < kAllocAlignment) return kAllocAlignment;
@@ -129,8 +120,8 @@ DLManagedTensor* NDArray::ToDLPack() const {
 }
 
 NDArray NDArray::Empty(std::vector<int64_t> shape,
-                        DLDataType dtype,
-                        DLContext ctx) {
+                       DLDataType dtype,
+                       DLContext ctx) {
   NDArray ret = Internal::Create(shape, dtype, ctx);
   // setup memory content
   size_t size = GetDataSize(ret.data_->dl_tensor);
diff --git a/tests/python/relay/test_dead_code_elimination.py b/tests/python/relay/test_dead_code_elimination.py
new file mode 100644
index 000000000000..10f60be32f55
--- /dev/null
+++ b/tests/python/relay/test_dead_code_elimination.py
@@ -0,0 +1,77 @@
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import dead_code_elimination, alpha_equal
+from tvm.relay.ir_builder import convert, IRBuilder
+from tvm.relay.op import log, add, equal, subtract, concat
+
+class env:
+    def __init__(self):
+        self.a = relay.Var("a")
+        self.b = relay.Var("b")
+        self.c = relay.Var("c")
+        self.d = relay.Var("d")
+        self.e = relay.Var("e")
+        self.x = relay.Var("x")
+        self.y = relay.Var("y")
+        self.z = relay.Var("z")
+        self.shape = tvm.convert([1, 2, 3])
+        self.tt = relay.TensorType(self.shape, "float32")
+        self.int32 = relay.TensorType([], "int32")
+        self.float32 = relay.TensorType([], "float32")
+        self.one = convert(1.0)
+        self.two = convert(2.0)
+        self.three = convert(3.0)
+
+e = env()
+
+def test_let():
+    orig = relay.Let(e.x, e.y, e.z, e.tt)
+    assert alpha_equal(dead_code_elimination(orig), e.z)
+
+def test_used_let():
+    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.c, e.tt), e.tt)
+    assert alpha_equal(dead_code_elimination(orig), relay.Let(e.c, e.d, e.c, e.tt))
+
+def test_chain_unused_let():
+    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.e, e.tt), e.tt)
+    assert alpha_equal(dead_code_elimination(orig), e.e)
+
+# make sure we dont infinite loop
+def test_recursion():
+    """
+    Program:
+       let f(n: i32, data: f32) -> f32 = {
+          if (n == 0) {
+              return data;
+          } else {
+              return f(n - 1, log(data));
+          }
+       }
+       f(2, 10000);
+    """
+    f = relay.Var("f")
+    n = relay.Var("n")
+    np = relay.Param(n, e.int32)
+    data = relay.Var("data")
+    datap = relay.Param(data, e.float32)
+    funcbody = relay.If(equal(n, convert(0)), data, f(subtract(n, convert(1.0)), log(data)))
+    value = relay.Function([np, datap], e.float32, funcbody, [])
+    orig = relay.Let(f, funcbody, f(convert(2.0), convert(10000.0)), e.float32)
+    assert alpha_equal(dead_code_elimination(orig), orig)
+    assert alpha_equal(dead_code_elimination(relay.Let(f, funcbody, e.three, e.float32)), e.three)
+
+def test_op_let():
+    assert alpha_equal(dead_code_elimination(add(relay.Let(e.a, e.one, e.three, e.float32), e.two)), add(e.three, e.two))
+
+def test_if():
+    orig = relay.If(convert(True), e.a, e.b)
+    assert alpha_equal(dead_code_elimination(orig), e.a)
+
+
+if __name__ == "__main__":
+    test_let()
+    test_used_let()
+    test_chain_unused_let()
+    test_recursion()
+    test_op_let()
+    test_if()
diff --git a/tests/python/relay/test_pass_alpha_eq.py b/tests/python/relay/test_pass_alpha_equal.py
similarity index 89%
rename from tests/python/relay/test_pass_alpha_eq.py
rename to tests/python/relay/test_pass_alpha_equal.py
index d925b54d47d2..93f8a8fbc0b3 100644
--- a/tests/python/relay/test_pass_alpha_eq.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -1,8 +1,9 @@
 import tvm
 from tvm import relay
+from tvm.relay.ir_pass import alpha_equal
+from tvm.relay.ir_builder import convert
 
-
-def test_tensor_type_alpha_eq():
+def test_tensor_type_alpha_equal():
     t1 = relay.TensorType((3, 4), "float32")
     t2 = relay.TensorType((3, 4), "float32")
     t3 = relay.TensorType((3, 4, 5), "float32")
@@ -13,8 +14,14 @@ def test_tensor_type_alpha_eq():
     t2 = relay.TensorType((), "float32")
     assert t1 == t2
 
+def test_constant_alpha_equal():
+    x = convert(1)
+    y = convert(2)
+    assert alpha_equal(x, x)
+    assert not alpha_equal(x, y)
+    assert alpha_equal(x, convert(1))
 
-def test_incomplete_type_alpha_eq():
+def test_incomplete_type_alpha_equal():
     t1 = relay.IncompleteType(relay.Kind.Shape)
     t2 = relay.IncompleteType(relay.Kind.Type)
     t3 = relay.IncompleteType(relay.Kind.Type)
@@ -26,7 +33,7 @@ def test_incomplete_type_alpha_eq():
     assert t2 != t3
 
 
-def test_type_param_alpha_eq():
+def test_type_param_alpha_equal():
     t1 = relay.TypeParam("v1", relay.Kind.Type)
     t2 = relay.TypeParam("v2", relay.Kind.Shape)
     t3 = relay.TypeParam("v3", relay.Kind.Type)
@@ -48,7 +55,7 @@ def test_type_param_alpha_eq():
     assert ft1 != ft3 # kinds still do not match
 
 
-def test_func_type_alpha_eq():
+def test_func_type_alpha_equal():
     t1 = relay.TensorType((1, 2), "float32")
     t2 = relay.TensorType((1, 2, 3), "float32")
 
@@ -108,7 +115,7 @@ def test_func_type_alpha_eq():
     assert ft != more_rels
 
 
-def test_tuple_type_alpha_eq():
+def test_tuple_type_alpha_equal():
     t1 = relay.TensorType((1, 2, 3), "float32")
     t2 = relay.TensorType((1, 2, 3, 4), "float32")
     tp1 = relay.TypeParam("v1", relay.Kind.Type)
@@ -126,7 +133,7 @@ def test_tuple_type_alpha_eq():
     assert tup1 != tup4
 
 
-def test_type_relation_alpha_eq():
+def test_type_relation_alpha_equal():
     t1 = relay.TensorType((1, 2), "float32")
     t2 = relay.TensorType((1, 2, 3), "float32")
     t3 = relay.TensorType((1, 2, 3, 4), "float32")
@@ -162,9 +169,9 @@ def test_type_relation_alpha_eq():
 
 
 if __name__ == "__main__":
-    test_tensor_type_alpha_eq()
-    test_incomplete_type_alpha_eq()
-    test_type_param_alpha_eq()
-    test_func_type_alpha_eq()
-    test_tuple_type_alpha_eq()
-    test_type_relation_alpha_eq()
+    test_tensor_type_alpha_equal()
+    test_incomplete_type_alpha_equal()
+    test_type_param_alpha_equal()
+    test_func_type_alpha_equal()
+    test_tuple_type_alpha_equal()
+    test_type_relation_alpha_equal()
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 5b8375580424..97baf701347a 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -120,9 +120,9 @@ def test_recursion():
     Program:
        def f(n: i32, data: f32) -> f32 {
           if (n == 0) {
-              return f(n - 1, log(data));
-          } else {
               return data;
+          } else {
+              return f(n - 1, log(data));
           }
        }
        f(2, 10000);
@@ -133,9 +133,9 @@ def f(n: i32, data: f32) -> f32 {
     data = b.param('data', ty='float32')
     with b.decl(f, n, data):
         with b.if_scope(equal(n, convert(0))):
-            b.ret(f(subtract(n, convert(1)), log(data)))
-        with b.else_scope():
             b.ret(data)
+        with b.else_scope():
+            b.ret(f(subtract(n, convert(1)), log(data)))
     b.ret(f(convert(2.0), convert(10000.0)))
     assert_decl_has_type(b.env, 'f', func_type(
         ['int32', 'float32'], 'float32'))
@@ -160,11 +160,11 @@ def try_concat2(x: Float(3, 2), y: Float(2, 2)) -> Float(5, 2) {
 
 if __name__ == "__main__":
     test_dual_op()
-
     test_recursion()
     test_monomorphic_let()
     test_single_op()
     test_add_op()
     test_add_broadcast_op()
     test_decl()
+    test_recursion()
     test_concat()

From f7b069348471d47c15ceb375d193e4f4eb82a0ac Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 6 Oct 2018 14:36:17 -0700
Subject: [PATCH 188/529] [RELAY] reorg testcase, make checked_type property,
 fix constructor error handling (#1850)

---
 python/tvm/_ffi/_ctypes/node.py                           | 2 ++
 python/tvm/_ffi/_cython/node.pxi                          | 2 ++
 python/tvm/relay/expr.py                                  | 8 ++++++++
 .../{test_debug_printer.py => test_ir_debug_printer.py}   | 0
 tests/python/relay/test_ir_nodes.py                       | 8 ++++++++
 tests/python/relay/{test_relay_op.py => test_ir_op.py}    | 0
 .../relay/{test_well_formed.py => test_ir_well_formed.py} | 0
 tests/python/relay/test_op_level1.py                      | 4 ++--
 tests/python/relay/test_op_level2.py                      | 6 +++---
 tests/python/relay/test_op_level3.py                      | 2 +-
 tests/python/relay/test_op_level4.py                      | 4 ++--
 .../relay/{test_check_kind.py => test_pass_check_kind.py} | 0
 ..._elimination.py => test_pass_dead_code_elimination.py} | 0
 .../relay/{test_free_vars.py => test_pass_free_vars.py}   | 0
 tests/python/relay/test_type_infer.py                     | 4 ++--
 15 files changed, 30 insertions(+), 10 deletions(-)
 rename tests/python/relay/{test_debug_printer.py => test_ir_debug_printer.py} (100%)
 rename tests/python/relay/{test_relay_op.py => test_ir_op.py} (100%)
 rename tests/python/relay/{test_well_formed.py => test_ir_well_formed.py} (100%)
 rename tests/python/relay/{test_check_kind.py => test_pass_check_kind.py} (100%)
 rename tests/python/relay/{test_dead_code_elimination.py => test_pass_dead_code_elimination.py} (100%)
 rename tests/python/relay/{test_free_vars.py => test_pass_free_vars.py} (100%)

diff --git a/python/tvm/_ffi/_ctypes/node.py b/python/tvm/_ffi/_ctypes/node.py
index eb9e930b30eb..ccfaa6dd77a2 100644
--- a/python/tvm/_ffi/_ctypes/node.py
+++ b/python/tvm/_ffi/_ctypes/node.py
@@ -76,6 +76,8 @@ def __init_handle_by_constructor__(self, fconstructor, *args):
         So the return handle is directly set into the Node object
         instead of creating a new Node.
         """
+        # assign handle first to avoid error raising
+        self.handle = None
         handle = __init_by_constructor__(fconstructor, args)
         if not isinstance(handle, NodeHandle):
             handle = NodeHandle(handle)
diff --git a/python/tvm/_ffi/_cython/node.pxi b/python/tvm/_ffi/_cython/node.pxi
index c62e4ab44cef..73ead2b4b447 100644
--- a/python/tvm/_ffi/_cython/node.pxi
+++ b/python/tvm/_ffi/_cython/node.pxi
@@ -82,6 +82,8 @@ cdef class NodeBase:
         So the return handle is directly set into the Node object
         instead of creating a new Node.
         """
+        # avoid error raised during construction.
+        self.chandle = NULL
         cdef void* chandle
         ConstructorCall(
             (<FunctionBase>fconstructor).chandle,
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 3f90a3af64a5..9b292a74eccd 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -9,7 +9,15 @@
 
 class Expr(NodeBase):
     """The base type for all Relay expressions."""
+    @property
     def checked_type(self):
+        """Get the checked type of relay.
+
+        Returns
+        -------
+        checked_type : relay.Type
+            The checked type.
+        """
         ret = self._checked_type_
         if ret is None:
             raise ValueError("The type checker has not populated"
diff --git a/tests/python/relay/test_debug_printer.py b/tests/python/relay/test_ir_debug_printer.py
similarity index 100%
rename from tests/python/relay/test_debug_printer.py
rename to tests/python/relay/test_ir_debug_printer.py
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index a94e035e2fef..d3dae9b2c3f8 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -3,6 +3,13 @@
 from tvm import relay
 from tvm.expr import *
 
+def test_bad_constructor():
+    try:
+        x = relay.ty.TensorType("xx", "xx")
+    except tvm.TVMError:
+        pass
+
+
 # Span
 def test_span():
     span = relay.Span(None, 1, 1)
@@ -169,6 +176,7 @@ def test_if():
 
 
 if __name__ == "__main__":
+    test_bad_constructor()
     test_span()
     test_tensor_type()
     test_type_param()
diff --git a/tests/python/relay/test_relay_op.py b/tests/python/relay/test_ir_op.py
similarity index 100%
rename from tests/python/relay/test_relay_op.py
rename to tests/python/relay/test_ir_op.py
diff --git a/tests/python/relay/test_well_formed.py b/tests/python/relay/test_ir_well_formed.py
similarity index 100%
rename from tests/python/relay/test_well_formed.py
rename to tests/python/relay/test_ir_well_formed.py
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 9cfca9630561..621d40e79b64 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -11,7 +11,7 @@ def test_expand_dims_infer_type():
         ib.ret(relay.expand_dims(x, axis=2))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type()
+    ftype = func.checked_type
     assert ftype.ret_type == relay.ty.TensorType(
         (n, t, 1, 100), "float32")
 
@@ -27,7 +27,7 @@ def test_unary_op():
             ib.ret(op(x.var))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type()
+        ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((10, 4), "int32")
 
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index d5dd64d76555..7182c641248e 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -16,7 +16,7 @@ def test_conv2d_infer_type():
                                channels=2))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type()
+    ftype = func.checked_type
     assert ftype.ret_type == relay.ty.TensorType(
         (n, 2, 224, 224), "float32")
     assert ftype.arg_types[1] == relay.ty.TensorType(
@@ -31,7 +31,7 @@ def test_conv2d_infer_type():
         ib.ret(relay.nn.conv2d(x.var, w.var, out_dtype="int32"))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type()
+    ftype = func.checked_type
     assert ftype.ret_type == relay.ty.TensorType(
         (n, 2, 222, 222), "int32")
 
@@ -50,7 +50,7 @@ def test_conv2d_infer_type():
                                out_dtype="int32"))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type()
+    ftype = func.checked_type
     assert ftype.ret_type == relay.ty.TensorType(
         (1, 4, 224, 224, 4, 4), "int32")
     assert ftype.arg_types[1] == relay.ty.TensorType(
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 450a7c3458f7..a78685597a59 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -9,5 +9,5 @@ def test_unary_identity():
             ib.ret(op(x.var))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type()
+        ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((8, 9, 4), "int32")
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 72876780f944..dddbf40bd878 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -16,7 +16,7 @@ def test_cmp_type():
             ib.ret(op(x.var, y.var))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type()
+        ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((5, 10, 4), "uint1")
 
 
@@ -32,7 +32,7 @@ def test_binary_broadcast():
             ib.ret(op(x.var, y.var))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type()
+        ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((5, 10, 4), "int32")
 
 
diff --git a/tests/python/relay/test_check_kind.py b/tests/python/relay/test_pass_check_kind.py
similarity index 100%
rename from tests/python/relay/test_check_kind.py
rename to tests/python/relay/test_pass_check_kind.py
diff --git a/tests/python/relay/test_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
similarity index 100%
rename from tests/python/relay/test_dead_code_elimination.py
rename to tests/python/relay/test_pass_dead_code_elimination.py
diff --git a/tests/python/relay/test_free_vars.py b/tests/python/relay/test_pass_free_vars.py
similarity index 100%
rename from tests/python/relay/test_free_vars.py
rename to tests/python/relay/test_pass_free_vars.py
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 97baf701347a..8d312108f303 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -12,7 +12,7 @@
 
 def assert_has_type(expr, typ, env=Environment({})):
     checked_expr = infer_type(env, expr)
-    checked_type = checked_expr.checked_type()
+    checked_type = checked_expr.checked_type
     if checked_type != typ:
         raise RuntimeError("Type mismatch %s vs %s" % (
             checked_type, typ))
@@ -20,7 +20,7 @@ def assert_has_type(expr, typ, env=Environment({})):
 
 def assert_decl_has_type(env, name, typ):
     func = env[name]
-    assert func.checked_type() == typ
+    assert func.checked_type == typ
 
 
 def test_monomorphic_let():

From 01b9c9ed888ec3272c30b344b858f192b1cf87b2 Mon Sep 17 00:00:00 2001
From: nhynes <nhynes@berkeley.edu>
Date: Sat, 6 Oct 2018 14:55:12 -0700
Subject: [PATCH 189/529] [SGX] Add ignored files to sgx example (#1852)

---
 apps/sgx/README.md                  | 15 ++++++++++++
 apps/sgx/enclave/build.rs           |  9 +++++++
 apps/sgx/enclave/src/build_model.py | 38 +++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 apps/sgx/enclave/build.rs
 create mode 100644 apps/sgx/enclave/src/build_model.py

diff --git a/apps/sgx/README.md b/apps/sgx/README.md
index dd21cff02f80..7d642422ec6e 100644
--- a/apps/sgx/README.md
+++ b/apps/sgx/README.md
@@ -19,6 +19,21 @@ Check out the `/tvm/install/ubuntu_install_sgx.sh` for the commands to get these
 
 ## Running the example
 
+If using Docker, start by running
+
+```
+git clone https://github.com/dmlc/tvm.git
+docker run --rm -it -v $(pwd)/tvm:/mnt tvmai/ci-cpu /bin/bash
+```
+then, in the container
+```
+cd /mnt
+mkdir build && cd build
+cmake .. -DUSE_LLVM=ON -DUSE_SGX=/opt/sgxsdk -DRUST_SGX_SDK=/opt/rust-sgx-sdk
+make -j4
+cd ../apps/sgx
+```
+
 `bash run_example.sh`
 
 If everything goes well, you should see a lot of build messages and below them
diff --git a/apps/sgx/enclave/build.rs b/apps/sgx/enclave/build.rs
new file mode 100644
index 000000000000..a3beedaacda6
--- /dev/null
+++ b/apps/sgx/enclave/build.rs
@@ -0,0 +1,9 @@
+use std::env;
+
+fn main() {
+  println!(
+    "cargo:rustc-link-search=native={}",
+    env::var("BUILD_DIR").unwrap()
+  );
+  println!("cargo:rustc-link-lib=static=model");
+}
diff --git a/apps/sgx/enclave/src/build_model.py b/apps/sgx/enclave/src/build_model.py
new file mode 100644
index 000000000000..d1b45cc4a4df
--- /dev/null
+++ b/apps/sgx/enclave/src/build_model.py
@@ -0,0 +1,38 @@
+"""Creates a simple TVM modules."""
+
+import argparse
+import os
+from os import path as osp
+
+import nnvm.compiler
+import nnvm.testing
+import tvm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--out-dir', default='.')
+    opts = parser.parse_args()
+
+    # from tutorials/nnvm_quick_start.py
+    dshape = (1, 3, 224, 224)
+    net, params = nnvm.testing.resnet.get_workload(
+        layers=18, batch_size=dshape[0], image_shape=dshape[1:])
+
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, 'llvm --system-lib', shape={'data': dshape}, params=params)
+
+    build_dir = osp.abspath(opts.out_dir)
+    if not osp.isdir(build_dir):
+        os.makedirs(build_dir, exist_ok=True)
+
+    lib.save(osp.join(build_dir, 'model.bc'))
+    with open(osp.join(build_dir, 'graph.json'), 'w') as f_graph_json:
+        f_graph_json.write(graph.json())
+        with open(osp.join(build_dir, 'params.bin'), 'wb') as f_params:
+            f_params.write(nnvm.compiler.save_param_dict(params))
+
+
+if __name__ == '__main__':
+    main()

From a7e8046252a8ab3539b06b9e40870db49e92864b Mon Sep 17 00:00:00 2001
From: nhynes <nhynes@berkeley.edu>
Date: Sat, 6 Oct 2018 15:16:38 -0700
Subject: [PATCH 190/529] [Rust] Update rust install in dockerfile (#1855)

* Update rust docker

* minor edit for consistency
---
 docker/Dockerfile.ci_cpu              | 1 +
 docker/install/ubuntu_install_rust.sh | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 86a633bf8f3c..f05818721f98 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -30,6 +30,7 @@ RUN bash /install/ubuntu_install_rust.sh
 # SGX deps
 COPY install/ubuntu_install_sgx.sh /install/ubuntu_install_sgx.sh
 RUN bash /install/ubuntu_install_sgx.sh
+ENV LD_LIBRARY_PATH /opt/sgxsdk/lib64:${LD_LIBRARY_PATH}
 
 
 ENV PATH $PATH:/root/.cargo/bin:/usr/lib/go-1.10/bin
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index 1d17b66164c9..836186e8ff96 100644
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -4,6 +4,7 @@ curl -sSo rustup.sh 'https://sh.rustup.rs'
 # rustc nightly-2018-08-25 is the version supported by the above version of rust-sgx-sdk
 bash rustup.sh -y --no-modify-path --default-toolchain nightly-2018-08-25
 . $HOME/.cargo/env
+rustup toolchain add nightly
 rustup component add rust-src
-cargo install rustfmt-nightly --force
-cargo install xargo
+cargo +nightly install rustfmt-nightly --version 0.99.5 --force
+cargo +nightly install xargo

From 0dbc8a98caed9fc5e35b7dd5946eab7d69751dc5 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Sat, 6 Oct 2018 18:03:14 -0700
Subject: [PATCH 191/529] [RELAY] Add softmax (#1841)

---
 docs/langref/relay_op.rst            |  1 +
 include/tvm/relay/attrs/nn.h         | 10 +++++++
 python/tvm/relay/op/nn/nn.py         | 20 +++++++++++++
 src/relay/op/nn/convolution.cc       |  6 ++--
 src/relay/op/nn/nn.cc                | 43 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level1.py | 14 +++++++++
 6 files changed, 91 insertions(+), 3 deletions(-)
 create mode 100644 src/relay/op/nn/nn.cc

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 8566404561b2..c7db9364a72e 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -28,6 +28,7 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.sigmoid
    tvm.relay.add
    tvm.relay.expand_dims
+   tvm.relay.nn.softmax
 
 **Level 2: Convolutions**
 
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index b364079f06fc..0de0164a562f 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -67,6 +67,16 @@ struct ConvAttrs : public tvm::AttrsNode<ConvAttrs> {
   }
 };
 
+/*! \brief Attributes used in softmax operators */
+struct SoftmaxAttrs : public tvm::AttrsNode<SoftmaxAttrs> {
+  int axis;
+
+  TVM_DECLARE_ATTRS(SoftmaxAttrs, "relay.attrs.SoftmaxAttrs") {
+      TVM_ATTR_FIELD(axis).set_default(1)
+          .describe("The axis to sum over when computing softmax.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_NN_H_
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index f2d60d48eaad..3b168c6fce21 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -86,3 +86,23 @@ def conv2d(data,
     return _make.conv2d(data, weight, strides, padding, dilation,
                         groups, channels, kernel_size, data_layout,
                         weight_layout, out_layout, out_dtype)
+
+
+def softmax(data, axis):
+    r"""Computes softmax.
+
+    .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+    .. note::
+        This operator can be optimized away for inference.
+
+    Parameters
+    ----------
+    data: relay.Expr
+        The input data to the operator.
+
+    axis: int
+        The axis to sum over when computing softmax
+    """
+
+    return _make.softmax(data, axis)
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 920fc68d51e8..ba424128640c 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -49,9 +49,9 @@ bool Conv2DRel(const Array<Type>& types,
     CHECK_EQ(param->dilation.size(), 2);
     std::vector<IndexExpr> wshape(
         {param->channels / param->groups,
-              data->shape[1] / param->groups,
-              param->kernel_size[0],
-              param->kernel_size[1]});
+         data->shape[1] / param->groups,
+         param->kernel_size[0],
+         param->kernel_size[1]});
     wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
     wshape[kernel_layout.indexof('O')] *= param->groups;
     channels = param->channels;
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
new file mode 100644
index 000000000000..b34d248d1704
--- /dev/null
+++ b/src/relay/op/nn/nn.cc
@@ -0,0 +1,43 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file nn.cc
+ * \brief Property def of nn operators.
+ */
+
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/nn.h>
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+
+TVM_REGISTER_API("relay.op.nn._make.softmax")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  auto make_func = [](Expr data, int axis) {
+    auto attrs = make_node<SoftmaxAttrs>();
+    attrs->axis = axis;
+    static const Op& op = Op::Get("nn.softmax");
+    return CallNode::make(op, {data}, Attrs(attrs), {});
+  };
+
+  runtime::detail::unpack_call<Expr, 2>(make_func, args, rv);
+});
+
+RELAY_REGISTER_OP("nn.softmax")
+    .describe(R"code(Softmax layer.
+
+.. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+.. note::
+    This operator can be optimized away for inference.
+
+- **data**: The input data
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 621d40e79b64..654e184e8e23 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -16,6 +16,19 @@ def test_expand_dims_infer_type():
         (n, t, 1, 100), "float32")
 
 
+def test_softmax():
+    ib = relay.ir_builder.IRBuilder()
+    n, d = tvm.var("n"), tvm.var("d")
+    x = ib.param("x", relay.ty.TensorType((n, d), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.nn.softmax(x, axis=1))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type()
+    assert ftype.ret_type == relay.ty.TensorType((n, d), "float32")
+
+
 def test_unary_op():
     for op in [relay.exp,
                relay.log,
@@ -34,3 +47,4 @@ def test_unary_op():
 if __name__ == "__main__":
     test_expand_dims_infer_type()
     test_unary_op()
+    test_softmax()

From 4f57d55cf61ba184fd65ef705e986df4f6995690 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 6 Oct 2018 21:19:28 -0400
Subject: [PATCH 192/529] [Relay][Op] concatenate, reshape, transpose, copy
 (#1847)

---
 docs/langref/relay_op.rst                     |   4 +
 include/tvm/relay/attrs/transform.h           |  29 +++
 python/tvm/relay/op/tensor.py                 |  53 ++--
 python/tvm/relay/op/transform.py              |  90 +++++++
 src/relay/op/op_common.h                      |  30 +++
 src/relay/op/tensor/transform.cc              | 242 +++++++++++++++++-
 src/relay/op/tensor/unary.cc                  |  17 +-
 tests/python/relay/test_op_level1.py          |  39 +++
 tests/python/relay/test_op_level3.py          |  47 ++++
 .../relay/test_pass_dead_code_elimination.py  |   2 +-
 tests/python/relay/test_type_infer.py         |   6 +-
 topi/include/topi/transform.h                 |  16 +-
 12 files changed, 536 insertions(+), 39 deletions(-)
 create mode 100644 src/relay/op/op_common.h

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index c7db9364a72e..c5b0f4ba4f5c 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -28,6 +28,7 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.sigmoid
    tvm.relay.add
    tvm.relay.expand_dims
+   tvm.relay.concatenate
    tvm.relay.nn.softmax
 
 **Level 2: Convolutions**
@@ -47,6 +48,9 @@ This level enables typical convnet models.
 
    tvm.relay.zeros_like
    tvm.relay.ones_like
+   tvm.relay.reshape
+   tvm.relay.copy
+   tvm.relay.transpose
 
 **Level 4: Broadcast and Reductions**
 
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index b14e8f22722e..d501e6cb7255 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -30,6 +30,35 @@ struct ExpandDimsAttrs : public tvm::AttrsNode<ExpandDimsAttrs> {
   }
 };  // struct ExpandDimsAttrs
 
+/*! \brief Attributes used in concatenate operators */
+struct ConcatenateAttrs : public tvm::AttrsNode<ConcatenateAttrs> {
+  int axis;
+  TVM_DECLARE_ATTRS(ConcatenateAttrs, "relay.attrs.ConcatenateAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .describe("The axis at which the input arrays are concatenated."
+                  "Should lie in range `[-ndim, ndim)`.")
+        .set_default(0);
+  }
+};  // struct ConcatenateAttrs
+
+/*! \brief Attributes used in transpose operators */
+struct TransposeAttrs : public tvm::AttrsNode<TransposeAttrs> {
+  Array<IndexExpr> axes;
+  TVM_DECLARE_ATTRS(TransposeAttrs, "relay.attrs.TransposeAttrs") {
+    TVM_ATTR_FIELD(axes)
+        .describe("The target axes order, reverse order if not specified.");
+  }
+};  // struct TransposeAttrs
+
+/*! \brief Attributes used in reshape operators */
+struct ReshapeAttrs : public tvm::AttrsNode<ReshapeAttrs> {
+  Array<IndexExpr> newshape;
+  TVM_DECLARE_ATTRS(ReshapeAttrs, "relay.attrs.ReshapeAttrs") {
+    TVM_ATTR_FIELD(newshape)
+        .describe("The new shape. Should be compatible with the original shape.");
+  }
+};  // struct ReshapeAttrs
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 859bfdc26799..a576c275b7ed 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -300,21 +300,6 @@ def left_shift(lhs, rhs):
     return _make.left_shift(lhs, rhs)
 
 
-def concat(*args):
-    """Concatenate the input tensors along the zero axis.
-
-    Parameters
-    ----------
-    args: list of Tensor
-
-    Returns
-    -------
-    tensor: The concatenated tensor.
-    """
-    tup = Tuple(list(args))
-    return _make.concat(tup)
-
-
 def zeros_like(data):
     """Returns an array of zeros, with same type and shape as the input.
 
@@ -345,3 +330,41 @@ def ones_like(data):
         The computed result.
     """
     return _make.ones_like(data)
+
+def concatenate(data, axis):
+    """Concatenate the input tensors along the given axis.
+
+    Parameters
+    ----------
+    data : Union(List[relay.Expr], Tuple[relay.Expr])
+        A list of tensors.
+    axis : int
+        The axis along which the tensors are concatenated.
+
+    Returns
+    -------
+    result: relay.Expr
+        The concatenated tensor.
+    """
+    data = list(data)
+    if not data:
+        raise ValueError("relay.concatenate requires data to be non-empty.")
+    if not isinstance(axis, int):
+        raise ValueError("For now, we only support integer axis")
+    return _make.concatenate(Tuple(data), axis)
+
+
+def copy(data):
+    """Copy a tensor.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The tensor to be copied.
+
+    Returns
+    -------
+    result: relay.Expr
+        The copied result.
+    """
+    return _make.copy(data)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 21f61735e58a..b530883d006c 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -26,3 +26,93 @@ def expand_dims(data, axis, num_newaxis=1):
         The reshaped result.
     """
     return _make.expand_dims(data, axis, num_newaxis)
+
+
+def transpose(data, axes=None):
+    """Permutes the dimensions of an array.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    axes : None or List[int]
+        The target axes order, reverse order if not specified.
+
+    Returns
+    -------
+    result : relay.Expr
+        The reshaped result.
+    """
+    axes = axes or []
+    return _make.transpose(data, list(axes))
+
+
+def reshape(data, newshape):
+    """Reshapes the input array.
+
+    Example::
+
+    To give user more convenience in without doing manual shape inference,
+    some dimensions of the shape can take special values from the set {0, -1, -2, -3, -4}.
+    The significance of each is explained below:
+
+    - ``0``  copy this dimension from the input to the output shape.
+
+    Example::
+
+    - data.shape = (2,3,4), newshape = (4,0,2), result.shape = (4,3,2)
+    - data.shape = (2,3,4), newshape = (2,0,0), result.shape = (2,3,4)
+
+    - ``-1`` infers the dimension of the output shape by using the remainder of the input dimensions
+    keeping the size of the new array same as that of the input array.
+    At most one dimension of shape can be -1.
+
+    Example::
+
+    - data.shape = (2,3,4), newshape = (6,1,-1), result.shape = (6,1,4)
+    - data.shape = (2,3,4), newshape = (3,-1,8), result.shape = (3,1,8)
+    - data.shape = (2,3,4), newshape = (-1,), result.shape = (24,)
+
+    - ``-2`` copy all/remainder of the input dimensions to the output shape.
+
+    Example::
+
+    - data.shape = (2,3,4), newshape = (-2,), result.shape = (2,3,4)
+    - data.shape = (2,3,4), newshape = (2,-2), result.shape = (2,3,4)
+    - data.shape = (2,3,4), newshape = (-2,1,1), result.shape = (2,3,4,1,1)
+
+    - ``-3`` use the product of two consecutive dimensions of the input shape
+    as the output dimension.
+
+    Example::
+
+    - data.shape = (2,3,4), newshape = (-3,4), result.shape = (6,4)
+    - data.shape = (2,3,4,5), newshape = (-3,-3), result.shape = (6,20)
+    - data.shape = (2,3,4), newshape = (0,-3), result.shape = (2,12)
+    - data.shape = (2,3,4), newshape = (-3,-2), result.shape = (6,4)
+
+    - ``-4`` split one dimension of the input into two dimensions passed subsequent
+    to -4 in shape (can contain -1).
+
+    Example::
+
+    - data.shape = (2,3,4), newshape = (-4,1,2,-2), result.shape =(1,2,3,4)
+    - data.shape = (2,3,4), newshape = (2,-4,-1,3,-2), result.shape = (2,1,3,4)
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    newshape : Union[int, Tuple[int], List[int]]
+        The new shape. Should be compatible with the original shape.
+
+    Returns
+    -------
+    result : relay.Expr
+        The reshaped result.
+    """
+    if isinstance(newshape, int):
+        newshape = [newshape]
+    return _make.reshape(data, list(newshape))
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
new file mode 100644
index 000000000000..5bdc91bfd6de
--- /dev/null
+++ b/src/relay/op/op_common.h
@@ -0,0 +1,30 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file op_common.h
+ * \brief A set of utilities and common functionality
+ * for relay ops.
+ */
+#ifndef TVM_RELAY_OP_OP_COMMON_H_
+#define TVM_RELAY_OP_OP_COMMON_H_
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+template<typename T>
+std::vector<T> AsVector(const Array<T> &array) {
+    std::vector<T> result;
+    result.reserve(array.size());
+    for (const T& ele : array) {
+        result.push_back(ele);
+    }
+    return result;
+}
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_OP_COMMON_H_
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 61db1f90ae39..f85fd706a52f 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -5,25 +5,29 @@
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/transform.h>
+#include <tvm/ir_operator.h>
 #include <vector>
+#include "../op_common.h"
 
 
 namespace tvm {
 namespace relay {
 
+/* relay.expand_dims */
+
 TVM_REGISTER_NODE_TYPE(ExpandDimsAttrs);
 
 bool ExpandDimsRel(const Array<Type>& types,
                    int num_inputs,
                    const Attrs& attrs,
                    const TypeReporter& reporter) {
-  // `types` contains: [data, output]
+  // `types` contains: [data, result]
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
   }
-  const ExpandDimsAttrs* param = attrs.as<ExpandDimsAttrs>();
+  const auto* param = attrs.as<ExpandDimsAttrs>();
   const int ndim = static_cast<int>(data->shape.size());
   const int axis = param->axis;
   const int num_newaxis = param->num_newaxis;
@@ -76,6 +80,240 @@ RELAY_REGISTER_OP("expand_dims")
 .set_support_level(1)
 .add_type_rel("ExpandDims", ExpandDimsRel);
 
+/* relay.concatenate */
+
+TVM_REGISTER_NODE_TYPE(ConcatenateAttrs);
+
+bool ConcatenateRel(const Array<Type>& types,
+                    int num_inputs,
+                    const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  // types: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* tensor_tuple = types[0].as<TupleTypeNode>();
+  if (tensor_tuple == nullptr) {
+    return false;
+  }
+  const auto* param = attrs.as<ConcatenateAttrs>();
+  const auto& first = Downcast<TensorType>(tensor_tuple->fields[0]);
+  // Sanity check: ndim and dtype.
+  const int ndim = static_cast<int>(first->shape.size());
+  const DataType dtype = first->dtype;
+  for (const Type& ele : tensor_tuple->fields) {
+    const auto& e = Downcast<TensorType>(ele);
+    int e_ndim = static_cast<int>(e->shape.size());
+    const DataType& e_dtype = e->dtype;
+    CHECK_EQ(e_ndim, ndim) << "relay.concatenate requires all tensors have the same ndim";
+    CHECK_EQ(e_dtype, dtype) << "relay.concatenate requires all tensors have the same dtype";
+  }
+  // Sanity check: axis
+  int axis = param->axis;
+  CHECK(-ndim <= axis && axis < ndim)
+    << "concatenate only accepts `axis` in [-ndim, ndim)"
+    << ", but got axis = " << axis
+    << ", and ndim = " << ndim;
+  axis = axis < 0 ? ndim + axis : axis;
+  // Calculate shape
+  std::vector<IndexExpr>&& oshape = AsVector(first->shape);
+  IndexExpr &concat_dim = oshape[axis];
+  for (int i = 1; i < static_cast<int>(tensor_tuple->fields.size()); ++i) {
+    const auto& e = Downcast<TensorType>(tensor_tuple->fields[i]);
+    concat_dim += e->shape[axis];
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, dtype));
+  return true;
+}
+
+Expr MakeConcatenate(Expr data,
+                     int axis) {
+  auto attrs = make_node<ConcatenateAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("concatenate");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.concatenate")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeConcatenate, args, rv);
+});
+
+RELAY_REGISTER_OP("concatenate")
+.describe(R"code(Concatenate the input tensors along the given axis.
+
+- **data** : A list of tensors.
+
+- **axis** : The axis along which the tensors are concatenated.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input list of tensors.")
+.set_support_level(1)
+.add_type_rel("Concatenate", ConcatenateRel);
+
+/* relay.transpose */
+
+bool TransposeRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  // types: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+  const auto* param = attrs.as<TransposeAttrs>();
+  const int ndim = data->shape.size();
+  const Array<IndexExpr>& axes = param->axes;
+  // check dimension match
+  CHECK(axes.empty() || static_cast<int>(axes.size()) == ndim)
+    << "Dimension mismatch: axes has " << axes.size() << " elements"
+    << ", but data.ndim = " << ndim;
+  // construct int_axes
+  std::vector<int> int_axes;
+  int_axes.reserve(ndim);
+  if (axes.empty()) {
+    for (int i = ndim - 1; i >= 0; --i) {
+      int_axes.push_back(i);
+    }
+  } else {
+    std::vector<int> axis_used(ndim, 0);
+    for (const IndexExpr& e : axes) {
+      const int64_t *axis_ptr = as_const_int(e);
+      CHECK(axis_ptr != nullptr);
+      int axis = *axis_ptr;
+      // sanity check for axis and ndim
+      CHECK(-ndim <= axis && axis < ndim)
+        << "transpose only allows each `axis` in `axes` in range [-data.ndim, data.ndim)"
+        << ", but got axis = " << axis
+        << ", and data.ndim = " << ndim;
+      axis = axis < 0 ? axis + ndim : axis;
+      // sanity check for duplication
+      CHECK(!axis_used[axis]) << "Duplicate axes in transpose: " << axis;
+      axis_used[axis] = 1;
+      int_axes.push_back(axis);
+    }
+  }
+  std::vector<IndexExpr> oshape;
+  oshape.reserve(ndim);
+  for (int axis : int_axes) {
+    oshape.push_back(data->shape[axis]);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeTranspose(Expr data,
+                   Array<IndexExpr> axes) {
+  auto attrs = make_node<TransposeAttrs>();
+  attrs->axes = std::move(axes);
+  static const Op& op = Op::Get("transpose");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.transpose")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeTranspose, args, rv);
+});
+
+RELAY_REGISTER_OP("transpose")
+.describe(R"code(Permutes the dimensions of an array.
+
+- **data**: The input data to the operator.
+
+- **axes**: The target axes order, reverse order if not specified.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Transpose", TransposeRel);
+
+/* relay.reshape */
+
+bool ReshapeRel(const Array<Type>& types,
+                int num_inputs,
+                const Attrs& attrs,
+                const TypeReporter& reporter) {
+  // types: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+  const auto* param = attrs.as<ReshapeAttrs>();
+  reporter->Assign(types[1], TensorTypeNode::make(param->newshape, data->dtype));
+  return true;
+}
+
+Expr MakeReshape(Expr data,
+                 Array<IndexExpr> newshape) {
+  auto attrs = make_node<ReshapeAttrs>();
+  attrs->newshape = std::move(newshape);
+  static const Op& op = Op::Get("reshape");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.reshape")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeReshape, args, rv);
+});
+
+RELAY_REGISTER_OP("reshape")
+.describe(R"code(Reshapes the input array.
+
+Example::
+
+To give user more convenience in without doing manual shape inference,
+some dimensions of the shape can take special values from the set {0, -1, -2, -3, -4}.
+The significance of each is explained below:
+
+- ``0``  copy this dimension from the input to the output shape.
+
+Example::
+
+- data.shape = (2,3,4), newshape = (4,0,2), result.shape = (4,3,2)
+- data.shape = (2,3,4), newshape = (2,0,0), result.shape = (2,3,4)
+
+- ``-1`` infers the dimension of the output shape by using the remainder of the input dimensions
+keeping the size of the new array same as that of the input array.
+At most one dimension of shape can be -1.
+
+Example::
+
+- data.shape = (2,3,4), newshape = (6,1,-1), result.shape = (6,1,4)
+- data.shape = (2,3,4), newshape = (3,-1,8), result.shape = (3,1,8)
+- data.shape = (2,3,4), newshape = (-1,), result.shape = (24,)
+
+- ``-2`` copy all/remainder of the input dimensions to the output shape.
+
+Example::
+
+- data.shape = (2,3,4), newshape = (-2,), result.shape = (2,3,4)
+- data.shape = (2,3,4), newshape = (2,-2), result.shape = (2,3,4)
+- data.shape = (2,3,4), newshape = (-2,1,1), result.shape = (2,3,4,1,1)
+
+- ``-3`` use the product of two consecutive dimensions of the input shape as the output dimension.
+
+Example::
+
+- data.shape = (2,3,4), newshape = (-3,4), result.shape = (6,4)
+- data.shape = (2,3,4,5), newshape = (-3,-3), result.shape = (6,20)
+- data.shape = (2,3,4), newshape = (0,-3), result.shape = (2,12)
+- data.shape = (2,3,4), newshape = (-3,-2), result.shape = (6,4)
+
+- ``-4`` split one dimension of the input into two dimensions passed subsequent to -4 in shape (can contain -1).
+
+Example::
+
+- data.shape = (2,3,4), newshape = (-4,1,2,-2), result.shape =(1,2,3,4)
+- data.shape = (2,3,4), newshape = (2,-4,-1,3,-2), result.shape = (2,1,3,4)
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Reshape", ReshapeRel);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index cfcc14e4276f..9de4975de790 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -82,18 +82,11 @@ RELAY_REGISTER_UNARY_OP("sigmoid")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
-// Concat
-TVM_REGISTER_API("relay.op._make.concat")
-  .set_body_typed<Expr(Expr)>([](Expr tuple) {
-      static const Op& op = Op::Get("concat");
-    return CallNode::make(op, { tuple }, Attrs(), {});
-  });
-
-RELAY_REGISTER_OP("concat")
-.set_num_inputs(1)
-.add_argument("tuple", "Tuple", "The tupled tensor arguments.")
-.set_support_level(1)
-.add_type_rel("Concat", ConcatRel);
+RELAY_REGISTER_UNARY_OP("copy")
+.describe(R"code(Copy a tensor.
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.add_type_rel("Identity", IdentityRel);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 654e184e8e23..61ac95ed8dc5 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -44,7 +44,46 @@ def test_unary_op():
         assert ftype.ret_type == relay.TensorType((10, 4), "int32")
 
 
+def test_concatenate_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d = tvm.var("n"), tvm.var("t"), 100
+    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
+    y = ib.param("y", relay.ty.TensorType((n, t, d), "float32"))
+    with ib.function(x, y) as func:
+        ib.ret(relay.concatenate((x, y), axis=-1))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, t, 200), "float32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d = tvm.var("n"), tvm.var("t"), 100
+    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
+    y = ib.param("y", relay.ty.TensorType((n, t, d), "float32"))
+    with ib.function(x, y) as func:
+        ib.ret(relay.concatenate((x, y), axis=2))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, t, 200), "float32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d = tvm.var("n"), tvm.var("t"), 100
+    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
+    y = ib.param("y", relay.ty.TensorType((n, t, d), "float32"))
+    with ib.function(x, y) as func:
+        ib.ret(relay.concatenate((x, y), axis=1))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, t + t, 100), "float32")
+
+
 if __name__ == "__main__":
     test_expand_dims_infer_type()
     test_unary_op()
+    test_concatenate_infer_type()
     test_softmax()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index a78685597a59..ecd9d071e671 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1,6 +1,7 @@
 import tvm
 from tvm import relay
 
+
 def test_unary_identity():
     for op in [relay.zeros_like, relay.ones_like]:
         ib = relay.ir_builder.IRBuilder()
@@ -11,3 +12,49 @@ def test_unary_identity():
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
         ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((8, 9, 4), "int32")
+
+
+def test_copy_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d = tvm.var("n"), tvm.var("t"), 100
+    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.copy(x))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, t, 100), "float32")
+
+
+def test_transpose_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d = tvm.var("n"), tvm.var("t"), 100
+    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.transpose(x, axes=(1, 0, 2)))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (t, n, 100), "float32")
+
+
+def test_reshape_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d1, d2 = tvm.var("n"), tvm.var("t"), 100, 20
+    x = ib.param("x", relay.ty.TensorType((n, t, d1, d2), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.reshape(x, newshape=(n, t, 2000)))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, t, 2000), "float32")
+
+
+if __name__ == "__main__":
+    test_unary_identity()
+    test_copy_infer_type()
+    test_transpose_infer_type()
+    test_reshape_infer_type()
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index 10f60be32f55..db73fb5c585f 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -2,7 +2,7 @@
 from tvm import relay
 from tvm.relay.ir_pass import dead_code_elimination, alpha_equal
 from tvm.relay.ir_builder import convert, IRBuilder
-from tvm.relay.op import log, add, equal, subtract, concat
+from tvm.relay.op import log, add, equal, subtract
 
 class env:
     def __init__(self):
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 8d312108f303..dfed126e6ab1 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -7,7 +7,7 @@
 from tvm.relay.ir_builder import IRBuilder, func_type
 from tvm.relay.ir_builder import scalar_type, convert, tensor_type
 from tvm.relay.env import Environment
-from tvm.relay.op import log, add, equal, subtract, concat
+from tvm.relay.op import log, add, equal, subtract, concatenate
 from tvm.relay.expr import Function
 
 def assert_has_type(expr, typ, env=Environment({})):
@@ -146,7 +146,7 @@ def test_concat():
     """
     Program:
         def try_concat2(x: Float(3, 2), y: Float(2, 2)) -> Float(5, 2) {
-            return concat(x, y);
+            return concatenate((x, y), axis=0);
         }
     """
     ib = IRBuilder()
@@ -154,7 +154,7 @@ def try_concat2(x: Float(3, 2), y: Float(2, 2)) -> Float(5, 2) {
     x = ib.param('x', ty=tensor_type(3, 2))
     y = ib.param('y', ty=tensor_type(2, 2))
     with ib.decl(try_concat2, x, y):
-        ib.ret(concat(x, y))
+        ib.ret(concatenate((x, y), axis=0))
     fn_ty = func_type([tensor_type(3, 2), tensor_type(2, 2)], tensor_type(5, 2))
     assert_decl_has_type(ib.env, try_concat2, fn_ty)
 
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index e4e646453cca..756aa2ec3b49 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -38,10 +38,6 @@ inline Tensor expand_dims(const Tensor& x,
                           std::string name = "tensor",
                           std::string tag = kBroadcast) {
   int ndim = static_cast<int>(x->shape.size());
-  if (axis < 0) {
-    // Calculate offset from last dimension
-    axis = ndim + axis + 1;
-  }
   CHECK(-ndim - 1 <= axis && axis <= ndim)
     << "expand_dims only accepts `axis` in [-data.ndim - 1, data.ndim]"
     << ", but got axis = " << axis
@@ -49,7 +45,10 @@ inline Tensor expand_dims(const Tensor& x,
   CHECK(num_newaxis >= 0)
     << "expand_dims only accepts `num_newaxis >= 0`"
     << ", but got num_newaxis = " << num_newaxis;
-
+  if (axis < 0) {
+    // Calculate offset from last dimension
+    axis = ndim + axis + 1;
+  }
   Array<Expr> new_shape;
   for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
     new_shape.push_back(x->shape[i]);
@@ -265,8 +264,13 @@ inline Tensor concatenate(const Array<Tensor>& inputs,
                           int axis = 0,
                           std::string name = "tensor",
                           std::string tag = kInjective) {
+  int ndim = static_cast<int>(inputs[0]->shape.size());
+  CHECK(-ndim <= axis && axis < ndim)
+    << "concatenate only accepts `axis` in [-ndim, ndim)"
+    << ", but got axis = " << axis
+    << ", and ndim = " << ndim;
   if (axis < 0) {
-    axis += static_cast<int>(inputs[0]->shape.size());
+    axis += ndim;
   }
   CHECK_LT(axis, inputs[0]->shape.size()) <<
     "axis out of bounds";

From ffa5a8287f004ce41c978b4c5649ec5d21a118a4 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 6 Oct 2018 22:17:16 -0700
Subject: [PATCH 193/529] Update test_op_level1.py

---
 tests/python/relay/test_op_level1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 61ac95ed8dc5..c7f8aa5ef63c 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -25,7 +25,7 @@ def test_softmax():
     ib.ret(func)
 
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type()
+    ftype = func.checked_type
     assert ftype.ret_type == relay.ty.TensorType((n, d), "float32")
 
 

From 2899d0aa5d748d3ac8c791ddcf8fe739fe64cbdf Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 6 Oct 2018 22:17:32 -0700
Subject: [PATCH 194/529] Enable bool type as storage type (#1853)

---
 include/tvm/expr.h                         |  2 +
 include/tvm/runtime/packed_func.h          | 12 ++++-
 python/tvm/_ffi/runtime_ctypes.py          |  9 ++++
 src/codegen/codegen_cuda.cc                |  2 +
 src/codegen/codegen_metal.cc               |  3 ++
 src/codegen/codegen_opencl.cc              |  3 ++
 src/codegen/spirv/ir_builder.cc            | 21 +++++++-
 src/lang/buffer.cc                         | 27 ++++++++--
 src/pass/storage_flatten.cc                | 10 +++-
 src/runtime/builtin_fp16.cc                |  5 +-
 src/runtime/ndarray.cc                     |  2 +
 tests/python/unittest/test_codegen_bool.py | 58 ++++++++++++++++++++++
 tests/python/unittest/test_lang_basic.py   |  2 +-
 13 files changed, 144 insertions(+), 12 deletions(-)
 create mode 100644 tests/python/unittest/test_codegen_bool.py

diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index e41f5f28d35b..7fdca7f6af8e 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -56,6 +56,8 @@ inline TVMType Type2TVMType(Type t) {
 // Get number of bytes considering vector type.
 inline int GetVectorBytes(Type dtype) {
   int data_bits = dtype.bits() * dtype.lanes();
+  // allow bool to exist
+  if (dtype == Bool()) return 1;
   CHECK_EQ(data_bits % 8, 0U)
       << "Need to load/store by multiple of bytes";
   return data_bits / 8;
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index d204f8624a64..a8fa096e51c4 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -873,6 +873,9 @@ inline const char* TypeCode2Str(int type_code) {
 
 #ifndef _LIBCPP_SGX_NO_IOSTREAMS
 inline std::ostream& operator<<(std::ostream& os, TVMType t) {  // NOLINT(*)
+  if (t.bits == 1 && t.lanes == 1 && t.code == kDLUInt) {
+    os << "bool"; return os;
+  }
   os << TypeCode2Str(t.code);
   if (t.code == kHandle) return os;
   os << static_cast<int>(t.bits);
@@ -890,7 +893,9 @@ inline std::string TVMType2String(TVMType t) {
   os << t;
   return os.str();
 #else
-  std::string repr = "";
+  if (t.bits == 1 && t.lanes == 1 && t.code == kDLUInt) {
+    return "bool";
+  }
   repr += TypeCode2Str(t.code);
   if (t.code == kHandle) return repr;
   repr += std::to_string(static_cast<int>(t.bits));
@@ -920,6 +925,11 @@ inline TVMType String2TVMType(std::string s) {
     t.code = kHandle;
     t.bits = 64;  // handle uses 64 bit by default.
     scan = s.c_str() + 6;
+  } else if (s == "bool") {
+    t.code = kDLUInt;
+    t.bits = 1;
+    t.lanes = 1;
+    return t;
   } else {
     scan = s.c_str();
     LOG(FATAL) << "unknown type " << s;
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 2aced1aef7d2..b17487559e50 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -48,6 +48,13 @@ def __init__(self, type_str):
         super(TVMType, self).__init__()
         if isinstance(type_str, np.dtype):
             type_str = str(type_str)
+
+        if type_str == "bool":
+            self.bits = 1
+            self.type_code = 1
+            self.lanes = 1
+            return
+
         arr = type_str.split("x")
         head = arr[0]
         self.lanes = int(arr[1]) if len(arr) > 1 else 1
@@ -73,6 +80,8 @@ def __init__(self, type_str):
 
 
     def __repr__(self):
+        if self.bits == 1 and self.lanes == 1:
+            return "bool"
         x = "%s%d" % (TVMType.CODE2STR[self.type_code], self.bits)
         if self.lanes != 1:
             x += "x%d" % self.lanes
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 0960106ae471..2ed8d8e3ff78 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -77,6 +77,8 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
     if (!fail && (lanes >= 2 && lanes <= 4)) {
       os << lanes; return;
     }
+  } else if (t == Bool()) {
+    os << "bool"; return;
   } else if (t.is_uint() || t.is_int()) {
     if (t.is_uint()) {
       if (t.lanes() != 1) {
diff --git a/src/codegen/codegen_metal.cc b/src/codegen/codegen_metal.cc
index 3bbe98289439..031313190370 100644
--- a/src/codegen/codegen_metal.cc
+++ b/src/codegen/codegen_metal.cc
@@ -141,6 +141,9 @@ void CodeGenMetal::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
         << "do not yet support vector types";
     os << "void*"; return;
   }
+  if (t == Bool()) {
+    os << "bool"; return;
+  }
   bool fail = false;
   if (t.is_float()) {
     switch (t.bits()) {
diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc
index 3d3de5e3bcf4..a0b3c2000a80 100644
--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -80,6 +80,9 @@ void CodeGenOpenCL::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
         << "do not yet support vector types";
     os << "void*"; return;
   }
+  if (t == Bool()) {
+    os << "bool"; return;
+  }
   bool fail = false;
   if (t.is_float()) {
     switch (t.bits()) {
diff --git a/src/codegen/spirv/ir_builder.cc b/src/codegen/spirv/ir_builder.cc
index 41cb48c5854b..fdf4b9852430 100644
--- a/src/codegen/spirv/ir_builder.cc
+++ b/src/codegen/spirv/ir_builder.cc
@@ -438,8 +438,25 @@ Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
   const tvm::Type& from = value.stype.type;
   const tvm::Type& to = dst_type.type;
   CHECK_EQ(from.lanes(), to.lanes());
-
-  if (from.is_int() && to.is_int()) {
+  if (from == Bool()) {
+    if (to.is_int()) {
+      return Select(value, IntImm(dst_type, 1), IntImm(dst_type, 0));
+    } else if (to.is_uint()) {
+      return Select(value, UIntImm(dst_type, 1), UIntImm(dst_type, 0));
+    } else {
+      LOG(FATAL) << "cannot cast from " << from << " to " << to;
+      return Value();
+    }
+  } else if (to == Bool()) {
+    if (from.is_int()) {
+      return NE(value, IntImm(value.stype, 0));
+    } else if (to.is_uint()) {
+      return NE(value, UIntImm(value.stype, 0));
+    } else {
+      LOG(FATAL) << "cannot cast from " << from << " to " << to;
+      return Value();
+    }
+  } else if (from.is_int() && to.is_int()) {
     return MakeValue(spv::OpSConvert, dst_type, value);
   } else if (from.is_uint() && to.is_uint()) {
     return MakeValue(spv::OpUConvert, dst_type, value);
diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc
index 69967c55a7ff..183a52f785bd 100644
--- a/src/lang/buffer.cc
+++ b/src/lang/buffer.cc
@@ -260,25 +260,42 @@ inline Expr BufferOffset(const BufferNode* n, Array<Expr> index, Type dtype) {
 }
 
 Expr Buffer::vload(Array<Expr> begin, Type dtype) const {
+  // specially handle bool, stored as Int(8)
   const BufferNode* n = operator->();
   CHECK(dtype.element_of() == n->dtype.element_of() &&
         dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot load " << dtype
       << " from buffer of " << n->dtype;
-  return ir::Load::make(
-      dtype, n->data, BufferOffset(n, begin, dtype),
-      const_true(dtype.lanes()));
+  if (dtype == Bool()) {
+    return ir::Cast::make(
+        Bool(),
+        ir::Load::make(
+            Int(8), n->data, BufferOffset(n, begin, Int(8)),
+            const_true()));
+  } else {
+    return ir::Load::make(
+        dtype, n->data, BufferOffset(n, begin, dtype),
+        const_true(dtype.lanes()));
+  }
 }
 
 Stmt Buffer::vstore(Array<Expr> begin, Expr value) const {
+  // specially handle bool, stored as Int(8)
   const BufferNode* n = operator->();
   Type dtype = value.type();
   CHECK(dtype.element_of() == n->dtype.element_of() &&
         dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot load " << dtype
       << " from buffer of " << n->dtype;
-  return ir::Store::make(n->data, value, BufferOffset(n, begin, dtype),
-                         const_true(dtype.lanes()));
+  if (value.type() == Bool()) {
+    return ir::Store::make(n->data,
+                           ir::Cast::make(Int(8), value),
+                           BufferOffset(n, begin, Int(8)),
+                           const_true());
+  } else {
+    return ir::Store::make(n->data, value, BufferOffset(n, begin, dtype),
+                           const_true(dtype.lanes()));
+  }
 }
 
 Buffer Buffer::MakeStrideView() const {
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index 28a6ace9bfa6..993f6294e15b 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -191,10 +191,16 @@ class StorageFlattener : public IRMutator {
       buf_map_[key].released = true;
       Stmt ret;
 
+      Type storage_type = e.buffer->dtype;
+      // specially handle bool, lower its storage
+      // type to be Int(8)(byte)
+      if (storage_type == Bool()) {
+        storage_type = Int(8);
+      }
       if (strides.size() != 0) {
         int first_dim = 0;
         ret = Allocate::make(
-            e.buffer->data, e.buffer->dtype,
+            e.buffer->data, storage_type,
             {arith::ComputeExpr<Mul>(e.buffer->strides[first_dim], e.buffer->shape[first_dim])},
             make_const(Bool(e.buffer->dtype.lanes()), true), body);
       } else {
@@ -203,7 +209,7 @@ class StorageFlattener : public IRMutator {
           shape.push_back(make_const(Int(32), 1));
         }
         ret = Allocate::make(
-            e.buffer->data, e.buffer->dtype, shape,
+            e.buffer->data, storage_type, shape,
             make_const(Bool(e.buffer->dtype.lanes()), true), body);
       }
       ret = AttrStmt::make(
diff --git a/src/runtime/builtin_fp16.cc b/src/runtime/builtin_fp16.cc
index 79c3cc474269..c920c9571f38 100644
--- a/src/runtime/builtin_fp16.cc
+++ b/src/runtime/builtin_fp16.cc
@@ -3,12 +3,14 @@
  * \file builtin_fp16.cc
  * \brief Functions for conversion between fp32 and fp16
 */
-
 #include <builtin_fp16.h>
 #include <tvm/runtime/c_runtime_api.h>
 
 extern "C" {
 
+// disable under msvc
+#ifndef _MSC_VER
+
 TVM_WEAK uint16_t __gnu_f2h_ieee(float a) {
   return __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(a);
 }
@@ -17,4 +19,5 @@ TVM_WEAK float __gnu_h2f_ieee(uint16_t a) {
   return __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(a);
 }
 
+#endif
 }
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 574111e39b64..0ffa4c174544 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -20,6 +20,8 @@ inline void VerifyDataType(DLDataType dtype) {
   if (dtype.code == kDLFloat) {
     CHECK_EQ(dtype.bits % 8, 0);
   } else {
+    // allow uint1 as a special flag for bool.
+    if (dtype.bits == 1 && dtype.code == kDLUInt) return;
     CHECK_EQ(dtype.bits % 8, 0);
   }
   CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
diff --git a/tests/python/unittest/test_codegen_bool.py b/tests/python/unittest/test_codegen_bool.py
new file mode 100644
index 000000000000..e2592c416345
--- /dev/null
+++ b/tests/python/unittest/test_codegen_bool.py
@@ -0,0 +1,58 @@
+"""codegen related to bool types"""
+
+import tvm
+import numpy as np
+
+def test_cmp_load_store():
+    n = 32
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) > B(*i), name='C')
+    D = tvm.compute(C.shape, lambda *i: tvm.all(C(*i), A(*i) > 1), name="D")
+
+
+    def check_llvm():
+        if not tvm.module.enabled("llvm"):
+            return
+        s = tvm.create_schedule(D.op)
+        xo, xi = s[C].split(C.op.axis[0], factor=4)
+        xo1, xo2 = s[C].split(xo, factor=13)
+        s[C].parallel(xo2)
+        # BUILD and invoke the kernel.
+        f = tvm.build(s, [A, B, D], "llvm")
+        ctx = tvm.cpu(0)
+        a_np = np.random.uniform(size=n).astype(A.dtype)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
+        f(a, b, d)
+        np.testing.assert_equal(
+            d.asnumpy(), np.logical_and(a.asnumpy()> b.asnumpy(), a.asnumpy() > 1))
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            return
+        s = tvm.create_schedule(D.op)
+        for stage in [C, D]:
+            xo, xi = s[stage].split(stage.op.axis[0], factor=4)
+            s[stage].bind(xo, tvm.thread_axis("blockIdx.x"))
+            s[stage].bind(xi, tvm.thread_axis("threadIdx.x"))
+        f = tvm.build(s, [A, B, D], device)
+        a_np = np.random.uniform(size=n).astype(A.dtype)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
+        f(a, b, d)
+        np.testing.assert_equal(
+            d.asnumpy(), np.logical_and(a.asnumpy()> b.asnumpy(), a.asnumpy() > 1))
+
+
+    check_llvm()
+    for device in ["vulkan", "opencl", "cuda", "rocm", "metal"]:
+        check_device(device)
+
+
+
+if __name__ == "__main__":
+    test_cmp_load_store()
diff --git a/tests/python/unittest/test_lang_basic.py b/tests/python/unittest/test_lang_basic.py
index bf25ca3dfc85..079123d96ca0 100644
--- a/tests/python/unittest/test_lang_basic.py
+++ b/tests/python/unittest/test_lang_basic.py
@@ -79,7 +79,7 @@ def test_dtype():
     x = tvm.var('x')
     assert x.dtype == 'int32'
     y = tvm.var('y')
-    assert (x > y).dtype == 'uint1'
+    assert (x > y).dtype == 'bool'
 
 
 def test_any():

From 5078e8e7619bd8a866726e50a62ebdabcd628da5 Mon Sep 17 00:00:00 2001
From: nhynes <nhynes@berkeley.edu>
Date: Sat, 6 Oct 2018 22:17:50 -0700
Subject: [PATCH 195/529] Install rust for all users (#1856)

---
 docker/Dockerfile.ci_cpu              |  4 +++-
 docker/install/ubuntu_install_rust.sh | 13 +++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index f05818721f98..2b72b6eea6e5 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -26,6 +26,8 @@ RUN bash /install/ubuntu_install_golang.sh
 # Rust env
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
 RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
 
 # SGX deps
 COPY install/ubuntu_install_sgx.sh /install/ubuntu_install_sgx.sh
@@ -33,4 +35,4 @@ RUN bash /install/ubuntu_install_sgx.sh
 ENV LD_LIBRARY_PATH /opt/sgxsdk/lib64:${LD_LIBRARY_PATH}
 
 
-ENV PATH $PATH:/root/.cargo/bin:/usr/lib/go-1.10/bin
+ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index 836186e8ff96..6ca56acadf74 100644
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -1,10 +1,15 @@
 apt-get update && apt-get install -y --no-install-recommends --force-yes curl
 
-curl -sSo rustup.sh 'https://sh.rustup.rs'
-# rustc nightly-2018-08-25 is the version supported by the above version of rust-sgx-sdk
-bash rustup.sh -y --no-modify-path --default-toolchain nightly-2018-08-25
-. $HOME/.cargo/env
+export RUSTUP_HOME=/opt/rust
+export CARGO_HOME=/opt/rust
+# rustc nightly-2018-08-25 is the version supported by the rust-sgx-sdk
+curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2018-08-25
+. $CARGO_HOME/env
 rustup toolchain add nightly
 rustup component add rust-src
 cargo +nightly install rustfmt-nightly --version 0.99.5 --force
 cargo +nightly install xargo
+
+# make rust usable by all users
+chmod a+w /opt/rust
+sudo find /opt/rust -type d -exec chmod a+w {} \;

From 4fd36042d78c60e6a814a485d97047db1a554ba5 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Sun, 7 Oct 2018 11:14:46 +0530
Subject: [PATCH 196/529] [RELAY][OP] Operators.  	 pool2d,
 global_pool2d, batch_flatten, tanh, sigmoid, floor, ceil, trunc, abs,
 negative, multiply, mod, pow,  resize (#1813)

---
 docs/langref/relay_op.rst              |  59 ++++++
 include/tvm/relay/attrs/image.h        |  41 ++++
 include/tvm/relay/attrs/nn.h           |  96 +++++++++
 include/tvm/relay/attrs/vision.h       |  17 ++
 python/tvm/relay/__init__.py           |   6 +-
 python/tvm/relay/image.py              |   4 +
 python/tvm/relay/op/__init__.py        |   7 +-
 python/tvm/relay/op/image/__init__.py  |   4 +
 python/tvm/relay/op/image/_make.py     |   4 +
 python/tvm/relay/op/image/image.py     |  42 ++++
 python/tvm/relay/op/nn/nn.py           | 236 +++++++++++++++++++++
 python/tvm/relay/op/tensor.py          | 188 ++++++++++++++++-
 python/tvm/relay/op/vision/__init__.py |   3 +
 python/tvm/relay/op/vision/_make.py    |   4 +
 python/tvm/relay/vision.py             |   4 +
 src/relay/op/image/resize.cc           |  87 ++++++++
 src/relay/op/nn/nn.cc                  |  66 +++++-
 src/relay/op/nn/pooling.cc             | 270 +++++++++++++++++++++++++
 src/relay/op/nn/upsampling.cc          |  87 ++++++++
 src/relay/op/tensor/binary.cc          |  17 +-
 src/relay/op/tensor/unary.cc           |  80 +++++++-
 tests/python/relay/test_ir_op.py       |  11 +-
 tests/python/relay/test_op_level1.py   |  79 +++++++-
 tests/python/relay/test_op_level2.py   | 108 +++++++++-
 tests/python/relay/test_op_level3.py   |  28 ++-
 tests/python/relay/test_op_level4.py   |  91 +++++++++
 tests/python/relay/test_op_level5.py   |  29 +++
 tests/python/relay/test_type_infer.py  |  51 -----
 28 files changed, 1648 insertions(+), 71 deletions(-)
 create mode 100644 include/tvm/relay/attrs/image.h
 create mode 100644 include/tvm/relay/attrs/vision.h
 create mode 100644 python/tvm/relay/image.py
 create mode 100644 python/tvm/relay/op/image/__init__.py
 create mode 100644 python/tvm/relay/op/image/_make.py
 create mode 100644 python/tvm/relay/op/image/image.py
 create mode 100644 python/tvm/relay/op/vision/__init__.py
 create mode 100644 python/tvm/relay/op/vision/_make.py
 create mode 100644 python/tvm/relay/vision.py
 create mode 100644 src/relay/op/image/resize.cc
 create mode 100644 src/relay/op/nn/pooling.cc
 create mode 100644 src/relay/op/nn/upsampling.cc
 create mode 100644 tests/python/relay/test_op_level5.py

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index c5b0f4ba4f5c..deafaa99d645 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -30,6 +30,13 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.expand_dims
    tvm.relay.concatenate
    tvm.relay.nn.softmax
+   tvm.relay.subtract
+   tvm.relay.multiply
+   tvm.relay.divide
+   tvm.relay.mod
+   tvm.relay.tanh
+   tvm.relay.sigmoid
+
 
 **Level 2: Convolutions**
 
@@ -39,10 +46,18 @@ This level enables typical convnet models.
    :nosignatures:
 
    tvm.relay.nn.conv2d
+   tvm.relay.nn.max_pool2d
+   tvm.relay.nn.avg_pool2d
+   tvm.relay.nn.global_max_pool2d
+   tvm.relay.nn.global_avg_pool2d
+   tvm.relay.nn.upsampling
+   tvm.relay.nn.batch_flatten
 
 
 **Level 3: Additional Math And Transform Operators**
 
+This level enables additional math and transform operators.
+
 .. autosummary::
    :nosignatures:
 
@@ -51,6 +66,13 @@ This level enables typical convnet models.
    tvm.relay.reshape
    tvm.relay.copy
    tvm.relay.transpose
+   tvm.relay.floor
+   tvm.relay.ceil
+   tvm.relay.trunc
+   tvm.relay.round
+   tvm.relay.abs
+   tvm.relay.negative
+
 
 **Level 4: Broadcast and Reductions**
 
@@ -67,9 +89,15 @@ This level enables typical convnet models.
    tvm.relay.less_equal
    tvm.relay.maximum
    tvm.relay.minimum
+   tvm.relay.pow
 
 **Level 5: Vision/Image Operators**
 
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.image.resize
+
 
 Level 1 Definitions
 -------------------
@@ -78,12 +106,38 @@ Level 1 Definitions
 .. autofunction:: tvm.relay.exp
 .. autofunction:: tvm.relay.sigmoid
 .. autofunction:: tvm.relay.add
+.. autofunction:: tvm.relay.subtract
+.. autofunction:: tvm.relay.multiply
+.. autofunction:: tvm.relay.divide
+.. autofunction:: tvm.relay.mod
+.. autofunction:: tvm.relay.tanh
+.. autofunction:: tvm.relay.sigmoid
+.. autofunction:: tvm.relay.concatenate
+.. autofunction:: tvm.relay.nn.softmax
 
 
 Level 2 Definitions
 -------------------
 .. autofunction:: tvm.relay.nn.conv2d
+.. autofunction:: tvm.relay.nn.max_pool2d
+.. autofunction:: tvm.relay.nn.avg_pool2d
+.. autofunction:: tvm.relay.nn.global_max_pool2d
+.. autofunction:: tvm.relay.nn.global_avg_pool2d
+.. autofunction:: tvm.relay.nn.upsampling
+.. autofunction:: tvm.relay.nn.batch_flatten
+
 
+Level 3 Definitions
+-------------------
+.. autofunction:: tvm.relay.floor
+.. autofunction:: tvm.relay.ceil
+.. autofunction:: tvm.relay.trunc
+.. autofunction:: tvm.relay.round
+.. autofunction:: tvm.relay.abs
+.. autofunction:: tvm.relay.negative
+.. autofunction:: tvm.relay.reshape
+.. autofunction:: tvm.relay.copy
+.. autofunction:: tvm.relay.transpose
 
 Level 4 Definitions
 -------------------
@@ -97,3 +151,8 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.less_equal
 .. autofunction:: tvm.relay.maximum
 .. autofunction:: tvm.relay.minimum
+.. autofunction:: tvm.relay.pow
+
+Level 5 Definitions
+-------------------
+.. autofunction:: tvm.relay.image.resize
diff --git a/include/tvm/relay/attrs/image.h b/include/tvm/relay/attrs/image.h
new file mode 100644
index 000000000000..527bb647314f
--- /dev/null
+++ b/include/tvm/relay/attrs/image.h
@@ -0,0 +1,41 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/attrs/image.h
+ * \brief Auxiliary attributes for image operators.
+ */
+#ifndef TVM_RELAY_ATTRS_IMAGE_H_
+#define TVM_RELAY_ATTRS_IMAGE_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Attributes used in image resize operator */
+struct ResizeAttrs : public tvm::AttrsNode<ResizeAttrs> {
+  Array<IndexExpr> size;
+  std::string layout;
+  std::string method;
+  bool align_corners;
+
+  TVM_DECLARE_ATTRS(ResizeAttrs, "relay.attrs.ResizeAttrs") {
+    TVM_ATTR_FIELD(size).set_default(NullValue<Array<IndexExpr> >())
+        .describe("Output Size.");
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+        .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Resize is applied on the 'H' and"
+                  "'W' dimensions.");
+    TVM_ATTR_FIELD(method).set_default("BILINEAR")
+        .describe("Specify the mode to use for scaling."
+                  "NEAREST_NEIGHBOR -  Nearest Neighbor"
+                  "BILINEAR - Bilinear Interpolation");
+    TVM_ATTR_FIELD(align_corners).set_default(false)
+        .describe("Should be true to preserve the values at the corner pixels");
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_IMAGE_H_
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 0de0164a562f..45f1d2d41cfc 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -77,6 +77,102 @@ struct SoftmaxAttrs : public tvm::AttrsNode<SoftmaxAttrs> {
   }
 };
 
+/*! \brief Attributes for max pool operator */
+struct MaxPool2DAttrs : public tvm::AttrsNode<MaxPool2DAttrs> {
+  Array<IndexExpr> pool_size;
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  std::string layout;
+  bool ceil_mode;
+
+  TVM_DECLARE_ATTRS(MaxPool2DAttrs, "relay.attrs.MaxPool2DAttrs") {
+    TVM_ATTR_FIELD(pool_size)
+      .describe("Size of the pooling windows.");
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "Padding support both symmetric and asymmetric as"
+                "one int : same padding used on all sides"
+                "two int : bottom, right will use same padding as top, left"
+                "four int : padding width in the order of (top, left, bottom, right)");
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    TVM_ATTR_FIELD(ceil_mode).set_default(false)
+      .describe("When true, will use ceil instead of floor to compute the output shape.");
+  }
+};
+
+/*! \brief Attributes for avg pool operator */
+struct AvgPool2DAttrs : public tvm::AttrsNode<AvgPool2DAttrs> {
+  Array<IndexExpr> pool_size;
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  std::string layout;
+  bool ceil_mode;
+  bool count_include_pad;
+
+  TVM_DECLARE_ATTRS(AvgPool2DAttrs, "relay.attrs.AvgPool2DAttrs") {
+    TVM_ATTR_FIELD(pool_size)
+      .describe("Size of the pooling windows.");
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "Padding support both symmetric and asymmetric as"
+                "one int : same padding used on all sides"
+                "two int : bottom, right will use same padding as top, left"
+                "four int : padding width in the order of (top, left, bottom, right)");
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    TVM_ATTR_FIELD(ceil_mode).set_default(false)
+      .describe("When true, will use ceil instead of floor to compute the output shape.");
+    TVM_ATTR_FIELD(count_include_pad).set_default(false)
+      .describe("When true, will include padding to compute the average");
+  }
+};
+
+/*! \brief Attributes for global pool operator */
+struct GlobalPool2DAttrs : public tvm::AttrsNode<GlobalPool2DAttrs> {
+  std::string layout;
+
+  TVM_DECLARE_ATTRS(GlobalPool2DAttrs, "relay.attrs.GlobalPool2DAttrs") {
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+  }
+};
+
+/*! \brief Attributes for upsampling operator */
+struct UpSamplingAttrs : public tvm::AttrsNode<UpSamplingAttrs> {
+  int scale;
+  std::string layout;
+  std::string method;
+
+  TVM_DECLARE_ATTRS(UpSamplingAttrs, "relay.attrs.UpSamplingAttrs") {
+    TVM_ATTR_FIELD(scale)
+        .describe("Should be true to preserve the values at the corner pixels");
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+        .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Upsampling is applied on the 'H' and"
+                  "'W' dimensions.");
+    TVM_ATTR_FIELD(method).set_default("NEAREST_NEIGHBOR")
+        .describe("Specify the mode to use for scaling."
+                  "NEAREST_NEIGHBOR -  Nearest Neighbor"
+                  "BILINEAR - Bilinear Interpolation");
+  }
+};
+
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_NN_H_
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
new file mode 100644
index 000000000000..a2f7360f1f71
--- /dev/null
+++ b/include/tvm/relay/attrs/vision.h
@@ -0,0 +1,17 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/attrs/vision.h
+ * \brief Auxiliary attributes for vision operators.
+ */
+#ifndef TVM_RELAY_ATTRS_VISION_H_
+#define TVM_RELAY_ATTRS_VISION_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_VISION_H_
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 318a4d45d66d..dd48d213f700 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -1,4 +1,4 @@
-# pylint: disable=wildcard-import
+# pylint: disable=wildcard-import, redefined-builtin
 """The Relay IR namespace containing the IR definition and compiler."""
 from . import base
 from . import ty
@@ -10,8 +10,10 @@
 # Root operators
 from .op import Op
 from .op.tensor import *
-from . import nn
 from .op.transform import *
+from . import nn
+from . import vision
+from . import image
 
 # Span
 Span = base.Span
diff --git a/python/tvm/relay/image.py b/python/tvm/relay/image.py
new file mode 100644
index 000000000000..43cee89b3483
--- /dev/null
+++ b/python/tvm/relay/image.py
@@ -0,0 +1,4 @@
+# pylint: disable=wildcard-import, unused-import, unused-wildcard-import
+"""Image nets related operators."""
+# Re-export in a specific file name so that autodoc can pick it up
+from .op.image import *
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 444dc74a31cb..bfd368356d89 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -1,13 +1,14 @@
-#pylint: disable=wildcard-import
+#pylint: disable=wildcard-import, redefined-builtin
 """Relay core operators."""
 # operator defs
 from .op import get, register, Op
 
 # Operators
 from .tensor import *
-from . import nn
 from .transform import *
-
+from . import nn
+from . import image
+from . import vision
 
 # operator registry
 from . import _tensor
diff --git a/python/tvm/relay/op/image/__init__.py b/python/tvm/relay/op/image/__init__.py
new file mode 100644
index 000000000000..9d1415b1dca4
--- /dev/null
+++ b/python/tvm/relay/op/image/__init__.py
@@ -0,0 +1,4 @@
+# pylint: disable=wildcard-import
+"""Image network related operators."""
+from __future__ import absolute_import as _abs
+from .image import *
diff --git a/python/tvm/relay/op/image/_make.py b/python/tvm/relay/op/image/_make.py
new file mode 100644
index 000000000000..1198258553fe
--- /dev/null
+++ b/python/tvm/relay/op/image/_make.py
@@ -0,0 +1,4 @@
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.image._make", __name__)
diff --git a/python/tvm/relay/op/image/image.py b/python/tvm/relay/op/image/image.py
new file mode 100644
index 000000000000..36c8dd5fa548
--- /dev/null
+++ b/python/tvm/relay/op/image/image.py
@@ -0,0 +1,42 @@
+"""Image operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+def resize(data,
+           size,
+           layout="NCHW",
+           method="BILINEAR",
+           align_corners=False):
+    """Image resize operator.
+
+    This operator takes data as input and does 2D scaling to the given scale factor.
+    In the default case, where the data_layout is `NCHW`
+    with data of shape (n, c, h, w)
+    out will have a shape (n, c, size[0], size[1])
+
+    method indicates the algorithm to be used while calculating ghe out value
+    and method can be one of ("BILINEAR", "NEAREST_NEIGHBOR")
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    size: Tuple of Expr
+        The out size to which the image will be resized.
+
+    layout : str, optional
+        Layout of the input.
+
+    method : str, optional
+        Scale method to used [NEAREST_NEIGHBOR, BILINEAR].
+
+    align_corners : int, optional
+        Should be true to preserve the values at the corner pixels
+
+    Returns
+    -------
+    result: relay.Expr
+        The resized result.
+    """
+    return _make.resize(data, size, layout, method, align_corners)
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 3b168c6fce21..681afd5075c9 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -106,3 +106,239 @@ def softmax(data, axis):
     """
 
     return _make.softmax(data, axis)
+
+
+def max_pool2d(data,
+               pool_size=(1, 1),
+               strides=(1, 1),
+               padding=(0, 0),
+               layout="NCHW",
+               ceil_mode=False):
+    r"""2D maximum pooling operator.
+
+    This operator takes data as input and does 2D max value calculation
+    with in pool_size sized window by striding defined by stride
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with the following rule:
+
+    with data of shape (b, c, h, w) and pool_size (kh, kw)
+
+    .. math::
+
+        \mbox{out}(b, c, y, x)  = \max_{m=0, \ldots, kh-1} \max_{n=0, \ldots, kw-1}
+             \mbox{data}(b, c, \mbox{stride}[0] * y + m, \mbox{stride}[1] * x + n)
+
+    Padding is applied to data before the computation.
+    ceil_mode is used to take ceil or floor while computing out shape.
+    This operator accepts data layout specification.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    strides : tuple of int, optional
+        The strides of pooling.
+
+    padding : tuple of int, optional
+        The padding for pooling.
+
+    layout : str, optional
+        Layout of the input.
+
+    ceil_mode : bool, optional
+        To enable or disable ceil while pooling.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.max_pool2d(data, pool_size, strides, padding,
+                            layout, ceil_mode)
+
+def avg_pool2d(data,
+               pool_size=(1, 1),
+               strides=(1, 1),
+               padding=(0, 0),
+               layout="NCHW",
+               ceil_mode=False,
+               count_include_pad=False):
+    r"""2D average pooling operator.
+
+    This operator takes data as input and does 2D average value calculation
+    with in pool_size sized window by striding defined by stride
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with the following rule:
+
+    with data of shape (b, c, h, w), pool_size (kh, kw)
+
+    .. math::
+
+        \mbox{out}(b, c, y, x)  = \frac{1}{kh * kw} \sum_{m=0}^{kh-1} \sum_{n=0}^{kw-1}
+             \mbox{data}(b, c, \mbox{stride}[0] * y + m, \mbox{stride}[1] * x + n)
+
+    Padding is applied to data before the computation.
+    ceil_mode is used to take ceil or floor while computing out shape.
+    count_include_pad indicates including or excluding padded input values in computation.
+    This operator accepts data layout specification.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    strides : tuple of int, optional
+        The strides of pooling.
+
+    padding : tuple of int, optional
+        The padding for pooling.
+
+    layout : str, optional
+        Layout of the input.
+
+    ceil_mode : bool, optional
+        To enable or disable ceil while pooling.
+
+    count_include_pad : bool, optional
+        To include padding to compute the average.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.avg_pool2d(data, pool_size, strides, padding,
+                            layout, ceil_mode, count_include_pad)
+
+def global_max_pool2d(data,
+                      layout="NCHW"):
+    r"""2D global maximum pooling operator.
+
+    This operator takes data as input and does 2D max value calculation
+    across each window represented by WxH.
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with the following rule:
+
+    with data of shape (b, c, h, w)
+
+    .. math::
+
+        \mbox{out}(b, c, 1, 1)  = \max_{m=0, \ldots, h} \max_{n=0, \ldots, w}
+             \mbox{data}(b, c, m, n)
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    layout : str, optional
+        Layout of the input.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.global_max_pool2d(data, layout)
+
+def global_avg_pool2d(data,
+                      layout="NCHW"):
+    r"""2D global average pooling operator.
+
+    This operator takes data as input and does 2D average value calculation
+    across each window represented by WxH.
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with the following rule:
+
+    with data of shape (b, c, h, w)
+
+    .. math::
+
+        \mbox{out}(b, c, 1, 1)  = \frac{1}{h * w} \sum_{m=0}^{h-1} \sum_{n=0}^{w-1}
+             \mbox{data}(b, c, m, n)
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    layout : str, optional
+        Layout of the input.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.global_avg_pool2d(data, layout)
+
+
+def upsampling(data,
+               scale=1,
+               layout="NCHW",
+               method="NEAREST_NEIGHBOR"):
+    """Upsampling.
+
+    This operator takes data as input and does 2D scaling to the given scale factor.
+    In the default case, where the data_layout is `NCHW`
+    with data of shape (n, c, h, w)
+    out will have a shape (n, c, h*scale, w*scale)
+
+    method indicates the algorithm to be used while calculating ghe out value
+    and method can be one of ("BILINEAR", "NEAREST_NEIGHBOR")
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    scale : relay.Expr
+        The scale factor for upsampling.
+
+    layout : str, optional
+        Layout of the input.
+
+    method : str, optional
+        Scale method to used [NEAREST_NEIGHBOR, BILINEAR].
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.upsampling(data, scale, layout, method)
+
+def batch_flatten(data):
+    """BatchFlatten.
+
+    This operator flattens all the dimensions except for the batch dimension.
+    which results a 2D output.
+
+    For data with shape ``(d1, d2, ..., dk)``
+    batch_flatten(data) returns reshaped output of shape ``(d1, d2*...*dk)``.
+
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    Returns
+    -------
+    result: relay.Expr
+        The Flattened result.
+    """
+    return _make.batch_flatten(data)
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index a576c275b7ed..425a072631a6 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -1,4 +1,5 @@
 """Basic tensor operations."""
+# pylint: disable=redefined-builtin
 from __future__ import absolute_import as _abs
 from . import _make
 from ..expr import Tuple
@@ -59,7 +60,6 @@ def sqrt(data):
     """
     return _make.sqrt(data)
 
-
 def sigmoid(data):
     """Compute elementwise sigmoid of data.
 
@@ -76,6 +76,118 @@ def sigmoid(data):
     return _make.sigmoid(data)
 
 
+def floor(data):
+    """Compute element-wise floor of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.floor(data)
+
+
+def ceil(data):
+    """Compute element-wise ceil of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.ceil(data)
+
+
+def trunc(data):
+    """Compute element-wise trunc of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.trunc(data)
+
+
+def round(data):
+    """Compute element-wise round of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.round(data)
+
+
+def abs(data):
+    """Compute element-wise absolute of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.abs(data)
+
+
+def tanh(data):
+    """Compute element-wise tanh of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.tanh(data)
+
+
+def negative(data):
+    """Compute element-wise negative of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.negative(data)
+
+
 def add(lhs, rhs):
     """Addition with numpy-style broadcasting.
 
@@ -102,8 +214,80 @@ def add(lhs, rhs):
     return _make.add(lhs, rhs)
 
 
+def multiply(lhs, rhs):
+    """Multiplication with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.multiply(lhs, rhs)
+
+
+def divide(lhs, rhs):
+    """Division with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.divide(lhs, rhs)
+
+
+def pow(lhs, rhs):
+    """Power with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.pow(lhs, rhs)
+
+
+def mod(lhs, rhs):
+    """Mod with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.mod(lhs, rhs)
+
+
 def subtract(lhs, rhs):
-    """Elementwise subtraction with broadcasting.
+    """Subtraction with numpy-style broadcasting.
 
     Parameters
     ----------
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
new file mode 100644
index 000000000000..3569093b95e6
--- /dev/null
+++ b/python/tvm/relay/op/vision/__init__.py
@@ -0,0 +1,3 @@
+# pylint: disable=wildcard-import
+"""Vision network related operators."""
+from __future__ import absolute_import as _abs
diff --git a/python/tvm/relay/op/vision/_make.py b/python/tvm/relay/op/vision/_make.py
new file mode 100644
index 000000000000..614d42f47176
--- /dev/null
+++ b/python/tvm/relay/op/vision/_make.py
@@ -0,0 +1,4 @@
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.vision._make", __name__)
diff --git a/python/tvm/relay/vision.py b/python/tvm/relay/vision.py
new file mode 100644
index 000000000000..d2c08bc0cc45
--- /dev/null
+++ b/python/tvm/relay/vision.py
@@ -0,0 +1,4 @@
+# pylint: disable=wildcard-import, unused-import, unused-wildcard-import
+"""Vision network related operators."""
+# Re-export in a specific file name so that autodoc can pick it up
+from .op.vision import *
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
new file mode 100644
index 000000000000..e6d60f9344a1
--- /dev/null
+++ b/src/relay/op/image/resize.cc
@@ -0,0 +1,87 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file resize.cc
+ * \brief Image operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/image.h>
+#include "../nn/layout.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(ResizeAttrs);
+
+bool ResizeRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  static const Layout kNCHW("NCHW");
+
+  const ResizeAttrs* param = attrs.as<ResizeAttrs>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->layout);
+  CHECK(in_layout.convertible(kNCHW))
+    << "Resize only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+
+  auto oshape = ConvertLayout(data->shape, in_layout, kNCHW);
+  oshape[2] = param->size[0];
+  oshape[3] = param->size[1];
+
+  // assign output type
+  reporter->Assign(types[1],
+                   TensorTypeNode::make(ConvertLayout(oshape, kNCHW, in_layout),
+                                        data->dtype));
+  return true;
+}
+
+
+// Positional relay function to create image operator
+// used by frontend FFI.
+Expr MakeResize(Expr data,
+                Array<IndexExpr> size,
+                std::string layout,
+                std::string method,
+                bool align_corners) {
+  auto attrs = make_node<ResizeAttrs>();
+  attrs->size = std::move(size);
+  attrs->layout = std::move(layout);
+  attrs->method = std::move(method);
+  attrs->align_corners = align_corners;
+  static const Op& op = Op::Get("image.resize");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.image._make.resize")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 5>(MakeResize, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("image.resize")
+.describe(R"code(Perform resize to input array with nearest neighbour or bilinear interpolation.
+
+- **data**: data is 4D array of shape
+            (batch_size, channels, in_height, in_width) for NCHW
+            (batch_size, in_height, in_width, channels) for NHWC
+
+- **out**: Output is 4D array of shape
+           for layout NCHW
+           (batch_size, channels, size[0], size[1])
+
+           for layout NHWC
+           (batch_size, size[0], size[1], channels)
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(5)
+.add_type_rel("Resize", ResizeRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index b34d248d1704..1937d610d003 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -6,12 +6,14 @@
 
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/attrs/image.h>
+#include <vector>
 #include "../type_relations.h"
+#include "layout.h"
 
 namespace tvm {
 namespace relay {
 
-
 TVM_REGISTER_API("relay.op.nn._make.softmax")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
   auto make_func = [](Expr data, int axis) {
@@ -39,5 +41,67 @@ RELAY_REGISTER_OP("nn.softmax")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
+// BatchFlatten
+bool BatchFlattenRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  if (data->shape.size() == 0) return false;
+
+  auto target_dim = make_const(Int(32), 1);
+
+  for (uint32_t i = 1; i < data->shape.size(); ++i) {
+    target_dim = target_dim * data->shape[i];
+  }
+
+  std::vector<IndexExpr> oshape({data->shape[0], target_dim});
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeBatchFlatten(Expr data) {
+  static const Op& op = Op::Get("nn.batch_flatten");
+  return CallNode::make(op, {data}, Attrs(), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.batch_flatten")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 1>(MakeBatchFlatten, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.batch_flatten")
+.describe(R"code(Flattens the input into a 2-D array.
+
+For an input array with shape ``(d1, d2, ..., dk)``, `batch_flatten` operation reshapes
+the input array into an output array of shape ``(d1, d2*...*dk)``.
+
+Example::
+
+    x = [[
+        [1,2,3],
+        [4,5,6],
+        [7,8,9]
+    ],
+    [   [1,2,3],
+        [4,5,6],
+        [7,8,9]
+    ]],
+
+    batch_flatten(x) = [[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.],
+       [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.]]
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("BatchFlatten", BatchFlattenRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
new file mode 100644
index 000000000000..665eaf6de880
--- /dev/null
+++ b/src/relay/op/nn/pooling.cc
@@ -0,0 +1,270 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file pooling.cc
+ * \brief Pooling operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/nn.h>
+#include <vector>
+#include "layout.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(MaxPool2DAttrs);
+
+template <typename AttrTtype>
+bool Pool2DRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+
+  CHECK(data != nullptr);
+  const auto dshape = data->shape;
+  CHECK_NE(dshape.size(), 0);
+  CHECK_GE(dshape.size(), 2U)
+      << "Pool2D only support input >= 2-D: input must have height and width";
+  const auto param = attrs.as<AttrTtype>();
+  CHECK(param != nullptr);
+
+  Layout layout(param->layout);
+  CHECK(layout.contains('H') && layout.contains('W') &&
+        !layout.contains('h') && !layout.contains('w'))
+    << "Invalid layout " << layout
+    << ". Pool2D layout must have H and W, which cannot be split";
+
+  const auto hidx = layout.indexof('H');
+  const auto widx = layout.indexof('W');
+
+  IndexExpr pad_h, pad_w;
+  if (param->padding.size() == 1) {
+    pad_h = param->padding[0] * 2;
+    pad_w = param->padding[0] * 2;
+  } else if (param->padding.size() == 2) {
+    // (top, left)
+    pad_h = param->padding[0] * 2;
+    pad_w = param->padding[1] * 2;
+  } else if (param->padding.size() == 4) {
+    // (top, left, bottom, right)
+    pad_h = param->padding[0] + param->padding[2];
+    pad_w = param->padding[1] + param->padding[3];
+  } else {
+    return false;
+  }
+
+  std::vector<IndexExpr> oshape({dshape[0], dshape[1], dshape[2], dshape[3]});
+  if (param->ceil_mode) {
+    oshape[hidx] = ((dshape[hidx] + pad_h - param->pool_size[0] +
+                    param->strides[0] - 1) / param->strides[0]) + 1;
+    oshape[widx] = ((dshape[widx] + pad_w - param->pool_size[1] +
+                    param->strides[1] - 1) / param->strides[1]) + 1;
+  } else {
+    oshape[hidx] = ((dshape[hidx] + pad_h - param->pool_size[0]) / param->strides[0]) + 1;
+    oshape[widx] = ((dshape[widx] + pad_w - param->pool_size[1]) / param->strides[1]) + 1;
+  }
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+// MaxPool2D
+Expr MakeMaxPool2D(Expr data,
+                   Array<IndexExpr> pool_size,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   std::string layout,
+                   bool ceil_mode) {
+  auto attrs = make_node<MaxPool2DAttrs>();
+  attrs->pool_size = std::move(pool_size);
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->layout = std::move(layout);
+  attrs->ceil_mode = ceil_mode;
+  static const Op& op = Op::Get("nn.max_pool2d");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.max_pool2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 6>(MakeMaxPool2D, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.max_pool2d")
+.describe(R"code(Max pooling operation for two dimensional data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+           out_height and out_width are calculated as::
+
+               out_height = floor((height+padding[0]+padding[2]-pool_size[0])/strides[0])+1
+               out_width = floor((width+padding[1]+padding[3]-pool_size[1])/strides[1])+1
+
+           where padding will be an expanded array based on number of values passed as::
+               one int : all sides same padding used.
+               two int : bottom, right use same as top and left.
+               four int: padding width in the order of (top, left, bottom, right).
+
+           When `ceil_mode` is `True`, ceil will be used instead of floor in this
+           equation.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("MaxPool2D", Pool2DRel<MaxPool2DAttrs>);
+
+
+// AvgPool2D
+Expr MakeAvgPool2D(Expr data,
+                   Array<IndexExpr> pool_size,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   std::string layout,
+                   bool ceil_mode,
+                   bool count_include_pad) {
+  auto attrs = make_node<AvgPool2DAttrs>();
+  attrs->pool_size = std::move(pool_size);
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->layout = std::move(layout);
+  attrs->ceil_mode = ceil_mode;
+  attrs->count_include_pad = count_include_pad;
+  static const Op& op = Op::Get("nn.avg_pool2d");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.avg_pool2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 7>(MakeAvgPool2D, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.avg_pool2d")
+.describe(R"code(
+Average pooling operation for one dimensional data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+           out_height and out_width are calculated as::
+
+               out_height = floor((height+padding[0]+padding[2]-pool_size[0])/strides[0])+1
+               out_width = floor((width+padding[1]+padding[3]-pool_size[1])/strides[1])+1
+
+           where padding will be an expanded array based on number of values passed as::
+               one int : all sides same padding used.
+               two int : bottom, right use same as top and left.
+               four int: padding width in the order of (top, left, bottom, right).
+
+           When `ceil_mode` is `True`, ceil will be used instead of floor in this
+           equation.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("AvgPool2D", Pool2DRel<AvgPool2DAttrs>);
+
+// Global Pool
+TVM_REGISTER_NODE_TYPE(GlobalPool2DAttrs);
+
+bool GlobalPool2DRel(const Array<Type>& types,
+                     int num_inputs,
+                     const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+
+  CHECK(data != nullptr);
+  const auto dshape = data->shape;
+  CHECK_NE(dshape.size(), 0);
+  CHECK_GE(dshape.size(), 2U)
+      << "Pool2D only support input >= 2-D: input must have height and width";
+  const auto param = attrs.as<GlobalPool2DAttrs>();
+  CHECK(param != nullptr);
+
+  Layout layout(param->layout);
+  CHECK(layout.contains('H') && layout.contains('W') &&
+        !layout.contains('h') && !layout.contains('w'))
+    << "Invalid layout " << layout
+    << ". Pool2D layout must have H and W, which cannot be split";
+
+  const auto hidx = layout.indexof('H');
+  const auto widx = layout.indexof('W');
+  std::vector<IndexExpr> oshape({dshape[0], dshape[1], dshape[2], dshape[3]});
+  oshape[hidx] = oshape[widx] = 1;
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeGlobalAvgPool2D(Expr data,
+                         std::string layout) {
+  auto attrs = make_node<GlobalPool2DAttrs>();
+  attrs->layout = std::move(layout);
+  static const Op& op = Op::Get("nn.global_avg_pool2d");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.global_avg_pool2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeGlobalAvgPool2D, args, rv);
+  });
+
+// GlobalAvgPool
+RELAY_REGISTER_OP("nn.global_avg_pool2d")
+.describe(R"code(Global average pooling operation for 2D data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("GlobalAvgPool2D", GlobalPool2DRel);
+
+// GlobalMaxPool
+Expr MakeGlobalMaxPool2D(Expr data,
+                         std::string layout) {
+  auto attrs = make_node<GlobalPool2DAttrs>();
+  attrs->layout = std::move(layout);
+  static const Op& op = Op::Get("nn.global_max_pool2d");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.global_max_pool2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeGlobalMaxPool2D, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.global_max_pool2d")
+.describe(R"code(Global max pooling operation for 2D data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("GlobalMaxPool2D", GlobalPool2DRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
new file mode 100644
index 000000000000..a429a7c40e82
--- /dev/null
+++ b/src/relay/op/nn/upsampling.cc
@@ -0,0 +1,87 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file upsampling.cc
+ * \brief upsampling operator
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/nn.h>
+#include "layout.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(UpSamplingAttrs);
+
+bool UpSamplingRel(const Array<Type>& types,
+                   int num_inputs,
+                   const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  static const Layout kNCHW("NCHW");
+
+  const UpSamplingAttrs* param = attrs.as<UpSamplingAttrs>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->layout);
+  CHECK(in_layout.convertible(kNCHW))
+    << "UpSampling only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+
+  auto oshape = ConvertLayout(data->shape, in_layout, kNCHW);
+
+  oshape[2] = oshape[2] * param->scale;
+  oshape[3] = oshape[3] * param->scale;
+
+  // assign output type
+  reporter->Assign(types[1],
+                   TensorTypeNode::make(ConvertLayout(oshape, kNCHW, in_layout),
+                                        data->dtype));
+  return true;
+}
+
+
+// Positional relay function to create upsampling operator
+// used by frontend FFI.
+Expr MakeUpSampling(Expr data,
+                    int scale,
+                    std::string layout,
+                    std::string method) {
+  auto attrs = make_node<UpSamplingAttrs>();
+  attrs->layout = std::move(layout);
+  attrs->method = std::move(method);
+  attrs->scale = scale;
+  static const Op& op = Op::Get("nn.upsampling");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.upsampling")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 4>(MakeUpSampling, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.upsampling")
+.describe(R"code(Perform upsampling on input array with nearest neighbour or bilinear interpolation.
+
+- **data**: data is 4D array of shape
+            (batch_size, channels, in_height, in_width) for NCHW
+            (batch_size, in_height, in_width, channels) for NHWC
+
+- **out**: Output is 4D array of shape
+           for layout NCHW
+           (batch_size, channels, in_height*scale, in_width*scale)
+
+           for layout NHWC
+           (batch_size, in_height*scale, in_width*scale, channels)
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("UpSampling", UpSamplingRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index 11175f21573d..677ae654c9cf 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -22,7 +22,6 @@ namespace relay {
   .add_argument("rhs", "Tensor", "The right hand side tensor.")        \
   .add_type_rel("Broadcast", BroadcastRel)
 
-// Addition
 RELAY_REGISTER_BINARY_OP("add")
 .describe("Elementwise add with with broadcasting")
 .set_support_level(1);
@@ -49,6 +48,22 @@ RELAY_REGISTER_BINARY_OP("minimum")
 .describe("Elementwise minimum of two tensors with broadcasting")
 .set_support_level(4);
 
+RELAY_REGISTER_BINARY_OP("divide")
+.describe("Elementwise divide with broadcasting")
+.set_support_level(1);
+
+RELAY_REGISTER_BINARY_OP("multiply")
+.describe("Elementwise multiply with broadcasting")
+.set_support_level(1);
+
+RELAY_REGISTER_BINARY_OP("pow")
+.describe("Elementwise power with broadcasting")
+.set_support_level(4);
+
+RELAY_REGISTER_BINARY_OP("mod")
+.describe("Elementwise mod with broadcasting")
+.set_support_level(1);
+
 // Comparisons
 #define RELAY_REGISTER_CMP_OP(OpName)                               \
   TVM_REGISTER_API("relay.op._make." OpName)                        \
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 9de4975de790..0ebb5f721d34 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -28,9 +28,8 @@ namespace relay {
   .set_num_inputs(1)                                  \
   .add_argument("data", "Tensor", "The input tensor.")
 
-
 RELAY_REGISTER_UNARY_OP("log")
-.describe(R"code(Returns the log input array, computed element-wise.
+.describe(R"code(Returns the log of input array, computed element-wise.
 
 .. math::
    log(x)
@@ -39,12 +38,8 @@ RELAY_REGISTER_UNARY_OP("log")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
-// data : Tensor[shape, dtype]
-// result: Tensor[shape, dtype]
-
-
 RELAY_REGISTER_UNARY_OP("exp")
-.describe(R"code(Returns the exp input array, computed element-wise.
+.describe(R"code(Returns the exp of input array, computed element-wise.
 
 .. math::
    \exp(x)
@@ -56,6 +51,10 @@ RELAY_REGISTER_UNARY_OP("exp")
 
 RELAY_REGISTER_UNARY_OP("sqrt")
 .describe(R"code(Returns the sqrt input array, computed element-wise.
+
+.. math::
+   sqrt(x)
+
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
@@ -88,5 +87,72 @@ RELAY_REGISTER_UNARY_OP("copy")
 .set_support_level(3)
 .add_type_rel("Identity", IdentityRel);
 
+RELAY_REGISTER_UNARY_OP("floor")
+.describe(R"code(Returns the floor of input array, computed element-wise.
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.add_type_rel("Identity", IdentityRel);
+
+RELAY_REGISTER_UNARY_OP("ceil")
+.describe(R"code(Returns the ceil of input array, computed element-wise.
+
+.. math::
+   ceil(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.add_type_rel("Identity", IdentityRel);
+
+RELAY_REGISTER_UNARY_OP("trunc")
+.describe(R"code(Returns the trunc of input array, computed element-wise.
+
+.. math::
+   trunc(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.add_type_rel("Identity", IdentityRel);
+
+RELAY_REGISTER_UNARY_OP("round")
+.describe(R"code(Returns the round of input array, computed element-wise.
+
+.. math::
+   round(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.add_type_rel("Identity", IdentityRel);
+
+RELAY_REGISTER_UNARY_OP("abs")
+.describe(R"code(Returns the abs of input array, computed element-wise.
+
+.. math::
+   abs(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.add_type_rel("Identity", IdentityRel);
+
+RELAY_REGISTER_UNARY_OP("tanh")
+.describe(R"code(Returns the tanh of input array, computed element-wise.
+
+.. math::
+   Y = sinh(X) / cosh(X)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
+RELAY_REGISTER_UNARY_OP("negative")
+.describe(R"code(Returns the numeric negative of input array, computed element-wise.
+
+.. math::
+   -(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.add_type_rel("Identity", IdentityRel);
+
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_ir_op.py b/tests/python/relay/test_ir_op.py
index 3b1d914fe02c..f1d835d2b43b 100644
--- a/tests/python/relay/test_ir_op.py
+++ b/tests/python/relay/test_ir_op.py
@@ -14,13 +14,22 @@ def test(x):
 def test_op_level1():
     x = relay.Var("x")
 
-    for op_name in ["log", "exp", "sqrt"]:
+    for op_name in ["log", "exp", "sqrt", "tanh"]:
         y = getattr(relay, op_name)(x)
         assert y.op.name == op_name
         assert y.op.support_level == 1
         assert y.args[0] == x
 
+def test_op_level3():
+    x = relay.Var("x")
+
+    for op_name in ["ceil", "floor", "trunc", "round", "abs", "negative"]:
+        y = getattr(relay, op_name)(x)
+        assert y.op.name == op_name
+        assert y.op.support_level == 3
+        assert y.args[0] == x
 
 if __name__ == "__main__":
     test_op_attr()
     test_op_level1()
+    test_op_level3()
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index c7f8aa5ef63c..78cdc048d438 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -1,6 +1,31 @@
 import tvm
+import numpy as np
 from tvm import relay
+from tvm.relay.ir_pass import infer_type
+from tvm.relay.ir_builder import IRBuilder, func_type
+from tvm.relay.ir_builder import scalar_type, convert, tensor_type
+from tvm.relay.env import Environment
 
+def assert_has_type(expr, typ, env=Environment({})):
+    checked_expr = infer_type(env, expr)
+    checked_type = checked_expr.checked_type
+    if checked_type != typ:
+        raise RuntimeError("Type mismatch %s vs %s" % (
+            checked_type, typ))
+
+def test_single_op():
+    def check_single_op(opfunc):
+        "Program: fn (x : float32) { let t1 = f(x); t1 }"
+        b = IRBuilder()
+        with b.function(('x', 'float32')) as func:
+            x, = func.param_ids()
+            t1 = b.let('t1', opfunc(x))
+            b.ret(t1)
+        assert_has_type(func.to_func(), func_type(['float32'], 'float32'))
+
+    for opfunc in [tvm.relay.log, tvm.relay.exp, tvm.relay.sqrt,
+                   tvm.relay.sigmoid, tvm.relay.tanh]:
+        check_single_op(opfunc)
 
 def test_expand_dims_infer_type():
     ib = relay.ir_builder.IRBuilder()
@@ -43,6 +68,55 @@ def test_unary_op():
         ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((10, 4), "int32")
 
+def test_binary_op():
+    def check_binary_op(opfunc):
+        """
+        Program:
+            fn (x, y) {
+                return x <op> y;
+            }
+        """
+        b = IRBuilder()
+
+        x = b.param('x', tensor_type(5, 5, 5))
+        y = b.param('y', tensor_type(5, 5, 5))
+        with b.function(x, y) as func:
+            b.ret(opfunc(x.var, y.var))
+        b.ret(func)
+        prog, env = b.get()
+        ttype = tensor_type(5, 5, 5)
+        expected_ty = func_type([ttype, ttype], ttype)
+        assert_has_type(func.to_func(), expected_ty)
+
+    for opfunc in [relay.add, relay.subtract, relay.mod,
+                   relay.multiply, relay.divide]:
+        check_binary_op(opfunc)
+
+
+def test_binary_broadcast_op():
+    def check_binary_broadcast_op(opfunc):
+        """
+        Program:
+            fn (x: Tensor[(10, 4), f32], y: Tensor[(5, 10, 1), f32]) -> Tensor[(5, 10, 4), f32] {
+                return x <op> y;
+            }
+        """
+        b = IRBuilder()
+        x = b.param('x', tensor_type(10, 4))
+        y = b.param('y', tensor_type(5, 10, 1))
+        with b.function(x, y) as func:
+            b.ret(opfunc(x.var, y.var))
+        b.ret(func)
+        prog, env = b.get()
+
+        expected_ty = func_type([tensor_type(10, 4), tensor_type(5, 10, 1)],
+                                tensor_type(5, 10, 4))
+        assert_has_type(func.to_func(), expected_ty)
+
+    for opfunc in [relay.add, relay.subtract, relay.mod,
+                   relay.multiply, relay.divide]:
+        check_binary_broadcast_op(opfunc)
+
 
 def test_concatenate_infer_type():
     ib = relay.ir_builder.IRBuilder()
@@ -83,7 +157,10 @@ def test_concatenate_infer_type():
 
 
 if __name__ == "__main__":
-    test_expand_dims_infer_type()
     test_unary_op()
+    test_single_op()
+    test_expand_dims_infer_type()
     test_concatenate_infer_type()
     test_softmax()
+    test_binary_op()
+    test_binary_broadcast_op()
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 7182c641248e..b9599982aa93 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1,7 +1,8 @@
+""" Support level2 operator test cases.
+"""
 import tvm
 from tvm import relay
 
-
 def test_conv2d_infer_type():
     # symbolic in batch dimension
     ib = relay.ir_builder.IRBuilder()
@@ -56,7 +57,112 @@ def test_conv2d_infer_type():
     assert ftype.arg_types[1] == relay.ty.TensorType(
         (4, 8, 3, 3, 4, 4), "int8")
 
+def test_upsampling_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.nn.upsampling(x.var, scale=2, layout="NCHW", method="BILINEAR"))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c, h*2, w*2), "float32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c = tvm.var("n"), tvm.var("c")
+    x = ib.param("x", relay.ty.TensorType((n, c, 100, 200), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.nn.upsampling(x.var, scale=2, layout="NCHW", method="BILINEAR"))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c, 200, 400), "float32")
+
+def _test_pool2d_infer_type(opfunc):
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 10, 224, 224
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(opfunc(x.var, pool_size=(1, 1)))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, 10, 224, 224), "float32")
+
+    ph, pw = tvm.var("ph"), tvm.var("pw")
+    sh, sw = tvm.var("sh"), tvm.var("sw")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 10, 224, 224
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(opfunc(x.var, pool_size=(ph, pw), strides=(sh, sw)))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, 10, (((224 - ph)/sh) + 1), (((224 - pw)/sw) + 1)), "float32")
+
+def _test_global_pool2d_infer_type(opfunc):
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), tvm.var("c"), 224, 224
+    x = ib.param("x", relay.ty.TensorType((n, h, w, c), "float32"))
+    with ib.function(x) as func:
+        ib.ret(opfunc(x.var, layout="NHWC"))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, 1, 1, c), "float32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(opfunc(x.var))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c, 1, 1), "float32")
+
+def test_pool2d_infer_type():
+    _test_pool2d_infer_type(relay.nn.max_pool2d)
+    _test_pool2d_infer_type(relay.nn.avg_pool2d)
+    _test_global_pool2d_infer_type(relay.nn.global_avg_pool2d)
+    _test_global_pool2d_infer_type(relay.nn.global_avg_pool2d)
+
+def test_flatten_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    x = ib.param("x", relay.ty.TensorType((d1, d2, d3, d4), "float32"))
+
+    with ib.function(x) as func:
+        ib.ret(relay.nn.batch_flatten(x.var))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((d1, ((d2*d3)*d4)), "float32")
+
+    ib = relay.ir_builder.IRBuilder()
+    x = ib.param("x", relay.ty.TensorType((3, 2, 4, 3), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.nn.batch_flatten(x.var))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((3, 24), "float32")
+
+    ib = relay.ir_builder.IRBuilder()
+    x = ib.param("x", relay.ty.TensorType((d1, 2, d3, 3), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.nn.batch_flatten(x.var))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((d1, ((2*d3)*3)), "float32")
 
 
 if __name__ == "__main__":
     test_conv2d_infer_type()
+    test_pool2d_infer_type()
+    test_upsampling_infer_type()
+    test_flatten_infer_type()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index ecd9d071e671..4dfa7b563b82 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1,5 +1,11 @@
+""" Support level3 operator test cases.
+"""
 import tvm
+import numpy as np
 from tvm import relay
+from tvm.relay.ir_pass import infer_type
+from tvm.relay.ir_builder import IRBuilder, func_type
+from tvm.relay.env import Environment
 
 
 def test_unary_identity():
@@ -13,7 +19,6 @@ def test_unary_identity():
         ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((8, 9, 4), "int32")
 
-
 def test_copy_infer_type():
     ib = relay.ir_builder.IRBuilder()
     n, t, d = tvm.var("n"), tvm.var("t"), 100
@@ -52,8 +57,29 @@ def test_reshape_infer_type():
     assert ftype.ret_type == relay.ty.TensorType(
         (n, t, 2000), "float32")
 
+def assert_has_type(expr, typ, env=Environment({})):
+    checked_expr = infer_type(env, expr)
+    checked_type = checked_expr.checked_type
+    if checked_type != typ:
+        raise RuntimeError("Type mismatch %s vs %s" % (
+            checked_type, typ))
+
+def test_single_op():
+    def check_single_op(opfunc):
+        "Program: fn (x : float32) { let t1 = f(x); t1 }"
+        b = IRBuilder()
+        with b.function(('x', 'float32')) as func:
+            x, = func.param_ids()
+            t1 = b.let('t1', opfunc(x))
+            b.ret(t1)
+        assert_has_type(func.to_func(), func_type(['float32'], 'float32'))
+
+    for opfunc in [tvm.relay.ceil, tvm.relay.floor, tvm.relay.trunc,
+                   tvm.relay.round, tvm.relay.abs, tvm.relay.negative]:
+        check_single_op(opfunc)
 
 if __name__ == "__main__":
+    test_single_op()
     test_unary_identity()
     test_copy_infer_type()
     test_transpose_infer_type()
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index dddbf40bd878..a855b0f2caaa 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -1,6 +1,17 @@
 import tvm
+import numpy as np
 from tvm import relay
+from tvm.relay.ir_pass import infer_type
+from tvm.relay.ir_builder import IRBuilder, func_type
+from tvm.relay.ir_builder import scalar_type, convert, tensor_type
+from tvm.relay.env import Environment
 
+def assert_has_type(expr, typ, env=Environment({})):
+    checked_expr = infer_type(env, expr)
+    checked_type = checked_expr.checked_type
+    if checked_type != typ:
+        raise RuntimeError("Type mismatch %s vs %s" % (
+            checked_type, typ))
 
 def test_cmp_type():
     for op in (relay.greater,
@@ -20,6 +31,84 @@ def test_cmp_type():
         assert ftype.ret_type == relay.TensorType((5, 10, 4), "uint1")
 
 
+def test_binary_broadcast():
+    for op in [relay.right_shift,
+               relay.left_shift,
+               relay.maximum]:
+        ib = relay.ir_builder.IRBuilder()
+        x = ib.param("x", relay.TensorType((10, 4), "int32"))
+        y = ib.param("y", relay.TensorType((5, 10, 1), "int32"))
+        with ib.function(x, y) as func:
+            ib.ret(op(x.var, y.var))
+        ib.ret(func)
+        func = relay.ir_pass.infer_type(ib.env, func.to_func())
+        ftype = func.checked_type
+        assert ftype.ret_type == relay.TensorType((5, 10, 4), "int32")
+
+def test_binary_op():
+    def check_binary_op(opfunc):
+        """
+        Program:
+            fn (x, y) {
+                return x <op> y;
+            }
+        """
+        b = IRBuilder()
+
+        x = b.param('x', tensor_type(5, 5, 5))
+        y = b.param('y', tensor_type(5, 5, 5))
+        with b.function(x, y) as func:
+            b.ret(opfunc(x.var, y.var))
+        b.ret(func)
+        prog, env = b.get()
+        ttype = tensor_type(5, 5, 5)
+        expected_ty = func_type([ttype, ttype], ttype)
+        assert_has_type(func.to_func(), expected_ty)
+
+    for opfunc in [relay.pow]:
+        check_binary_op(opfunc)
+
+
+def test_binary_broadcast_op():
+    def check_binary_broadcast_op(opfunc):
+        """
+        Program:
+            fn (x: Tensor[(10, 4), f32], y: Tensor[(5, 10, 1), f32]) -> Tensor[(5, 10, 4), f32] {
+                return x <op> y;
+            }
+        """
+        b = IRBuilder()
+        x = b.param('x', tensor_type(10, 4))
+        y = b.param('y', tensor_type(5, 10, 1))
+        with b.function(x, y) as func:
+            b.ret(opfunc(x.var, y.var))
+        b.ret(func)
+        prog, env = b.get()
+
+        expected_ty = func_type([tensor_type(10, 4), tensor_type(5, 10, 1)],
+                                tensor_type(5, 10, 4))
+        assert_has_type(func.to_func(), expected_ty)
+
+    for opfunc in [relay.pow]:
+        check_binary_broadcast_op(opfunc)
+
+def test_cmp_type():
+    for op in (relay.greater,
+               relay.greater_equal,
+               relay.less,
+               relay.less_equal,
+               relay.equal,
+               relay.not_equal):
+        ib = relay.ir_builder.IRBuilder()
+        x = ib.param("x", relay.TensorType((10, 4), "float32"))
+        y = ib.param("y", relay.TensorType((5, 10, 1), "float32"))
+        with ib.function(x, y) as func:
+            ib.ret(op(x.var, y.var))
+        ib.ret(func)
+        func = relay.ir_pass.infer_type(ib.env, func.to_func())
+        ftype = func.checked_type
+        assert ftype.ret_type == relay.TensorType((5, 10, 4), "uint1")
+
 def test_binary_broadcast():
     for op in [relay.right_shift,
                relay.left_shift,
@@ -39,3 +128,5 @@ def test_binary_broadcast():
 if __name__ == "__main__":
     test_cmp_type()
     test_binary_broadcast()
+    test_binary_op()
+    test_binary_broadcast_op()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
new file mode 100644
index 000000000000..62da592e8249
--- /dev/null
+++ b/tests/python/relay/test_op_level5.py
@@ -0,0 +1,29 @@
+""" Support level5 operator test cases.
+"""
+import tvm
+from tvm import relay
+
+def test_resize_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
+    th, tw = tvm.var("th"), tvm.var("tw")
+
+    with ib.function(x) as func:
+        ib.ret(relay.image.resize(x.var, (th, tw)))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c, th, tw), "int8")
+
+    ib = relay.ir_builder.IRBuilder()
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
+    with ib.function(x) as func:
+        ib.ret(relay.image.resize(x.var, (100, 200), "NCHW", "BILINEAR", False))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c, 100, 200), "int8")
+
+if __name__ == "__main__":
+    test_resize_infer_type()
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index dfed126e6ab1..6629932921f8 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -32,54 +32,6 @@ def test_monomorphic_let():
     prog, env = b.get()
     assert_has_type(prog, scalar_type('float64'))
 
-
-def test_single_op():
-    "Program: fn (x : float32) { let t1 = f(x); t1 }"
-    b = IRBuilder()
-    with b.function(('x', 'float32')) as func:
-        x, = func.param_ids()
-        t1 = b.let('t1', log(x))
-        b.ret(t1)
-    assert_has_type(func.to_func(), func_type(['float32'], 'float32'))
-
-def test_add_op():
-    """
-    Program:
-        fn (x, y) {
-            return x + y;
-        }
-    """
-    b = IRBuilder()
-
-    x = b.param('x', tensor_type(5, 5, 5))
-    y = b.param('y', tensor_type(5, 5, 5))
-    with b.function(x, y) as func:
-        b.ret(add(x.var, y.var))
-    b.ret(func)
-    prog, env = b.get()
-    ttype = tensor_type(5, 5, 5)
-    expected_ty = func_type([ttype, ttype], ttype)
-    assert_has_type(func.to_func(), expected_ty)
-
-def test_add_broadcast_op():
-    """
-    Program:
-        fn (x: Tensor[(10, 4), f32], y: Tensor[(5, 10, 1), f32]) -> Tensor[(5, 10, 4), f32] {
-            return x + y;
-        }
-    """
-    b = IRBuilder()
-    x = b.param('x', tensor_type(10, 4))
-    y = b.param('y', tensor_type(5, 10, 1))
-    with b.function(x, y) as func:
-        b.ret(add(x.var, y.var))
-    b.ret(func)
-    prog, env = b.get()
-
-    expected_ty = func_type([tensor_type(10, 4), tensor_type(5, 10, 1)],
-                            tensor_type(5, 10, 4))
-    assert_has_type(func.to_func(), expected_ty)
-
 def test_dual_op():
     """Program:
        fn (x : Tensor[f32, (10, 10)]) {
@@ -162,9 +114,6 @@ def try_concat2(x: Float(3, 2), y: Float(2, 2)) -> Float(5, 2) {
     test_dual_op()
     test_recursion()
     test_monomorphic_let()
-    test_single_op()
-    test_add_op()
-    test_add_broadcast_op()
     test_decl()
     test_recursion()
     test_concat()

From fe035db464e523ffe0d3027b15b0ac69fbb6fd83 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Mon, 8 Oct 2018 09:40:36 +0530
Subject: [PATCH 197/529] [RELAY][OP]log_softmax op (#1857)

---
 docs/langref/relay_op.rst            |  2 ++
 python/tvm/relay/op/nn/nn.py         | 22 +++++++++++++++++++++
 src/relay/op/nn/nn.cc                | 29 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level1.py | 13 +++++++++++++
 4 files changed, 66 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index deafaa99d645..f4c39261ba1f 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -30,6 +30,7 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.expand_dims
    tvm.relay.concatenate
    tvm.relay.nn.softmax
+   tvm.relay.nn.log_softmax
    tvm.relay.subtract
    tvm.relay.multiply
    tvm.relay.divide
@@ -114,6 +115,7 @@ Level 1 Definitions
 .. autofunction:: tvm.relay.sigmoid
 .. autofunction:: tvm.relay.concatenate
 .. autofunction:: tvm.relay.nn.softmax
+.. autofunction:: tvm.relay.nn.log_softmax
 
 
 Level 2 Definitions
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 681afd5075c9..5a1bc1068f4b 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -108,6 +108,28 @@ def softmax(data, axis):
     return _make.softmax(data, axis)
 
 
+def log_softmax(data, axis):
+    r"""Computes log softmax.
+
+    .. math::
+
+        \text{log_softmax}(x)_i = \log \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+    .. note::
+        This operator can be optimized away for inference.
+
+    Parameters
+    ----------
+    data: relay.Expr
+        The input data to the operator.
+
+    axis: int
+        The axis to sum over when computing softmax
+    """
+
+    return _make.log_softmax(data, axis)
+
+
 def max_pool2d(data,
                pool_size=(1, 1),
                strides=(1, 1),
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 1937d610d003..dfbeceb45cc0 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -41,6 +41,35 @@ RELAY_REGISTER_OP("nn.softmax")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
+
+TVM_REGISTER_API("relay.op.nn._make.log_softmax")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  auto make_func = [](Expr data, int axis) {
+    auto attrs = make_node<SoftmaxAttrs>();
+    attrs->axis = axis;
+    static const Op& op = Op::Get("nn.log_softmax");
+    return CallNode::make(op, {data}, Attrs(attrs), {});
+  };
+
+  runtime::detail::unpack_call<Expr, 2>(make_func, args, rv);
+});
+
+RELAY_REGISTER_OP("nn.log_softmax")
+    .describe(R"code(Computes log softmax.
+
+.. math:: \text{log_softmax}(x)_i = \log \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+.. note::
+    This operator can be optimized away for inference.
+
+- **data**: The input data
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
+
 // BatchFlatten
 bool BatchFlattenRel(const Array<Type>& types,
                int num_inputs,
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 78cdc048d438..e8c5b5fc87f2 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -54,6 +54,18 @@ def test_softmax():
     assert ftype.ret_type == relay.ty.TensorType((n, d), "float32")
 
 
+def test_log_softmax():
+    ib = relay.ir_builder.IRBuilder()
+    n, d = tvm.var("n"), tvm.var("d")
+    x = ib.param("x", relay.ty.TensorType((n, d), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.nn.log_softmax(x, axis=1))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, d), "float32")
+
 def test_unary_op():
     for op in [relay.exp,
                relay.log,
@@ -162,5 +174,6 @@ def test_concatenate_infer_type():
     test_expand_dims_infer_type()
     test_concatenate_infer_type()
     test_softmax()
+    test_log_softmax()
     test_binary_op()
     test_binary_broadcast_op()

From b3c03ff84e0ce01e22655debf1f7e654e5ccd894 Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Mon, 8 Oct 2018 08:44:54 -0700
Subject: [PATCH 198/529] [Relay][Op] Clip (#1844)

---
 python/tvm/relay/op/tensor.py        | 29 +++++++++++++++++++++++++
 src/relay/op/tensor/unary.cc         | 32 +++++++++++++++++++++++++++-
 tests/python/relay/test_op_level3.py | 15 +++++++++++++
 3 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 425a072631a6..316514801fd6 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -515,6 +515,35 @@ def ones_like(data):
     """
     return _make.ones_like(data)
 
+
+def clip(a, a_min, a_max):
+    """Clip the elements in `a` between `a_min` and `a_max`.
+    `a_min` and `a_max` are cast to `a`'s dtype.
+
+    Parameters
+    ----------
+    a : relay.Expr
+        The input tensor.
+    a_min : float
+        The clip minimum.
+    a_max : float
+        The clip maximum.
+
+    Returns
+    -------
+    result : relay.Expr
+        `a` with elements clipped between `a_min` and `a_max`.
+
+    Examples
+    --------
+    .. code:: python
+      x = relay.Constant(tvm.nd.array([0, 1, 5, 3, 4, 2]))
+      relay.clip(x, 1., 4.)
+      # [1, 1, 4, 3, 4, 2]
+    """
+    return _make.clip(a, a_min, a_max)
+
+
 def concatenate(data, axis):
     """Concatenate the input tensors along the given axis.
 
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 0ebb5f721d34..ef051e964538 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -87,6 +87,37 @@ RELAY_REGISTER_UNARY_OP("copy")
 .set_support_level(3)
 .add_type_rel("Identity", IdentityRel);
 
+// Clip
+struct ClipAttrs : public tvm::AttrsNode<ClipAttrs> {
+  double a_min;
+  double a_max;
+
+  TVM_DECLARE_ATTRS(ClipAttrs, "relay.attrs.ClipAttrs") {
+  TVM_ATTR_FIELD(a_min)
+    .describe("The minimum clip value.");
+  TVM_ATTR_FIELD(a_max)
+    .describe("The maximum clip value.");
+  }
+};
+
+TVM_REGISTER_API("relay.op._make.clip")
+  .set_body_typed<Expr(Expr, double, double)>([](Expr a, double a_min, double a_max) {
+      auto attrs = make_node<ClipAttrs>();
+      attrs->a_min = a_min;
+      attrs->a_max = a_max;
+      static const Op& op = Op::Get("clip");
+    return CallNode::make(op, {a}, Attrs(attrs), {});
+  });
+
+RELAY_REGISTER_OP("clip")
+  .describe(R"code(Clip tensor values.
+  This function takes a tensor, a minimum value `a_min`, and a maximum value `a_max`, and returns a clipped tensor where all values below `a_min` are set to `a_min` and all values above `a_max` are set to `a_max`. `a_min` and `a_max` are cast to the tensor's dtype.
+  )code" TVM_ADD_FILELINE)
+  .set_num_inputs(1)
+  .add_argument("tensor", "Tensor", "The input tensor.")
+  .set_support_level(3)
+  .add_type_rel("Clip", IdentityRel);
+
 RELAY_REGISTER_UNARY_OP("floor")
 .describe(R"code(Returns the floor of input array, computed element-wise.
 )code" TVM_ADD_FILELINE)
@@ -153,6 +184,5 @@ RELAY_REGISTER_UNARY_OP("negative")
 .set_support_level(3)
 .add_type_rel("Identity", IdentityRel);
 
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 4dfa7b563b82..c6b83b39c276 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -19,6 +19,18 @@ def test_unary_identity():
         ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((8, 9, 4), "int32")
 
+
+def test_clip_type():
+    ib = relay.ir_builder.IRBuilder()
+    a = ib.param("a", relay.TensorType((10, 4), "float32"))
+    with ib.function(a) as func:
+        ib.ret(relay.clip(a.var, 1., 4.))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.TensorType((10, 4), "float32")
+
+
 def test_copy_infer_type():
     ib = relay.ir_builder.IRBuilder()
     n, t, d = tvm.var("n"), tvm.var("t"), 100
@@ -57,6 +69,7 @@ def test_reshape_infer_type():
     assert ftype.ret_type == relay.ty.TensorType(
         (n, t, 2000), "float32")
 
+
 def assert_has_type(expr, typ, env=Environment({})):
     checked_expr = infer_type(env, expr)
     checked_type = checked_expr.checked_type
@@ -78,9 +91,11 @@ def check_single_op(opfunc):
                    tvm.relay.round, tvm.relay.abs, tvm.relay.negative]:
         check_single_op(opfunc)
 
+
 if __name__ == "__main__":
     test_single_op()
     test_unary_identity()
+    test_clip_type()
     test_copy_infer_type()
     test_transpose_infer_type()
     test_reshape_infer_type()

From 6ffdd28e4ac01cbbd5d4e9a2fec3e7b41cd9bbdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Mon, 8 Oct 2018 22:39:03 -0700
Subject: [PATCH 199/529] add relu (#1849)

---
 docs/langref/relay_op.rst            | 11 +++++-
 python/tvm/relay/op/nn/nn.py         | 19 ++++++++++
 src/relay/op/nn/nn.cc                | 11 ++++++
 src/relay/op/op_common.h             | 46 ++++++++++++++++++++++++
 src/relay/op/tensor/binary.cc        | 36 +++++++------------
 src/relay/op/tensor/unary.cc         | 53 ++++++++++------------------
 tests/python/relay/test_op_level1.py |  3 +-
 7 files changed, 120 insertions(+), 59 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index f4c39261ba1f..0ac6851ba9de 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -37,7 +37,7 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.mod
    tvm.relay.tanh
    tvm.relay.sigmoid
-
+   tvm.relay.nn.relu
 
 **Level 2: Convolutions**
 
@@ -75,6 +75,7 @@ This level enables additional math and transform operators.
    tvm.relay.negative
 
 
+
 **Level 4: Broadcast and Reductions**
 
 .. autosummary::
@@ -92,6 +93,7 @@ This level enables additional math and transform operators.
    tvm.relay.minimum
    tvm.relay.pow
 
+
 **Level 5: Vision/Image Operators**
 
 .. autosummary::
@@ -116,6 +118,7 @@ Level 1 Definitions
 .. autofunction:: tvm.relay.concatenate
 .. autofunction:: tvm.relay.nn.softmax
 .. autofunction:: tvm.relay.nn.log_softmax
+.. autofunction:: tvm.relay.nn.relu
 
 
 Level 2 Definitions
@@ -141,6 +144,12 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.copy
 .. autofunction:: tvm.relay.transpose
 
+Level 3 Definitions
+-------------------
+.. autofunction:: tvm.relay.zeros_like
+.. autofunction:: tvm.relay.ones_like
+
+
 Level 4 Definitions
 -------------------
 .. autofunction:: tvm.relay.right_shift
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 5a1bc1068f4b..61ed6f64a91b 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -364,3 +364,22 @@ def batch_flatten(data):
         The Flattened result.
     """
     return _make.batch_flatten(data)
+
+
+def relu(data):
+    """Rectified linear unit.
+
+    .. math::
+       out = max(x, 0)
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.relu(data)
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index dfbeceb45cc0..7f6b0ee79e3f 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -9,6 +9,7 @@
 #include <tvm/relay/attrs/image.h>
 #include <vector>
 #include "../type_relations.h"
+#include "../op_common.h"
 #include "layout.h"
 
 namespace tvm {
@@ -132,5 +133,15 @@ Example::
 .set_support_level(2)
 .add_type_rel("BatchFlatten", BatchFlattenRel);
 
+RELAY_REGISTER_UNARY_OP("relay.op.nn._make.", "relu")
+.describe(R"code(Returns the relu input array, computed element-wise.
+
+.. math::
+   max(x, 0)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
index 5bdc91bfd6de..d07b7f02cd67 100644
--- a/src/relay/op/op_common.h
+++ b/src/relay/op/op_common.h
@@ -24,6 +24,52 @@ std::vector<T> AsVector(const Array<T> &array) {
     return result;
 }
 
+/*! Quick helper macro
+ * - Expose a positional make function to construct the node.
+ * - Register op to the registry.
+ *
+ * We make the decision to always only expose positional argument.
+ * We will do rewrapping in the frontend to support language
+ * sugars such as keyword arguments and default value.
+ *
+ * \param Prefix the prefix of the registry, for example, "relay.op._make.".
+ *
+ * \param OpName the name of registry.
+ */
+#define RELAY_REGISTER_UNARY_OP(Prefix, OpName)           \
+  TVM_REGISTER_API(Prefix OpName)                         \
+    .set_body_typed<Expr(Expr)>([](Expr data) {           \
+        static const Op& op = Op::Get(OpName);            \
+        return CallNode::make(op, {data}, Attrs(), {});   \
+      });                                                 \
+  RELAY_REGISTER_OP(OpName)                               \
+    .set_num_inputs(1)                                    \
+    .add_argument("data", "Tensor", "The input tensor.")
+
+/*! Quick helper macro
+ * - Expose a positional make function to construct the node.
+ * - Register op to the registry.
+ *
+ * We make the decision to always only expose positional argument.
+ * We will do rewrapping in the frontend to support language
+ * sugars such as keyword arguments and default value.
+ *
+ * \param Prefix the prefix of the registry, for example, "relay.op._make.".
+ *
+ * \param OpName the name of registry.
+ */
+#define RELAY_REGISTER_BINARY_OP(Prefix, OpName)                  \
+  TVM_REGISTER_API(Prefix OpName)                                 \
+    .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {    \
+        static const Op& op = Op::Get(OpName);                    \
+        return CallNode::make(op, {lhs, rhs}, Attrs(), {});       \
+      });                                                         \
+  RELAY_REGISTER_OP(OpName)                                       \
+    .set_num_inputs(2)                                            \
+    .add_argument("lhs", "Tensor", "The left hand side tensor.")  \
+    .add_argument("rhs", "Tensor", "The right hand side tensor.") \
+    .add_type_rel("Broadcast", BroadcastRel)
+
 }  // namespace relay
 }  // namespace tvm
 
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index 677ae654c9cf..fe614aa4ea1c 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -6,61 +6,51 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
 #include "../type_relations.h"
+#include "../op_common.h"
 
 namespace tvm {
 namespace relay {
 
-#define RELAY_REGISTER_BINARY_OP(OpName)                               \
-  TVM_REGISTER_API("relay.op._make." OpName)                           \
-  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {           \
-      static const Op& op = Op::Get(OpName);                           \
-      return CallNode::make(op, {lhs, rhs}, Attrs(), {});              \
-    });                                                                \
-  RELAY_REGISTER_OP(OpName)                                            \
-  .set_num_inputs(2)                                                   \
-  .add_argument("lhs", "Tensor", "The left hand side tensor.")         \
-  .add_argument("rhs", "Tensor", "The right hand side tensor.")        \
-  .add_type_rel("Broadcast", BroadcastRel)
-
-RELAY_REGISTER_BINARY_OP("add")
+// Addition
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "add")
 .describe("Elementwise add with with broadcasting")
 .set_support_level(1);
 
 // Subtraction
-RELAY_REGISTER_BINARY_OP("subtract")
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "subtract")
 .describe("Elementwise substract with broadcasting")
 .set_support_level(1);
 
 // Right shift
-RELAY_REGISTER_BINARY_OP("right_shift")
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "right_shift")
 .describe("Elementwise right shift with broadcasting")
 .set_support_level(4);
 
-RELAY_REGISTER_BINARY_OP("left_shift")
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "left_shift")
 .describe("Elementwise left shift with broadcasting")
 .set_support_level(4);
 
-RELAY_REGISTER_BINARY_OP("maximum")
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "maximum")
 .describe("Elementwise maximum of two tensors with broadcasting")
 .set_support_level(4);
 
-RELAY_REGISTER_BINARY_OP("minimum")
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "minimum")
 .describe("Elementwise minimum of two tensors with broadcasting")
 .set_support_level(4);
 
-RELAY_REGISTER_BINARY_OP("divide")
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "divide")
 .describe("Elementwise divide with broadcasting")
 .set_support_level(1);
 
-RELAY_REGISTER_BINARY_OP("multiply")
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "multiply")
 .describe("Elementwise multiply with broadcasting")
 .set_support_level(1);
 
-RELAY_REGISTER_BINARY_OP("pow")
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "pow")
 .describe("Elementwise power with broadcasting")
 .set_support_level(4);
 
-RELAY_REGISTER_BINARY_OP("mod")
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "mod")
 .describe("Elementwise mod with broadcasting")
 .set_support_level(1);
 
@@ -68,7 +58,7 @@ RELAY_REGISTER_BINARY_OP("mod")
 #define RELAY_REGISTER_CMP_OP(OpName)                               \
   TVM_REGISTER_API("relay.op._make." OpName)                        \
   .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {        \
-      static const Op& op = Op::Get(OpName);                        \
+    static const Op& op = Op::Get(OpName);                          \
     return CallNode::make(op, {lhs, rhs}, Attrs(), {});             \
   });                                                               \
   RELAY_REGISTER_OP(OpName)                                         \
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index ef051e964538..22f97e8f0d54 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -6,30 +6,14 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
 #include "../type_relations.h"
+#include "../op_common.h"
 
 namespace tvm {
 namespace relay {
 
-// Quick helper macro
-// - Expose a positional make function to construct the node.
-// - Register op to the registry.
-//
-// We make the decision to always only expose positional argument.
-// We will do rewrapping in the frontend to support language
-// sugars such as keyword arguments and default value.
-//
-#define RELAY_REGISTER_UNARY_OP(OpName)               \
-  TVM_REGISTER_API("relay.op._make." OpName)          \
-  .set_body_typed<Expr(Expr)>([](Expr data) {         \
-      static const Op& op = Op::Get(OpName);          \
-    return CallNode::make(op, {data}, Attrs(), {});   \
-    });                                               \
-  RELAY_REGISTER_OP(OpName)                           \
-  .set_num_inputs(1)                                  \
-  .add_argument("data", "Tensor", "The input tensor.")
-
-RELAY_REGISTER_UNARY_OP("log")
-.describe(R"code(Returns the log of input array, computed element-wise.
+
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "log")
+.describe(R"code(Returns the log input array, computed element-wise.
 
 .. math::
    log(x)
@@ -38,8 +22,8 @@ RELAY_REGISTER_UNARY_OP("log")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("exp")
-.describe(R"code(Returns the exp of input array, computed element-wise.
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "exp")
+.describe(R"code(Returns the exp input array, computed element-wise.
 
 .. math::
    \exp(x)
@@ -49,7 +33,7 @@ RELAY_REGISTER_UNARY_OP("exp")
 .add_type_rel("Identity", IdentityRel);
 
 
-RELAY_REGISTER_UNARY_OP("sqrt")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "sqrt")
 .describe(R"code(Returns the sqrt input array, computed element-wise.
 
 .. math::
@@ -59,19 +43,19 @@ RELAY_REGISTER_UNARY_OP("sqrt")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("zeros_like")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "zeros_like")
 .describe(R"code(Returns an array of zeros, with same type and shape as the input.
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("ones_like")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "ones_like")
 .describe(R"code(Returns an array of ones, with same type and shape as the input.
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("sigmoid")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "sigmoid")
 .describe(R"code(Returns the sigmoid input array, computed element-wise.
 
 .. math::
@@ -81,7 +65,7 @@ RELAY_REGISTER_UNARY_OP("sigmoid")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("copy")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "copy")
 .describe(R"code(Copy a tensor.
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
@@ -118,13 +102,14 @@ RELAY_REGISTER_OP("clip")
   .set_support_level(3)
   .add_type_rel("Clip", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("floor")
+
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "floor")
 .describe(R"code(Returns the floor of input array, computed element-wise.
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("ceil")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "ceil")
 .describe(R"code(Returns the ceil of input array, computed element-wise.
 
 .. math::
@@ -134,7 +119,7 @@ RELAY_REGISTER_UNARY_OP("ceil")
 .set_support_level(3)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("trunc")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "trunc")
 .describe(R"code(Returns the trunc of input array, computed element-wise.
 
 .. math::
@@ -144,7 +129,7 @@ RELAY_REGISTER_UNARY_OP("trunc")
 .set_support_level(3)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("round")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "round")
 .describe(R"code(Returns the round of input array, computed element-wise.
 
 .. math::
@@ -154,7 +139,7 @@ RELAY_REGISTER_UNARY_OP("round")
 .set_support_level(3)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("abs")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "abs")
 .describe(R"code(Returns the abs of input array, computed element-wise.
 
 .. math::
@@ -164,7 +149,7 @@ RELAY_REGISTER_UNARY_OP("abs")
 .set_support_level(3)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("tanh")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "tanh")
 .describe(R"code(Returns the tanh of input array, computed element-wise.
 
 .. math::
@@ -174,7 +159,7 @@ RELAY_REGISTER_UNARY_OP("tanh")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
-RELAY_REGISTER_UNARY_OP("negative")
+RELAY_REGISTER_UNARY_OP("relay.op._make.", "negative")
 .describe(R"code(Returns the numeric negative of input array, computed element-wise.
 
 .. math::
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index e8c5b5fc87f2..cc4f1662e915 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -70,7 +70,8 @@ def test_unary_op():
     for op in [relay.exp,
                relay.log,
                relay.sqrt,
-               relay.sigmoid]:
+               relay.sigmoid,
+               relay.nn.relu]:
         ib = relay.ir_builder.IRBuilder()
         x = ib.param("x", relay.TensorType((10, 4), "int32"))
         with ib.function(x) as func:

From 614793b08efc10bebf05cf189c6f0a030e9a5c6c Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Tue, 9 Oct 2018 11:14:52 +0530
Subject: [PATCH 200/529] [RELAY][OP] take (#1863)

---
 docs/langref/relay_op.rst            |  2 +
 include/tvm/relay/attrs/transform.h  |  9 +++
 nnvm/src/top/tensor/transform.cc     |  2 +-
 python/tvm/relay/op/transform.py     | 23 +++++++
 src/relay/op/tensor/transform.cc     | 89 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level3.py | 22 +++++++
 6 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 0ac6851ba9de..d5f92f567b17 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -73,6 +73,7 @@ This level enables additional math and transform operators.
    tvm.relay.round
    tvm.relay.abs
    tvm.relay.negative
+   tvm.relay.take
 
 
 
@@ -143,6 +144,7 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.reshape
 .. autofunction:: tvm.relay.copy
 .. autofunction:: tvm.relay.transpose
+.. autofunction:: tvm.relay.take
 
 Level 3 Definitions
 -------------------
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index d501e6cb7255..5c4cbca4a4a8 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -59,6 +59,15 @@ struct ReshapeAttrs : public tvm::AttrsNode<ReshapeAttrs> {
   }
 };  // struct ReshapeAttrs
 
+struct TakeAttrs : public tvm::AttrsNode<TakeAttrs> {
+  IndexExpr axis;
+
+  TVM_DECLARE_ATTRS(TakeAttrs, "relay.attrs.TakeAttrs") {
+    TVM_ATTR_FIELD(axis).set_default(NullValue<IndexExpr>())
+        .describe("The axis over which to select values.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 40c8c930a029..270172856a75 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -1135,7 +1135,7 @@ Examples::
 .set_attr<FCorrectLayout>("FCorrectLayout", TakeCorrectLayout)
 .set_num_inputs(2)
 .set_num_outputs(1)
-.set_support_level(1)
+.set_support_level(3)
 .set_attr<FTVMCompute>(
     "FTVMCompute", [](const NodeAttrs& attrs,
                       const Array<Tensor>& inputs,
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index b530883d006c..830c1b18e42c 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -116,3 +116,26 @@ def reshape(data, newshape):
     if isinstance(newshape, int):
         newshape = [newshape]
     return _make.reshape(data, list(newshape))
+
+
+def take(data, indices, axis=None):
+    """Take elements from an array along an axis.
+
+    Parameters
+    ----------
+    a : relay.Expr
+        The source array.
+
+    indices : rely.Expr
+        The indices of the values to extract.
+
+    axis : int, optional
+        The axis over which to select values. By default,
+        the flattened input array is used.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    return _make.take(data, indices, axis)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index f85fd706a52f..ac9763a0f562 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -315,5 +315,94 @@ Example::
 .set_support_level(3)
 .add_type_rel("Reshape", ReshapeRel);
 
+// Take
+TVM_REGISTER_NODE_TYPE(TakeAttrs);
+
+bool TakeRel(const Array<Type>& types,
+             int num_inputs,
+             const Attrs& attrs,
+             const TypeReporter& reporter) {
+  // `types` contains: [data, indices, result]
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  CHECK(data != nullptr);
+  const auto* indices = types[1].as<TensorTypeNode>();
+  CHECK(indices != nullptr);
+  const auto param = attrs.as<TakeAttrs>();
+  CHECK(param != nullptr);
+
+  if (!param->axis.defined()) {
+    std::vector<IndexExpr>&& oshape = AsVector(indices->shape);
+    reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+    return true;
+  }
+
+  std::vector<IndexExpr> oshape;
+  const auto ndim_data = static_cast<int>(data->shape.size());
+  const auto ndim_indices = static_cast<int>(indices->shape.size());
+  auto axis = (*as_const_int(param->axis));
+  if (axis < 0) axis += ndim_data;
+  CHECK_LE(axis, ndim_data)
+    << "axis should be with in data shape"
+    << ", but got = " << axis;
+
+  oshape.reserve(ndim_data - 1 + ndim_indices);
+  for (int i = 0; i < axis; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+  for (int i = 0; i < ndim_indices; ++i) {
+    oshape.emplace_back(indices->shape[i]);
+  }
+  for (int i = axis+1; i < ndim_data; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeTake(Expr data,
+              Expr indices,
+              IndexExpr axis) {
+  auto attrs = make_node<TakeAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("take");
+  return CallNode::make(op, {data, indices}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.take")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeTake, args, rv);
+});
+
+RELAY_REGISTER_OP("take")
+.describe(R"code(Take elements from an array along an axis.
+
+When axis is not None, this function does the same thing as 'fancy' indexing
+(indexing arrays using arrays); however, it can be easier to use if you need
+elements along a given axis.
+
+**Note** that when axis is none the flattened input array is used.
+
+Examples::
+
+  a = [[ 1, 2],
+       [ 3, 4]]
+  indices = [3, 0, 2]
+  take(a, indices) = [ 4, 1, 3]
+
+  a = [[ 1., 2.],
+       [ 3., 4.]]
+  indices = [1, 0]
+  take(a, indices, axis=1) = [[ 2., 1.],
+                              [ 4., 3.]]
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("indices", "Tensor", "The indices tensor.")
+.set_support_level(2)
+.add_type_rel("Take", TakeRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index c6b83b39c276..55717bbe23df 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -91,6 +91,27 @@ def check_single_op(opfunc):
                    tvm.relay.round, tvm.relay.abs, tvm.relay.negative]:
         check_single_op(opfunc)
 
+def test_take_infer_type():
+    def verify_take(dshape, indices_shape, oshape, axis=None):
+        ib = relay.ir_builder.IRBuilder()
+        x = ib.param("x", relay.ty.TensorType(dshape, "float32"))
+        indices = ib.param("indices", relay.ty.TensorType(indices_shape, "int32"))
+        with ib.function(x, indices) as func:
+            ib.ret(relay.take(x.var, indices.var, axis=axis))
+        ib.ret(func)
+        func = relay.ir_pass.infer_type(ib.env, func.to_func())
+        ftype = func.checked_type
+        assert ftype.ret_type == relay.ty.TensorType(oshape, "float32")
+
+    d1, d2, d3 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3")
+    d4, d5, d6 = tvm.var("d4"), tvm.var("d5"), tvm.var("d6")
+    verify_take((d1,), (1,), (1,), 0)
+    verify_take((4,), (d1, d2), (d1, d2))
+    verify_take((3, 3, 3), (1, d2), (1, d2))
+    verify_take((d1, d2), (d3, d4, d5), (d3, d4, d5, d2), 0)
+    verify_take((d1, d2), (d3, d4, d5), (d1, d3, d4, d5), 1)
+    verify_take((d1, d2, d3, d4), (d5, d6), (d1, d2, d5, d6, d4), -2)
+
 
 if __name__ == "__main__":
     test_single_op()
@@ -99,3 +120,4 @@ def check_single_op(opfunc):
     test_copy_infer_type()
     test_transpose_infer_type()
     test_reshape_infer_type()
+    test_take_infer_type()

From 34ad0940a3a3977d541499b2b13ebce028256bd2 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Tue, 9 Oct 2018 09:01:33 -0700
Subject: [PATCH 201/529] Allow override gtest library search path (#1867)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a47fe1f8b889..7bd76bbd7906 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -213,7 +213,7 @@ target_include_directories(
 # Tests
 set(TEST_EXECS "")
 file(GLOB TEST_SRCS tests/cpp/*.cc)
-find_library(GTEST_LIB gtest)
+find_library(GTEST_LIB gtest "$ENV{GTEST_LIB}")
 
 if(GTEST_LIB)
   foreach(__srcpath ${TEST_SRCS})

From 491875d1717bf59a991d3f573639a80b521f13dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Wed, 10 Oct 2018 14:25:38 -0700
Subject: [PATCH 202/529] [Relay] GetItem (#1861)

---
 include/tvm/relay/expr.h                      | 24 +++++++++++++++++--
 include/tvm/relay/expr_functor.h              |  4 ++++
 python/tvm/relay/__init__.py                  |  1 +
 python/tvm/relay/expr.py                      |  8 +++++++
 src/relay/ir/debug_printer.cc                 |  6 +++--
 src/relay/ir/expr.cc                          | 16 +++++++++++++
 src/relay/ir/expr_functor.cc                  | 15 ++++++++++--
 src/relay/pass/alpha_eq.cc                    |  9 +++++++
 src/relay/pass/type_functor.h                 |  9 ++++---
 src/relay/pass/type_infer.cc                  | 18 ++++++++++++++
 tests/python/relay/test_ir_debug_printer.py   |  7 +++++-
 tests/python/relay/test_ir_nodes.py           |  8 +++++++
 tests/python/relay/test_ir_well_formed.py     | 18 +++++++++++++-
 tests/python/relay/test_pass_alpha_equal.py   |  8 +++++++
 .../relay/test_pass_dead_code_elimination.py  | 16 +++++++++++++
 tests/python/relay/test_pass_free_vars.py     | 11 +++++++++
 tests/python/relay/test_type_infer.py         | 12 ++++++++++
 17 files changed, 177 insertions(+), 13 deletions(-)

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 909b702bc1a1..c6e5573d9413 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -360,8 +360,6 @@ class IfNode : public ExprNode {
   /*! \brief The expression evaluated when condition is false */
   Expr false_branch;
 
-  IfNode() {}
-
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("cond", &cond);
     v->Visit("true_branch", &true_branch);
@@ -378,6 +376,28 @@ class IfNode : public ExprNode {
 
 RELAY_DEFINE_NODE_REF(If, IfNode, Expr);
 
+/*! \brief Get a field out of a tuple. */
+class TupleGetItem;
+class TupleGetItemNode : public ExprNode {
+ public:
+  /*! \brief The tuple */
+  Expr tuple;
+  /*! \brief which value to get */
+  int index;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("tuple", &tuple);
+    v->Visit("index", &index);
+  }
+
+  TVM_DLL static TupleGetItem make(Expr tuple, int index);
+
+  static constexpr const char * _type_key = "relay.GetItem";
+  TVM_DECLARE_NODE_TYPE_INFO(TupleGetItemNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(TupleGetItem, TupleGetItemNode, Expr);
+
 /*! \brief Print a debug representation of the expression to the stream.
  *  \param env The environment.
  *  \param e The expression
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index 1da66bc95f57..be174d33b4c8 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -89,6 +89,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
                        Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const OpNode* op,
                        Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const TupleGetItemNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExprDefault_(const Node* op, Args...) {
     throw Error(std::string("Do not have a default for ") + op->type_key());
   }
@@ -108,6 +109,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
     RELAY_EXPR_FUNCTOR_DISPATCH(LetNode);
     RELAY_EXPR_FUNCTOR_DISPATCH(IfNode);
     RELAY_EXPR_FUNCTOR_DISPATCH(OpNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(TupleGetItemNode);
     return vtable;
   }
 };
@@ -131,6 +133,7 @@ class ExprVisitor : public ::tvm::relay::ExprFunctor<void(const Expr& n)> {
   void VisitExpr_(const LetNode* op) override;
   void VisitExpr_(const IfNode* op) override;
   void VisitExpr_(const OpNode* op) override;
+  void VisitExpr_(const TupleGetItemNode* op) override;
   virtual void VisitType(const Type& t);
 };
 
@@ -153,6 +156,7 @@ class ExprMutator
   Expr VisitExpr_(const CallNode* call_node) override;
   Expr VisitExpr_(const LetNode* op) override;
   Expr VisitExpr_(const IfNode* op) override;
+  Expr VisitExpr_(const TupleGetItemNode* op) override;
   /*! \brief Used to visit the types inside of expressions.
    *
    * Can be overloaded to transform the types in arbitrary
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index dd48d213f700..18c02a416d6b 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -39,3 +39,4 @@
 Call = expr.Call
 Let = expr.Let
 If = expr.If
+TupleGetItem = expr.TupleGetItem
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 9b292a74eccd..05214ca095d1 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -125,4 +125,12 @@ def __init__(self, cond, true_value, false_value):
         self.__init_handle_by_constructor__(
             _make.If, cond, true_value, false_value)
 
+@register_relay_node
+class TupleGetItem(Expr):
+    """An expression that get field from tuple in Relay, see tvm/relay/expr.h for more details."""
+
+    def __init__(self, tuple_, index):
+        self.__init_handle_by_constructor__(
+            _make.TupleGetItem, tuple_, index)
+
 debug_print = _expr._debug_print
diff --git a/src/relay/ir/debug_printer.cc b/src/relay/ir/debug_printer.cc
index e216faa0f195..90e82d3b2dd7 100644
--- a/src/relay/ir/debug_printer.cc
+++ b/src/relay/ir/debug_printer.cc
@@ -223,7 +223,6 @@ class ExprDocifier : private ExprFunctor<Doc(const Expr& n)> {
   }
 
   Doc VisitExpr_(const CallNode* c) final {
-    auto args = DocifyExprArray(c->args);
     return Docify(c->op) + Seq("<", DocifyExprArray(c->args), ">");
   }
 
@@ -244,6 +243,10 @@ class ExprDocifier : private ExprFunctor<Doc(const Expr& n)> {
     return DocOfStr(o->name);
   }
 
+  Doc VisitExpr_(const TupleGetItemNode* g) final {
+    return Docify(g->tuple) + DocOfStr(std::string(".") + std::to_string(g->index));
+  }
+
  public:
   ExprDocifier(const Environment& env) : env(env), td(env) { }
 
@@ -291,7 +294,6 @@ std::string PrintType(const Environment& env, const Type& t) {
 TVM_REGISTER_API("relay._expr._debug_print")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     NodeRef x = args[1];
-    std::cout << x << std::endl;
     if (x.as<TypeNode>()) {
       *ret = PrintType(args[0], Downcast<Type>(x));
     } else {
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index dbbb5b84fc8b..6b56cb4e844f 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -193,5 +193,21 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
             << ", " << node->false_branch << ")";
 });
 
+TupleGetItem TupleGetItemNode::make(Expr tuple, int index) {
+  NodePtr<TupleGetItemNode> n = make_node<TupleGetItemNode>();
+  n->tuple = std::move(tuple);
+  n->index = index;
+  return TupleGetItem(n);
+}
+
+TVM_REGISTER_API("relay._make.TupleGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = TupleGetItemNode::make(args[0], args[1]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TupleGetItemNode>([](const TupleGetItemNode* node, tvm::IRPrinter* p) {
+  p->stream << "TupleGetItemNode(" << node->tuple << ", " << node->index << ")";
+});
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index e3393bdb039b..792f99d699dd 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -150,10 +150,17 @@ Expr ExprMutator::VisitExpr_(const IfNode* op) {
   }
 }
 
-Type ExprMutator::VisitType(const Type& t) {
-  return t;
+Expr ExprMutator::VisitExpr_(const TupleGetItemNode* g) {
+  auto t = this->Mutate(g->tuple);
+  if (g->tuple == t) {
+    return GetRef<Expr>(g);
+  } else {
+    return TupleGetItemNode::make(t, g->index);
+  }
 }
 
+Type ExprMutator::VisitType(const Type& t) { return t; }
+
 void ExprVisitor::ExprVisitor::VisitExpr_(const VarNode* op) {
 }
 
@@ -206,6 +213,10 @@ void ExprVisitor::VisitExpr_(const IfNode* op) {
 
 void ExprVisitor::VisitExpr_(const OpNode* op) { return; }
 
+void ExprVisitor::VisitExpr_(const TupleGetItemNode* op) {
+  this->VisitExpr(op->tuple);
+}
+
 void ExprVisitor::VisitType(const Type& t) { return; }
 
 }  // namespace relay
diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
index 3c4c3d78063f..0e13a598ca3a 100644
--- a/src/relay/pass/alpha_eq.cc
+++ b/src/relay/pass/alpha_eq.cc
@@ -335,6 +335,15 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
       equal = false;
     }
   }
+
+  void VisitExpr_(const TupleGetItemNode* op, const Expr& e2) final {
+    if (const TupleGetItemNode* proj = e2.as<TupleGetItemNode>()) {
+      this->VisitExpr(op->tuple, proj->tuple);
+      equal = equal && (op->index == proj->index);
+    } else {
+      equal = false;
+    }
+  }
 };
 
 bool AlphaEqual(const Expr& e1, const Expr& e2) {
diff --git a/src/relay/pass/type_functor.h b/src/relay/pass/type_functor.h
index a451fbe16984..70a2d9347eab 100644
--- a/src/relay/pass/type_functor.h
+++ b/src/relay/pass/type_functor.h
@@ -8,7 +8,6 @@
 
 #include <tvm/node/ir_functor.h>
 #include <tvm/relay/expr.h>
-#include <tvm/relay/error.h>
 #include <string>
 
 namespace tvm {
@@ -21,11 +20,11 @@ class TypeFunctor;
 #define TYPE_FUNCTOR_DEFAULT \
   { return VisitTypeDefault_(op, std::forward<Args>(args)...); }
 
-#define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                       \
-  vtable.template set_dispatch<OP>(                           \
-      [](const NodeRef& n, TSelf* self, Args... args) {       \
+#define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                                   \
+  vtable.template set_dispatch<OP>(                                       \
+      [](const NodeRef& n, TSelf* self, Args... args) {                   \
         return self->VisitType_(static_cast<const OP*>(n.node_.get()),    \
-                                std::forward<Args>(args)...); \
+                                std::forward<Args>(args)...);             \
       });
 
 template <typename R, typename... Args>
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 1e2100fa902e..72bdaf69f061 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -119,6 +119,20 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
     return TupleTypeNode::make(fields);
   }
 
+  Type VisitExpr_(const TupleGetItemNode* op) final {
+    // TODO(M.K.)
+    // handle case where field type is not known
+    Type tuple_type = GetType(op->tuple);
+    auto tuple_ty_node = tuple_type.as<TupleTypeNode>();
+    if (!tuple_ty_node) {
+      LOG(FATAL) << "only expressions with tuple types is accepted" << GetRef<TupleGetItem>(op);
+    }
+    if (static_cast<int>(tuple_ty_node->fields.size()) <= op->index) {
+      LOG(FATAL) << "tuple not big enough" << GetRef<TupleGetItem>(op);
+    }
+    return tuple_ty_node->fields[op->index];
+  }
+
   Type VisitExpr_(const OpNode* op) final {
     return op->op_type;
   }
@@ -293,6 +307,10 @@ class TypeInferencer::Resolver : public ExprMutator {
     return AttachCheckedType(op);
   }
 
+  Expr VisitExpr_(const TupleGetItemNode* op) final {
+    return AttachCheckedType(op);
+  }
+
   Expr VisitExpr_(const ParamNode* op) final {
     return ExprMutator::VisitExpr_(op);
   }
diff --git a/tests/python/relay/test_ir_debug_printer.py b/tests/python/relay/test_ir_debug_printer.py
index 2ea0b7575ff8..e5f9ad2e69cd 100644
--- a/tests/python/relay/test_ir_debug_printer.py
+++ b/tests/python/relay/test_ir_debug_printer.py
@@ -77,7 +77,7 @@ def test_call():
 
 def test_let():
     lv = relay.Var('x')
-    ty = relay.ty.TensorType((10, 20), "float32")
+    ty = relay.ty.TensorType((10, 20), 'float32')
     arr = tvm.nd.array(10)
     value = relay.Constant(arr)
     let = relay.Let(lv, value, lv, ty)
@@ -90,3 +90,8 @@ def test_if():
     right = relay.Var('right')
     ife = relay.If(cond, left, right)
     show(ife)
+
+def test_tuple_get_item():
+    t = relay.Var('t')
+    g = relay.TupleGetItem(t, 0)
+    show(g)
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index d3dae9b2c3f8..79883ed225e0 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -175,6 +175,13 @@ def test_if():
     str(ife)
 
 
+def test_tuple_get_item():
+    tup = relay.Var("tuple")
+    get = relay.TupleGetItem(tup, 1)
+    assert get.tuple == tup
+    assert get.index == 1
+    str(get)
+
 if __name__ == "__main__":
     test_bad_constructor()
     test_span()
@@ -192,3 +199,4 @@ def test_if():
     test_call()
     test_let()
     test_if()
+    test_tuple_get_item()
diff --git a/tests/python/relay/test_ir_well_formed.py b/tests/python/relay/test_ir_well_formed.py
index 8bdef4d0edb5..c6cb99662bb5 100644
--- a/tests/python/relay/test_ir_well_formed.py
+++ b/tests/python/relay/test_ir_well_formed.py
@@ -3,7 +3,7 @@
 from tvm.relay.ir_pass import well_formed
 
 def test_well_formed():
-    x = relay.Var("x")
+    x = relay.Var('x')
     assert well_formed(x)
     v = relay.Constant(tvm.nd.array(10))
     ty = None
@@ -16,3 +16,19 @@ def test_well_formed():
     # but we want all binder to be distinct from each other.
     assert not well_formed(relay.Let(relay.Var("y"), f,
                                      relay.Let(relay.Var("z"), f, v, ty), ty))
+
+
+def test_tuple():
+    x = relay.Var('x')
+    assert well_formed(x)
+    v = relay.Constant(tvm.nd.array(10))
+    ty = None
+    let = relay.Let(x, v, x, ty)
+    assert well_formed(let)
+    assert well_formed(relay.Tuple([v, v]))
+    assert not well_formed(relay.Tuple([let, let]))
+
+
+def test_tuple_get_item():
+    t = relay.Var('t')
+    assert well_formed(relay.TupleGetItem(t, 2))
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index 93f8a8fbc0b3..9fa1a554a6e2 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -167,11 +167,19 @@ def test_type_relation_alpha_equal():
 
     assert bigger != diff_num_inputs
 
+def test_tuple_get_item_alpha_equal():
+    x = relay.Var('x')
+    y = relay.Var('y')
+    assert not alpha_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(y, 1))
+    assert not alpha_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(x, 2))
+    assert alpha_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(x, 1))
 
 if __name__ == "__main__":
     test_tensor_type_alpha_equal()
     test_incomplete_type_alpha_equal()
+    test_constant_alpha_equal()
     test_type_param_alpha_equal()
     test_func_type_alpha_equal()
     test_tuple_type_alpha_equal()
     test_type_relation_alpha_equal()
+    test_tuple_get_item_alpha_equal()
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index db73fb5c585f..ce9bda3d254f 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -4,6 +4,7 @@
 from tvm.relay.ir_builder import convert, IRBuilder
 from tvm.relay.op import log, add, equal, subtract
 
+
 class env:
     def __init__(self):
         self.a = relay.Var("a")
@@ -22,20 +23,25 @@ def __init__(self):
         self.two = convert(2.0)
         self.three = convert(3.0)
 
+
 e = env()
 
+
 def test_let():
     orig = relay.Let(e.x, e.y, e.z, e.tt)
     assert alpha_equal(dead_code_elimination(orig), e.z)
 
+
 def test_used_let():
     orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.c, e.tt), e.tt)
     assert alpha_equal(dead_code_elimination(orig), relay.Let(e.c, e.d, e.c, e.tt))
 
+
 def test_chain_unused_let():
     orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.e, e.tt), e.tt)
     assert alpha_equal(dead_code_elimination(orig), e.e)
 
+
 # make sure we dont infinite loop
 def test_recursion():
     """
@@ -60,14 +66,23 @@ def test_recursion():
     assert alpha_equal(dead_code_elimination(orig), orig)
     assert alpha_equal(dead_code_elimination(relay.Let(f, funcbody, e.three, e.float32)), e.three)
 
+
 def test_op_let():
     assert alpha_equal(dead_code_elimination(add(relay.Let(e.a, e.one, e.three, e.float32), e.two)), add(e.three, e.two))
 
+
 def test_if():
     orig = relay.If(convert(True), e.a, e.b)
     assert alpha_equal(dead_code_elimination(orig), e.a)
 
 
+def test_tuple_get_item():
+    t = relay.Var('t')
+    g = relay.TupleGetItem(t, 0)
+    assert alpha_equal(dead_code_elimination(g), g)
+    assert alpha_equal(dead_code_elimination(relay.TupleGetItem(relay.Let(e.a, e.one, t, e.float32), 0)), g)
+
+
 if __name__ == "__main__":
     test_let()
     test_used_let()
@@ -75,3 +90,4 @@ def test_if():
     test_recursion()
     test_op_let()
     test_if()
+    test_tuple_get_item()
diff --git a/tests/python/relay/test_pass_free_vars.py b/tests/python/relay/test_pass_free_vars.py
index 002646ada582..989c9f8d25db 100644
--- a/tests/python/relay/test_pass_free_vars.py
+++ b/tests/python/relay/test_pass_free_vars.py
@@ -15,6 +15,17 @@ def test_free_vars():
     f = relay.Function([relay.Param(x, ty)], ty, x)
     assert len(free_vars(f)) == 0
 
+
+def test_tuple():
+    t = relay.Var('t')
+    fv = free_vars(relay.Tuple([t, t]))
+    assert len(fv) == 1
+    assert fv[0] == t
+    fv = free_vars(relay.TupleGetItem(t, 123))
+    assert len(fv) == 1
+    assert fv[0] == t
+
+
 def test_free_type_vars():
     tp = relay.TypeParam("")
     ty = relay.TupleType([tp, relay.TensorType([], "int32")])
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 6629932921f8..77b04590df59 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -9,6 +9,7 @@
 from tvm.relay.env import Environment
 from tvm.relay.op import log, add, equal, subtract, concatenate
 from tvm.relay.expr import Function
+from tvm import relay
 
 def assert_has_type(expr, typ, env=Environment({})):
     checked_expr = infer_type(env, expr)
@@ -110,6 +111,16 @@ def try_concat2(x: Float(3, 2), y: Float(2, 2)) -> Float(5, 2) {
     fn_ty = func_type([tensor_type(3, 2), tensor_type(2, 2)], tensor_type(5, 2))
     assert_decl_has_type(ib.env, try_concat2, fn_ty)
 
+def test_tuple():
+    ib = IRBuilder()
+    dup = ib.global_var('dup')
+    x = ib.param('x')
+    with ib.decl(dup, x):
+        ib.ret(relay.Tuple([x, x]))
+    # todo: why is this not generalized?
+    fn_ty = func_type([tensor_type()], relay.TupleType([tensor_type(), tensor_type()]))
+    assert_decl_has_type(ib.env, dup, fn_ty)
+
 if __name__ == "__main__":
     test_dual_op()
     test_recursion()
@@ -117,3 +128,4 @@ def try_concat2(x: Float(3, 2), y: Float(2, 2)) -> Float(5, 2) {
     test_decl()
     test_recursion()
     test_concat()
+    test_tuple()

From 35161c2b441cbb611aa0ed69fa04082acb34f9c6 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Wed, 10 Oct 2018 17:14:06 -0700
Subject: [PATCH 203/529] Add instructions to run tests locally (#1868)

---
 docs/contribute/pull_request.rst | 47 ++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 80a0448c08dd..c83edc6cf7d1 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -24,3 +24,50 @@ This is a quick guide to submit a pull request, please also refer to the detaile
   - The detailed guidelines and summarizes useful lessons.
 
 - The patch can be merged after the reviewers approve the pull request.
+
+Testing
+-------
+Even though we have hooks to run unit tests automatically for each pull request, It's always recommended to run unit tests
+locally beforehand to reduce reviewers' burden and speedup review process.
+
+C++
+^^^
+.. code:: bash
+
+  # assume you are in tvm source root
+  TVM_ROOT=`pwd`
+
+  # you need to install google test first, gtest will be installed to $TVM_ROOT/lib
+  CACHE_PREFIX=. make -f 3rdparty/dmlc-core/scripts/packages.mk gtest
+
+  mkdir build
+  cd build
+  GTEST_LIB=$TVM_ROOT/lib cmake ..
+  make cpptest -j
+  for test in *_test; do
+    ./$test || exit -1
+  done
+
+Python
+^^^^^^
+If you want to run all tests:
+
+.. code:: bash
+
+  # build tvm
+  make
+
+  ./tests/scripts/task_python_unittest.sh
+
+If you want to run a single test:
+
+.. code:: bash
+
+  # build tvm
+  make
+
+  # let python know where to find tvm related libraries
+  export PYTHONPATH=python:topi/python
+  rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
+
+  TVM_FFI=ctypes python -m nose -v tests/python/unittest/test_pass_storage_rewrite.py
\ No newline at end of file

From e114959d1a2600f577615ab14e02f4dfc2e20b2e Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Thu, 11 Oct 2018 05:48:05 +0530
Subject: [PATCH 204/529]  [RELAY][OPS]LRN and L2_Normalize (#1860)

---
 docs/langref/relay_op.rst            |  5 ++
 include/tvm/relay/attrs/nn.h         | 38 ++++++++++++++
 python/tvm/relay/op/nn/nn.py         | 63 +++++++++++++++++++++++
 src/relay/op/nn/nn.cc                | 74 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level1.py | 26 ++++++++++
 5 files changed, 206 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index d5f92f567b17..ff560be340a2 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -39,6 +39,7 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.sigmoid
    tvm.relay.nn.relu
 
+
 **Level 2: Convolutions**
 
 This level enables typical convnet models.
@@ -53,6 +54,8 @@ This level enables typical convnet models.
    tvm.relay.nn.global_avg_pool2d
    tvm.relay.nn.upsampling
    tvm.relay.nn.batch_flatten
+   tvm.relay.nn.lrn
+   tvm.relay.nn.l2_normalize
 
 
 **Level 3: Additional Math And Transform Operators**
@@ -131,6 +134,8 @@ Level 2 Definitions
 .. autofunction:: tvm.relay.nn.global_avg_pool2d
 .. autofunction:: tvm.relay.nn.upsampling
 .. autofunction:: tvm.relay.nn.batch_flatten
+.. autofunction:: tvm.relay.nn.lrn
+.. autofunction:: tvm.relay.nn.l2_normalize
 
 
 Level 3 Definitions
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 45f1d2d41cfc..ce80407f15c6 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -173,6 +173,44 @@ struct UpSamplingAttrs : public tvm::AttrsNode<UpSamplingAttrs> {
 };
 
 
+
+
+/*! \brief Attributes for LRN operator */
+struct LRNAttrs : public tvm::AttrsNode<LRNAttrs> {
+  IndexExpr size;
+  IndexExpr axis;
+  double bias;
+  double alpha;
+  double beta;
+
+  TVM_DECLARE_ATTRS(LRNAttrs, "relay.attrs.LRNAttrs") {
+    TVM_ATTR_FIELD(size).set_default(5)
+      .describe("The size of the local region to be considered for normalization.");
+    TVM_ATTR_FIELD(axis).set_default(1)
+      .describe("Axis of input data layout channel.");
+    TVM_ATTR_FIELD(bias).set_default(2)
+      .describe("The offset parameter to avoid division by 0.");
+    TVM_ATTR_FIELD(alpha).set_default(0.0001)
+      .describe("The scaling parameter.");
+    TVM_ATTR_FIELD(beta).set_default(0.75)
+      .describe("The exponent parameter.");
+  }
+};
+
+
+/*! \brief Attributes for L2Normalize operator */
+struct L2NormalizeAttrs : public tvm::AttrsNode<L2NormalizeAttrs> {
+  double eps;
+  Array<IndexExpr> axis;
+
+  TVM_DECLARE_ATTRS(L2NormalizeAttrs, "relay.attrs.L2NormalizeAttrs") {
+    TVM_ATTR_FIELD(eps)
+      .describe("A lower bound value for the norm, to avoid division by 0.");
+    TVM_ATTR_FIELD(axis)
+      .describe("Axis over the normalization applied.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_NN_H_
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 61ed6f64a91b..7985d57c9edb 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -383,3 +383,66 @@ def relu(data):
         The computed result.
     """
     return _make.relu(data)
+
+
+def lrn(data, size=5, axis=1, bias=2, alpha=.00001, beta=0.75):
+    """This operator takes data as input and does local response normalization.
+
+    Normalize the input in a local region across or within feature maps.
+    Each input value is divided by (data / (bias + (alpha * sum_data ^2 /size))^beta)
+    where n is the size of each local region, and the sum is taken over the region
+    centered at that value (zero padding is added where necessary).
+
+    .. math::
+        (data / (bias + (alpha * sum_data ^2 /size))^beta)
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    size : int, optional
+        The size of the local region to be considered for normalization.
+
+    axis : int, optional
+        Input data layout channel axis. Default value is 1 for NCHW format
+
+    bias : float, optional
+        The offset parameter to avoid dividing by 0.
+
+    alpha : float, optional
+        The scaling parameter.
+
+    beta : float, optional
+        The exponent parameter.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+
+    return _make.lrn(data, size, axis, alpha, beta, bias)
+
+def l2_normalize(data, eps, axis=None):
+    """Perform L2 normalization on the input data
+
+    .. math::
+        y(i, j) = x(i, j) / sqrt(max(sum(x^2), eps))
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    eps : float
+        epsilon value
+
+    axis : list of int, optional
+        axis over the normalization applied
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.l2_normalize(data, eps, axis)
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 7f6b0ee79e3f..f2439b9fb7ca 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -143,5 +143,79 @@ RELAY_REGISTER_UNARY_OP("relay.op.nn._make.", "relu")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
+
+// Positional relay function to create LRN operator used by frontend FFI.
+Expr MakeLRN(Expr data,
+             IndexExpr size,
+             IndexExpr axis,
+             double alpha,
+             double beta,
+             double bias) {
+  auto attrs = make_node<LRNAttrs>();
+  attrs->size = size;
+  attrs->axis = axis;
+  attrs->alpha = alpha;
+  attrs->beta = beta;
+  attrs->bias = bias;
+  static const Op& op = Op::Get("nn.lrn");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.lrn")
+  .set_body([](const TVMArgs& args, TVMRetValue* rv) {
+      runtime::detail::unpack_call<Expr, 6>(MakeLRN, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.lrn")
+    .describe(R"code(LRN layer.
+
+Normalize the input in a local region across or within feature maps.
+Each input value is divided by (1 + (\alpha/n) \sum_i x_i^2)^\beta,
+where n is the size of each local region, and the sum is taken over the region
+centered at that value (zero padding is added where necessary).
+
+.. math::
+
+    data / (bias + (alpha * sum_data ^2 /size))^beta
+
+- **data**: The input tensor.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("Identity", IdentityRel);
+
+
+// Positional relay function to create L2Normalize operator used by frontend FFI.
+Expr MakeL2Normalize(Expr data,
+                     double eps,
+                     Array<IndexExpr> axis) {
+  auto attrs = make_node<L2NormalizeAttrs>();
+  attrs->eps = eps;
+  attrs->axis = std::move(axis);
+  static const Op& op = Op::Get("nn.l2_normalize");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.l2_normalize")
+  .set_body([](const TVMArgs& args, TVMRetValue* rv) {
+      runtime::detail::unpack_call<Expr, 3>(MakeL2Normalize, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.l2_normalize")
+    .describe(R"code(L2 Normalization layer.
+
+Normalizes along dimension axis using an L2 norm
+
+.. math::
+    output = x / sqrt(max(sum(x^2), epsilon))
+
+- **data**: The input tensor.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("Identity", IdentityRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index cc4f1662e915..a90f6eb55ae1 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -168,6 +168,30 @@ def test_concatenate_infer_type():
     assert ftype.ret_type == relay.ty.TensorType(
         (n, t + t, 100), "float32")
 
+def test_lrn():
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=.00001, beta=0.75))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c , h, w), "float32")
+
+
+def test_l2_normalize():
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.nn.l2_normalize(x, eps=0.001, axis=[1]))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c , h, w), "float32")
 
 if __name__ == "__main__":
     test_unary_op()
@@ -178,3 +202,5 @@ def test_concatenate_infer_type():
     test_log_softmax()
     test_binary_op()
     test_binary_broadcast_op()
+    test_lrn()
+    test_l2_normalize()

From 15e994d99fa8eca3bae442e3a5a2216ef934ef5e Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Wed, 10 Oct 2018 17:27:41 -0700
Subject: [PATCH 205/529] [Relay][Op] Add operators full and full_like (#1845)

---
 docs/langref/relay_op.rst            |  3 +-
 include/tvm/relay/attrs/transform.h  | 14 +++++
 python/tvm/relay/op/transform.py     | 43 ++++++++++++-
 src/relay/op/tensor/transform.cc     | 94 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level3.py | 49 +++++++++++++++
 5 files changed, 201 insertions(+), 2 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index ff560be340a2..97f2d4cb9fea 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -77,7 +77,8 @@ This level enables additional math and transform operators.
    tvm.relay.abs
    tvm.relay.negative
    tvm.relay.take
-
+   tvm.relay.full
+   tvm.relay.full_like
 
 
 **Level 4: Broadcast and Reductions**
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 5c4cbca4a4a8..080a375cf1e2 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -68,6 +68,20 @@ struct TakeAttrs : public tvm::AttrsNode<TakeAttrs> {
   }
 };
 
+/*! \brief Attributes used in full operator */
+struct FullAttrs : public tvm::AttrsNode<FullAttrs> {
+  Array<IndexExpr> shape;
+  DataType dtype;
+
+  TVM_DECLARE_ATTRS(FullAttrs, "relay.attrs.FullAttrs") {
+    TVM_ATTR_FIELD(shape)
+      .describe("Target shape.");
+    TVM_ATTR_FIELD(dtype)
+      .describe("Target data type.")
+      .set_default(Int(0));
+  }
+};  // struct FullAttrs
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 830c1b18e42c..757297db9109 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -18,7 +18,7 @@ def expand_dims(data, axis, num_newaxis=1):
         If `axis >= 0`, it is the last axis inserted in Python's negative indexing.
 
     num_newaxis : int
-        Number of axises to be inserted. Should be >= 0.
+        Number of axes to be inserted. Should be >= 0.
 
     Returns
     -------
@@ -139,3 +139,44 @@ def take(data, indices, axis=None):
         The computed result.
     """
     return _make.take(data, indices, axis)
+
+
+def full(fill_value, shape=(), dtype=""):
+    """Fill array with scalar value.
+
+    Parameters
+    ----------
+    fill_value : relay.Expr
+        The value to fill. Must be a scalar.
+
+    shape : tuple of int
+        The shape of the target.
+
+    dtype : data type, optional (defaults to data type of the fill value)
+        The data type of the target.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.full(fill_value, shape, dtype)
+
+
+def full_like(data, fill_value):
+    """Return an scalar value array with the same shape and type as the input array.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input tensor.
+
+    fill_value : relay.Expr
+        The scalar value to fill.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.full_like(data, fill_value)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index ac9763a0f562..663dd5c38ec5 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -404,5 +404,99 @@ Examples::
 .set_support_level(2)
 .add_type_rel("Take", TakeRel);
 
+TVM_REGISTER_NODE_TYPE(FullAttrs);
+
+bool FullRel(const Array<Type>& types,
+             int num_inputs,
+             const Attrs& attrs,
+             const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const FullAttrs* param = attrs.as<FullAttrs>();
+  const auto* fill_value = types[0].as<TensorTypeNode>();
+  if (fill_value == nullptr) {
+    return false;
+  }
+
+  DataType out_dtype = param->dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = fill_value->dtype;
+  }
+
+  CHECK_EQ(fill_value->shape.size(), 0)
+    << "Fill value should be a scalar but has dimension "
+    << fill_value->shape.size() << ".";
+
+  reporter->Assign(types[1], TensorTypeNode::make(param->shape, out_dtype));
+  return true;
+}
+
+Expr MakeFull(Expr fill_value,
+              Array<IndexExpr> shape,
+              DataType dtype) {
+  auto attrs = make_node<FullAttrs>();
+  attrs->shape = std::move(shape);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("full");
+  return CallNode::make(op, {fill_value}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.full")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeFull, args, rv);
+});
+
+RELAY_REGISTER_OP("full")
+.describe(R"code(Fill array with scalar value.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("fill_value", "double", "The value to fill.")
+.set_support_level(3)
+.add_type_rel("Full", FullRel);
+
+bool FullLikeRel(const Array<Type>& types,
+                 int num_inputs,
+                 const Attrs& attrs,
+                 const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+  const auto* fill_value = types[1].as<TensorTypeNode>();
+  if (fill_value == nullptr) {
+    return false;
+  }
+
+  CHECK_EQ(fill_value->shape.size(), 0)
+    << "The fill value should be a scalar but here it has dimension "
+    << fill_value->shape.size() << ".";
+
+  reporter->Assign(types[2], TensorTypeNode::make(data->shape, data->dtype));
+  return true;
+}
+
+Expr MakeFullLike(Expr data,
+                  Expr fill_value) {
+  static const Op& op = Op::Get("full_like");
+  return CallNode::make(op, {data, fill_value}, Attrs(), {});
+}
+
+TVM_REGISTER_API("relay.op._make.full_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeFullLike, args, rv);
+  });
+
+RELAY_REGISTER_OP("full_like")
+.describe(R"code(Return an scalar value array with the same shape
+and type as the input array.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("fill_value", "double", "Scalar value to fill.")
+.set_support_level(3)
+.add_type_rel("FullLike", FullLikeRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 55717bbe23df..cc8973c38384 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -113,6 +113,53 @@ def verify_take(dshape, indices_shape, oshape, axis=None):
     verify_take((d1, d2, d3, d4), (d5, d6), (d1, d2, d5, d6, d4), -2)
 
 
+def test_full():
+    # default settings: match input dtype
+    ib = relay.ir_builder.IRBuilder()
+    x = ib.param("x", relay.TensorType((), "int8"))
+    with ib.function(x) as func:
+        ib.ret(relay.full(x.var, ()))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.TensorType((), "int8")
+
+    # change the shape and dtype
+    ib = relay.ir_builder.IRBuilder()
+    x = ib.param("x", relay.TensorType((), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.full(x.var, (1, 2), "int8"))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.TensorType((1, 2), "int8")
+
+
+def test_full_like():
+    # concrete shape
+    ib = relay.ir_builder.IRBuilder()
+    base = ib.param("base", relay.TensorType((1, 2, 3), "float32"))
+    fill = ib.param("fill", relay.TensorType((), "float32"))
+    with ib.function(base, fill) as func:
+        ib.ret(relay.full_like(base.var, fill.var))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.TensorType((1, 2, 3), "float32")
+
+    # symbolic shape
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 2, 3, tvm.var("w")
+    base = ib.param("base", relay.TensorType((n, c, h, w), "float32"))
+    fill = ib.param("fill", relay.TensorType((), "float32"))
+    with ib.function(base, fill) as func:
+        ib.ret(relay.full_like(base.var, fill.var))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.TensorType((n, c, h, w), "float32")
+
+
 if __name__ == "__main__":
     test_single_op()
     test_unary_identity()
@@ -121,3 +168,5 @@ def verify_take(dshape, indices_shape, oshape, axis=None):
     test_transpose_infer_type()
     test_reshape_infer_type()
     test_take_infer_type()
+    test_full()
+    test_full_like()

From d27a534b89c01893bd13da1ff835b6327e6f694b Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Wed, 10 Oct 2018 17:28:23 -0700
Subject: [PATCH 206/529] Use new onnx API to load model from file (#1874)

---
 nnvm/tests/python/frontend/onnx/test_forward.py | 2 +-
 nnvm/tests/python/frontend/onnx/test_graph.py   | 2 +-
 tutorials/nnvm/from_onnx.py                     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 187e6c175cd4..7ca520a88b12 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -66,7 +66,7 @@ def get_caffe2_output(model, x, dtype='float32'):
 def verify_onnx_forward_impl(graph_file, data_shape, out_shape):
     dtype = 'float32'
     x = np.random.uniform(size=data_shape)
-    model = onnx.load(graph_file)
+    model = onnx.load_model(graph_file)
     c2_out = get_caffe2_output(model, x, dtype)
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, dtype)
diff --git a/nnvm/tests/python/frontend/onnx/test_graph.py b/nnvm/tests/python/frontend/onnx/test_graph.py
index 0aad9d22f1be..b3961c1a38fd 100755
--- a/nnvm/tests/python/frontend/onnx/test_graph.py
+++ b/nnvm/tests/python/frontend/onnx/test_graph.py
@@ -6,7 +6,7 @@
 from model_zoo import squeezenet as squeezenet
 
 def compare_graph(onnx_file, nnvm_sym, ishape):
-    onnx_model = onnx.load(onnx_file)
+    onnx_model = onnx.load_model(onnx_file)
     onnx_sym, params = nnvm.frontend.from_onnx(onnx_model)
     g1 = nnvm.graph.create(onnx_sym)
     g2 = nnvm.graph.create(nnvm_sym)
diff --git a/tutorials/nnvm/from_onnx.py b/tutorials/nnvm/from_onnx.py
index df8dee8272ce..0fdef8afa98c 100644
--- a/tutorials/nnvm/from_onnx.py
+++ b/tutorials/nnvm/from_onnx.py
@@ -46,7 +46,7 @@ def download(url, path, overwrite=False):
                      'super_resolution_0.2.onnx'])
 download(model_url, 'super_resolution.onnx', True)
 # now you have super_resolution.onnx on disk
-onnx_model = onnx.load('super_resolution.onnx')
+onnx_model = onnx.load_model('super_resolution.onnx')
 # we can load the graph as NNVM compatible model
 sym, params = nnvm.frontend.from_onnx(onnx_model)
 

From 85d26af43325cd0f113a14a49e2a19c5c95bd5a1 Mon Sep 17 00:00:00 2001
From: Nick Hynes <nhynes@berkeley.edu>
Date: Wed, 10 Oct 2018 21:53:16 -0700
Subject: [PATCH 207/529] Update SGX example (#1879)

---
 apps/sgx/Makefile                             |  2 +-
 apps/sgx/README.md                            | 10 ++++--
 apps/sgx/enclave/Cargo.toml                   |  3 +-
 apps/sgx/enclave/Makefile                     | 13 ++++++--
 apps/sgx/enclave/Xargo.toml                   |  8 ++---
 apps/sgx/enclave/enclave_config.xml.in        |  4 +--
 apps/sgx/enclave/sgx-deps.diff                | 13 ++++++++
 .../sgx/enclave/x86_64-unknown-linux-sgx.json | 31 +++++++++++++++++++
 apps/sgx/run_example.sh                       | 10 ++++--
 docker/install/ubuntu_install_rust.sh         |  4 +--
 docker/install/ubuntu_install_sgx.sh          | 10 +++---
 rust/src/lib.rs                               |  1 -
 12 files changed, 82 insertions(+), 27 deletions(-)
 create mode 100644 apps/sgx/enclave/sgx-deps.diff
 create mode 100644 apps/sgx/enclave/x86_64-unknown-linux-sgx.json

diff --git a/apps/sgx/Makefile b/apps/sgx/Makefile
index 875897b82d23..422d3e4f03ab 100644
--- a/apps/sgx/Makefile
+++ b/apps/sgx/Makefile
@@ -4,7 +4,7 @@ SGX_MODE ?= SIM
 DEBUG ?= true
 NUM_THREADS ?= 4
 
-TVM_DIR ?= ../..
+TVM_DIR ?= $(shell git rev-parse --show-toplevel)
 
 export
 
diff --git a/apps/sgx/README.md b/apps/sgx/README.md
index 7d642422ec6e..10989ba4b90d 100644
--- a/apps/sgx/README.md
+++ b/apps/sgx/README.md
@@ -22,7 +22,7 @@ Check out the `/tvm/install/ubuntu_install_sgx.sh` for the commands to get these
 If using Docker, start by running
 
 ```
-git clone https://github.com/dmlc/tvm.git
+git clone --recursive https://github.com/dmlc/tvm.git
 docker run --rm -it -v $(pwd)/tvm:/mnt tvmai/ci-cpu /bin/bash
 ```
 then, in the container
@@ -31,10 +31,14 @@ cd /mnt
 mkdir build && cd build
 cmake .. -DUSE_LLVM=ON -DUSE_SGX=/opt/sgxsdk -DRUST_SGX_SDK=/opt/rust-sgx-sdk
 make -j4
-cd ../apps/sgx
+cd ..
+pip install -e python -e topi/python -e nnvm/python
+cd apps/sgx
 ```
 
-`bash run_example.sh`
+Once TVM is build and installed, just
+
+`./run_example.sh`
 
 If everything goes well, you should see a lot of build messages and below them
 the text `It works!`.
diff --git a/apps/sgx/enclave/Cargo.toml b/apps/sgx/enclave/Cargo.toml
index 9a14c76c5897..cb128f3fbf94 100644
--- a/apps/sgx/enclave/Cargo.toml
+++ b/apps/sgx/enclave/Cargo.toml
@@ -8,8 +8,7 @@ crate-type = ["staticlib"]
 
 [dependencies]
 lazy_static = "1.1.0"
-# tvm = { path = "../../../rust", default-features = false, features = ["sgx"] }
-tvm = { path = "/home/nhynes/myelin/deps/tvm-rs", default-features = false, features = ["sgx"] }
+tvm = { path = "../../../rust", default-features = false, features = ["sgx"] }
 
 [profile.release]
 lto = true
diff --git a/apps/sgx/enclave/Makefile b/apps/sgx/enclave/Makefile
index e8515356238a..a28e05e03b13 100644
--- a/apps/sgx/enclave/Makefile
+++ b/apps/sgx/enclave/Makefile
@@ -13,7 +13,9 @@ else
 	debug := debug
 endif
 
-target/x86_64-unknown-linux-sgx/$(debug)/libmodel-enclave.a: $(build_dir)/libmodel.a **/*
+target=target/x86_64-unknown-linux-sgx/$(debug)/libmodel-enclave.a
+
+$(target): $(build_dir)/libmodel.a **/* $(TVM_DIR)/rust/patched.txt
 	RUST_TARGET_PATH=$(shell pwd) \
 		RUST_TARGET_DIR=$(shell pwd)/target \
 		RUSTFLAGS="-Z force-unstable-if-unmarked" \
@@ -21,11 +23,16 @@ target/x86_64-unknown-linux-sgx/$(debug)/libmodel-enclave.a: $(build_dir)/libmod
 		BUILD_DIR=../build \
 		xargo build --target x86_64-unknown-linux-sgx $(xargo_args) -q
 
+$(TVM_DIR)/rust/patched.txt: $(shell pwd)/sgx-deps.diff
+	echo $(TVM_DIR)
+	cd $(TVM_DIR) && git apply $<
+	touch $@
+
 $(build_dir)/libmodel.a: $(build_dir)/model.o
-	llvm-ar cr $@ $^
+	$(AR) cr $@ $^
 
 $(build_dir)/model.o: $(build_dir)/model.bc
-	clang -c $< -o $@ -fPIC -O3
+	$(CC) -c $< -o $@ -fPIC -O3
 	objcopy --globalize-symbol __tvm_module_startup $@
 
 $(build_dir)/model.bc: src/build_model.py
diff --git a/apps/sgx/enclave/Xargo.toml b/apps/sgx/enclave/Xargo.toml
index 1fd50d699264..57acf092b4d6 100644
--- a/apps/sgx/enclave/Xargo.toml
+++ b/apps/sgx/enclave/Xargo.toml
@@ -4,14 +4,10 @@ panic_unwind = {}
 panic_abort = {}
 
 [dependencies.std]
+path = "/opt/rust-sgx-sdk/xargo/sgx_tstd"
 features = ["backtrace", "stdio", "untrusted_time"]
-path = "/home/nhynes/myelin/deps/rust-sgx-sdk/xargo/sgx_tstd"
-# git = "https://github.com/oasislabs/rust-sgx-sdk"
-# rev = "7334c30d85cb1752577998705110b7b27c69b570"
 stage = 2
 
 [dependencies.xargo_sgx_rand]
-# git = "https://github.com/oasislabs/rust-sgx-sdk"
-path = "/home/nhynes/myelin/deps/rust-sgx-sdk/xargo/sgx_rand"
-# rev = "7334c30d85cb1752577998705110b7b27c69b570"
+path = "/opt/rust-sgx-sdk/xargo/sgx_rand"
 stage = 3
diff --git a/apps/sgx/enclave/enclave_config.xml.in b/apps/sgx/enclave/enclave_config.xml.in
index d49b6693f231..2423f93086b8 100644
--- a/apps/sgx/enclave/enclave_config.xml.in
+++ b/apps/sgx/enclave/enclave_config.xml.in
@@ -1,8 +1,8 @@
 <EnclaveConfiguration>
   <ProdID>0</ProdID>
   <ISVSVN>0</ISVSVN>
-  <StackMaxSize>0x100000</StackMaxSize>
-  <HeapMaxSize>0xf0000000</HeapMaxSize>
+  <StackMaxSize>0x20000</StackMaxSize>
+  <HeapMaxSize>0x5000000</HeapMaxSize>
   <TCSNum>NUM_THREADS</TCSNum>
   <TCSPolicy>0</TCSPolicy> <!-- must be "bound" to use thread_local -->
   <DisableDebug>0</DisableDebug>
diff --git a/apps/sgx/enclave/sgx-deps.diff b/apps/sgx/enclave/sgx-deps.diff
new file mode 100644
index 000000000000..1c67e7957f38
--- /dev/null
+++ b/apps/sgx/enclave/sgx-deps.diff
@@ -0,0 +1,13 @@
+diff --git a/rust/Cargo.toml b/rust/Cargo.toml
+index 0819e0c7..e56f4ef2 100644
+--- a/rust/Cargo.toml
++++ b/rust/Cargo.toml
+@@ -14,7 +14,7 @@ default = ["nom/std"]
+ sgx = ["nom/alloc"]
+ 
+ [dependencies]
+-bounded-spsc-queue = "0.4.0"
++bounded-spsc-queue = { git = "https://github.com/nhynes/bounded-spsc-queue", branch = "sgx" }
+ error-chain = { version = "0.12.0", default-features = false }
+ itertools = "0.7.8"
+ lazy_static = "1.1.0"
diff --git a/apps/sgx/enclave/x86_64-unknown-linux-sgx.json b/apps/sgx/enclave/x86_64-unknown-linux-sgx.json
new file mode 100644
index 000000000000..6cbb524f4439
--- /dev/null
+++ b/apps/sgx/enclave/x86_64-unknown-linux-sgx.json
@@ -0,0 +1,31 @@
+{
+  "arch": "x86_64",
+  "cpu": "x86-64",
+  "data-layout": "e-m:e-i64:64-f80:128-n8:16:32:64-S128",
+  "dynamic-linking": true,
+  "env": "sgx",
+  "exe-allocation-crate": "alloc_system",
+  "executables": true,
+  "has-elf-tls": true,
+  "has-rpath": true,
+  "linker-flavor": "gcc",
+  "linker-is-gnu": true,
+  "llvm-target": "x86_64-unknown-linux-gnu",
+  "max-atomic-width": 64,
+  "os": "linux",
+  "position-independent-executables": true,
+  "pre-link-args": {
+    "gcc": [
+      "-Wl,--as-needed",
+      "-Wl,-z,noexecstack",
+      "-m64"
+    ]
+  },
+  "relro-level": "full",
+  "stack-probes": true,
+  "target-c-int-width": "32",
+  "target-endian": "little",
+  "target-family": "unix",
+  "target-pointer-width": "64",
+  "vendor": "unknown"
+}
diff --git a/apps/sgx/run_example.sh b/apps/sgx/run_example.sh
index cc6f22f24e00..811da3938dd6 100755
--- a/apps/sgx/run_example.sh
+++ b/apps/sgx/run_example.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 
 sgx_sdk=${SGX_SDK:=/opt/sgxsdk}
-LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH} make
-printf "\n"
-LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH} TVM_CACHE_DIR=/tmp python3 run_model.py
+
+export LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH}
+export CC=clang-6.0
+export AR=llvm-ar-6.0
+export TVM_CACHE_DIR=/tmp
+
+make && printf "\n" && python3 run_model.py
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index 6ca56acadf74..bd7fcb9a4ded 100644
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -2,8 +2,8 @@ apt-get update && apt-get install -y --no-install-recommends --force-yes curl
 
 export RUSTUP_HOME=/opt/rust
 export CARGO_HOME=/opt/rust
-# rustc nightly-2018-08-25 is the version supported by the rust-sgx-sdk
-curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2018-08-25
+# this rustc is one supported by the installed version of rust-sgx-sdk
+curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2018-09-25
 . $CARGO_HOME/env
 rustup toolchain add nightly
 rustup component add rust-src
diff --git a/docker/install/ubuntu_install_sgx.sh b/docker/install/ubuntu_install_sgx.sh
index 917fd4b55954..a8201ac74a97 100644
--- a/docker/install/ubuntu_install_sgx.sh
+++ b/docker/install/ubuntu_install_sgx.sh
@@ -2,18 +2,20 @@ apt-get update && apt-get install -y --no-install-recommends --force-yes \
     build-essential git cmake \
     wget python pkg-config software-properties-common \
     autoconf automake libtool ocaml \
+    protobuf-compiler libprotobuf-dev \
     libssl-dev libcurl4-openssl-dev curl
 
 git clone https://github.com/intel/linux-sgx.git
 cd linux-sgx
 git checkout sgx_2.2
-curl 'https://gist.github.com/nhynes/c770b0e91610f8c020a8d1a803a1e7cb' | git am
+curl 'https://gist.githubusercontent.com/nhynes/c770b0e91610f8c020a8d1a803a1e7cb/raw/8f5372d9cb88929b3cc49a384943bb363bc06827/intel-sgx.patch' | git apply
 ./download_prebuilt.sh
-make -j sdk && make -j sdk_install_pkg
-./linux/installer/bin/sgx_linux_x64_sdk_2.2.100.45311.bin --prefix /opt
+make -j4 sdk && make -j4 sdk_install_pkg
+./linux/installer/bin/sgx_linux_x64_sdk*.bin --prefix /opt
 cd -
 
 git clone https://github.com/baidu/rust-sgx-sdk.git /opt/rust-sgx-sdk
 cd /opt/rust-sgx-sdk
-git checkout bdd75ca05f66d1f5df637182ec335970f769b03a
+git checkout v1.0.4
+curl 'https://gist.githubusercontent.com/nhynes/37164039c5d3f33aa4f123e4ba720036/raw/5b7fc24d4faa0bd6efce19f8324f79d5562991e0/rust-sgx-sdk.diff' | git apply
 cd -
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
index 4a70e428d37a..e17c66911b18 100644
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -13,7 +13,6 @@
   alloc,
   allocator_api,
   box_syntax,
-  extern_prelude,
   fn_traits,
   try_from,
   unboxed_closures,

From 74ebcfeb6c2f48275e1ea0f91a210b24765d1aa2 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Thu, 11 Oct 2018 10:26:02 +0530
Subject: [PATCH 208/529] [RELAY][OP] conv2d_transpose (#1862)

---
 docs/langref/relay_op.rst            |   2 +
 include/tvm/relay/attrs/nn.h         |  51 +++++++++
 python/tvm/relay/op/nn/nn.py         |  68 +++++++++++-
 src/relay/op/nn/convolution.cc       | 148 +++++++++++++++++++++++++++
 tests/python/relay/test_op_level2.py |  37 +++++++
 5 files changed, 304 insertions(+), 2 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 97f2d4cb9fea..fe5356557e55 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -48,6 +48,7 @@ This level enables typical convnet models.
    :nosignatures:
 
    tvm.relay.nn.conv2d
+   tvm.relay.nn.conv2d_transpose
    tvm.relay.nn.max_pool2d
    tvm.relay.nn.avg_pool2d
    tvm.relay.nn.global_max_pool2d
@@ -129,6 +130,7 @@ Level 1 Definitions
 Level 2 Definitions
 -------------------
 .. autofunction:: tvm.relay.nn.conv2d
+.. autofunction:: tvm.relay.nn.conv2d_transpose
 .. autofunction:: tvm.relay.nn.max_pool2d
 .. autofunction:: tvm.relay.nn.avg_pool2d
 .. autofunction:: tvm.relay.nn.global_max_pool2d
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index ce80407f15c6..7eb7a83605ac 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -77,6 +77,57 @@ struct SoftmaxAttrs : public tvm::AttrsNode<SoftmaxAttrs> {
   }
 };
 
+/*! \brief Attributes used in transposed convolution operator */
+struct Conv2DTransposeAttrs : public tvm::AttrsNode<Conv2DTransposeAttrs> {
+  IndexExpr channels;
+  Array<IndexExpr> kernel_size;
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> output_padding;
+  Array<IndexExpr> dilation;
+  int groups;
+  std::string data_layout;
+  std::string weight_layout;
+  DataType out_dtype;
+
+  TVM_DECLARE_ATTRS(Conv2DTransposeAttrs, "relay.attrs.Conv2DTransposeAttrs") {
+    TVM_ATTR_FIELD(channels)
+      .set_default(NullValue<IndexExpr>())
+      .describe("The dimensionality of the output space"
+                "i.e. the number of output channels in the convolution.");
+    TVM_ATTR_FIELD(kernel_size)
+      .describe("The dimensions of the convolution window.")
+      .set_default(NullValue<Array<IndexExpr> >());
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+      .describe("The strides of the convolution.");
+    TVM_ATTR_FIELD(output_padding).set_default(Array<IndexExpr>({0, 0}))
+      .describe("Zero-padding added to one side of the output.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "on both sides for padding number of points");
+    TVM_ATTR_FIELD(dilation).set_default(Array<IndexExpr>({1, 1}))
+      .describe("Specifies the dilation rate to use for dilated convolution.");
+    TVM_ATTR_FIELD(groups).set_default(1)
+      .describe("Controls the connections between inputs and outputs."
+                "At groups=1, all inputs are convolved to all outputs."
+                "At groups=2, the operation becomes equivalent to having two convolution"
+                "layers side by side, each seeing half the input channels, and producing"
+                "half the output channels, and both subsequently concatenated.");
+    TVM_ATTR_FIELD(data_layout).set_default("NCHW")
+      .describe("Dimension ordering of data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    TVM_ATTR_FIELD(weight_layout).set_default("OIHW")
+      .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(Int(0))
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
+
 /*! \brief Attributes for max pool operator */
 struct MaxPool2DAttrs : public tvm::AttrsNode<MaxPool2DAttrs> {
   Array<IndexExpr> pool_size;
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 7985d57c9edb..52414df8e444 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -88,6 +88,62 @@ def conv2d(data,
                         weight_layout, out_layout, out_dtype)
 
 
+def conv2d_transpose(data,
+                     weight,
+                     strides=(1, 1),
+                     padding=(0, 0),
+                     dilation=(1, 1),
+                     groups=1,
+                     channels=None,
+                     kernel_size=None,
+                     data_layout="NCHW",
+                     weight_layout="OIHW",
+                     output_padding=(0, 0),
+                     out_dtype=""):
+    """Two dimensional trnasposed convolution operator.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    weight : relay.Expr
+        The weight expressions.
+
+    strides : Tuple[int], optional
+        The strides of convoltution.
+
+    padding : Tuple[int], optional
+        The padding of convolution on both sides of inputs.
+
+    dilation : Tuple[int], optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    weight_layout : str, optional
+        Layout of the weight.
+
+    output_padding : Tuple[int], optional
+        Additional zero-padding to be added to one side of the output.
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.conv2d_transpose(data, weight, strides, padding, dilation,
+                                  groups, channels, kernel_size, data_layout,
+                                  weight_layout, output_padding, out_dtype)
+
+
 def softmax(data, axis):
     r"""Computes softmax.
 
@@ -103,8 +159,12 @@ def softmax(data, axis):
 
     axis: int
         The axis to sum over when computing softmax
-    """
 
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
     return _make.softmax(data, axis)
 
 
@@ -125,8 +185,12 @@ def log_softmax(data, axis):
 
     axis: int
         The axis to sum over when computing softmax
-    """
 
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
     return _make.log_softmax(data, axis)
 
 
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index ba424128640c..4717e3fe0803 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -154,5 +154,153 @@ with the layer input to produce a tensor of outputs.
 .set_support_level(2)
 .add_type_rel("Conv2D", Conv2DRel);
 
+
+// Conv2DTranspose
+TVM_REGISTER_NODE_TYPE(Conv2DTransposeAttrs);
+
+bool Conv2DTransposeRel(const Array<Type>& types,
+                        int num_inputs,
+                        const Attrs& attrs,
+                        const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const Conv2DTransposeAttrs* param = attrs.as<Conv2DTransposeAttrs>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->data_layout);
+  const Layout kernel_layout(param->weight_layout);
+  CHECK(in_layout.convertible(kNCHW))
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+  CHECK(kernel_layout.convertible(kOIHW))
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
+  const auto dshape_nchw = ConvertLayout(data->shape, in_layout, kNCHW);
+  // infer weight if the kernel_size and channels are defined
+  if (param->kernel_size.defined() && param->channels.defined()) {
+    CHECK_EQ(param->kernel_size.size(), 2);
+    CHECK_EQ(param->dilation.size(), 2);
+
+    std::vector<IndexExpr> wshape({dshape_nchw[1],
+                                   param->channels / param->groups,
+                                   param->kernel_size[0],
+                                   param->kernel_size[1]});
+
+    wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+    dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+    channels = param->channels;
+
+    // assign result to reporter
+    reporter->Assign(types[1], TensorTypeNode::make(wshape, data->dtype));
+  } else {
+    // use weight to infer the conv shape.
+    if (weight == nullptr) return false;
+    auto wshape = ConvertLayout(weight->shape, kernel_layout, kOIHW);
+    if (param->kernel_size.defined()) {
+      CHECK_EQ(param->kernel_size.size(), 2);
+      // check the size
+      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+          << "Conv2D: shape of weight is inconsistent with kernel_size, "
+          << " kernel_size=" << param->kernel_size
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    if (param->channels.defined()) {
+      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
+          << "Conv2D: shape of weight is inconsistent with channels, "
+          << " channels=" << param->channels
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    CHECK(reporter->AssertEQ(dshape_nchw[1] / param->groups, wshape[0]));
+    channels = wshape[1];
+    dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
+  }
+  // dilation
+  std::vector<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+  oshape[2] = (param->strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
+               2 * param->padding[0] + param->output_padding[0]);
+  oshape[3] = (param->strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
+               2 * param->padding[1] + param->output_padding[1]);
+
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  oshape = ConvertLayout(oshape, kNCHW, in_layout);
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
+  return true;
+}
+
+
+Expr MakeConv2DTranspose(Expr data,
+                         Expr weight,
+                         Array<IndexExpr> strides,
+                         Array<IndexExpr> padding,
+                         Array<IndexExpr> dilation,
+                         int groups,
+                         IndexExpr channels,
+                         Array<IndexExpr> kernel_size,
+                         std::string data_layout,
+                         std::string weight_layout,
+                         Array<IndexExpr> output_padding,
+                         DataType out_dtype) {
+  auto attrs = make_node<Conv2DTransposeAttrs>();
+  attrs->channels = channels;
+  attrs->kernel_size = kernel_size;
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->output_padding = std::move(output_padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->data_layout = std::move(data_layout);
+  attrs->weight_layout = std::move(weight_layout);
+  attrs->out_dtype = std::move(out_dtype);
+  static const Op& op = Op::Get("nn.conv2d_transpose");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.conv2d_transpose")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 12>(MakeConv2DTranspose, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.conv2d_transpose")
+.describe(R"code(Transposed 2D convolution layer (sometimes called Deconvolution).
+
+The need for transposed convolutions generally arises
+from the desire to use a transformation going in the opposite direction
+of a normal convolution, i.e., from something that has the shape of the
+output of some convolution to something that has the shape of its input
+while maintaining a connectivity pattern that is compatible with
+said convolution.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+- **weight**: (in_channels, channels, kernel_size[0], kernel_size[1])
+- **bias**: (channels,)
+- **out**:  This depends on the `layout` parameter. Output is 4D array of shape
+v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+
+            out_height and out_width are calculated as::
+                out_height = (height-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
+                out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(2)
+.add_type_rel("Conv2DTranspose", Conv2DTransposeRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index b9599982aa93..1d6d00277358 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -57,6 +57,42 @@ def test_conv2d_infer_type():
     assert ftype.arg_types[1] == relay.ty.TensorType(
         (4, 8, 3, 3, 4, 4), "int8")
 
+def test_conv2d_transpose_infer_type():
+    # symbolic in batch dimension
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 10, 10, 12
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    w = ib.param("w", relay.ty.IncompleteType())
+
+    with ib.function(x, w) as func:
+        ib.ret(relay.nn.conv2d_transpose(x.var, w.var,
+                                         kernel_size=(3, 3),
+                                         padding=(1, 1),
+                                         channels=15))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, 15, 10, 12), "float32")
+    assert ftype.arg_types[1] == relay.ty.TensorType(
+        (10, 15, 3, 3), "float32")
+
+    # infer by shape of w, mixed precision
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 10, 10, 12
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    w = ib.param("w", relay.ty.TensorType((12, 11, 5, 5), "float32"))
+    with ib.function(x, w) as func:
+        ib.ret(relay.nn.conv2d_transpose(x.var, w.var,
+                                         output_padding=(1, 1),
+                                         channels=11,
+                                         data_layout="NHWC"))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (n, 15, 15, 11), "float32")
+
 def test_upsampling_infer_type():
     ib = relay.ir_builder.IRBuilder()
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
@@ -166,3 +202,4 @@ def test_flatten_infer_type():
     test_pool2d_infer_type()
     test_upsampling_infer_type()
     test_flatten_infer_type()
+    test_conv2d_transpose_infer_type()

From b423672fd84a1a92a5271995194516d4093f5d5f Mon Sep 17 00:00:00 2001
From: Pariksheet Pinjari <pariksheet.pinjari@huawei.com>
Date: Thu, 11 Oct 2018 10:27:03 +0530
Subject: [PATCH 209/529] Update frontend.rst (#1881)

---
 docs/api/python/nnvm/frontend.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/api/python/nnvm/frontend.rst b/docs/api/python/nnvm/frontend.rst
index f872a6b878e2..eb07a13e8340 100644
--- a/docs/api/python/nnvm/frontend.rst
+++ b/docs/api/python/nnvm/frontend.rst
@@ -10,3 +10,7 @@ nnvm.frontend
 .. autofunction:: nnvm.frontend.from_coreml
 
 .. autofunction:: nnvm.frontend.from_keras
+
+.. autofunction:: nnvm.frontend.from_tensorflow
+
+.. autofunction:: nnvm.frontend.from_darknet

From fd634f3f93433ee844b7fdb956e4655962e30d92 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Thu, 11 Oct 2018 09:24:26 -0700
Subject: [PATCH 210/529] [Relay] Alpha equality tests for Relay exprs (#1871)

---
 python/tvm/relay/expr.py                    |   2 +-
 src/relay/pass/alpha_eq.cc                  |  44 +++
 tests/python/relay/test_pass_alpha_equal.py | 281 +++++++++++++++++++-
 3 files changed, 320 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 05214ca095d1..6ed8df0d736b 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -112,7 +112,7 @@ def __init__(self, op, args, attrs, ty_args=None):
 class Let(Expr):
     """A variable bindings in Relay, see tvm/relay/expr.h for more details."""
 
-    def __init__(self, var, value, body, value_type):
+    def __init__(self, var, value, body, value_type=None):
         self.__init_handle_by_constructor__(
             _make.Let, var, value, body, value_type)
 
diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
index 0e13a598ca3a..0ed0e3df3056 100644
--- a/src/relay/pass/alpha_eq.cc
+++ b/src/relay/pass/alpha_eq.cc
@@ -268,10 +268,27 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
         return;
       }
 
+      if (func1->type_params.size() != func2->type_params.size()) {
+        equal = false;
+        return;
+      }
+
       for (size_t i = 0U; i < func1->params.size(); i++) {
         this->VisitExpr(func1->params[i], func2->params[i]);
       }
 
+      for (size_t i = 0U; i < func1->type_params.size(); i++) {
+        equal = equal && AlphaEqual(func1->type_params[i], func2->type_params[i]);
+        if (!equal) {
+          return;
+        }
+      }
+
+      equal = equal && AlphaEqual(func1->ret_type, func2->ret_type);
+      if (!equal) {
+        return;
+      }
+
       this->VisitExpr(func1->body, func2->body);
     } else {
       equal = false;
@@ -287,10 +304,27 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
         return;
       }
 
+      if (op->type_args.size() != call->type_args.size()) {
+        equal = false;
+        return;
+      }
+
+      // checking attrs by pointer equality for now
+      equal = equal && (op->attrs == call->attrs);
+      if (!equal) {
+        return;
+      }
+
       for (size_t i = 0U; i < op->args.size(); i++) {
         this->VisitExpr(op->args[i], call->args[i]);
       }
 
+      for (size_t i = 0U; i < op->type_args.size(); i++) {
+        equal = equal && AlphaEqual(op->type_args[i], call->type_args[i]);
+        if (!equal) {
+          return;
+        }
+      }
     } else {
       equal = false;
     }
@@ -301,6 +335,16 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
       eq_map.Set(op->var, let->var);
       this->VisitExpr(op->value, let->value);
       this->VisitExpr(op->body, let->body);
+
+      // value_type should match as well (including nulls)
+      if (op->value_type.defined() != let->value_type.defined()) {
+        equal = false;
+        return;
+      }
+
+      if (op->value_type.defined()) {
+        equal = equal && AlphaEqual(op->value_type, let->value_type);
+      }
     } else {
       equal = false;
     }
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index 9fa1a554a6e2..dd722399dac4 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -14,12 +14,6 @@ def test_tensor_type_alpha_equal():
     t2 = relay.TensorType((), "float32")
     assert t1 == t2
 
-def test_constant_alpha_equal():
-    x = convert(1)
-    y = convert(2)
-    assert alpha_equal(x, x)
-    assert not alpha_equal(x, y)
-    assert alpha_equal(x, convert(1))
 
 def test_incomplete_type_alpha_equal():
     t1 = relay.IncompleteType(relay.Kind.Shape)
@@ -167,6 +161,79 @@ def test_type_relation_alpha_equal():
 
     assert bigger != diff_num_inputs
 
+
+def test_constant_alpha_equal():
+    x = convert(1)
+    y = convert(2)
+    assert alpha_equal(x, x)
+    assert not alpha_equal(x, y)
+    assert alpha_equal(x, convert(1))
+
+
+def test_var_alpha_equal():
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+
+    # normally only pointer equality
+    assert alpha_equal(v1, v1)
+    assert not alpha_equal(v1, v2)
+
+    # let node allows for setting the eq_map
+    l1 = relay.Let(v1, convert(1), v1, None)
+    l2 = relay.Let(v2, convert(1), v2, None)
+    l3 = relay.Let(v1, convert(1), v2, None)
+
+    assert alpha_equal(l1, l2)
+    assert not alpha_equal(l1, l3)
+
+
+def test_global_var_alpha_equal():
+    v1 = relay.GlobalVar("v1")
+    v2 = relay.GlobalVar("v2")
+
+    # only pointer equality suffices (smoke test)
+    assert alpha_equal(v1, v1)
+    assert not alpha_equal(v1, v2)
+
+
+def test_tuple_alpha_equal():
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+
+    # unit value is a valid tuple
+    assert alpha_equal(relay.Tuple([]), relay.Tuple([]))
+
+    tup = relay.Tuple([v1, convert(2), convert(3), relay.Tuple([convert(4)])])
+    same = relay.Tuple([v1, convert(2), convert(3), relay.Tuple([convert(4)])])
+
+    assert alpha_equal(tup, same)
+
+    # use the eq_map
+    let_tup = relay.Let(v1, tup, v1, None)
+    let_mapped = relay.Let(v2, relay.Tuple([v2, convert(2), convert(3),
+                                            relay.Tuple([convert(4)])]),
+                           v2, None)
+    assert alpha_equal(let_tup, let_mapped)
+
+    more_fields = relay.Tuple([v1, convert(2), convert(3), relay.Tuple([convert(4)]), v2])
+    assert not alpha_equal(tup, more_fields)
+
+    fewer_fields = relay.Tuple([v1, convert(2), convert(3)])
+    assert not alpha_equal(tup, fewer_fields)
+
+    different_end = relay.Tuple([v1, convert(2), convert(3),
+                           relay.Tuple([convert(5)])])
+    assert not alpha_equal(tup, different_end)
+
+    different_start = relay.Tuple([v2, convert(2), convert(3),
+                                 relay.Tuple([convert(4)])])
+    assert not alpha_equal(tup, different_start)
+
+    longer_at_end = relay.Tuple([v1, convert(2), convert(3),
+                                 relay.Tuple([convert(4), convert(5)])])
+    assert not alpha_equal(tup, longer_at_end)
+
+
 def test_tuple_get_item_alpha_equal():
     x = relay.Var('x')
     y = relay.Var('y')
@@ -174,6 +241,198 @@ def test_tuple_get_item_alpha_equal():
     assert not alpha_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(x, 2))
     assert alpha_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(x, 1))
 
+
+def test_param_alpha_equal():
+    # only checks equality of the types
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+
+    p1 = relay.Param(v1, relay.TensorType((1, 2, 3), "float32"))
+    p2 = relay.Param(v2, relay.TensorType((1, 2, 3), "float32"))
+    assert alpha_equal(p1, p2)
+
+    p3 = relay.Param(v1, relay.TensorType((4, 5, 6), "int8"))
+    assert not alpha_equal(p1, p3)
+
+    p4 = relay.Param(v1, relay.TupleType([relay.TensorType((1, 2, 3),
+                                                           "float32")]))
+    assert not alpha_equal(p1, p4)
+
+
+def test_function_alpha_equal():
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+    v3 = relay.Var("v3")
+    v4 = relay.Var("v4")
+
+    tt1 = relay.TensorType((1, 2, 3), "float32")
+    tt2 = relay.TensorType((4, 5, 6), "int8")
+    tt3 = relay.TupleType([tt1, tt2])
+
+    tp1 = relay.TypeParam("tp1", relay.Kind.Type)
+    tp2 = relay.TypeParam("tp2", relay.Kind.Type)
+    tp3 = relay.TypeParam("tp3", relay.Kind.Shape)
+    tp4 = relay.TypeParam("tp4", relay.Kind.Shape)
+
+    basic_args = [relay.Param(v3, tt1), relay.Param(v4, tt2)]
+    basic_tps = [tp1, tp2]
+
+    func = relay.Function([relay.Param(v1, tt1), relay.Param(v2, tt2)],
+                          tt2, v2, basic_tps)
+    mapped = relay.Function(basic_args, tt2, v4, basic_tps)
+    assert alpha_equal(func, mapped)
+
+    fewer_params = relay.Function([relay.Param(v4, tt2)], tt2, v4, basic_tps)
+    assert not alpha_equal(func, fewer_params)
+
+    more_params = relay.Function([relay.Param(v3, tt1), relay.Param(v4, tt2),
+                                  relay.Param(v2, tt2)], tt2, v4, basic_tps)
+    assert not alpha_equal(func, more_params)
+
+    params_unordered = relay.Function([relay.Param(v3, tt2),
+                                       relay.Param(v4, tt1)],
+                                      tt1, v3, basic_tps)
+    assert not alpha_equal(func, params_unordered)
+
+    params_mismatch = relay.Function([relay.Param(v3, tt3),
+                                      relay.Param(v4, tt2)],
+                                     tt2, v4, basic_tps)
+    assert not alpha_equal(func, params_mismatch)
+
+    # also would not typecheck
+    ret_type_mismatch = relay.Function(basic_args, tt1, v4, basic_tps)
+    assert not alpha_equal(func, ret_type_mismatch)
+
+    # also mis-typed
+    different_body = relay.Function(basic_args, tt2, v3, basic_tps)
+    assert not alpha_equal(func, different_body)
+
+    fewer_type_params = relay.Function(basic_args, tt2, v4, [tp1])
+    assert not alpha_equal(func, fewer_type_params)
+
+    more_type_params = relay.Function(basic_args, tt2, v4, [tp1, tp2, tp3])
+    assert not alpha_equal(func, more_type_params)
+
+    type_params_unordered = relay.Function(basic_args, tt2, v4, [tp2, tp1])
+    assert not alpha_equal(func, type_params_unordered)
+
+    different_type_params = relay.Function(basic_args, tt2, v4, [tp3, tp4])
+    assert not alpha_equal(func, different_type_params)
+
+    # a well-typed example that also differs in body, ret type, and type params
+    tupled_example = relay.Function(basic_args, tt3, relay.Tuple([v3, v4]))
+    assert not alpha_equal(func, tupled_example)
+
+
+def test_call_alpha_equal():
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+
+    # attrs are compared only by pointer equality
+    attr1 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+    attr2 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+
+    tt1 = relay.TensorType((1, 2, 3), "float32")
+    tt2 = relay.TensorType((), "int8")
+
+    basic_args = [convert(1), convert(2), v2, relay.Tuple([])]
+
+    # manually writing out args to ensure that args does not rely on
+    # pointer equality
+    call = relay.Call(v1, [convert(1), convert(2), v2, relay.Tuple([])],
+                      attr1, [tt1])
+    same = relay.Call(v1, basic_args, attr1, [tt1])
+    assert alpha_equal(call, same)
+
+    different_fn = relay.Call(v2, basic_args, attr1, [tt1])
+    assert not alpha_equal(call, different_fn)
+
+    fewer_args = relay.Call(v1, [convert(1), convert(2), v2], attr1, [tt1])
+    assert not alpha_equal(call, fewer_args)
+
+    reordered_args = relay.Call(v1, [convert(2), convert(1),
+                                     relay.Tuple([]), v2], attr1, [tt1])
+    assert not alpha_equal(call, reordered_args)
+
+    different_args = relay.Call(v1, [convert(1), convert(2), convert(3)],
+                                attr1, [tt1])
+    assert not alpha_equal(call, different_args)
+
+    more_args = relay.Call(v1, [convert(1), convert(2), v2, relay.Tuple([]),
+                                convert(3), convert(4)], attr1, [tt1])
+    assert not alpha_equal(call, more_args)
+
+    different_attrs = relay.Call(v1, basic_args, attr2, [tt1])
+    assert not alpha_equal(call, different_attrs)
+
+    no_type_args = relay.Call(v1, basic_args, attr1)
+    assert not alpha_equal(call, no_type_args)
+
+    more_type_args = relay.Call(v1, basic_args, attr1, [tt1, tt2])
+    assert not alpha_equal(call, more_type_args)
+
+    different_type_arg = relay.Call(v1, basic_args, attr1, [tt2])
+    assert not alpha_equal(call, different_type_arg)
+
+
+def test_let_alpha_equal():
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+    v3 = relay.Var("v3")
+
+    let = relay.Let(v1, convert(2), v1)
+    mapped = relay.Let(v2, convert(2), v2)
+    assert alpha_equal(let, mapped)
+
+    mismatched_var = relay.Let(v2, convert(2), v3)
+    assert not alpha_equal(let, mismatched_var)
+
+    different_value = relay.Let(v2, convert(3), v2)
+    assert not alpha_equal(let, different_value)
+
+    different_body = relay.Let(v2, convert(3), convert(12))
+    assert not alpha_equal(let, different_body)
+
+    # specified types must match
+    tt1 = relay.TensorType((), "float32")
+    tt2 = relay.TensorType((), "int8")
+    let_with_type = relay.Let(v1, convert(2), v1, tt1)
+    same_type = relay.Let(v1, convert(2), v1, tt1)
+    assert alpha_equal(let_with_type, same_type)
+    assert not alpha_equal(let, let_with_type)
+
+    different_type = relay.Let(v1, convert(2), v1, tt2)
+    assert not alpha_equal(let_with_type, different_type)
+
+
+def test_if_alpha_equal():
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+
+    if_sample = relay.If(v1, convert(1), relay.Tuple([convert(2), convert(3)]))
+    same = relay.If(v1, convert(1), relay.Tuple([convert(2), convert(3)]))
+    assert alpha_equal(if_sample, same)
+
+    different_cond = relay.If(v2, convert(1), relay.Tuple([convert(2), convert(3)]))
+    assert not alpha_equal(if_sample, different_cond)
+
+    different_true = relay.If(v1, convert(2), relay.Tuple([convert(2), convert(3)]))
+    assert not alpha_equal(if_sample, different_true)
+
+    different_false = relay.If(v1, convert(1), relay.Tuple([]))
+    assert not alpha_equal(if_sample, different_false)
+
+
+def test_op_alpha_equal():
+    # only checks names
+    op1 = relay.op.get("add")
+    op2 = relay.op.get("add")
+    assert alpha_equal(op1, op2)
+
+    op3 = relay.op.get("take")
+    assert not alpha_equal(op1, op3)
+
+
 if __name__ == "__main__":
     test_tensor_type_alpha_equal()
     test_incomplete_type_alpha_equal()
@@ -182,4 +441,14 @@ def test_tuple_get_item_alpha_equal():
     test_func_type_alpha_equal()
     test_tuple_type_alpha_equal()
     test_type_relation_alpha_equal()
+    test_constant_alpha_equal()
+    test_var_alpha_equal()
+    test_global_var_alpha_equal()
+    test_tuple_alpha_equal()
     test_tuple_get_item_alpha_equal()
+    test_param_alpha_equal()
+    test_function_alpha_equal()
+    test_call_alpha_equal()
+    test_let_alpha_equal()
+    test_if_alpha_equal()
+    test_op_alpha_equal()

From 25396ada2909e3f703b8bc84639b420f340531ba Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Thu, 11 Oct 2018 11:12:13 -0700
Subject: [PATCH 211/529] Add relay.where (#1869)

---
 docs/langref/relay_op.rst            |  2 +
 python/tvm/relay/op/transform.py     | 39 ++++++++++++++
 src/relay/op/tensor/transform.cc     | 80 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level4.py | 14 +++++
 4 files changed, 135 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index fe5356557e55..3add2dce76a3 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -98,6 +98,7 @@ This level enables additional math and transform operators.
    tvm.relay.maximum
    tvm.relay.minimum
    tvm.relay.pow
+   tvm.relay.where
 
 
 **Level 5: Vision/Image Operators**
@@ -173,6 +174,7 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.maximum
 .. autofunction:: tvm.relay.minimum
 .. autofunction:: tvm.relay.pow
+.. autofunction:: tvm.relay.where
 
 Level 5 Definitions
 -------------------
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 757297db9109..75fbba8461e3 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -180,3 +180,42 @@ def full_like(data, fill_value):
         The resulting tensor.
     """
     return _make.full_like(data, fill_value)
+
+
+def where(condition, x, y):
+    """Selecting elements from either x or y depending on the value of the
+    condition.
+
+    Parameters
+    ----------
+    condition : relay.Expr
+        The condition array. The n-th element in `y` is selected when the n-th
+        value in the `condition` array is zero. Otherwise, the corresponding
+        element from `x` will be picked.
+
+    x : relay.Expr
+        The first array to be selected.
+
+    y : relay.Expr
+        The second array to be selected.
+
+    Returns
+    -------
+    result : relay.Expr
+		The selected array.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        x = [[1, 2], [3, 4]]
+        y = [[5, 6], [7, 8]]
+        condition = [[0, 1], [-1, 0]]
+        relay.where(conditon, x, y) = [[5, 2], [3, 8]]
+
+        condition = [1, 0]
+        relay.where(conditon, x, y) = [[1, 2], [7, 8]]
+
+    Note that the shape of condition, x, and y needs to be the same.
+    """
+    return _make.where(condition, x, y)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 663dd5c38ec5..23f804013809 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -498,5 +498,85 @@ and type as the input array.
 .set_support_level(3)
 .add_type_rel("FullLike", FullLikeRel);
 
+// where operator
+bool WhereRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 4U);
+  const auto* condition = types[0].as<TensorTypeNode>();
+  const auto* x = types[1].as<TensorTypeNode>();
+  const auto* y = types[2].as<TensorTypeNode>();
+  CHECK(condition != nullptr && x != nullptr && y != nullptr);
+
+  const auto& cond_shape = condition->shape;
+  const auto& x_shape = x->shape;
+  const auto& y_shape = y->shape;
+  CHECK(x_shape.size() == y_shape.size()) << "x and y must have the same size";
+
+  if (cond_shape.size() != x_shape.size()) {
+    CHECK_EQ(cond_shape.size(), 1)
+        << "Shape of condition " << condition->shape
+        << " must be either equal to x or has dimension of 1.";
+  }
+  for (size_t i = 0; i < x_shape.size(); i++) {
+    CHECK(reporter->AssertEQ(x_shape[i], y_shape[i]))
+        << "x and y must have the same shape: " << x_shape << " vs " << y_shape;
+
+    CHECK(reporter->AssertEQ(cond_shape[i], x_shape[i]))
+        << "Shape of condition " << condition->shape
+        << " must be either equal to x or has dimension of 1.";
+  }
+  reporter->Assign(types[3], TensorTypeNode::make(x_shape, x->dtype));
+  return true;
+}
+
+// Positional relay function to create where operator.
+Expr MakeWhere(const Expr& condition, const Expr& x, const Expr& y) {
+  static const Op& op = Op::Get("where");
+  return CallNode::make(op, {condition, x, y});
+}
+
+TVM_REGISTER_API("relay.op._make.where")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 3>(MakeWhere, args, rv);
+});
+
+RELAY_REGISTER_OP("where")
+.describe(R"code(
+Return the elements, either from x or y, depending on the condition.
+
+Given three ndarrays, condition, x, and y, return an ndarray with the elements
+from x or y, depending on the elements from condition are true or false.
+x and y must have the same shape. If condition has the same shape as x,
+each element in the output array is from x if the corresponding element
+in the condition is true, and from y if false.
+
+If condition does not have the same shape as x, it must be a 1D array whose
+size is the same as x’s first dimension size. Each row of the output array
+is from x’s row if the corresponding element from condition is true, and
+from y’s row if false.
+
+Note that all non-zero values are interpreted as True in condition.
+
+Examples::
+
+  x = [[1, 2], [3, 4]]
+  y = [[5, 6], [7, 8]]
+  cond = [[0, 1], [-1, 0]]
+  where(cond, x, y) = [[5, 2], [3, 8]]
+
+
+  cond = [1, 0]
+  where(cond, x, y) = [[1, 2], [7, 8]]
+
+)code" TVM_ADD_FILELINE)
+.add_argument("condition", "Tensor", "Condition array")
+.add_argument("x", "Tensor", "First array to be selected")
+.add_argument("y", "Tensor", "Second array to be selected")
+.set_num_inputs(3)
+.set_support_level(4)
+.add_type_rel("Where", WhereRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index a855b0f2caaa..807d3a3a964e 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -125,8 +125,22 @@ def test_binary_broadcast():
         assert ftype.ret_type == relay.TensorType((5, 10, 4), "int32")
 
 
+def test_where():
+    ib = relay.ir_builder.IRBuilder()
+    cond = ib.param("cond", relay.TensorType((3, 4), "float32"))
+    x = ib.param("x", relay.TensorType((3, 4), "float32"))
+    y = ib.param("y", relay.TensorType((3, 4), "float32"))
+    with ib.function(cond, x, y) as func:
+        ib.ret(relay.where(cond.var, x.var, y.var))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.TensorType((3, 4), "float32")
+
+
 if __name__ == "__main__":
     test_cmp_type()
     test_binary_broadcast()
     test_binary_op()
     test_binary_broadcast_op()
+    test_where()

From da1cef8d4eade2017780caf7a5f7d174a7284bc9 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Thu, 11 Oct 2018 21:57:51 -0700
Subject: [PATCH 212/529] [Relay][Op] Pad operator (#1843)

---
 docs/langref/relay_op.rst            |  1 +
 include/tvm/relay/attrs/nn.h         | 13 ++++-
 python/tvm/relay/op/nn/nn.py         | 29 +++++++++-
 src/relay/op/nn/pad.cc               | 86 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level2.py | 25 ++++++++
 5 files changed, 151 insertions(+), 3 deletions(-)
 create mode 100644 src/relay/op/nn/pad.cc

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 3add2dce76a3..b8cd20709a17 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -55,6 +55,7 @@ This level enables typical convnet models.
    tvm.relay.nn.global_avg_pool2d
    tvm.relay.nn.upsampling
    tvm.relay.nn.batch_flatten
+   tvm.relay.nn.pad
    tvm.relay.nn.lrn
    tvm.relay.nn.l2_normalize
 
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 7eb7a83605ac..de0da7477a35 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -223,8 +223,19 @@ struct UpSamplingAttrs : public tvm::AttrsNode<UpSamplingAttrs> {
   }
 };
 
+/*! \brief Attributes used for the padding operator */
+struct PadAttrs : public tvm::AttrsNode<PadAttrs> {
+  double pad_value;
+  Array<Array<IndexExpr> > pad_width;
 
-
+  TVM_DECLARE_ATTRS(PadAttrs, "relay.attrs.PadAttrs") {
+    TVM_ATTR_FIELD(pad_value).set_default(0.0)
+      .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(pad_width)
+      .describe("Number of values padded to the edges of each axis, "
+                "in the format of ((before_1, after_1), ..., (before_N, after_N))");
+  }
+};
 
 /*! \brief Attributes for LRN operator */
 struct LRNAttrs : public tvm::AttrsNode<LRNAttrs> {
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 52414df8e444..e95e3e9b715d 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -429,7 +429,6 @@ def batch_flatten(data):
     """
     return _make.batch_flatten(data)
 
-
 def relu(data):
     """Rectified linear unit.
 
@@ -449,6 +448,32 @@ def relu(data):
     return _make.relu(data)
 
 
+def pad(data,
+        pad_width,
+        pad_value=0.0):
+    r"""Padding
+
+    This operator takes in a tensor and pads each axis by the specified
+    widths using the specified value.
+
+    Parameters
+    ----------
+    data: relay.Expr
+        The input data to the operator
+    pad_width: tuple of <tuple of <int>>, required
+        Number of values padded to the edges of each axis, in the format
+        of ((before_1, after_1), ..., (before_N, after_N))
+    pad_value: float, optional, default=0.0
+        The value used for padding
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.pad(data, pad_width, pad_value)
+
+
 def lrn(data, size=5, axis=1, bias=2, alpha=.00001, beta=0.75):
     """This operator takes data as input and does local response normalization.
 
@@ -484,9 +509,9 @@ def lrn(data, size=5, axis=1, bias=2, alpha=.00001, beta=0.75):
     result : relay.Expr
         The computed result.
     """
-
     return _make.lrn(data, size, axis, alpha, beta, bias)
 
+
 def l2_normalize(data, eps, axis=None):
     """Perform L2 normalization on the input data
 
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
new file mode 100644
index 000000000000..667a5be90fc8
--- /dev/null
+++ b/src/relay/op/nn/pad.cc
@@ -0,0 +1,86 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file pad.cc
+ * \brief Implementation of operator pad
+ */
+#include <tvm/ir_operator.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/nn.h>
+#include <vector>
+#include "layout.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(PadAttrs);
+
+bool PadRel(const Array<Type>& types,
+            int num_inputs,
+            const Attrs& attrs,
+            const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const PadAttrs* param = attrs.as<PadAttrs>();
+  CHECK(param != nullptr);
+
+  // check that pad widths match lengths
+  CHECK(data->shape.size() == param->pad_width.size())
+    << "There should be as many pad width pairs as shape dimensions "
+    << "but the shape has " << data->shape.size() << " dimensions "
+    << "and there are " << param->pad_width.size() << " pad width pairs.";
+
+  // each pad width element should be a pair of positive integers
+  std::vector<IndexExpr> oshape;
+  for (size_t i = 0; i < param->pad_width.size(); i++) {
+    CHECK(param->pad_width[i].size() == 2)
+      << "Each pad width element should be a pair but at index " << i
+      << " there are " << param->pad_width[i].size() << " elements.";
+
+    auto width1 = as_const_int(param->pad_width[i][0]);
+    auto width2 = as_const_int(param->pad_width[i][1]);
+    CHECK(width1 != nullptr);
+    CHECK(width2 != nullptr);
+
+    CHECK(*width1 >= 0)
+      << "Param width elements should be positive but first pad width at "
+      << "index " << i << " is " << *width1 << ".";
+    CHECK(*width2 >= 0)
+      << "Param width elements should be positive but first pad width at "
+      << "index " << i << " is " << *width2 << ".";
+
+    auto padding = make_const(data->shape[i].type(), *width1 + *width2);
+    oshape.push_back(data->shape[i] + padding);
+  }
+
+  reporter->Assign(types[1], TensorTypeNode::make(Array<IndexExpr>(oshape),
+                                                  data->dtype));
+  return true;
+}
+
+// Handler to create a call to the padding op used by front-end FFI
+  Expr MakePad(Expr data, Array<Array<IndexExpr> > pad_width, double pad_value) {
+  auto attrs = make_node<PadAttrs>();
+  attrs->pad_value = pad_value;
+  attrs->pad_width = std::move(pad_width);
+  static const Op& op = Op::Get("nn.pad");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.pad")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakePad, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.pad")
+.describe(R"code(Pad for n-D tensor.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("Pad", PadRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 1d6d00277358..f67faea19be1 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -196,10 +196,35 @@ def test_flatten_infer_type():
     ftype = func.checked_type
     assert ftype.ret_type == relay.ty.TensorType((d1, ((2*d3)*3)), "float32")
 
+def test_pad_infer_type():
+    # entirely concrete case
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = 1, 2, 3, 4
+    t = ib.param("t", relay.TensorType((n, c, h, w), "float32"))
+    with ib.function(t) as func:
+        ib.ret(relay.nn.pad(t.var, ((1, 1), (2, 2), (3, 3), (4, 4))))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.TensorType((3, 6, 9, 12), "float32")
+
+    # some symbolic values
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 2, 3, tvm.var("w")
+    t = ib.param("t", relay.TensorType((n, c, h, w), "float32"))
+    with ib.function(t) as func:
+        ib.ret(relay.nn.pad(t.var, ((1, 1), (2, 2), (3, 3), (4, 4))))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
+
 
 if __name__ == "__main__":
     test_conv2d_infer_type()
     test_pool2d_infer_type()
     test_upsampling_infer_type()
     test_flatten_infer_type()
+    test_pad_infer_type()
     test_conv2d_transpose_infer_type()
+

From 36123fb9e04e3e8da6ba201e10c980648670f805 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Thu, 11 Oct 2018 22:01:38 -0700
Subject: [PATCH 213/529] [Relay] add python doc for function in ir_pass
 (#1877)

---
 include/tvm/relay/pass.h     |  7 ++--
 python/tvm/relay/ir_pass.py  | 81 ++++++++++++++++++++++++++++++++----
 src/relay/pass/kind_check.cc |  4 +-
 3 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 3678aee32850..1043e4aaaa4c 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -29,7 +29,7 @@ Expr InferType(const Environment& env, const Expr& e);
 Expr InferType(const Environment& env, const GlobalVar& var, const Function& f);
 
 /*!
- * \brief Check that types are well formed by applying "kinding rules".
+ * \brief Check that types are well kinded by applying "kinding rules".
  *
  * This pass ensures we do not do things that violate the design of the
  * type system when writing down types.
@@ -39,11 +39,12 @@ Expr InferType(const Environment& env, const GlobalVar& var, const Function& f);
  * We check this by ensuring the `dtype` field of a Tensor always contains
  * a data type such as `int`, `float`, `uint`.
  *
- * \param env The global environment.
  * \param t The type to check.
+ * \param env The global environment.
+ *
  * \return true if the rules are satisified otherwise false
  */
-bool KindCheck(const Environment& env, const Type& t);
+bool KindCheck(const Type& t, const Environment& env);
 
 /*! \brief Compare two expressions for structural equivalence.
  *
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 6de6437b9eb9..cbb7095e2f17 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -15,26 +15,91 @@ def infer_type(env, expr):
     Parameters
     ----------
     env : relay.Environment
-        The global environment.
+      The global environment.
 
     expr : relay.Expr
-        The input expression.
+      The input expression.
 
     Returns
     -------
     checked_expr : relay.Expr
-         The checked expression.
+      The checked expression.
     """
     return _ir_pass.infer_type(env, expr)
 
+def well_formed(e):
+    """Check that each Var is only bound once (well formed).
 
-well_formed = _ir_pass.well_formed
+    Parameters
+    ----------
+    e: relay.Expr
+      The input expression
+
+    Returns
+    -------
+    well_form : bool
+      whether the input expression is well formed
+    """
+    return _ir_pass.well_formed(e)
+
+def check_kind(t, env=None):
+    """Check that the type is well kinded.
+    For example, this mean type cannot has tensor of tensor, or is a tuple type of 2 shapes.
+
+    Parameters
+    ----------
+    t: relay.Type
+      The type to check
+
+    env: relay.Environment, optional
+      The global environment
+
+    Returns
+    -------
+    well_kinded : bool
+      whether the input type is well kinded.
+
+    Examples
+    --------
+    .. code:: python
+
+        assert not check_kind(relay.TupleType([relay.TypeParam('tp1', relay.Kind.Shape)]))
+        assert check_kind(relay.TupleType([relay.TypeParam('tp1', relay.Kind.Type)]))
+    """
+    if env is not None:
+        return _ir_pass.check_kind(t, env)
+    else:
+        return _ir_pass.check_kind(t)
 
-check_kind = _ir_pass.check_kind
+def free_vars(e):
+    """Get free variables from expression e.
 
-free_vars = _ir_pass.free_vars
+    Parameters
+    ----------
+    e: relay.Expr
+      The input expression
 
-free_type_vars = _ir_pass.free_type_vars
+    Returns
+    -------
+    free : List[relay.Var]
+      the list of free variables
+    """
+    return _ir_pass.free_vars(e)
+
+def free_type_vars(e):
+    """Get free type variables from expression/type e
+
+    Parameters
+    ----------
+    e: relay.Expr/relay.Type
+      The input expression/type
+
+    Returns
+    -------
+    free : List[relay.TypeParam]
+      the list of free type variables
+    """
+    return _ir_pass.free_type_vars(e)
 
 def dead_code_elimination(e):
     """ Remove expressions which does not effect the program result (dead code).
@@ -59,10 +124,10 @@ def alpha_equal(lhs, rhs):
     ----------
     lhs: relay.Expr
       One of the input Expression.
+
     rhs: relay.Expr
       One of the input Expression.
 
-
     Returns
     -------
     result: bool
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
index f649243dbfec..72807985ced4 100644
--- a/src/relay/pass/kind_check.cc
+++ b/src/relay/pass/kind_check.cc
@@ -99,7 +99,7 @@ struct KindChecker : TypeVisitor<> {
   }
 };
 
-bool KindCheck(const Environment& env, const Type &t) {
+bool KindCheck(const Type& t, const Environment& env) {
   KindChecker kc;
   return kc.Check(t);
 }
@@ -107,7 +107,7 @@ bool KindCheck(const Environment& env, const Type &t) {
 TVM_REGISTER_API("relay._ir_pass.check_kind")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       if (args.size() == 1) {
-        *ret = KindCheck(EnvironmentNode::make({}), args[0]);
+        *ret = KindCheck(args[0], EnvironmentNode::make({}));
       } else {
         *ret = KindCheck(args[0], args[1]);
       }

From f960d8950094a4c821841ddf9b2b7acaac032108 Mon Sep 17 00:00:00 2001
From: Nick Hynes <nhynes@berkeley.edu>
Date: Thu, 11 Oct 2018 22:19:17 -0700
Subject: [PATCH 214/529] [Rust] Add rust runtime to CI (#1851)

---
 apps/sgx/enclave/.rustfmt.toml        | 60 +--------------------------
 docker/Dockerfile.ci_cpu              | 24 +++++------
 docker/install/ubuntu_install_rust.sh |  4 +-
 rust/.rustfmt.toml                    |  4 +-
 rust/src/runtime/array.rs             |  3 +-
 rust/src/runtime/graph.rs             | 18 +++++---
 rust/src/runtime/threading.rs         |  9 ++--
 rust/tests/test_graph_serde.rs        |  3 +-
 tests/scripts/task_rust.sh            | 25 +++++++++++
 9 files changed, 64 insertions(+), 86 deletions(-)
 mode change 100644 => 120000 apps/sgx/enclave/.rustfmt.toml
 create mode 100755 tests/scripts/task_rust.sh

diff --git a/apps/sgx/enclave/.rustfmt.toml b/apps/sgx/enclave/.rustfmt.toml
deleted file mode 100644
index 9ae87cc6bfcf..000000000000
--- a/apps/sgx/enclave/.rustfmt.toml
+++ /dev/null
@@ -1,59 +0,0 @@
-max_width = 100
-hard_tabs = false
-tab_spaces = 2
-newline_style = "Auto"
-use_small_heuristics = "Default"
-indent_style = "Block"
-wrap_comments = false
-comment_width = 80
-normalize_comments = false
-format_strings = false
-format_macro_matchers = false
-format_macro_bodies = true
-empty_item_single_line = true
-struct_lit_single_line = true
-fn_single_line = false
-where_single_line = false
-imports_indent = "Block"
-imports_layout = "Mixed"
-merge_imports = true
-reorder_imports = true
-reorder_modules = true
-reorder_impl_items = false
-type_punctuation_density = "Wide"
-space_before_colon = false
-space_after_colon = true
-spaces_around_ranges = false
-binop_separator = "Front"
-remove_nested_parens = true
-combine_control_expr = true
-struct_field_align_threshold = 0
-match_arm_blocks = true
-force_multiline_blocks = false
-fn_args_density = "Tall"
-brace_style = "SameLineWhere"
-control_brace_style = "AlwaysSameLine"
-trailing_semicolon = true
-trailing_comma = "Vertical"
-match_block_trailing_comma = true
-blank_lines_upper_bound = 1
-blank_lines_lower_bound = 0
-edition = "2015"
-merge_derives = true
-use_try_shorthand = true
-use_field_init_shorthand = false
-force_explicit_abi = true
-condense_wildcard_suffixes = true
-color = "Auto"
-required_version = "0.99.5"
-unstable_features = false
-disable_all_formatting = false
-skip_children = false
-hide_parse_errors = false
-error_on_line_overflow = false
-error_on_unformatted = false
-report_todo = "Never"
-report_fixme = "Never"
-ignore = []
-emit_mode = "Files"
-make_backup = false
diff --git a/apps/sgx/enclave/.rustfmt.toml b/apps/sgx/enclave/.rustfmt.toml
new file mode 120000
index 000000000000..ec1baa2f89be
--- /dev/null
+++ b/apps/sgx/enclave/.rustfmt.toml
@@ -0,0 +1 @@
+../../../rust/.rustfmt.toml
\ No newline at end of file
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 2b72b6eea6e5..b2bebea0b892 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -15,24 +15,24 @@ RUN bash /install/ubuntu_install_python_package.sh
 COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
 RUN bash /install/ubuntu_install_llvm.sh
 
-# AutoTVM deps
-COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
-RUN bash /install/ubuntu_install_redis.sh
-
-# Golang environment
-COPY install/ubuntu_install_golang.sh /install/ubuntu_install_golang.sh
-RUN bash /install/ubuntu_install_golang.sh
+# SGX deps (build early; changes infrequently)
+COPY install/ubuntu_install_sgx.sh /install/ubuntu_install_sgx.sh
+RUN bash /install/ubuntu_install_sgx.sh
+ENV LD_LIBRARY_PATH /opt/sgxsdk/lib64:${LD_LIBRARY_PATH}
 
-# Rust env
+# Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
 RUN bash /install/ubuntu_install_rust.sh
 ENV RUSTUP_HOME /opt/rust
 ENV CARGO_HOME /opt/rust
+ENV RUSTC_WRAPPER sccache
 
-# SGX deps
-COPY install/ubuntu_install_sgx.sh /install/ubuntu_install_sgx.sh
-RUN bash /install/ubuntu_install_sgx.sh
-ENV LD_LIBRARY_PATH /opt/sgxsdk/lib64:${LD_LIBRARY_PATH}
+# AutoTVM deps
+COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
+RUN bash /install/ubuntu_install_redis.sh
 
+# Golang environment
+COPY install/ubuntu_install_golang.sh /install/ubuntu_install_golang.sh
+RUN bash /install/ubuntu_install_golang.sh
 
 ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index bd7fcb9a4ded..9a51afeea79b 100644
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -7,9 +7,9 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchai
 . $CARGO_HOME/env
 rustup toolchain add nightly
 rustup component add rust-src
+cargo +nightly install sccache
 cargo +nightly install rustfmt-nightly --version 0.99.5 --force
 cargo +nightly install xargo
 
 # make rust usable by all users
-chmod a+w /opt/rust
-sudo find /opt/rust -type d -exec chmod a+w {} \;
+chmod -R a+w /opt/rust
diff --git a/rust/.rustfmt.toml b/rust/.rustfmt.toml
index df9a65dacfaa..dbf3347a32bd 100644
--- a/rust/.rustfmt.toml
+++ b/rust/.rustfmt.toml
@@ -38,14 +38,14 @@ trailing_comma = "Vertical"
 match_block_trailing_comma = false
 blank_lines_upper_bound = 1
 blank_lines_lower_bound = 0
-edition = "Edition2015"
+edition = "2015"
 merge_derives = true
 use_try_shorthand = true
 use_field_init_shorthand = false
 force_explicit_abi = true
 condense_wildcard_suffixes = false
 color = "Auto"
-required_version = "0.99.4"
+required_version = "0.99.5"
 unstable_features = false
 disable_all_formatting = false
 skip_children = false
diff --git a/rust/src/runtime/array.rs b/rust/src/runtime/array.rs
index 79d22e400cff..9d0941811758 100644
--- a/rust/src/runtime/array.rs
+++ b/rust/src/runtime/array.rs
@@ -173,7 +173,8 @@ impl<'a> Tensor<'a> {
                 expected_stride * (*shape as usize),
               )
             },
-          ).0
+          )
+          .0
       }
     }
   }
diff --git a/rust/src/runtime/graph.rs b/rust/src/runtime/graph.rs
index 6c53aeb9f6e9..08fbd5938380 100644
--- a/rust/src/runtime/graph.rs
+++ b/rust/src/runtime/graph.rs
@@ -56,11 +56,13 @@ impl Graph {
         .as_ref()
         .ok_or(ErrorKind::GraphFormatError(
           "Missing graph attrs".to_string(),
-        ))?.get(attr)
+        ))?
+        .get(attr)
         .ok_or(ErrorKind::GraphFormatError(format!(
           "Missing {} attr",
           attr
-        )))?.to_owned(),
+        )))?
+        .to_owned(),
     )?)
   }
 }
@@ -99,7 +101,8 @@ impl Node {
       .ok_or(format!(
         "Node `{}` is missing attrs.flatten_data",
         self.name
-      ))?.parse::<u8>()?
+      ))?
+      .parse::<u8>()?
       == 1;
     Ok(NodeAttrs {
       func_name,
@@ -189,7 +192,8 @@ impl<'m, 't> GraphExecutor<'m, 't> {
         } else {
           Err(ErrorKind::GraphFormatError(format!("Invalid dltype: {}", dltype).to_string()).into())
         }
-      }).collect::<Result<Vec<DataType>>>()?;
+      })
+      .collect::<Result<Vec<DataType>>>()?;
 
     let align = dtypes.iter().map(|dtype| dtype.bits as usize).max();
     let mut storage_num_bytes = vec![0usize; *storage_ids.iter().max().unwrap_or(&1) + 1];
@@ -216,7 +220,8 @@ impl<'m, 't> GraphExecutor<'m, 't> {
           strides: None,
           byte_offset: 0,
         }
-      }).collect();
+      })
+      .collect();
 
     Ok(tensors)
   }
@@ -261,7 +266,8 @@ impl<'m, 't> GraphExecutor<'m, 't> {
           } else {
             DLTensor::from(tensor)
           })
-        }).collect::<Result<Vec<DLTensor>>>()
+        })
+        .collect::<Result<Vec<DLTensor>>>()
         .unwrap();
       let op: Box<Fn()> = box move || {
         let args = dl_tensors
diff --git a/rust/src/runtime/threading.rs b/rust/src/runtime/threading.rs
index c0d6221c91b7..693ebf7c4a33 100644
--- a/rust/src/runtime/threading.rs
+++ b/rust/src/runtime/threading.rs
@@ -58,7 +58,8 @@ impl Job {
         },
         cdata: self.cdata,
         pending: Arc::clone(&self.pending),
-      }).collect()
+      })
+      .collect()
   }
 
   /// Waits for all tasks in this `Job` to be completed.
@@ -110,7 +111,8 @@ impl<'a> Threads {
         let (p, c) = bounded_spsc_queue::make(2);
         let handle = thread::spawn(move || cb(c.into()));
         (handle, p)
-      }).unzip();
+      })
+      .unzip();
     Threads {
       handles: handles,
       queues: queues,
@@ -128,7 +130,8 @@ impl<'a> Threads {
         let (p, c) = bounded_spsc_queue::make(2);
         consumer_queues.push_back(c.into());
         p
-      }).collect();
+      })
+      .collect();
     ocall_packed!("__sgx_thread_group_launch__", num_threads as u64);
     Threads { queues: queues }
   }
diff --git a/rust/tests/test_graph_serde.rs b/rust/tests/test_graph_serde.rs
index a596544212ca..b02c12889794 100644
--- a/rust/tests/test_graph_serde.rs
+++ b/rust/tests/test_graph_serde.rs
@@ -20,7 +20,8 @@ fn test_load_graph() {
 
   let graph = Graph::try_from(
     &fs::read_to_string(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/graph.json")).unwrap(),
-  ).unwrap();
+  )
+  .unwrap();
 
   assert_eq!(graph.nodes[3].op, "tvm_op");
   assert_eq!(
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
new file mode 100755
index 000000000000..8ef9a1a1556f
--- /dev/null
+++ b/tests/scripts/task_rust.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -e
+
+export LD_LIBRARY_PATH=lib:$LD_LIBRARY_PATH
+
+tvm_root="$(git rev-parse --show-toplevel)"
+export PYTHONPATH="$tvm_root/python":"$tvm_root/nnvm/python":"$tvm_root/topi/python"
+
+cd rust
+cargo fmt -- --check
+
+# run basic tests
+python3 tests/build_model.py
+cargo test --tests
+
+# run TVM module test
+cd tests/test_tvm_basic
+cargo run
+cd -
+
+# run NNVM graph test
+cd tests/test_nnvm
+cargo run
+cd -

From 6012ea989e73d6adef220ebcd19c4e382bde4d1d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 12 Oct 2018 09:18:47 -0700
Subject: [PATCH 215/529] Update HalideIR (#1890)

---
 3rdparty/HalideIR | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
index 2f3ecdfdedf3..e4a4c02764d3 160000
--- a/3rdparty/HalideIR
+++ b/3rdparty/HalideIR
@@ -1 +1 @@
-Subproject commit 2f3ecdfdedf3efa7e45a3945dca63a25856c4674
+Subproject commit e4a4c02764d37c9c3db0d64c4996651a3ef9513c

From 5e3bdc07adfdf6c5cac2e8093778f09f1147f764 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Fri, 12 Oct 2018 09:19:41 -0700
Subject: [PATCH 216/529] [TOPI] Update pre-tuned parameters for TX2 and fp16
 on Mali (#1892)

---
 apps/benchmark/README.md                    |  7 +++++++
 apps/benchmark/arm_cpu_imagenet_bench.py    |  8 ++++----
 apps/benchmark/gpu_imagenet_bench.py        |  6 +++---
 apps/benchmark/mobile_gpu_imagenet_bench.py | 14 ++++++-------
 apps/benchmark/util.py                      | 22 +++++++++++----------
 python/tvm/autotvm/measure/executor.py      |  2 +-
 python/tvm/autotvm/tophub.py                |  4 ++--
 python/tvm/exec/autotvm_log_editor.py       |  4 ++--
 topi/python/topi/mali/conv2d.py             |  4 ++--
 tutorials/autotvm/tune_nnvm_arm.py          |  2 +-
 tutorials/autotvm/tune_nnvm_cuda.py         |  2 +-
 tutorials/autotvm/tune_nnvm_mobile_gpu.py   |  2 +-
 12 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index f713684524ef..9806ddc05bae 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -28,6 +28,10 @@ Build TVM with LLVM and CUDA enabled. [Help](https://docs.tvm.ai/install/from_so
 ```bash
 python3 gpu_imagenet_bench.py --model 1080ti
 python3 gpu_imagenet_bench.py --model titanx
+
+# For NVIDIA Jetson TX2, you can run the following command directly on the board,
+# or use cross compilation and RPC like what we do for ARM CPU.
+python3 gpu_imagenet_bench.py --model tx2
 ```
 
 ### ARM CPU & Mali GPU
@@ -87,13 +91,16 @@ python3 -m tvm.exec.rpc_tracker
   python3 arm_cpu_imagenet_bench.py --model pixel2 --rpc-key pixel2
   python3 arm_cpu_imagenet_bench.py --model p20pro --rpc-key p20pro
   python3 arm_cpu_imagenet_bench.py --model mate10pro --rpc-key mate10pro  
+  ```
 
+  ```bash
   # Mali GPU
   # NOTE: To make the test environment more stable, we close GUI and lock the frequency
   sudo /etc/init.d/lightdm stop
   sudo -i
   echo performance > /sys/class/misc/mali0/device/devfreq/ff9a0000.gpu/governor
   python3 mobile_gpu_imagenet_bench.py --model rk3399 --rpc-key rk3399
+  python3 mobile_gpu_imagenet_bench.py --model rk3399 --rpc-key rk3399 --dtype float16
   ```
 
 ### AMD GPU
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index 5b666bc9d2e0..f84d42bcab82 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -14,7 +14,7 @@
 from util import get_network, print_progress
 
 
-def evaluate_network(network, target, target_host, number):
+def evaluate_network(network, target, target_host, repeat):
     # connect to remote device
     tracker = tvm.rpc.connect_tracker(args.host, args.port)
     remote = tracker.request(args.rpc_key)
@@ -50,7 +50,7 @@ def evaluate_network(network, target, target_host, number):
 
     # evaluate
     print_progress("%-20s evaluating..." % network)
-    ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
+    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=repeat)
     prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
     print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
 
@@ -70,7 +70,7 @@ def evaluate_network(network, target, target_host, number):
     parser.add_argument("--host", type=str, default='localhost')
     parser.add_argument("--port", type=int, default=9190)
     parser.add_argument("--rpc-key", type=str, required=True)
-    parser.add_argument("--number", type=int, default=3)
+    parser.add_argument("--repeat", type=int, default=10)
     args = parser.parse_args()
 
     dtype = 'float32'
@@ -87,5 +87,5 @@ def evaluate_network(network, target, target_host, number):
     print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
     print("--------------------------------------------------")
     for network in networks:
-        evaluate_network(network, target, target_host, args.number)
+        evaluate_network(network, target, target_host, args.repeat)
 
diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
index a0eb4a055103..80df08128995 100644
--- a/apps/benchmark/gpu_imagenet_bench.py
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -22,10 +22,10 @@
                          'mobilenet', 'mobilenet_v2', 'squeezenet_v1.0', 'squeezenet_v1.1'],
                         help='The name of neural network')
     parser.add_argument("--model", type=str,
-                        choices=['1080ti', 'titanx', 'gfx900'], default='1080ti',
+                        choices=['1080ti', 'titanx', 'tx2', 'gfx900'], default='1080ti',
                         help="The model of the test device. If your device is not listed in "
                              "the choices list, pick the most similar one as argument.")
-    parser.add_argument("--number", type=int, default=500)
+    parser.add_argument("--repeat", type=int, default=600)
     parser.add_argument("--target", type=str,
                         choices=['cuda', 'opencl', 'rocm', 'nvptx', 'metal'], default='cuda',
                         help="The tvm compilation target")
@@ -58,6 +58,6 @@
         module.set_input(**params)
 
         # evaluate
-        ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=args.repeat)
         prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
         print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
index a75620b3fe08..cd3d7eca9f3c 100644
--- a/apps/benchmark/mobile_gpu_imagenet_bench.py
+++ b/apps/benchmark/mobile_gpu_imagenet_bench.py
@@ -13,13 +13,13 @@
 
 from util import get_network, print_progress
 
-def evaluate_network(network, target, target_host, number):
+def evaluate_network(network, target, target_host, dtype, repeat):
     # connect to remote device
     tracker = tvm.rpc.connect_tracker(args.host, args.port)
     remote = tracker.request(args.rpc_key)
 
     print_progress(network)
-    net, params, input_shape, output_shape = get_network(network, batch_size=1)
+    net, params, input_shape, output_shape = get_network(network, batch_size=1, dtype=dtype)
 
     print_progress("%-20s building..." % network)
     with nnvm.compiler.build_config(opt_level=3):
@@ -40,7 +40,6 @@ def evaluate_network(network, target, target_host, number):
     print_progress("%-20s uploading..." % network)
     ctx = remote.context(str(target), 0)
     remote.upload(tmp.relpath(filename))
-    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
 
     rlib = remote.load_module(filename)
     module = runtime.create(graph, rlib, ctx)
@@ -50,7 +49,7 @@ def evaluate_network(network, target, target_host, number):
 
     # evaluate
     print_progress("%-20s evaluating..." % network)
-    ftimer = module.module.time_evaluator("run", ctx, number=number, repeat=3)
+    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=repeat)
     prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
     print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
 
@@ -69,11 +68,10 @@ def evaluate_network(network, target, target_host, number):
     parser.add_argument("--host", type=str, default='localhost')
     parser.add_argument("--port", type=int, default=9190)
     parser.add_argument("--rpc-key", type=str, required=True)
-    parser.add_argument("--number", type=int, default=30)
+    parser.add_argument("--repeat", type=int, default=30)
+    parser.add_argument("--dtype", type=str, default='float32')
     args = parser.parse_args()
 
-    dtype = 'float32'
-
     if args.network is None:
         networks = ['squeezenet_v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
     else:
@@ -87,4 +85,4 @@ def evaluate_network(network, target, target_host, number):
     print("--------------------------------------------------")
 
     for network in networks:
-        evaluate_network(network, target, target_host, args.number)
+        evaluate_network(network, target, target_host, args.dtype, args.repeat)
diff --git a/apps/benchmark/util.py b/apps/benchmark/util.py
index 4825ac96571d..bdf47dd660f7 100644
--- a/apps/benchmark/util.py
+++ b/apps/benchmark/util.py
@@ -3,15 +3,17 @@
 import sys
 import nnvm
 
-def get_network(name, batch_size):
+def get_network(name, batch_size, dtype='float32'):
     """Get the symbol definition and random weight of a network
     
     Parameters
     ----------
     name: str
         The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ...
-    batch_size:
+    batch_size: int
         batch size
+    dtype: str
+        Data type
 
     Returns
     -------
@@ -28,24 +30,24 @@ def get_network(name, batch_size):
     output_shape = (batch_size, 1000)
 
     if name == 'mobilenet':
-        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == 'mobilenet_v2':
-        net, params = nnvm.testing.mobilenet_v2.get_workload(batch_size=batch_size)
+        net, params = nnvm.testing.mobilenet_v2.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == 'inception_v3':
         input_shape = (1, 3, 299, 299)
-        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif "resnet" in name:
         n_layer = int(name.split('-')[1])
-        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
     elif "vgg" in name:
         n_layer = int(name.split('-')[1])
-        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
     elif "densenet" in name:
         n_layer = int(name.split('-')[1])
-        net, params = nnvm.testing.densenet.get_workload(num_layers=n_layer, batch_size=batch_size)
+        net, params = nnvm.testing.densenet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
     elif "squeezenet" in name:
         version = name.split("_v")[1]
-        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version=version)
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version=version, dtype=dtype)
     elif name == 'custom':
         # an example for custom network
         from nnvm.testing import utils
@@ -53,7 +55,7 @@ def get_network(name, batch_size):
         net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
         net = nnvm.sym.flatten(net)
         net = nnvm.sym.dense(net, units=1000)
-        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224), dtype=dtype)
     elif name == 'mxnet':
         # an example for mxnet model
         from mxnet.gluon.model_zoo.vision import get_model
diff --git a/python/tvm/autotvm/measure/executor.py b/python/tvm/autotvm/measure/executor.py
index 17ea1d7fda9e..f3ba4236ce63 100644
--- a/python/tvm/autotvm/measure/executor.py
+++ b/python/tvm/autotvm/measure/executor.py
@@ -6,7 +6,7 @@ class Executor(object):
     Allows submit asynchronous jobs and returns the Future object.
     """
     # timeout for jobs that may hang
-    DEFAULT_TIMEOUT = 60
+    DEFAULT_TIMEOUT = 120
 
     def submit(self, func, *args, **kwargs):
         """
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 64295d158af5..41e2b4c4683c 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -22,10 +22,10 @@
 PACKAGE_VERSION = {
     'arm_cpu': "v0.03",
 
-    'cuda':    "v0.02",
+    'cuda':    "v0.03",
     'rocm':    "v0.01",
     'opencl':  "v0.01",
-    'mali':    "v0.02",
+    'mali':    "v0.03",
 
     'vta':     "v0.01",
 }
diff --git a/python/tvm/exec/autotvm_log_editor.py b/python/tvm/exec/autotvm_log_editor.py
index c524fb5dc785..458b6eff0e44 100644
--- a/python/tvm/exec/autotvm_log_editor.py
+++ b/python/tvm/exec/autotvm_log_editor.py
@@ -10,9 +10,9 @@
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument("--act", type=str, choices=['pick-best'],
+    parser.add_argument("--act", type=str, choices=['pick-best'], required=True,
                         help="The action")
-    parser.add_argument("--i", type=str, help="The input file or directory")
+    parser.add_argument("--i", type=str, help="The input file or directory", required=True)
     parser.add_argument("--o", type=str, help="The output file")
 
     args = parser.parse_args()
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index d031acdd9a2b..121498f217c4 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -187,7 +187,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
             [-1 / 6.0, 1 / 6.0, -1 / 6.0],
             [1 / 24.0, 1 / 12.0, 1 / 6.0],
             [1 / 24.0, -1 / 12.0, 1 / 6.0],
-            [0, 0, 1]], dtype=np.float32)
+            [0, 0, 1]], out_dtype)
 
         B_data = np.array([
             [4, 0, 0, 0, 0, 0],
@@ -209,7 +209,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
             [1, 0, 0],
             [1.0/2, 1.0/2, 1.0/2],
             [1.0/2, -1.0/2, 1.0/2],
-            [0, 0, 1]], np.float32)
+            [0, 0, 1]], out_dtype)
 
         B_data = np.array([
             [1, 0, 0, 0],
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index 8f0d74180449..c21273ed25a3 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -334,7 +334,7 @@ def tune_and_evaluate(tuning_opt):
 
         # evaluate
         print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=8, repeat=3)
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
               (np.mean(prof_res), np.std(prof_res)))
diff --git a/tutorials/autotvm/tune_nnvm_cuda.py b/tutorials/autotvm/tune_nnvm_cuda.py
index 1900c87aa40d..6e0ace462d6f 100644
--- a/tutorials/autotvm/tune_nnvm_cuda.py
+++ b/tutorials/autotvm/tune_nnvm_cuda.py
@@ -236,7 +236,7 @@ def tune_and_evaluate(tuning_opt):
 
         # evaluate
         print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=400, repeat=3)
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
               (np.mean(prof_res), np.std(prof_res)))
diff --git a/tutorials/autotvm/tune_nnvm_mobile_gpu.py b/tutorials/autotvm/tune_nnvm_mobile_gpu.py
index 27cdd2e632fc..4bd6a11ca2f1 100644
--- a/tutorials/autotvm/tune_nnvm_mobile_gpu.py
+++ b/tutorials/autotvm/tune_nnvm_mobile_gpu.py
@@ -335,7 +335,7 @@ def tune_and_evaluate(tuning_opt):
 
         # evaluate
         print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=50, repeat=3)
+        ftimer = module.module.time_evaluator("run", ctx, number==1, repeat=30)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
               (np.mean(prof_res), np.std(prof_res)))

From 6185fa81a2164e510be51cd64d1d3310c7030a30 Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Fri, 12 Oct 2018 14:20:37 -0700
Subject: [PATCH 217/529] Fixes for tensorize in Windows build to expose
 TensorIntrin::make and search clang.exe (#1896)

---
 include/tvm/tensor_intrin.h | 15 +++++++--------
 python/tvm/contrib/clang.py |  1 +
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index fbee4bccc0bf..6cffc931d42a 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -74,13 +74,13 @@ class TensorIntrinNode : public Node {
     v->Visit("reduce_update", &reduce_update);
   }
 
-  static TensorIntrin make(std::string name,
-                           Operation op,
-                           Array<Tensor> inputs,
-                           Array<Buffer> buffers,
-                           Stmt body,
-                           Stmt reduce_init,
-                           Stmt reduce_update);
+  TVM_DLL static TensorIntrin make(std::string name,
+                                   Operation op,
+                                   Array<Tensor> inputs,
+                                   Array<Buffer> buffers,
+                                   Stmt body,
+                                   Stmt reduce_init,
+                                   Stmt reduce_update);
 
   static constexpr const char* _type_key = "TensorIntrin";
   TVM_DECLARE_NODE_TYPE_INFO(TensorIntrinNode, Node);
@@ -90,7 +90,6 @@ inline const TensorIntrinNode* TensorIntrin::operator->() const {
   return static_cast<const TensorIntrinNode*>(node_.get());
 }
 
-
 // Internal node container of tensor intrinsic calling.
 class TensorIntrinCallNode;
 
diff --git a/python/tvm/contrib/clang.py b/python/tvm/contrib/clang.py
index 19508160d42d..3e8ad663c58f 100644
--- a/python/tvm/contrib/clang.py
+++ b/python/tvm/contrib/clang.py
@@ -31,6 +31,7 @@ def find_clang(required=True):
     if hasattr(codegen, "llvm_version_major"):
         cc_list += ["clang-%d.0" % codegen.llvm_version_major()]
     cc_list += ["clang"]
+    cc_list += ["clang.exe"]
     valid_list = [util.which(x) for x in cc_list]
     valid_list = [x for x in valid_list if x]
     if not valid_list and required:

From 0198db675224b924dbce43649dc6f368f5756363 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Sat, 13 Oct 2018 09:18:52 -0700
Subject: [PATCH 218/529] [Relay] [Op] Zeros, Ones (#1885)

---
 docs/langref/relay_op.rst            |  8 ++--
 include/tvm/relay/attrs/transform.h  |  8 ++--
 python/tvm/relay/op/tensor.py        | 38 +++++++++++++++++
 src/relay/op/tensor/transform.cc     | 61 ++++++++++++++++++++++++++--
 tests/python/relay/test_op_level3.py | 10 +++++
 5 files changed, 115 insertions(+), 10 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index b8cd20709a17..47cab696a8e1 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -67,7 +67,9 @@ This level enables additional math and transform operators.
 .. autosummary::
    :nosignatures:
 
+   tvm.relay.zeros
    tvm.relay.zeros_like
+   tvm.relay.ones
    tvm.relay.ones_like
    tvm.relay.reshape
    tvm.relay.copy
@@ -155,10 +157,9 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.copy
 .. autofunction:: tvm.relay.transpose
 .. autofunction:: tvm.relay.take
-
-Level 3 Definitions
--------------------
+.. autofunction:: tvm.relay.zeros
 .. autofunction:: tvm.relay.zeros_like
+.. autofunction:: tvm.relay.ones
 .. autofunction:: tvm.relay.ones_like
 
 
@@ -177,6 +178,7 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.pow
 .. autofunction:: tvm.relay.where
 
+
 Level 5 Definitions
 -------------------
 .. autofunction:: tvm.relay.image.resize
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 080a375cf1e2..278826bc825c 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -68,19 +68,19 @@ struct TakeAttrs : public tvm::AttrsNode<TakeAttrs> {
   }
 };
 
-/*! \brief Attributes used in full operator */
-struct FullAttrs : public tvm::AttrsNode<FullAttrs> {
+/*! \brief Attributes that specify a tensor */
+struct InitOpAttrs : public tvm::AttrsNode<InitOpAttrs> {
   Array<IndexExpr> shape;
   DataType dtype;
 
-  TVM_DECLARE_ATTRS(FullAttrs, "relay.attrs.FullAttrs") {
+  TVM_DECLARE_ATTRS(InitOpAttrs, "relay.attrs.InitOpAttrs") {
     TVM_ATTR_FIELD(shape)
       .describe("Target shape.");
     TVM_ATTR_FIELD(dtype)
       .describe("Target data type.")
       .set_default(Int(0));
   }
-};  // struct FullAttrs
+};  // struct InitOpAttrs
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 316514801fd6..3c432b58092d 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -484,6 +484,25 @@ def left_shift(lhs, rhs):
     return _make.left_shift(lhs, rhs)
 
 
+def zeros(shape, dtype):
+    """Fill array with zeros.
+
+    Parameters
+    ----------
+    shape : tuple of int
+        The shape of the target.
+
+    dtype : data type
+        The data type of the target.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.zeros(shape, dtype)
+
+
 def zeros_like(data):
     """Returns an array of zeros, with same type and shape as the input.
 
@@ -500,6 +519,25 @@ def zeros_like(data):
     return _make.zeros_like(data)
 
 
+def ones(shape, dtype):
+    """Fill array with ones.
+
+    Parameters
+    ----------
+    shape : tuple of int
+        The shape of the target.
+
+    dtype : data type
+        The data type of the target.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.ones(shape, dtype)
+
+
 def ones_like(data):
     """Returns an array of ones, with same type and shape as the input.
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 23f804013809..fb7b09fd3b46 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -404,14 +404,14 @@ Examples::
 .set_support_level(2)
 .add_type_rel("Take", TakeRel);
 
-TVM_REGISTER_NODE_TYPE(FullAttrs);
+TVM_REGISTER_NODE_TYPE(InitOpAttrs);
 
 bool FullRel(const Array<Type>& types,
              int num_inputs,
              const Attrs& attrs,
              const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
-  const FullAttrs* param = attrs.as<FullAttrs>();
+  const InitOpAttrs* param = attrs.as<InitOpAttrs>();
   const auto* fill_value = types[0].as<TensorTypeNode>();
   if (fill_value == nullptr) {
     return false;
@@ -433,7 +433,7 @@ bool FullRel(const Array<Type>& types,
 Expr MakeFull(Expr fill_value,
               Array<IndexExpr> shape,
               DataType dtype) {
-  auto attrs = make_node<FullAttrs>();
+  auto attrs = make_node<InitOpAttrs>();
   attrs->shape = std::move(shape);
   attrs->dtype = std::move(dtype);
   static const Op& op = Op::Get("full");
@@ -454,6 +454,61 @@ RELAY_REGISTER_OP("full")
 .set_support_level(3)
 .add_type_rel("Full", FullRel);
 
+bool InitOpRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 1);
+  const InitOpAttrs* param = attrs.as<InitOpAttrs>();
+
+  reporter->Assign(types[0], TensorTypeNode::make(param->shape, param->dtype));
+  return true;
+}
+
+Expr MakeZeros(Array<IndexExpr> shape,
+               DataType dtype) {
+  auto attrs = make_node<InitOpAttrs>();
+  attrs->shape = std::move(shape);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("zeros");
+  return CallNode::make(op, {}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.zeros")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeZeros, args, rv);
+  });
+
+RELAY_REGISTER_OP("zeros")
+.describe(R"code(Fill array with zeros.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(0)
+.set_support_level(3)
+.add_type_rel("InitOp", InitOpRel);
+
+Expr MakeOnes(Array<IndexExpr> shape,
+              DataType dtype) {
+  auto attrs = make_node<InitOpAttrs>();
+  attrs->shape = std::move(shape);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("ones");
+  return CallNode::make(op, {}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.ones")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeOnes, args, rv);
+  });
+
+RELAY_REGISTER_OP("ones")
+.describe(R"code(Fill array with ones.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(0)
+.set_support_level(3)
+.add_type_rel("InitOp", InitOpRel);
+
 bool FullLikeRel(const Array<Type>& types,
                  int num_inputs,
                  const Attrs& attrs,
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index cc8973c38384..9515db87e64a 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -7,6 +7,15 @@
 from tvm.relay.ir_builder import IRBuilder, func_type
 from tvm.relay.env import Environment
 
+def test_zeros_ones():
+    for op in [relay.zeros, relay.ones]:
+        ib = relay.ir_builder.IRBuilder()
+        with ib.function() as func:
+            ib.ret(op((124, 50), "float64"))
+        ib.ret(func)
+        func = relay.ir_pass.infer_type(ib.env, func.to_func())
+        ftype = func.checked_type
+        assert ftype.ret_type == relay.TensorType((124, 50), "float64")
 
 def test_unary_identity():
     for op in [relay.zeros_like, relay.ones_like]:
@@ -162,6 +171,7 @@ def test_full_like():
 
 if __name__ == "__main__":
     test_single_op()
+    test_zeros_ones()
     test_unary_identity()
     test_clip_type()
     test_copy_infer_type()

From 84055a53cb2b43c9aeee2365fb9bf5a2c1e76a5c Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Sun, 14 Oct 2018 15:37:28 -0700
Subject: [PATCH 219/529] Add test to confirm that we forbid allocate statement
 referencing undefined variable (#1899)

---
 .../unittest/test_pass_split_host_device.py     | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 tests/python/unittest/test_pass_split_host_device.py

diff --git a/tests/python/unittest/test_pass_split_host_device.py b/tests/python/unittest/test_pass_split_host_device.py
new file mode 100644
index 000000000000..24cc497944d7
--- /dev/null
+++ b/tests/python/unittest/test_pass_split_host_device.py
@@ -0,0 +1,17 @@
+from nose.tools import raises
+import tvm
+
+@raises(Exception)
+def test_loop_dependent_allocate():
+    N = tvm.var("N")
+    A = tvm.placeholder((2*N,), "float32", "A")
+    C = tvm.compute((N, ), lambda i: A[2*i] + A[i+1], name='C')
+    s = tvm.create_schedule(C.op)
+    AA = s.cache_read(A, "local", [C])
+    s[AA].compute_at(s[C], s[C].op.axis[0])
+    # this line should fail due to IRUseDefAnalysis sees an allocate statement
+    # referencing undefined variable
+    tvm.lower(s, [A,C])
+
+if __name__ == "__main__":
+    test_loop_dependent_allocate()

From 75621835a7ecbe529d871c6aa29e437ce2fa7bf9 Mon Sep 17 00:00:00 2001
From: Gaoxiong <40658249+gaoxiong1@users.noreply.github.com>
Date: Mon, 15 Oct 2018 07:56:00 +0800
Subject: [PATCH 220/529] support double buffer to use in ir builder DSL(#1897)
 (#1898)

---
 src/pass/storage_flatten.cc                   |  3 +-
 .../unittest/test_pass_storage_flatten.py     | 33 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index 993f6294e15b..8c2105829839 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -59,7 +59,8 @@ class StorageFlattener : public IRMutator {
     if (op->attr_key == attr::realize_scope) {
       storage_scope_[op->node.get()] = op->value.as<StringImm>()->value;
       return this->Mutate(op->body);
-    } else if (op->attr_key == attr::double_buffer_scope) {
+    } else if (op->attr_key == attr::double_buffer_scope &&
+               op->node.node_->derived_from<OperationNode>()) {
       Operation func(op->node.node_);
       Stmt body = Mutate(op->body);
       for (int i = 0; i < func->num_outputs(); ++i) {
diff --git a/tests/python/unittest/test_pass_storage_flatten.py b/tests/python/unittest/test_pass_storage_flatten.py
index 4e2feed23eff..655df1da4e15 100644
--- a/tests/python/unittest/test_pass_storage_flatten.py
+++ b/tests/python/unittest/test_pass_storage_flatten.py
@@ -51,8 +51,41 @@ def test_flatten_storage_align():
     stmt = tvm.ir_pass.Simplify(stmt)
     assert(stmt.body.extents[0].value == 17 * 8)
 
+def test_flatten_double_buffer():
+    dtype = 'int64'
+    n = 100
+    m = 4
+    tx = tvm.thread_axis("threadIdx.x")
+    ib = tvm.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    C = ib.pointer("float32", name="C")
+    ib.scope_attr(tx, "thread_extent", 1)
+    with ib.for_range(0, n) as i:
+        B = ib.allocate("float32", m, name="B", scope="shared")
+        with ib.new_scope():
+            ib.scope_attr(B.asnode(), "double_buffer_scope", 1)
+            with ib.for_range(0, m) as j:
+                B[j] = A[i * 4 + j]
+        with ib.for_range(0, m) as j:
+            C[j] = B[j] + 1
+
+    stmt = ib.get()
+    stmt = tvm.ir_pass.StorageFlatten(stmt, {}, 64)
+    stmt = tvm.ir_pass.InjectDoubleBuffer(stmt, 2)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert isinstance(stmt.body.body, tvm.stmt.Allocate)
+    assert stmt.body.body.extents[0].value == 2
+    f = tvm.ir_pass.MakeAPI(stmt, "db", [A.asnode(), C.asnode()], 2, True)
+    f = tvm.ir_pass.ThreadSync(f, "shared")
+    count = [0]
+    def count_sync(op):
+        if isinstance(op, tvm.expr.Call) and op.name == "tvm_storage_sync":
+            count[0] += 1
+    tvm.ir_pass.PostOrderVisit(f.body, count_sync)
+    assert count[0] == 4
 
 if __name__ == "__main__":
     test_flatten_storage_align()
     test_flatten2()
     test_flatten_prefetch()
+    test_flatten_double_buffer()

From df27969e47553a50bf15738093e9fce4b20cc101 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 14 Oct 2018 18:08:52 -0700
Subject: [PATCH 221/529] [RELAY][IR] Move type_annotation to Var, remove Param
 (#1900)

---
 include/tvm/relay/expr.h                      |  69 ++++----
 include/tvm/relay/expr_functor.h              |   4 -
 python/tvm/relay/__init__.py                  |   1 -
 python/tvm/relay/expr.py                      | 159 +++++++++++++-----
 python/tvm/relay/ir_builder.py                |  33 ++--
 src/relay/ir/debug_printer.cc                 |  18 +-
 src/relay/ir/expr.cc                          |  46 ++---
 src/relay/ir/expr_functor.cc                  |  35 ++--
 src/relay/pass/alpha_eq.cc                    |  40 ++---
 src/relay/pass/dead_code.cc                   |  26 ++-
 src/relay/pass/let_list.h                     |  53 +++---
 src/relay/pass/type_infer.cc                  |  31 ++--
 src/relay/pass/util.cc                        |   8 +-
 src/relay/pass/well_formed.cc                 |   4 +-
 tests/python/relay/test_ir_builder.py         |   1 -
 tests/python/relay/test_ir_debug_printer.py   |  15 +-
 tests/python/relay/test_ir_nodes.py           |  24 +--
 tests/python/relay/test_ir_well_formed.py     |  11 +-
 tests/python/relay/test_op_level1.py          |   9 +-
 tests/python/relay/test_op_level2.py          |  33 ++--
 tests/python/relay/test_op_level3.py          |  15 +-
 tests/python/relay/test_op_level4.py          |  14 +-
 tests/python/relay/test_op_level5.py          |   4 +-
 tests/python/relay/test_pass_alpha_equal.py   |  81 ++++-----
 .../relay/test_pass_dead_code_elimination.py  |  24 ++-
 tests/python/relay/test_pass_free_vars.py     |  13 +-
 26 files changed, 375 insertions(+), 396 deletions(-)

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index c6e5573d9413..5e50cfc05e67 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -118,17 +118,27 @@ class Var;
 /*! \brief Container for Var */
 class VarNode : public ExprNode {
  public:
-  /*! \brief The name of the variable, this only acts as a hint to the user,
-   * and is not used for equality.
+  /*!
+   * \brief The name of the variable,
+   *  this only acts as a hint to the user,
+   *  and is not used for equality.
    */
   std::string name_hint;
+  /*!
+   * \brief type annotaion of the variable.
+   * This field records user provided type annotation of the Var.
+   * This field is optional and can be None.
+   */
+  Type type_annotation;
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("name_hint", &name_hint);
+    v->Visit("type_annotation", &type_annotation);
     v->Visit("_checked_type_", &checked_type_);
   }
 
-  TVM_DLL static Var make(std::string name_hint);
+  TVM_DLL static Var make(std::string name_hint,
+                          Type type_annotation);
 
   static constexpr const char* _type_key = "relay.Var";
   TVM_DECLARE_NODE_TYPE_INFO(VarNode, ExprNode);
@@ -162,32 +172,6 @@ class GlobalVarNode : public ExprNode {
 
 RELAY_DEFINE_NODE_REF(GlobalVar, GlobalVarNode, Expr);
 
-/*!
- * \brief Function parameter declaration.
- */
-class Param;
-/*! \brief A parameter. */
-class ParamNode : public ExprNode {
- public:
-  /*! \brief The variable */
-  Var var;
-  /*! \brief The type of the parameter */
-  Type type;
-
-  void VisitAttrs(tvm::AttrVisitor* v) final {
-    v->Visit("var", &var);
-    v->Visit("type", &type);
-    v->Visit("span", &span);
-  }
-
-  TVM_DLL static Param make(Var var, Type type);
-
-  static constexpr const char* _type_key = "relay.Param";
-  TVM_DECLARE_NODE_TYPE_INFO(ParamNode, ExprNode);
-};
-
-RELAY_DEFINE_NODE_REF(Param, ParamNode, Expr);
-
 /*!
  * \brief Function (subgraph in computational graph)
  */
@@ -196,7 +180,7 @@ class Function;
 class FunctionNode : public ExprNode {
  public:
   /*! \brief Function parameters */
-  tvm::Array<Param> params;
+  tvm::Array<Var> params;
   /*! \brief User annotated return type of the function. */
   Type ret_type;
   /*!
@@ -224,10 +208,18 @@ class FunctionNode : public ExprNode {
     v->Visit("_checked_type_", &checked_type_);
   }
 
-  Type fn_type() const;
+  /*!
+   * \brief Return the derived function annotation of this expression.
+   *
+   * \return The function type annotation.
+   * \note The function type annotation can contain IncompleteType.
+   */
+  TVM_DLL FuncType func_type_annotation() const;
 
-  TVM_DLL static Function make(tvm::Array<Param> params, Type ret_type,
-                               Expr body, tvm::Array<TypeParam> ty_params);
+  TVM_DLL static Function make(tvm::Array<Var> params,
+                               Type ret_type,
+                               Expr body,
+                               tvm::Array<TypeParam> ty_params);
 
   static constexpr const char* _type_key = "relay.Function";
   TVM_DECLARE_NODE_TYPE_INFO(FunctionNode, ExprNode);
@@ -289,7 +281,7 @@ class CallNode : public ExprNode {
   TVM_DLL static Call make(Expr op,
                            Array<Expr> args,
                            Attrs attrs = Attrs(),
-                           Array<Type> ty_args = Array<Type>());
+                           Array<Type> type_args = Array<Type>());
 
   static constexpr const char* _type_key = "relay.Call";
   TVM_DECLARE_NODE_TYPE_INFO(CallNode, ExprNode);
@@ -318,19 +310,16 @@ class LetNode : public ExprNode {
   Expr value;
   /*! \brief The body of the let binding */
   Expr body;
-  /*! \brief Type annotation of value, this can be null */
-  Type value_type;
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("var", &var);
     v->Visit("value", &value);
     v->Visit("body", &body);
-    v->Visit("value_type", &value_type);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
   }
 
-  TVM_DLL static Let make(Var var, Expr value, Expr body, Type value_type);
+  TVM_DLL static Let make(Var var, Expr value, Expr body);
 
   static constexpr const char* _type_key = "relay.Let";
   TVM_DECLARE_NODE_TYPE_INFO(LetNode, ExprNode);
@@ -376,11 +365,11 @@ class IfNode : public ExprNode {
 
 RELAY_DEFINE_NODE_REF(If, IfNode, Expr);
 
-/*! \brief Get a field out of a tuple. */
+/*! \brief Get index-th field out of a tuple. */
 class TupleGetItem;
 class TupleGetItemNode : public ExprNode {
  public:
-  /*! \brief The tuple */
+  /*! \brief The tuple Expression */
   Expr tuple;
   /*! \brief which value to get */
   int index;
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index be174d33b4c8..c10933590f99 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -80,7 +80,6 @@ class ExprFunctor<R(const Expr& n, Args...)> {
                        Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const GlobalVarNode* op,
                        Args... args) EXPR_FUNCTOR_DEFAULT;
-  virtual R VisitExpr_(const ParamNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const FunctionNode* op,
                        Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const CallNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
@@ -103,7 +102,6 @@ class ExprFunctor<R(const Expr& n, Args...)> {
     RELAY_EXPR_FUNCTOR_DISPATCH(TupleNode);
     RELAY_EXPR_FUNCTOR_DISPATCH(VarNode);
     RELAY_EXPR_FUNCTOR_DISPATCH(GlobalVarNode);
-    RELAY_EXPR_FUNCTOR_DISPATCH(ParamNode);
     RELAY_EXPR_FUNCTOR_DISPATCH(FunctionNode);
     RELAY_EXPR_FUNCTOR_DISPATCH(CallNode);
     RELAY_EXPR_FUNCTOR_DISPATCH(LetNode);
@@ -127,7 +125,6 @@ class ExprVisitor : public ::tvm::relay::ExprFunctor<void(const Expr& n)> {
   void VisitExpr_(const GlobalVarNode* op) override;
   void VisitExpr_(const ConstantNode* op) override;
   void VisitExpr_(const TupleNode* op) override;
-  void VisitExpr_(const ParamNode* op) override;
   void VisitExpr_(const FunctionNode* op) override;
   void VisitExpr_(const CallNode* op) override;
   void VisitExpr_(const LetNode* op) override;
@@ -151,7 +148,6 @@ class ExprMutator
   Expr VisitExpr_(const GlobalVarNode* op) override;
   Expr VisitExpr_(const OpNode* op) override;
   Expr VisitExpr_(const TupleNode* op) override;
-  Expr VisitExpr_(const ParamNode* op) override;
   Expr VisitExpr_(const FunctionNode* op) override;
   Expr VisitExpr_(const CallNode* call_node) override;
   Expr VisitExpr_(const LetNode* op) override;
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 18c02a416d6b..b1085be2e1e2 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -34,7 +34,6 @@
 Tuple = expr.Tuple
 Var = expr.Var
 GlobalVar = expr.GlobalVar
-Param = expr.Param
 Function = expr.Function
 Call = expr.Call
 Let = expr.Let
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 6ed8df0d736b..a71fd329ed5b 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -11,11 +11,11 @@ class Expr(NodeBase):
     """The base type for all Relay expressions."""
     @property
     def checked_type(self):
-        """Get the checked type of relay.
+        """Get the checked type of tvm.relay.Expr.
 
         Returns
         -------
-        checked_type : relay.Type
+        checked_type : tvm.relay.Type
             The checked type.
         """
         ret = self._checked_type_
@@ -25,70 +25,97 @@ def checked_type(self):
         return ret
 
     def __call__(self, *args):
-        converted_args = []
-        for arg in args:
-            if isinstance(arg, Param):
-                converted_args.append(arg.var)
-            else:
-                converted_args.append(arg)
-
         return Call(self, args, None, None)
 
 
 @register_relay_node
 class Constant(Expr):
-    """A constant tensor in Relay, see tvm/relay/type.h for more details.
-    """
+    """A constant expression in Relay.
 
+    Parameters
+    ----------
+    data : tvm.nd.NDArray
+        The data content of the constant expression.
+    """
     def __init__(self, data):
         self.__init_handle_by_constructor__(_make.Constant, data)
 
 
 @register_relay_node
 class Tuple(Expr):
-    """A hetereogenous sequence of values.
-       see tvm/relay/type.h for more details.
-    """
+    """Tuple expression that groups several fields together.
 
+    Parameters
+    ----------
+    fields : List[tvm.relay.Expr]
+        The fields in the tuple.
+    """
     def __init__(self, fields):
         self.__init_handle_by_constructor__(_make.Tuple, fields)
 
 
 @register_relay_node
 class Var(Expr):
-    """A local variable in Relay."""
+    """A local variable in Tvm.Relay.
 
-    def __init__(self, name_hint):
-        self.__init_handle_by_constructor__(_make.Var, name_hint)
+    Local variable can be used to declare input
+    arguments to a function, or intermediate variables.
+
+    Parameters
+    ----------
+    name_hint: str
+        The name of the variable.
+        This name only acts as a hint, and is not used
+        for equality.
+
+    type_annotation: tvm.relay.Type, optional
+        The type annotation on the variable.
+    """
+    def __init__(self, name_hint, type_annotation=None):
+        self.__init_handle_by_constructor__(
+            _make.Var, name_hint, type_annotation)
 
 
 @register_relay_node
 class GlobalVar(Expr):
-    """A global variable in Relay."""
+    """A global variable in Tvm.Relay.
 
+    GlobalVar is used to refer to the global functions
+    stored in the environment.
+
+    Parameters
+    ----------
+    name_hint: str
+        The name of the variable.
+    """
     def __init__(self, name_hint):
         self.__init_handle_by_constructor__(_make.GlobalVar, name_hint)
 
 
 @register_relay_node
-class Param(Expr):
-    """A function type in Relay, see tvm/relay/type.h for more details.
-    """
+class Function(Expr):
+    """A function declaration expression.
 
-    def __init__(self, var, ty):
-        self.__init_handle_by_constructor__(_make.Param, var, ty)
+    Parameters
+    ----------
+    params: List[tvm.relay.Var]
+        List of input parameters to the function.
 
+    ret_type: tvm.relay.Type
+        The return type annotation of the function.
 
-@register_relay_node
-class Function(Expr):
-    """A function in Relay, see tvm/relay/expr.h for more details."""
+    body: tvm.relay.Expr
+        The body of the function.
 
+    type_params: Optional[List[tvm.relay.TypeParam]]
+        The additional type parameters, this is only
+        used in advanced usecase of template functions.
+    """
     def __init__(self,
                  params,
                  ret_type,
                  body,
-                 type_params=None
-                ):
+                 type_params=None):
         if type_params is None:
             type_params = convert([])
 
@@ -98,39 +125,87 @@ def __init__(self,
 
 @register_relay_node
 class Call(Expr):
-    """A function call in Relay, see tvm/relay/expr.h for more details."""
+    """Function call node in Relay.
+
+    Call node corresponds the operator application node
+    in computational graph terminology.
+
+    Parameters
+    ----------
+    op: tvm.relay.Op or any tvm.relay.Expr with function type.
+        The operation to be called.
 
-    def __init__(self, op, args, attrs, ty_args=None):
-        if not ty_args:
-            ty_args = []
+    args: List[tvm.relay.Expr]
+        The arguments to the call.
 
+    attrs: Optional[tvm.Attrs]
+        Attributes to the call, can be None
+
+    type_args: Optional[List[tvm.relay.Type]]
+        The additional type arguments, this is only
+        used in advanced usecase of template functions.
+    """
+    def __init__(self, op, args, attrs=None, type_args=None):
+        if not type_args:
+            type_args = []
         self.__init_handle_by_constructor__(
-            _make.Call, op, args, attrs, ty_args)
+            _make.Call, op, args, attrs, type_args)
 
 
 @register_relay_node
 class Let(Expr):
-    """A variable bindings in Relay, see tvm/relay/expr.h for more details."""
+    """Let variable binding expression.
+
+    Parameters
+    ----------
+    var: tvm.relay.Var
+        The local variable to be bound.
+
+    value: tvm.relay.Expr
+        The value to be bound.
 
-    def __init__(self, var, value, body, value_type=None):
+    body: tvm.relay.Expr
+        The body of the let binding.
+    """
+    def __init__(self, var, value, body):
         self.__init_handle_by_constructor__(
-            _make.Let, var, value, body, value_type)
+            _make.Let, var, value, body)
 
 
 @register_relay_node
 class If(Expr):
-    """A conditional expression in Relay, see tvm/relay/expr.h for more details."""
+    """A conditional expression in Relay.
+
+    Parameters
+    ----------
+    cond: tvm.relay.Expr
+        The condition.
 
-    def __init__(self, cond, true_value, false_value):
+    true_branch: tvm.relay.Expr
+        The expression evaluated when condition is true.
+
+    false_branch: tvm.relay.Expr
+        The expression evaluated when condition is false.
+    """
+    def __init__(self, cond, true_branch, false_branch):
         self.__init_handle_by_constructor__(
-            _make.If, cond, true_value, false_value)
+            _make.If, cond, true_branch, false_branch)
+
 
 @register_relay_node
 class TupleGetItem(Expr):
-    """An expression that get field from tuple in Relay, see tvm/relay/expr.h for more details."""
+    """Get index-th item from a tuple.
+
+    Parameters
+    ----------
+    tuple_value: tvm.relay.Expr
+        The input tuple expression.
 
-    def __init__(self, tuple_, index):
+    index: int
+        The index.
+    """
+    def __init__(self, tuple_value, index):
         self.__init_handle_by_constructor__(
-            _make.TupleGetItem, tuple_, index)
+            _make.TupleGetItem, tuple_value, index)
 
 debug_print = _expr._debug_print
diff --git a/python/tvm/relay/ir_builder.py b/python/tvm/relay/ir_builder.py
index accb782659df..a429aea7d5ea 100644
--- a/python/tvm/relay/ir_builder.py
+++ b/python/tvm/relay/ir_builder.py
@@ -7,7 +7,7 @@
 import numpy as np
 import tvm
 from .ty import Type, FuncType, TensorType
-from .expr import Expr, Constant, Let, Var, Param, Function, If
+from .expr import Expr, Constant, Let, Var, Function, If
 from .env import Environment
 
 
@@ -98,7 +98,7 @@ def __init__(self, params, ret_type, body, type_params):
         self.type_params = type_params
 
     def param_ids(self):
-        return [p.var for p in self.params]
+        return [p for p in self.params]
 
     def to_func(self):
         """Converts a PartialFunc into a :py:class:`~relay.Function`."""
@@ -113,9 +113,8 @@ def to_func(self):
 
 def _mk_let(bindings, ret_value):
     let_expr = ret_value
-    for var, (value, ty) in reversed(list(bindings.items())):
-        let_expr = Let(var, value, let_expr, ty)
-
+    for var, value in reversed(list(bindings.items())):
+        let_expr = Let(var, value, let_expr)
     return let_expr
 
 
@@ -168,15 +167,12 @@ def exit_scope(self):
 
     #pylint: disable=invalid-name
     def bind(self, name, value, ty):
-        lv = Var(name)
+        lv = Var(name, ty)
         self.scopes[-1][name] = lv
-        self.bindings[-1][lv] = (value, ty)
+        self.bindings[-1][lv] = value
         return lv
 
     def let(self, name, value, value_type=None):
-        if isinstance(value, Param):
-            value = value.var
-
         if not isinstance(value, Expr):
             value = convert(value)
 
@@ -185,23 +181,18 @@ def let(self, name, value, value_type=None):
     def _convert_params(self, raw_params):
         relay_params = []
         for raw_param in raw_params:
-            if isinstance(raw_param, Param):
-                var = raw_param.var
+            if isinstance(raw_param, Var):
                 param = raw_param
             elif isinstance(raw_param, tuple):
                 var, ty = raw_param
-                if isinstance(var, str):
-                    var = Var(var)
                 ty = _convert_type(ty)
-                param = Param(var, ty)
-            elif isinstance(param, str):
-                var = Var(raw_param)
-                ty = None
-                param = Param(var, ty)
+                param = Var(var, ty)
+            elif isinstance(raw_param, str):
+                param = Var(raw_param, None)
             else:
                 raise Exception("unknown parameter type")
 
-            self.scopes[-1][var.name_hint] = var
+            self.scopes[-1][param.name_hint] = param
             relay_params.append(param)
 
         return relay_params
@@ -265,7 +256,7 @@ def param(self, name, ty=None):
         else:
             ty = _convert_type(ty)
 
-        return Param(Var(name), ty)
+        return Var(name, ty)
 
     def global_var(self, name):
         # type: (str) -> GlobalVar
diff --git a/src/relay/ir/debug_printer.cc b/src/relay/ir/debug_printer.cc
index 90e82d3b2dd7..cb463ef6975a 100644
--- a/src/relay/ir/debug_printer.cc
+++ b/src/relay/ir/debug_printer.cc
@@ -96,7 +96,9 @@ class TypeDocifier : private TypeFunctor<Doc(const Type& n)> {
   }
 
   std::vector<Doc> DocifyTypeParam(const tvm::Array<TypeParam>& arr) {
-    return MapDocify<TypeParam>(arr, [=](const TypeParam& tp) { return Docify(tp); });
+    return MapDocify<TypeParam>(arr, [=](const TypeParam& tp) {
+        return Docify(tp);
+      });
   }
 
   std::vector<Doc> DocifyTypeConstraint(const tvm::Array<TypeConstraint>& arr) {
@@ -188,10 +190,11 @@ class ExprDocifier : private ExprFunctor<Doc(const Expr& n)> {
     return vec;
   }
 
-  std::vector<Doc> DocifyParamArray(const tvm::Array<Param>& arr) {
+  std::vector<Doc> DocifyParamArray(const tvm::Array<Var>& arr) {
     std::vector<Doc> vec;
-    for (size_t i = 0; i < arr.size(); ++i) {
-      vec.push_back(Docify(arr[i]));
+    for (Var param : arr) {
+      vec.emplace_back(TypeAnnotation(DocOfStr(VarName(param)),
+                                      param->type_annotation));
     }
     return vec;
   }
@@ -212,10 +215,6 @@ class ExprDocifier : private ExprFunctor<Doc(const Expr& n)> {
     return DocOfStr(g->name_hint);
   }
 
-  Doc VisitExpr_(const ParamNode* p) final {
-    return TypeAnnotation(Docify(p->var), p->type);
-  }
-
   Doc VisitExpr_(const FunctionNode* f) final {
     return Group(TypeAnnotation(Seq("(", DocifyParamArray(f->params), ")"), f->ret_type) + Sep() +
                  DocOfStr("=>") + Sep() +
@@ -227,7 +226,8 @@ class ExprDocifier : private ExprFunctor<Doc(const Expr& n)> {
   }
 
   Doc VisitExpr_(const LetNode* l) final {
-    return Group(DocOfStr("let") + Sep() + TypeAnnotation(Docify(l->var), l->value_type) + Sep() +
+    return Group(DocOfStr("let") + Sep() +
+                 TypeAnnotation(Docify(l->var), l->var->type_annotation) + Sep() +
                  DocOfStr("=") + Sep() + Docify(l->value) + DocOfStr(";") + Endl() +
                  Docify(l->body));
   }
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 6b56cb4e844f..c248ad0de6f7 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -54,20 +54,26 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
     p->stream << "Tuple(" << node->fields << ")";
   });
 
-Var VarNode::make(std::string name_hint) {
+Var VarNode::make(std::string name_hint, Type type_annotation) {
   NodePtr<VarNode> n = make_node<VarNode>();
   n->name_hint = std::move(name_hint);
+  n->type_annotation = std::move(type_annotation);
   return Var(n);
 }
 
 TVM_REGISTER_API("relay._make.Var")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = VarNode::make(args[0]);
+    *ret = VarNode::make(args[0], args[1]);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<VarNode>([](const VarNode *node, tvm::IRPrinter *p) {
-    p->stream << "Var(" << node->name_hint << ")";
+    p->stream << "Var(" << node->name_hint;
+    if (node->type_annotation.defined()) {
+      p->stream << ", ty=";
+      p->print(node->type_annotation);
+    }
+    p->stream << ")";
   });
 
 GlobalVar GlobalVarNode::make(std::string name_hint) {
@@ -86,24 +92,10 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
     p->stream << "GlobalVar(" << node->name_hint << ")";
   });
 
-Param ParamNode::make(Var var, Type type) {
-  NodePtr<ParamNode> n = make_node<ParamNode>();
-  n->var = std::move(var);
-  n->type = std::move(type);
-  return Param(n);
-}
-
-TVM_REGISTER_API("relay._make.Param")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-  *ret = ParamNode::make(args[0], args[1]);
-});
 
-TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<ParamNode>([](const ParamNode *node, tvm::IRPrinter *p) {
-    p->stream << "Param(" << node->var << ", " << node->type << ")";
-});
-
-Function FunctionNode::make(tvm::Array<Param> params, Type ret_type, Expr body,
+Function FunctionNode::make(tvm::Array<Var> params,
+                            Type ret_type,
+                            Expr body,
                             tvm::Array<TypeParam> type_params) {
   NodePtr<FunctionNode> n = make_node<FunctionNode>();
   n->params = std::move(params);
@@ -113,12 +105,11 @@ Function FunctionNode::make(tvm::Array<Param> params, Type ret_type, Expr body,
   return Function(n);
 }
 
-Type FunctionNode::fn_type() const {
+FuncType FunctionNode::func_type_annotation() const {
   Array<Type> param_types;
   for (auto param : this->params) {
-    param_types.push_back(param->type);
+    param_types.push_back(param->type_annotation);
   }
-
   return FuncTypeNode::make(param_types, this->ret_type, this->type_params, {});
 }
 
@@ -155,24 +146,23 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
     << node->attrs << ", " << node->type_args << ")";
 });
 
-Let LetNode::make(Var var, Expr value, Expr body, Type value_type) {
+Let LetNode::make(Var var, Expr value, Expr body) {
   NodePtr<LetNode> n = make_node<LetNode>();
   n->var = std::move(var);
   n->value = std::move(value);
   n->body = std::move(body);
-  n->value_type = std::move(value_type);
   return Let(n);
 }
 
 TVM_REGISTER_API("relay._make.Let")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-  *ret = LetNode::make(args[0], args[1], args[2], args[3]);
-});
+    *ret = LetNode::make(args[0], args[1], args[2]);
+  });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<LetNode>([](const LetNode *node, tvm::IRPrinter *p) {
   p->stream << "LetNode(" << node->var << ", " << node->value
-    << ", " << node->body << ", " << node->value_type << ")";
+            << ", " << node->body << ")";
 });
 
 If IfNode::make(Expr cond, Expr true_branch, Expr false_branch) {
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 792f99d699dd..c55e4d672b6c 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -24,6 +24,16 @@ Expr ExprMutator::Mutate(const Expr& expr) {
 }
 
 Expr ExprMutator::VisitExpr_(const VarNode* op) {
+  // NOTE: var will only be mutated once
+  // Thanks to the memo and reused during rewriting if necessary.
+  // It is safe to assume that the
+  if (op->type_annotation.defined()) {
+    auto type = this->VisitType(op->type_annotation);
+    if (!op->type_annotation.same_as(type)) {
+      return VarNode::make(op->name_hint, type);
+    }
+  }
+  // default case return self.
   return GetRef<Expr>(op);
 }
 
@@ -55,16 +65,6 @@ Expr ExprMutator::VisitExpr_(const TupleNode* op) {
   }
 }
 
-Expr ExprMutator::VisitExpr_(const ParamNode* op) {
-  Var var = Downcast<Var>(this->Mutate(op->var));
-  auto type = this->VisitType(op->type);
-  if (op->var.same_as(var) && op->type.same_as(type)) {
-    return GetRef<Expr>(op);
-  } else {
-    return ParamNode::make(var, type);
-  }
-}
-
 Expr ExprMutator::VisitExpr_(const FunctionNode* op) {
   tvm::Array<TypeParam> ty_params;
   bool all_ty_params_changed = true;
@@ -75,10 +75,10 @@ Expr ExprMutator::VisitExpr_(const FunctionNode* op) {
     all_ty_params_changed &= new_ty_param.same_as(ty_param);
   }
 
-  tvm::Array<Param> params;
+  tvm::Array<Var> params;
   bool all_params_changed = true;
   for (auto param : op->params) {
-    Param new_param = Downcast<Param>(this->Mutate(param));
+    Var new_param = Downcast<Var>(this->Mutate(param));
     params.push_back(new_param);
     all_params_changed &= param.same_as(new_param);
   }
@@ -123,17 +123,15 @@ Expr ExprMutator::VisitExpr_(const CallNode* call_node) {
 
 Expr ExprMutator::VisitExpr_(const LetNode* op) {
   Var var = Downcast<Var>(this->Mutate(op->var));
-  auto type = this->VisitType(op->value_type);
   auto value = this->Mutate(op->value);
   auto body = this->Mutate(op->body);
 
   if (var.same_as(op->var) &&
-      type.same_as(op->value_type) &&
       value.same_as(op->value) &&
       body.same_as(op->body)) {
     return GetRef<Expr>(op);
   } else {
-    return LetNode::make(var, value, body, type);
+    return LetNode::make(var, value, body);
   }
 }
 
@@ -162,6 +160,9 @@ Expr ExprMutator::VisitExpr_(const TupleGetItemNode* g) {
 Type ExprMutator::VisitType(const Type& t) { return t; }
 
 void ExprVisitor::ExprVisitor::VisitExpr_(const VarNode* op) {
+  if (op->type_annotation.defined()) {
+    this->VisitType(op->type_annotation);
+  }
 }
 
 void ExprVisitor::ExprVisitor::VisitExpr_(const GlobalVarNode* op) {
@@ -176,10 +177,6 @@ void ExprVisitor::ExprVisitor::VisitExpr_(const TupleNode* op) {
   }
 }
 
-void ExprVisitor::ExprVisitor::VisitExpr_(const ParamNode* op) {
-  this->VisitExpr(op->var);
-}
-
 void ExprVisitor::ExprVisitor::VisitExpr_(const FunctionNode* op) {
   for (auto param : op->params) {
     this->VisitExpr(param);
diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
index 0ed0e3df3056..29d2f87cf04a 100644
--- a/src/relay/pass/alpha_eq.cc
+++ b/src/relay/pass/alpha_eq.cc
@@ -252,15 +252,6 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
     }
   }
 
-  void VisitExpr_(const ParamNode* p1, const Expr& e2) final {
-    if (const ParamNode* p2 = e2.as<ParamNode>()) {
-      eq_map.Set(p1->var, p2->var);
-      equal = equal && AlphaEqual(p1->type, p2->type);
-    } else {
-      equal = false;
-    }
-  }
-
   void VisitExpr_(const FunctionNode* func1, const Expr& e2) final {
     if (const FunctionNode* func2 = e2.as<FunctionNode>()) {
       if (func1->params.size() != func2->params.size()) {
@@ -273,9 +264,10 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
         return;
       }
 
-      for (size_t i = 0U; i < func1->params.size(); i++) {
-        this->VisitExpr(func1->params[i], func2->params[i]);
+      for (size_t i = 0; i < func1->params.size(); ++i) {
+        MergeVarDecl(func1->params[i], func2->params[i]);
       }
+      if (!equal) return;
 
       for (size_t i = 0U; i < func1->type_params.size(); i++) {
         equal = equal && AlphaEqual(func1->type_params[i], func2->type_params[i]);
@@ -332,19 +324,9 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
 
   void VisitExpr_(const LetNode* op, const Expr& e2) final {
     if (const LetNode* let = e2.as<LetNode>()) {
-      eq_map.Set(op->var, let->var);
+      MergeVarDecl(op->var, let->var);
       this->VisitExpr(op->value, let->value);
       this->VisitExpr(op->body, let->body);
-
-      // value_type should match as well (including nulls)
-      if (op->value_type.defined() != let->value_type.defined()) {
-        equal = false;
-        return;
-      }
-
-      if (op->value_type.defined()) {
-        equal = equal && AlphaEqual(op->value_type, let->value_type);
-      }
     } else {
       equal = false;
     }
@@ -388,6 +370,20 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
       equal = false;
     }
   }
+
+ private:
+  void MergeVarDecl(const Var& var1, const Var& var2) {
+    if (var1->type_annotation.defined() != var2->type_annotation.defined()) {
+      equal = false;
+      return;
+    }
+    if (var1->type_annotation.defined() &&
+        !AlphaEqual(var1->type_annotation, var2->type_annotation)) {
+      equal = false;
+      return;
+    }
+    eq_map.Set(var1, var2);
+  }
 };
 
 bool AlphaEqual(const Expr& e1, const Expr& e2) {
diff --git a/src/relay/pass/dead_code.cc b/src/relay/pass/dead_code.cc
index 05036042a635..2e2eca1f2739 100644
--- a/src/relay/pass/dead_code.cc
+++ b/src/relay/pass/dead_code.cc
@@ -54,12 +54,7 @@ class CalcDep : private ExprMutator {
   }
 
  private:
-  struct Binder {
-    Type t;
-    Expr e;
-    Binder(const Type& t, const Expr& e) : t(t), e(e) { }
-  };
-  using VarMap = std::unordered_map<Var, Binder, NodeHash, NodeEqual>;
+  using VarMap = std::unordered_map<Var, Expr, NodeHash, NodeEqual>;
   VarMap var_map_;
 
   Expr VisitExpr_(const IfNode* i) final {
@@ -74,9 +69,7 @@ class CalcDep : private ExprMutator {
   }
 
   Expr VisitExpr_(const LetNode* l) final {
-    var_map_.insert(std::pair<Var, Binder>(l->var,
-                                           Binder(l->value_type,
-                                                  Eliminate(l->value))));
+    var_map_[l->var] = Eliminate(l->value);
     return VisitExpr(l->body);
   }
 
@@ -92,15 +85,16 @@ class CalcDep : private ExprMutator {
     explicit GenLet(const VarMap& var_map) : var_map_(var_map) { }
     friend CalcDep;
 
-    void VisitExpr_(const VarNode* vn) final {
-      Var v = GetRef<Var>(vn);
-      if (var_map_.count(v) != 0) {
-        auto val = var_map_.at(v);
-        var_map_.erase(v);
+    void VisitExpr_(const VarNode* vnode) final {
+      Var v = GetRef<Var>(vnode);
+      auto it = var_map_.find(v);
+      if (it != var_map_.end()) {
+        Expr expr = it->second;
+        var_map_.erase(it);
         // erase before visit to handle letrec
-        VisitExpr(val.e);
+        VisitExpr(expr);
         // visit before push back so the dependency of dependency is before the dependency
-        lets_.Push(v, val.t, val.e);
+        lets_.Push(v, expr);
       }
     }
   };
diff --git a/src/relay/pass/let_list.h b/src/relay/pass/let_list.h
index d13358fe0e30..43b8bb8bba1d 100644
--- a/src/relay/pass/let_list.h
+++ b/src/relay/pass/let_list.h
@@ -26,57 +26,46 @@ namespace relay {
  */
 class LetList {
  public:
-  /*! \brief insert a binding.
+  /*!
+   * \brief insert a binding.
    *
-   *  \param pv the var of the binding.
+   * \param pv the var of the binding.
    *
-   *  \param ty the type of the binding.
+   * \param expr the value of the binding.
    *
-   *  \param expr the value of the binding.
-   *
-   *  \return a Var that hold the inserted expr.
+   * \return a Var that hold the inserted expr.
    */
-  Var Push(const Var& pv, const Type& ty, const Expr& expr) {
-    std::tuple<Var, Type, Expr> tuple(pv, ty, expr);
-    lets_.push_back(tuple);
+  Var Push(Var pv, Expr expr) {
+    lets_.emplace_back(std::make_pair(pv, expr));
     return pv;
   }
 
-  /*! \brief insert a binding.
+  /*!
+   * \brief insert a binding.
    *
-   *  \param ty the type of the binding.
+   * \param ty the type of the binding.
    *
-   *  \param expr the value of the binding.
+   * \param expr the value of the binding.
    *
-   *  \return a Var that hold the inserted expr.
-   */
-  Var Push(const Type& ty, const Expr& expr) {
-    return Push(VarNode::make("x"), ty, expr);
-  }
-
-  /*! \brief insert a binding.
-   *
-   *  \param pv the var of the binding.
-   *
-   *  \param expr the value of the binding.
-   *
-   *  \return a Var that hold the inserted expr.
+   * \return a Var that hold the inserted expr.
    */
-  Var Push(const Var& pv, const Expr& expr) {
-    return Push(pv, IncompleteTypeNode::make(TypeParamNode::kType), expr);
+  Var Push(Type ty, Expr expr) {
+    return Push(VarNode::make("x", ty), expr);
   }
 
-  /*! \brief insert a binding.
+  /*!
+   * \brief insert a binding.
    *
    *  \param expr the value of the binding.
    *
    *  \return a Var that hold the inserted expr.
    */
-  Var Push(const Expr& expr) {
+  Var Push(Expr expr) {
     return Push(IncompleteTypeNode::make(TypeParamNode::kType), expr);
   }
 
-  /*! \brief wrap an expr around the LetList.
+  /*!
+   * \brief wrap an expr around the LetList.
    *
    *  \param body the Expression to be wrapped around.
    *
@@ -85,7 +74,7 @@ class LetList {
   Expr Get(const Expr& body) const {
     Expr ret = body;
     for (auto rit = lets_.rbegin(); rit != lets_.rend(); ++rit) {
-      ret = LetNode::make(std::get<0>(*rit), std::get<2>(*rit), ret, std::get<1>(*rit));
+      ret = LetNode::make(std::get<0>(*rit), std::get<1>(*rit), ret);
     }
     return ret;
   }
@@ -118,7 +107,7 @@ class LetList {
   }
 
  private:
-  std::vector<std::tuple<Var, Type, Expr> > lets_;
+  std::vector<std::pair<Var, Expr> > lets_;
 };
 
 }  // namespace relay
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 72bdaf69f061..1b30865eacb1 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -87,15 +87,11 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
 
   // Visitor logics
   Type VisitExpr_(const VarNode* op) final {
-    // The type of Var can already been lookedup in type_map_;
-    LOG(FATAL) << "Cannot find binding for var " << GetRef<Var>(op);
-    return Type();
-  }
-
-  Type VisitExpr_(const ParamNode* op) final {
-    // directly handled by Funtion
-    LOG(FATAL) << "not reached";
-    return Type();
+    if (op->type_annotation.defined()) {
+      return op->type_annotation;
+    } else {
+      return IncompleteTypeNode::make(TypeParamNode::kType);
+    }
   }
 
   Type VisitExpr_(const GlobalVarNode* op) final {
@@ -139,11 +135,11 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
 
   Type VisitExpr_(const LetNode* op) final {
     Type vtype = GetType(op->value);
-    if (op->value_type.defined()) {
-      vtype = Unify(vtype, op->value_type, op->span);
+    if (op->var->type_annotation.defined()) {
+      vtype = Unify(vtype, op->var->type_annotation, op->span);
     }
     CHECK(!type_map_.count(op->var));
-    // NOTE: no scoping is necessary becase var are unique in program
+    // NOTE: no scoping is necessary because var are unique in program
     type_map_[op->var] = vtype;
     return GetType(op->body);
   }
@@ -256,8 +252,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
 
   Type VisitExpr_(const FunctionNode* f) final {
     for (auto param : f->params) {
-      type_map_[param->var] = param->type;
-      type_map_[param] = param->type;
+      GetType(param);
     }
     Type rtype = GetType(f->body);
     // Run solver using the currently known information
@@ -265,8 +260,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
     // Trying to resolve
     Array<Type> arg_types;
     for (size_t i = 0; i < f->params.size(); ++i) {
-      Param param = f->params[i];
-      Type atype = solver_.Resolve(param->type);
+      Type atype = solver_.Resolve(GetType(f->params[i]));
       CHECK(atype.as<IncompleteTypeNode>() == nullptr)
           << "Cannot resolve type of " << i
           << "-th parameter of function at" << f->span;
@@ -311,9 +305,6 @@ class TypeInferencer::Resolver : public ExprMutator {
     return AttachCheckedType(op);
   }
 
-  Expr VisitExpr_(const ParamNode* op) final {
-    return ExprMutator::VisitExpr_(op);
-  }
 
   Expr VisitExpr_(const FunctionNode* op) final {
     return AttachCheckedType(op);
@@ -380,7 +371,7 @@ Expr InferType(const Environment& env,
                const GlobalVar& var,
                const Function& func) {
   Function func_copy = Function(make_node<FunctionNode>(*func.operator->()));
-  func_copy->checked_type_ = func_copy->fn_type();
+  func_copy->checked_type_ = func_copy->func_type_annotation();
   env->functions.Set(var, func_copy);
   Expr func_ret = TypeInferencer(env).Infer(func_copy);
   auto map_node = env->functions.CopyOnWrite();
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index 5f87c3d4cb89..c845995b2003 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -50,14 +50,17 @@ class FreeVar : public ExprVisitor {
     if (bound_vars.count(var) == 0) {
       free_vars.insert(var);
     }
+    if (v->type_annotation.defined()) {
+      VisitType(v->type_annotation);
+    }
   }
 
   void VisitExpr_(const FunctionNode *f) final {
     for (const auto& tp : f->type_params) {
       bound_types.insert(tp);
     }
-    for (const auto& p : f->params) {
-      bound_vars.insert(p->var);
+    for (const auto& param : f->params) {
+      bound_vars.insert(param);
     }
     VisitExpr(f->body);
     VisitType(f->ret_type);
@@ -67,7 +70,6 @@ class FreeVar : public ExprVisitor {
     bound_vars.insert(l->var);
     VisitExpr(l->value);
     VisitExpr(l->body);
-    VisitType(l->value_type);
   }
 
  public:
diff --git a/src/relay/pass/well_formed.cc b/src/relay/pass/well_formed.cc
index a9bce74926bf..e008a72e5d90 100644
--- a/src/relay/pass/well_formed.cc
+++ b/src/relay/pass/well_formed.cc
@@ -34,8 +34,8 @@ class WellFormedChecker : private ExprVisitor {
   }
 
   void VisitExpr_(const FunctionNode * f) final {
-    for (const Param & p : f->params) {
-      Check(p->var);
+    for (const Var & param : f->params) {
+      Check(param);
     }
     CheckWellFormed(f->body);
   }
diff --git a/tests/python/relay/test_ir_builder.py b/tests/python/relay/test_ir_builder.py
index c98f920ca491..165c66f17ac3 100644
--- a/tests/python/relay/test_ir_builder.py
+++ b/tests/python/relay/test_ir_builder.py
@@ -14,7 +14,6 @@ def test_let():
     assert var == prog.body
     assert isinstance(value, Constant)
     assert value.data.asnumpy() == np.array(1)
-    assert prog.value_type == None
 
 if __name__ == "__main__":
     test_let()
diff --git a/tests/python/relay/test_ir_debug_printer.py b/tests/python/relay/test_ir_debug_printer.py
index e5f9ad2e69cd..b8aa86a87638 100644
--- a/tests/python/relay/test_ir_debug_printer.py
+++ b/tests/python/relay/test_ir_debug_printer.py
@@ -49,18 +49,11 @@ def test_global_var():
     show(gv)
 
 
-def test_param():
-    lv = relay.Var('x')
-    ty = None
-    param = relay.Param(lv, ty)
-    show(lv)
-
-
 def test_function():
     param_names = ['a', 'b', 'c', 'd']
-    params = tvm.convert([relay.Param(relay.Var(n), None) for n in param_names])
+    params = tvm.convert([relay.Var(n) for n in param_names])
     ret_type = None
-    body = params[0].var
+    body = params[0]
     type_params = tvm.convert([])
     fn = relay.Function(params, ret_type, body, type_params)
     show(fn)
@@ -76,11 +69,11 @@ def test_call():
 
 
 def test_let():
-    lv = relay.Var('x')
     ty = relay.ty.TensorType((10, 20), 'float32')
+    lv = relay.Var('x', ty)
     arr = tvm.nd.array(10)
     value = relay.Constant(arr)
-    let = relay.Let(lv, value, lv, ty)
+    let = relay.Let(lv, value, lv)
     show(let)
 
 
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index 79883ed225e0..e571f2a9c99a 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -99,10 +99,16 @@ def test_tuple():
 def test_local_var():
     name_hint = 's'
     lv = relay.Var(name_hint)
-    lv.name_hint == name_hint
+    assert lv.name_hint == name_hint
+    assert lv.type_annotation is None
     # assert lv.span == None todo(@jroesch): what do we do about spans
     str(lv)
 
+    t1 = relay.ty.TensorType((), "float")
+    lv = relay.Var(name_hint, t1)
+    assert lv.name_hint == name_hint
+    assert lv.type_annotation == t1
+
 
 def test_global_var():
     name_hint = 'g'
@@ -112,19 +118,9 @@ def test_global_var():
     str(gv)
 
 
-def test_param():
-    lv = relay.Var('x')
-    ty = None
-    param = relay.Param(lv, ty)
-    assert param.var == lv
-    assert param.type == ty
-    assert param.span == None
-    str(param)
-
-
 def test_function():
     param_names = ['a', 'b', 'c', 'd']
-    params = tvm.convert([relay.Param(relay.Var(n), None) for n in param_names])
+    params = tvm.convert([relay.Var(n) for n in param_names])
     ret_type = None
     body = None
     type_params = tvm.convert([])
@@ -154,10 +150,9 @@ def test_let():
     value = relay.Constant(arr)
     # I would prefer that the order of arguments
     # matches syntax let x: t = v in b
-    let = relay.Let(lv, value, lv, ty)
+    let = relay.Let(lv, value, lv)
     assert let.var == lv
     assert let.value == value
-    assert let.value_type == ty
     assert let.body == lv
     assert let.span == None
     str(let)
@@ -194,7 +189,6 @@ def test_tuple_get_item():
     test_tuple()
     test_local_var()
     test_global_var()
-    test_param()
     test_function()
     test_call()
     test_let()
diff --git a/tests/python/relay/test_ir_well_formed.py b/tests/python/relay/test_ir_well_formed.py
index c6cb99662bb5..d555c2beb627 100644
--- a/tests/python/relay/test_ir_well_formed.py
+++ b/tests/python/relay/test_ir_well_formed.py
@@ -7,23 +7,22 @@ def test_well_formed():
     assert well_formed(x)
     v = relay.Constant(tvm.nd.array(10))
     ty = None
-    let = relay.Let(x, v, x, ty)
+    let = relay.Let(x, v, x)
     assert well_formed(let)
-    assert not well_formed(relay.Let(x, v, let, ty))
-    f = relay.Function([relay.Param(x, ty)], ty, x)
+    assert not well_formed(relay.Let(x, v, let))
+    f = relay.Function([x], ty, x)
     assert well_formed(f)
     # this test should pass in case of weak uniqueness (only test for shadowing)
     # but we want all binder to be distinct from each other.
     assert not well_formed(relay.Let(relay.Var("y"), f,
-                                     relay.Let(relay.Var("z"), f, v, ty), ty))
+                                     relay.Let(relay.Var("z"), f, v)))
 
 
 def test_tuple():
     x = relay.Var('x')
     assert well_formed(x)
     v = relay.Constant(tvm.nd.array(10))
-    ty = None
-    let = relay.Let(x, v, x, ty)
+    let = relay.Let(x, v, x)
     assert well_formed(let)
     assert well_formed(relay.Tuple([v, v]))
     assert not well_formed(relay.Tuple([let, let]))
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index a90f6eb55ae1..05c02ab5d197 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -27,6 +27,8 @@ def check_single_op(opfunc):
                    tvm.relay.sigmoid, tvm.relay.tanh]:
         check_single_op(opfunc)
 
+
+
 def test_expand_dims_infer_type():
     ib = relay.ir_builder.IRBuilder()
     n, t, d = tvm.var("n"), tvm.var("t"), 100
@@ -75,12 +77,13 @@ def test_unary_op():
         ib = relay.ir_builder.IRBuilder()
         x = ib.param("x", relay.TensorType((10, 4), "int32"))
         with ib.function(x) as func:
-            ib.ret(op(x.var))
+            ib.ret(op(x))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
         ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((10, 4), "int32")
 
+
 def test_binary_op():
     def check_binary_op(opfunc):
         """
@@ -94,7 +97,7 @@ def check_binary_op(opfunc):
         x = b.param('x', tensor_type(5, 5, 5))
         y = b.param('y', tensor_type(5, 5, 5))
         with b.function(x, y) as func:
-            b.ret(opfunc(x.var, y.var))
+            b.ret(opfunc(x, y))
         b.ret(func)
         prog, env = b.get()
         ttype = tensor_type(5, 5, 5)
@@ -118,7 +121,7 @@ def check_binary_broadcast_op(opfunc):
         x = b.param('x', tensor_type(10, 4))
         y = b.param('y', tensor_type(5, 10, 1))
         with b.function(x, y) as func:
-            b.ret(opfunc(x.var, y.var))
+            b.ret(opfunc(x, y))
         b.ret(func)
         prog, env = b.get()
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index f67faea19be1..d0d02aece06d 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -11,7 +11,7 @@ def test_conv2d_infer_type():
     w = ib.param("w", relay.ty.IncompleteType())
 
     with ib.function(x, w) as func:
-        ib.ret(relay.nn.conv2d(x.var, w.var,
+        ib.ret(relay.nn.conv2d(x, w,
                                kernel_size=(3, 3),
                                padding=(1, 1),
                                channels=2))
@@ -29,7 +29,7 @@ def test_conv2d_infer_type():
     x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
     w = ib.param("w", relay.ty.TensorType((2, 10, 3, 3), "int8"))
     with ib.function(x, w) as func:
-        ib.ret(relay.nn.conv2d(x.var, w.var, out_dtype="int32"))
+        ib.ret(relay.nn.conv2d(x, w, out_dtype="int32"))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -42,7 +42,7 @@ def test_conv2d_infer_type():
     x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
     w = ib.param("w", relay.ty.IncompleteType())
     with ib.function(x, w) as func:
-        ib.ret(relay.nn.conv2d(x.var, w.var,
+        ib.ret(relay.nn.conv2d(x, w,
                                kernel_size=(3, 3),
                                padding=(1, 1),
                                channels=16,
@@ -65,7 +65,7 @@ def test_conv2d_transpose_infer_type():
     w = ib.param("w", relay.ty.IncompleteType())
 
     with ib.function(x, w) as func:
-        ib.ret(relay.nn.conv2d_transpose(x.var, w.var,
+        ib.ret(relay.nn.conv2d_transpose(x, w,
                                          kernel_size=(3, 3),
                                          padding=(1, 1),
                                          channels=15))
@@ -83,7 +83,7 @@ def test_conv2d_transpose_infer_type():
     x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
     w = ib.param("w", relay.ty.TensorType((12, 11, 5, 5), "float32"))
     with ib.function(x, w) as func:
-        ib.ret(relay.nn.conv2d_transpose(x.var, w.var,
+        ib.ret(relay.nn.conv2d_transpose(x, w,
                                          output_padding=(1, 1),
                                          channels=11,
                                          data_layout="NHWC"))
@@ -98,7 +98,7 @@ def test_upsampling_infer_type():
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
     with ib.function(x) as func:
-        ib.ret(relay.nn.upsampling(x.var, scale=2, layout="NCHW", method="BILINEAR"))
+        ib.ret(relay.nn.upsampling(x, scale=2, layout="NCHW", method="BILINEAR"))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -108,7 +108,7 @@ def test_upsampling_infer_type():
     n, c = tvm.var("n"), tvm.var("c")
     x = ib.param("x", relay.ty.TensorType((n, c, 100, 200), "float32"))
     with ib.function(x) as func:
-        ib.ret(relay.nn.upsampling(x.var, scale=2, layout="NCHW", method="BILINEAR"))
+        ib.ret(relay.nn.upsampling(x, scale=2, layout="NCHW", method="BILINEAR"))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -119,7 +119,7 @@ def _test_pool2d_infer_type(opfunc):
     n, c, h, w = tvm.var("n"), 10, 224, 224
     x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
     with ib.function(x) as func:
-        ib.ret(opfunc(x.var, pool_size=(1, 1)))
+        ib.ret(opfunc(x, pool_size=(1, 1)))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -132,7 +132,7 @@ def _test_pool2d_infer_type(opfunc):
     n, c, h, w = tvm.var("n"), 10, 224, 224
     x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
     with ib.function(x) as func:
-        ib.ret(opfunc(x.var, pool_size=(ph, pw), strides=(sh, sw)))
+        ib.ret(opfunc(x, pool_size=(ph, pw), strides=(sh, sw)))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -144,7 +144,7 @@ def _test_global_pool2d_infer_type(opfunc):
     n, c, h, w = tvm.var("n"), tvm.var("c"), 224, 224
     x = ib.param("x", relay.ty.TensorType((n, h, w, c), "float32"))
     with ib.function(x) as func:
-        ib.ret(opfunc(x.var, layout="NHWC"))
+        ib.ret(opfunc(x, layout="NHWC"))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -154,7 +154,7 @@ def _test_global_pool2d_infer_type(opfunc):
     n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
     with ib.function(x) as func:
-        ib.ret(opfunc(x.var))
+        ib.ret(opfunc(x))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -172,7 +172,7 @@ def test_flatten_infer_type():
     x = ib.param("x", relay.ty.TensorType((d1, d2, d3, d4), "float32"))
 
     with ib.function(x) as func:
-        ib.ret(relay.nn.batch_flatten(x.var))
+        ib.ret(relay.nn.batch_flatten(x))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -181,7 +181,7 @@ def test_flatten_infer_type():
     ib = relay.ir_builder.IRBuilder()
     x = ib.param("x", relay.ty.TensorType((3, 2, 4, 3), "float32"))
     with ib.function(x) as func:
-        ib.ret(relay.nn.batch_flatten(x.var))
+        ib.ret(relay.nn.batch_flatten(x))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -190,7 +190,7 @@ def test_flatten_infer_type():
     ib = relay.ir_builder.IRBuilder()
     x = ib.param("x", relay.ty.TensorType((d1, 2, d3, 3), "float32"))
     with ib.function(x) as func:
-        ib.ret(relay.nn.batch_flatten(x.var))
+        ib.ret(relay.nn.batch_flatten(x))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -202,7 +202,7 @@ def test_pad_infer_type():
     n, c, h, w = 1, 2, 3, 4
     t = ib.param("t", relay.TensorType((n, c, h, w), "float32"))
     with ib.function(t) as func:
-        ib.ret(relay.nn.pad(t.var, ((1, 1), (2, 2), (3, 3), (4, 4))))
+        ib.ret(relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4))))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -213,7 +213,7 @@ def test_pad_infer_type():
     n, c, h, w = tvm.var("n"), 2, 3, tvm.var("w")
     t = ib.param("t", relay.TensorType((n, c, h, w), "float32"))
     with ib.function(t) as func:
-        ib.ret(relay.nn.pad(t.var, ((1, 1), (2, 2), (3, 3), (4, 4))))
+        ib.ret(relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4))))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -227,4 +227,3 @@ def test_pad_infer_type():
     test_flatten_infer_type()
     test_pad_infer_type()
     test_conv2d_transpose_infer_type()
-
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 9515db87e64a..7d949b21026b 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -17,12 +17,13 @@ def test_zeros_ones():
         ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((124, 50), "float64")
 
+
 def test_unary_identity():
     for op in [relay.zeros_like, relay.ones_like]:
         ib = relay.ir_builder.IRBuilder()
         x = ib.param("x", relay.TensorType((8, 9, 4), "int32"))
         with ib.function(x) as func:
-            ib.ret(op(x.var))
+            ib.ret(op(x))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
         ftype = func.checked_type
@@ -33,7 +34,7 @@ def test_clip_type():
     ib = relay.ir_builder.IRBuilder()
     a = ib.param("a", relay.TensorType((10, 4), "float32"))
     with ib.function(a) as func:
-        ib.ret(relay.clip(a.var, 1., 4.))
+        ib.ret(relay.clip(a, 1., 4.))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -106,7 +107,7 @@ def verify_take(dshape, indices_shape, oshape, axis=None):
         x = ib.param("x", relay.ty.TensorType(dshape, "float32"))
         indices = ib.param("indices", relay.ty.TensorType(indices_shape, "int32"))
         with ib.function(x, indices) as func:
-            ib.ret(relay.take(x.var, indices.var, axis=axis))
+            ib.ret(relay.take(x, indices, axis=axis))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
         ftype = func.checked_type
@@ -127,7 +128,7 @@ def test_full():
     ib = relay.ir_builder.IRBuilder()
     x = ib.param("x", relay.TensorType((), "int8"))
     with ib.function(x) as func:
-        ib.ret(relay.full(x.var, ()))
+        ib.ret(relay.full(x, ()))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -137,7 +138,7 @@ def test_full():
     ib = relay.ir_builder.IRBuilder()
     x = ib.param("x", relay.TensorType((), "float32"))
     with ib.function(x) as func:
-        ib.ret(relay.full(x.var, (1, 2), "int8"))
+        ib.ret(relay.full(x, (1, 2), "int8"))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -150,7 +151,7 @@ def test_full_like():
     base = ib.param("base", relay.TensorType((1, 2, 3), "float32"))
     fill = ib.param("fill", relay.TensorType((), "float32"))
     with ib.function(base, fill) as func:
-        ib.ret(relay.full_like(base.var, fill.var))
+        ib.ret(relay.full_like(base, fill))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -162,7 +163,7 @@ def test_full_like():
     base = ib.param("base", relay.TensorType((n, c, h, w), "float32"))
     fill = ib.param("fill", relay.TensorType((), "float32"))
     with ib.function(base, fill) as func:
-        ib.ret(relay.full_like(base.var, fill.var))
+        ib.ret(relay.full_like(base, fill))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 807d3a3a964e..995e15fb9760 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -24,7 +24,7 @@ def test_cmp_type():
         x = ib.param("x", relay.TensorType((10, 4), "float32"))
         y = ib.param("y", relay.TensorType((5, 10, 1), "float32"))
         with ib.function(x, y) as func:
-            ib.ret(op(x.var, y.var))
+            ib.ret(op(x, y))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
         ftype = func.checked_type
@@ -39,7 +39,7 @@ def test_binary_broadcast():
         x = ib.param("x", relay.TensorType((10, 4), "int32"))
         y = ib.param("y", relay.TensorType((5, 10, 1), "int32"))
         with ib.function(x, y) as func:
-            ib.ret(op(x.var, y.var))
+            ib.ret(op(x, y))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
         ftype = func.checked_type
@@ -58,7 +58,7 @@ def check_binary_op(opfunc):
         x = b.param('x', tensor_type(5, 5, 5))
         y = b.param('y', tensor_type(5, 5, 5))
         with b.function(x, y) as func:
-            b.ret(opfunc(x.var, y.var))
+            b.ret(opfunc(x, y))
         b.ret(func)
         prog, env = b.get()
         ttype = tensor_type(5, 5, 5)
@@ -81,7 +81,7 @@ def check_binary_broadcast_op(opfunc):
         x = b.param('x', tensor_type(10, 4))
         y = b.param('y', tensor_type(5, 10, 1))
         with b.function(x, y) as func:
-            b.ret(opfunc(x.var, y.var))
+            b.ret(opfunc(x, y))
         b.ret(func)
         prog, env = b.get()
 
@@ -103,7 +103,7 @@ def test_cmp_type():
         x = ib.param("x", relay.TensorType((10, 4), "float32"))
         y = ib.param("y", relay.TensorType((5, 10, 1), "float32"))
         with ib.function(x, y) as func:
-            ib.ret(op(x.var, y.var))
+            ib.ret(op(x, y))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
         ftype = func.checked_type
@@ -118,7 +118,7 @@ def test_binary_broadcast():
         x = ib.param("x", relay.TensorType((10, 4), "int32"))
         y = ib.param("y", relay.TensorType((5, 10, 1), "int32"))
         with ib.function(x, y) as func:
-            ib.ret(op(x.var, y.var))
+            ib.ret(op(x, y))
         ib.ret(func)
         func = relay.ir_pass.infer_type(ib.env, func.to_func())
         ftype = func.checked_type
@@ -131,7 +131,7 @@ def test_where():
     x = ib.param("x", relay.TensorType((3, 4), "float32"))
     y = ib.param("y", relay.TensorType((3, 4), "float32"))
     with ib.function(cond, x, y) as func:
-        ib.ret(relay.where(cond.var, x.var, y.var))
+        ib.ret(relay.where(cond, x, y))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 62da592e8249..8d871e9ef4f5 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -10,7 +10,7 @@ def test_resize_infer_type():
     th, tw = tvm.var("th"), tvm.var("tw")
 
     with ib.function(x) as func:
-        ib.ret(relay.image.resize(x.var, (th, tw)))
+        ib.ret(relay.image.resize(x, (th, tw)))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
@@ -19,7 +19,7 @@ def test_resize_infer_type():
     ib = relay.ir_builder.IRBuilder()
     x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
     with ib.function(x) as func:
-        ib.ret(relay.image.resize(x.var, (100, 200), "NCHW", "BILINEAR", False))
+        ib.ret(relay.image.resize(x, (100, 200), "NCHW", "BILINEAR", False))
     ib.ret(func)
     func = relay.ir_pass.infer_type(ib.env, func.to_func())
     ftype = func.checked_type
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index dd722399dac4..04ef3cf3da8f 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -1,4 +1,5 @@
 import tvm
+import numpy as np
 from tvm import relay
 from tvm.relay.ir_pass import alpha_equal
 from tvm.relay.ir_builder import convert
@@ -179,9 +180,9 @@ def test_var_alpha_equal():
     assert not alpha_equal(v1, v2)
 
     # let node allows for setting the eq_map
-    l1 = relay.Let(v1, convert(1), v1, None)
-    l2 = relay.Let(v2, convert(1), v2, None)
-    l3 = relay.Let(v1, convert(1), v2, None)
+    l1 = relay.Let(v1, convert(1), v1)
+    l2 = relay.Let(v2, convert(1), v2)
+    l3 = relay.Let(v1, convert(1), v2)
 
     assert alpha_equal(l1, l2)
     assert not alpha_equal(l1, l3)
@@ -209,10 +210,10 @@ def test_tuple_alpha_equal():
     assert alpha_equal(tup, same)
 
     # use the eq_map
-    let_tup = relay.Let(v1, tup, v1, None)
+    let_tup = relay.Let(v1, tup, v1)
     let_mapped = relay.Let(v2, relay.Tuple([v2, convert(2), convert(3),
                                             relay.Tuple([convert(4)])]),
-                           v2, None)
+                           v2)
     assert alpha_equal(let_tup, let_mapped)
 
     more_fields = relay.Tuple([v1, convert(2), convert(3), relay.Tuple([convert(4)]), v2])
@@ -242,61 +243,44 @@ def test_tuple_get_item_alpha_equal():
     assert alpha_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(x, 1))
 
 
-def test_param_alpha_equal():
-    # only checks equality of the types
-    v1 = relay.Var("v1")
-    v2 = relay.Var("v2")
-
-    p1 = relay.Param(v1, relay.TensorType((1, 2, 3), "float32"))
-    p2 = relay.Param(v2, relay.TensorType((1, 2, 3), "float32"))
-    assert alpha_equal(p1, p2)
-
-    p3 = relay.Param(v1, relay.TensorType((4, 5, 6), "int8"))
-    assert not alpha_equal(p1, p3)
-
-    p4 = relay.Param(v1, relay.TupleType([relay.TensorType((1, 2, 3),
-                                                           "float32")]))
-    assert not alpha_equal(p1, p4)
-
-
 def test_function_alpha_equal():
-    v1 = relay.Var("v1")
-    v2 = relay.Var("v2")
-    v3 = relay.Var("v3")
-    v4 = relay.Var("v4")
-
     tt1 = relay.TensorType((1, 2, 3), "float32")
     tt2 = relay.TensorType((4, 5, 6), "int8")
     tt3 = relay.TupleType([tt1, tt2])
 
+    v1 = relay.Var("v1", tt1)
+    v2 = relay.Var("v2", tt2)
+    v3 = relay.Var("v3", tt3)
+    v4 = relay.Var("v4", tt2)
+    vret = relay.Constant(tvm.nd.array(np.ones(1)))
+
     tp1 = relay.TypeParam("tp1", relay.Kind.Type)
     tp2 = relay.TypeParam("tp2", relay.Kind.Type)
     tp3 = relay.TypeParam("tp3", relay.Kind.Shape)
     tp4 = relay.TypeParam("tp4", relay.Kind.Shape)
 
-    basic_args = [relay.Param(v3, tt1), relay.Param(v4, tt2)]
+    basic_args = [relay.Var("v3", tt1), relay.Var("v4", tt2)]
     basic_tps = [tp1, tp2]
 
-    func = relay.Function([relay.Param(v1, tt1), relay.Param(v2, tt2)],
-                          tt2, v2, basic_tps)
-    mapped = relay.Function(basic_args, tt2, v4, basic_tps)
+    func = relay.Function([v1, v2],
+                          tt2, v1, basic_tps)
+    mapped = relay.Function(basic_args, tt2, basic_args[0], basic_tps)
     assert alpha_equal(func, mapped)
 
-    fewer_params = relay.Function([relay.Param(v4, tt2)], tt2, v4, basic_tps)
+    fewer_params = relay.Function([relay.Var("v4", tt2)], tt2, v4, basic_tps)
     assert not alpha_equal(func, fewer_params)
 
-    more_params = relay.Function([relay.Param(v3, tt1), relay.Param(v4, tt2),
-                                  relay.Param(v2, tt2)], tt2, v4, basic_tps)
+    more_params = relay.Function([relay.Var("v3", tt1),
+                                  relay.Var("v4", tt2),
+                                  relay.Var("v2", tt2)], tt2, v4, basic_tps)
     assert not alpha_equal(func, more_params)
 
-    params_unordered = relay.Function([relay.Param(v3, tt2),
-                                       relay.Param(v4, tt1)],
-                                      tt1, v3, basic_tps)
+    params_unordered = relay.Function([v2, v1],
+                                      tt2, v1, basic_tps)
     assert not alpha_equal(func, params_unordered)
 
-    params_mismatch = relay.Function([relay.Param(v3, tt3),
-                                      relay.Param(v4, tt2)],
-                                     tt2, v4, basic_tps)
+    params_mismatch = relay.Function([v1, v3],
+                                     tt2, v1, basic_tps)
     assert not alpha_equal(func, params_mismatch)
 
     # also would not typecheck
@@ -376,7 +360,10 @@ def test_call_alpha_equal():
 
 
 def test_let_alpha_equal():
+    tt1 = relay.TensorType((), "float32")
+    tt2 = relay.TensorType((), "int8")
     v1 = relay.Var("v1")
+    v1_wtype = relay.Var("v1", tt1)
     v2 = relay.Var("v2")
     v3 = relay.Var("v3")
 
@@ -394,14 +381,13 @@ def test_let_alpha_equal():
     assert not alpha_equal(let, different_body)
 
     # specified types must match
-    tt1 = relay.TensorType((), "float32")
-    tt2 = relay.TensorType((), "int8")
-    let_with_type = relay.Let(v1, convert(2), v1, tt1)
-    same_type = relay.Let(v1, convert(2), v1, tt1)
+
+    let_with_type = relay.Let(v1_wtype, convert(2), v1_wtype)
+    same_type = relay.Let(v1_wtype, convert(2), v1_wtype)
     assert alpha_equal(let_with_type, same_type)
     assert not alpha_equal(let, let_with_type)
-
-    different_type = relay.Let(v1, convert(2), v1, tt2)
+    v2 = relay.Var("v1", tt2)
+    different_type = relay.Let(v2, convert(2), v2)
     assert not alpha_equal(let_with_type, different_type)
 
 
@@ -437,16 +423,13 @@ def test_op_alpha_equal():
     test_tensor_type_alpha_equal()
     test_incomplete_type_alpha_equal()
     test_constant_alpha_equal()
-    test_type_param_alpha_equal()
     test_func_type_alpha_equal()
     test_tuple_type_alpha_equal()
     test_type_relation_alpha_equal()
     test_constant_alpha_equal()
-    test_var_alpha_equal()
     test_global_var_alpha_equal()
     test_tuple_alpha_equal()
     test_tuple_get_item_alpha_equal()
-    test_param_alpha_equal()
     test_function_alpha_equal()
     test_call_alpha_equal()
     test_let_alpha_equal()
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index ce9bda3d254f..121cea0081bd 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -28,17 +28,17 @@ def __init__(self):
 
 
 def test_let():
-    orig = relay.Let(e.x, e.y, e.z, e.tt)
+    orig = relay.Let(e.x, e.y, e.z)
     assert alpha_equal(dead_code_elimination(orig), e.z)
 
 
 def test_used_let():
-    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.c, e.tt), e.tt)
-    assert alpha_equal(dead_code_elimination(orig), relay.Let(e.c, e.d, e.c, e.tt))
+    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.c))
+    assert alpha_equal(dead_code_elimination(orig), relay.Let(e.c, e.d, e.c))
 
 
 def test_chain_unused_let():
-    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.e, e.tt), e.tt)
+    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.e))
     assert alpha_equal(dead_code_elimination(orig), e.e)
 
 
@@ -56,19 +56,17 @@ def test_recursion():
        f(2, 10000);
     """
     f = relay.Var("f")
-    n = relay.Var("n")
-    np = relay.Param(n, e.int32)
-    data = relay.Var("data")
-    datap = relay.Param(data, e.float32)
+    n = relay.Var("n", e.int32)
+    data = relay.Var("data", e.float32)
     funcbody = relay.If(equal(n, convert(0)), data, f(subtract(n, convert(1.0)), log(data)))
-    value = relay.Function([np, datap], e.float32, funcbody, [])
-    orig = relay.Let(f, funcbody, f(convert(2.0), convert(10000.0)), e.float32)
+    value = relay.Function([n, data], e.float32, funcbody, [])
+    orig = relay.Let(f, funcbody, f(convert(2.0), convert(10000.0)))
     assert alpha_equal(dead_code_elimination(orig), orig)
-    assert alpha_equal(dead_code_elimination(relay.Let(f, funcbody, e.three, e.float32)), e.three)
+    assert alpha_equal(dead_code_elimination(relay.Let(f, funcbody, e.three)), e.three)
 
 
 def test_op_let():
-    assert alpha_equal(dead_code_elimination(add(relay.Let(e.a, e.one, e.three, e.float32), e.two)), add(e.three, e.two))
+    assert alpha_equal(dead_code_elimination(add(relay.Let(e.a, e.one, e.three), e.two)), add(e.three, e.two))
 
 
 def test_if():
@@ -80,7 +78,7 @@ def test_tuple_get_item():
     t = relay.Var('t')
     g = relay.TupleGetItem(t, 0)
     assert alpha_equal(dead_code_elimination(g), g)
-    assert alpha_equal(dead_code_elimination(relay.TupleGetItem(relay.Let(e.a, e.one, t, e.float32), 0)), g)
+    assert alpha_equal(dead_code_elimination(relay.TupleGetItem(relay.Let(e.a, e.one, t), 0)), g)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_pass_free_vars.py b/tests/python/relay/test_pass_free_vars.py
index 989c9f8d25db..a4c745de10e0 100644
--- a/tests/python/relay/test_pass_free_vars.py
+++ b/tests/python/relay/test_pass_free_vars.py
@@ -3,16 +3,17 @@
 from tvm.relay.ir_pass import free_vars, free_type_vars
 
 def test_free_vars():
-    x = relay.Var("x")
+    ty = relay.TensorType([], "int32")
+    x = relay.Var("x", ty)
     fvx = free_vars(x)
     assert len(fvx) == 1
     assert fvx[0] == x
     v = relay.Constant(tvm.nd.array(10))
-    ty = relay.TensorType([], "int32")
-    let = relay.Let(x, v, x, ty)
+
+    let = relay.Let(x, v, x)
     fvx = free_vars(let)
     assert len(free_vars(let)) == 0
-    f = relay.Function([relay.Param(x, ty)], ty, x)
+    f = relay.Function([x], ty, x)
     assert len(free_vars(f)) == 0
 
 
@@ -29,9 +30,9 @@ def test_tuple():
 def test_free_type_vars():
     tp = relay.TypeParam("")
     ty = relay.TupleType([tp, relay.TensorType([], "int32")])
-    x = relay.Var("x")
+    x = relay.Var("x", ty)
     y = relay.Var("y")
-    let = relay.Let(x, y, x, ty)
+    let = relay.Let(x, y, x)
     fvl = free_vars(let)
     assert len(fvl) == 1
     assert fvl[0] == y

From f5b0b1671ab0f394b632f6c0cc287be2bd895c66 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Mon, 15 Oct 2018 06:43:52 +0530
Subject: [PATCH 222/529] [RUNTIME][DEBUG]Support remote debugging (#1866)

---
 python/tvm/contrib/debugger/debug_runtime.py  | 11 +++++---
 .../graph/debug/graph_runtime_debug.cc        | 13 +++++++++
 .../unittest/test_runtime_graph_debug.py      | 27 +++++++++++++++++++
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index 986a7b167626..25d17d528bf2 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -5,8 +5,9 @@
 import shutil
 from datetime import datetime
 from tvm._ffi.base import string_types
-from tvm.contrib import graph_runtime
 from tvm._ffi.function import get_global_func
+from tvm.contrib import graph_runtime
+from tvm.rpc import base as rpc_base
 from . import debug_result
 
 _DUMP_ROOT_PREFIX = "tvmdbg_"
@@ -49,8 +50,12 @@ def create(graph_json_str, libmod, ctx, dump_root=None):
 
     ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx)
     if num_rpc_ctx == len(ctx):
-        raise NotSupportedError("Remote graph debugging is not supported.")
-
+        libmod = rpc_base._ModuleHandle(libmod)
+        try:
+            fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime_debug.remote_create")
+        except ValueError:
+            raise ValueError("Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in " \
+                             "config.cmake and rebuild TVM to enable debug mode")
     func_obj = fcreate(graph_json_str, libmod, *device_type_id)
     return GraphModuleDebug(func_obj, ctx, graph_json_str, dump_root)
 
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 7faee4420f47..452a48408ccf 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -146,5 +146,18 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create")
         << args.num_args;
     *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args));
   });
+
+TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.remote_create")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
+                                  "graph_runtime.remote_create is "
+                                  "at least 4, but it has "
+                               << args.num_args;
+    void* mhandle = args[1];
+    const auto& contexts = GetAllContext(args);
+    *rv = GraphRuntimeDebugCreate(
+        args[0], *static_cast<tvm::runtime::Module*>(mhandle), contexts);
+  });
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index ab6b729974df..b9d8b689cb9e 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -2,6 +2,8 @@
 import tvm
 import numpy as np
 import json
+from tvm import rpc
+from tvm.contrib import util
 from tvm.contrib.debugger import debug_runtime as graph_runtime
 
 def test_graph_simple():
@@ -70,7 +72,32 @@ def check_verify():
         #verify dump root delete after cleanup
         assert(not os.path.exists(directory))
 
+    def check_remote():
+        if not tvm.module.enabled("llvm"):
+            print("Skip because llvm is not enabled")
+            return
+        mlib = tvm.build(s, [A, B], "llvm", name="myadd")
+        server = rpc.Server("localhost")
+        remote = rpc.connect(server.host, server.port)
+        temp = util.tempdir()
+        ctx = remote.cpu(0)
+        path_dso = temp.relpath("dev_lib.so")
+        mlib.export_library(path_dso)
+        remote.upload(path_dso)
+        mlib = remote.load_module("dev_lib.so")
+        try:
+            mod = graph_runtime.create(graph, mlib, remote.cpu(0))
+        except ValueError:
+            print("Skip because debug graph_runtime not enabled")
+            return
+        a = np.random.uniform(size=(n,)).astype(A.dtype)
+        mod.run(x=tvm.nd.array(a, ctx))
+        out = tvm.nd.empty((n,), ctx=ctx)
+        out = mod.get_output(0, out)
+        np.testing.assert_equal(out.asnumpy(), a + 1)
+
     check_verify()
+    check_remote()
 
 if __name__ == "__main__":
     test_graph_simple()

From c473e152fe8d3f69950488767dcfa99c7fc3b9f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Mon, 15 Oct 2018 09:46:21 -0700
Subject: [PATCH 223/529] [Relay] [Op] Squeeze (#1858)

---
 include/tvm/relay/attrs/transform.h  | 14 ++++++
 python/tvm/relay/op/transform.py     | 25 +++++++++-
 src/relay/op/tensor/transform.cc     | 72 +++++++++++++++++++++++++++-
 tests/python/relay/test_op_level3.py | 41 ++++++++++++++++
 4 files changed, 149 insertions(+), 3 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 278826bc825c..d304a59567ea 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -82,6 +82,20 @@ struct InitOpAttrs : public tvm::AttrsNode<InitOpAttrs> {
   }
 };  // struct InitOpAttrs
 
+/*! \brief Attributes used in squeeze operators */
+struct SqueezeAttrs : public tvm::AttrsNode<SqueezeAttrs> {
+  Array<IndexExpr> axes;
+
+  TVM_DECLARE_ATTRS(SqueezeAttrs, "relay.attrs.SqueezeAttrs") {
+    TVM_ATTR_FIELD(axes)
+        .describe("The axes to squeeze in the input tensor."
+                  "If `axes = []`, all axis of dimension 1 get squeezed;"
+                  "Else, the dimension in axes get squeezed."
+                  "It is an error if an axes does not has dimension 1.")
+        .set_default(Array<IndexExpr>({}));
+  }
+};  // struct SqueezeAttrs
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 75fbba8461e3..c2036f509133 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -42,12 +42,35 @@ def transpose(data, axes=None):
     Returns
     -------
     result : relay.Expr
-        The reshaped result.
+        The transposed result.
     """
     axes = axes or []
     return _make.transpose(data, list(axes))
 
 
+def squeeze(data, axes=None):
+    """Squeeze axes in the array.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    axes : None or List[int]
+        Axes to remove.
+        If axes = [] or = None, remove all axis of dimensions 1.
+        Otherwise, remove all axis in axes.
+        If any axis in axes has dimension that does not equal 1, it is an error.
+
+    Returns
+    -------
+    result : relay.Expr
+        The squeezed result.
+    """
+    axes = axes or []
+    return _make.squeeze(data, list(axes))
+
+
 def reshape(data, newshape):
     """Reshapes the input array.
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index fb7b09fd3b46..956883476d09 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -80,8 +80,6 @@ RELAY_REGISTER_OP("expand_dims")
 .set_support_level(1)
 .add_type_rel("ExpandDims", ExpandDimsRel);
 
-/* relay.concatenate */
-
 TVM_REGISTER_NODE_TYPE(ConcatenateAttrs);
 
 bool ConcatenateRel(const Array<Type>& types,
@@ -633,5 +631,75 @@ Examples::
 .set_support_level(4)
 .add_type_rel("Where", WhereRel);
 
+Expr MakeSqueeze(Expr data,
+                 Array<IndexExpr> axes) {
+  auto attrs = make_node<SqueezeAttrs>();
+  attrs->axes = std::move(axes);
+  static const Op& op = Op::Get("squeeze");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.squeeze")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeSqueeze, args, rv);
+  });
+
+bool SqueezeRel(const Array<Type>& types,
+                int num_inputs,
+                const Attrs& attrs,
+                const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+  const auto* param = attrs.as<SqueezeAttrs>();
+  CHECK(param != nullptr);
+  std::vector<IndexExpr> result_shape;
+  // if axes is empty, squeeze all axes of dimension 1
+  if (param->axes.size() == 0) {
+    for (const auto& e : data->shape) {
+      const int64_t* axis_ptr = as_const_int(e);
+      CHECK(axis_ptr != nullptr) << "the axes attribute must be concrete";
+      if (*axis_ptr != 1) {
+        result_shape.push_back(e);
+      }
+    }
+  } else {
+    // pair up original shape with a boolean which control whether it will be in the final shape.
+    std::vector<std::pair<IndexExpr, bool> > original_shape;
+    for (const auto& e : data->shape) {
+      original_shape.push_back(std::pair<IndexExpr, bool>(e, true));
+    }
+    for (const auto& e : param->axes) {
+      const int64_t* axis_ptr = as_const_int(e);
+      CHECK(axis_ptr != nullptr);
+      original_shape.at(*axis_ptr).second = false;
+    }
+    for (const auto p : original_shape) {
+      if (p.second) {
+        result_shape.push_back(p.first);
+      } else {
+        const int64_t* axis_ptr = as_const_int(p.first);
+        CHECK(axis_ptr != nullptr) << "cannot get concrete shape of input tensor";
+        CHECK_EQ(*axis_ptr, 1) << "cannot squeeze axis with dimension not equal to 1";
+      }
+    }
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(result_shape, data->dtype));
+  return true;
+}
+
+RELAY_REGISTER_OP("squeeze")
+.describe(R"code(Squeeze the input tensor at the dimensions given by axes
+
+- **data**: The input data to the operator.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Squeeze", SqueezeRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 7d949b21026b..13ab483f936c 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -6,6 +6,7 @@
 from tvm.relay.ir_pass import infer_type
 from tvm.relay.ir_builder import IRBuilder, func_type
 from tvm.relay.env import Environment
+from nose.tools import raises
 
 def test_zeros_ones():
     for op in [relay.zeros, relay.ones]:
@@ -67,6 +68,44 @@ def test_transpose_infer_type():
         (t, n, 100), "float32")
 
 
+def test_squeeze_default_axes_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d = 1, 4, 1
+    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.squeeze(x))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (4,), "float32")
+
+
+def test_squeeze_axes_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d = 1, 4, 1
+    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.squeeze(x, axes=(2,)))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (1, 4), "float32")
+
+
+@raises(tvm._ffi.base.TVMError)
+def test_squeeze_bad_axes_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d = 1, 4, 1
+    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.squeeze(x, axes=(1,)))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+
+
 def test_reshape_infer_type():
     ib = relay.ir_builder.IRBuilder()
     n, t, d1, d2 = tvm.var("n"), tvm.var("t"), 100, 20
@@ -181,3 +220,5 @@ def test_full_like():
     test_take_infer_type()
     test_full()
     test_full_like()
+    test_squeeze_axes_infer_type()
+    test_squeeze_default_axes_infer_type()

From 5d778467fdf41f1cb1ab715309617d6619ee8a96 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Mon, 15 Oct 2018 12:16:51 -0700
Subject: [PATCH 224/529] [Relay][OP] MultiboxPrior (#1882)

* Relay MultiboxPrior Operator

* Fix lint

* Fix build

* Add test for default args
---
 include/tvm/relay/attrs/vision.h       | 28 ++++++++++
 python/tvm/relay/op/vision/__init__.py |  2 +
 python/tvm/relay/op/vision/multibox.py | 38 ++++++++++++++
 src/relay/op/vision/multibox_op.cc     | 72 ++++++++++++++++++++++++++
 tests/python/relay/test_op_level4.py   | 32 ++++++++++++
 5 files changed, 172 insertions(+)
 create mode 100644 python/tvm/relay/op/vision/multibox.py
 create mode 100644 src/relay/op/vision/multibox_op.cc

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index a2f7360f1f71..60ee4cb88e43 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -12,6 +12,34 @@
 namespace tvm {
 namespace relay {
 
+/*! \brief Attributes used in multibox_prior operators */
+struct MultiBoxPriorAttrs : public tvm::AttrsNode<MultiBoxPriorAttrs> {
+  Array<IndexExpr> sizes;
+  Array<IndexExpr> ratios;
+  Array<IndexExpr> steps;
+  Array<IndexExpr> offsets;
+  bool clip;
+
+  TVM_DECLARE_ATTRS(MultiBoxPriorAttrs, "relay.attrs.MultiBoxPriorAttrs") {
+    TVM_ATTR_FIELD(sizes)
+      .set_default(Array<IndexExpr>({static_cast<float>(1.0)}))
+      .describe("List of sizes of generated MultiBoxPriores.");
+    TVM_ATTR_FIELD(ratios)
+      .set_default(Array<IndexExpr>({static_cast<float>(1.0)}))
+      .describe("List of aspect ratios of generated MultiBoxPriores.");
+    TVM_ATTR_FIELD(steps)
+      .set_default(Array<IndexExpr>({static_cast<float>(-1.0),
+                                     static_cast<float>(-1.0)}))
+      .describe("Priorbox step across y and x, -1 for auto calculation.");
+    TVM_ATTR_FIELD(offsets)
+      .set_default(Array<IndexExpr>({static_cast<float>(0.5),
+                                     static_cast<float>(0.5)}))
+      .describe("Priorbox center offsets, y and x respectively.");
+    TVM_ATTR_FIELD(clip).set_default(false)
+      .describe("Whether to clip out-of-boundary boxes.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_VISION_H_
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
index 3569093b95e6..b3010d2d5310 100644
--- a/python/tvm/relay/op/vision/__init__.py
+++ b/python/tvm/relay/op/vision/__init__.py
@@ -1,3 +1,5 @@
 # pylint: disable=wildcard-import
 """Vision network related operators."""
 from __future__ import absolute_import as _abs
+
+from .multibox import *
diff --git a/python/tvm/relay/op/vision/multibox.py b/python/tvm/relay/op/vision/multibox.py
new file mode 100644
index 000000000000..9b7483eec5ab
--- /dev/null
+++ b/python/tvm/relay/op/vision/multibox.py
@@ -0,0 +1,38 @@
+"""Multibox operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+def multibox_prior(data,
+                   sizes=(1.0,),
+                   ratios=(1.0,),
+                   steps=(-1.0, -1.0),
+                   offsets=(0.5, 0.5),
+                   clip=False):
+    """Generate prior(anchor) boxes from data, sizes and ratios.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data tensor.
+
+    sizes : tuple of float, optional
+        Tuple of sizes for anchor boxes.
+
+    ratios : tuple of float, optional
+        Tuple of ratios for anchor boxes.
+
+    steps : Tuple of float, optional
+        Priorbox step across y and x, -1 for auto calculation.
+
+    offsets : tuple of int, optional
+        Priorbox center offsets, y and x respectively.
+
+    clip : boolean, optional
+        Whether to clip out-of-boundary boxes.
+
+    Returns
+    -------
+    out : relay.Expr
+        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
+    """
+    return _make.multibox_prior(data, sizes, ratios, steps, offsets, clip)
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
new file mode 100644
index 000000000000..63e75c0bb213
--- /dev/null
+++ b/src/relay/op/vision/multibox_op.cc
@@ -0,0 +1,72 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file multibox_op.cc
+ * \brief Multibox related operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/vision.h>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(MultiBoxPriorAttrs);
+
+bool MultiboxPriorRel(const Array<Type>& types,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const MultiBoxPriorAttrs* param = attrs.as<MultiBoxPriorAttrs>();
+  const auto& dshape = data->shape;
+  CHECK_EQ(dshape.size(), 4) << "Input data should be 4D: "
+      "[batch, channel, height, width]";
+  IndexExpr in_height = dshape[2];
+  IndexExpr in_width = dshape[3];
+  int num_sizes = static_cast<int>(param->sizes.size());
+  int num_ratios = static_cast<int>(param->ratios.size());
+
+  // since input sizes are same in each batch, we could share MultiBoxPrior
+  std::vector<IndexExpr> oshape(
+    {1, in_height * in_width * (num_sizes + num_ratios - 1), 4});
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+
+Expr MakeMultiBoxPrior(Expr data,
+                       Array<IndexExpr> sizes,
+                       Array<IndexExpr> ratios,
+                       Array<IndexExpr> steps,
+                       Array<IndexExpr> offsets,
+                       bool clip) {
+  auto attrs = make_node<MultiBoxPriorAttrs>();
+  attrs->sizes = std::move(sizes);
+  attrs->ratios = std::move(ratios);
+  attrs->steps = std::move(steps);
+  attrs->offsets = std::move(offsets);
+  attrs->clip = clip;
+  static const Op& op = Op::Get("vision.multibox_prior");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.vision._make.multibox_prior")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 6>(MakeMultiBoxPrior, args, rv);
+});
+
+
+RELAY_REGISTER_OP("vision.multibox_prior")
+.describe(R"doc("Generate prior(anchor) boxes from data, sizes and ratios."
+)doc" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(4)
+.add_type_rel("MultiBoxPrior", MultiboxPriorRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 995e15fb9760..9a7b2a10b5b7 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -124,6 +124,37 @@ def test_binary_broadcast():
         ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((5, 10, 4), "int32")
 
+def test_multibox_prior():
+    sizes = (0.3, 1.5, 0.7)
+    ratios = (1.3, 2.4)
+    steps = (2.0, 1.5)
+    offsets = (0.2, 0.3)
+    clip = True
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 3, 56, 56
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+
+    with ib.function(x) as func:
+        ib.ret(relay.vision.multibox_prior(x.var, sizes, ratios,
+                                           steps, offsets, clip))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (1, h * w * (len(sizes) + len(ratios) - 1), 4), "float32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 24, 32, 32
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+
+    with ib.function(x) as func:
+        ib.ret(relay.vision.multibox_prior(x.var))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (1, h * w, 4), "float32")
 
 def test_where():
     ib = relay.ir_builder.IRBuilder()
@@ -144,3 +175,4 @@ def test_where():
     test_binary_op()
     test_binary_broadcast_op()
     test_where()
+    test_multibox_prior()

From 8edf497d7e806b8abe0e75633d22e313ed32b1e5 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 15 Oct 2018 15:36:43 -0700
Subject: [PATCH 225/529] [LANG][ATTRS] Enable deep equality comparison and
 hash of Attrs (#1903)

---
 include/tvm/attrs.h                           | 156 +++++++++++++++++-
 src/api/api_pass.cc                           |   9 +
 src/lang/attr_functor.h                       |  76 +++++++++
 src/lang/attrs.cc                             | 155 +++++++++++++++++
 tests/python/relay/test_op_level4.py          |  32 ----
 tests/python/relay/test_op_level5.py          |  36 ++++
 .../unittest/test_pass_attrs_hash_equal.py    |  33 ++++
 7 files changed, 462 insertions(+), 35 deletions(-)
 create mode 100644 src/lang/attr_functor.h
 create mode 100644 tests/python/unittest/test_pass_attrs_hash_equal.py

diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 095e05b3f95f..fbb067dd6dcf 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -27,8 +27,10 @@
 #ifndef TVM_ATTRS_H_
 #define TVM_ATTRS_H_
 
+#include <dmlc/common.h>
 #include <unordered_map>
 #include <vector>
+#include <functional>
 #include <type_traits>
 #include <string>
 #include "ir.h"
@@ -129,8 +131,8 @@ class BaseAttrsNode : public Node {
    */
   inline void PrintDocString(std::ostream &os) const;  // NOLINT(*)
   /*!
-   * \brief Get the field information about the
-   * \note This function throws when the required a field is not present.
+   * \brief Get the field information
+   * \return The fields in the Attrs.
    */
   TVM_DLL virtual Array<AttrFieldInfo> ListFieldInfo() const = 0;
   /*!
@@ -138,9 +140,20 @@ class BaseAttrsNode : public Node {
    * \param kwargs The key value pairs for initialization.
    *        [key0, value0, key1, value1, ..., key_n, value_n]
    * \param allow_unknown Whether allow additional unknown fields.
-   * \note This function throws when the required a field is not present.
+   * \note This function throws when the required field is not present.
    */
   TVM_DLL virtual void InitByPackedArgs(const TVMArgs& kwargs, bool allow_unknown = false) = 0;
+  /*!
+   * \brief Whether this attribute's content equals to another node.
+   * \param other The pointer to another node.
+   * \return The comparison result.
+   */
+  TVM_DLL virtual bool ContentEqual(const Node* other) const = 0;
+  /*!
+   * \brief Content aware hash.
+   * \return the hash result.
+   */
+  TVM_DLL virtual size_t ContentHash() const = 0;
 
   static constexpr const char* _type_key = "Attrs";
   TVM_DECLARE_BASE_NODE_INFO(BaseAttrsNode, Node);
@@ -188,11 +201,93 @@ class DictAttrsNode : public BaseAttrsNode {
   void VisitAttrs(AttrVisitor* v) final;
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final;
   Array<AttrFieldInfo> ListFieldInfo() const final;
+  bool ContentEqual(const Node* other) const final;
+  size_t ContentHash() const final;
   // type info
   static constexpr const char* _type_key = "DictAttrs";
   TVM_DECLARE_NODE_TYPE_INFO(DictAttrsNode, BaseAttrsNode);
 };
 
+/*!
+ * \brief Content-aware Equality comparator for attrs.
+ *
+ * This comparator will recursively deep compare the following Attributes.
+ *
+ * - IntImm, UIntImm, FloatImm, StringImm
+ * - Any subclass of BaseAttrsNode
+ * - Array of Attributes.
+ * - Map from string to Attributes.
+ */
+class AttrsEqual {
+ public:
+  bool operator()(const double& lhs, const double& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const int64_t& lhs, const int64_t& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const uint64_t& lhs, const uint64_t& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const int& lhs, const int& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const bool& lhs, const bool& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const std::string& lhs, const std::string& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const Type& lhs, const Type& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const NodeRef& lhs, const NodeRef& rhs) const {
+    return AttrsEqual::Equal(lhs, rhs);
+  }
+
+  // comparator of NodeRef types.
+  static TVM_DLL bool Equal(const NodeRef& lhs, const NodeRef& rhs);
+};
+
+/*!
+ * \brief Content-aware hash function.
+ *
+ * This hash functor will recursively hash the content of the Attributes.
+ * It is guaranteed that if AttrsEqual(a, b) == true, then AttrsHash(a) == AttrsHash(b);
+ */
+class AttrsHash {
+ public:
+  size_t operator()(const double& value) const {
+    return std::hash<double>()(value);
+  }
+  size_t operator()(const int64_t& value) const {
+    return std::hash<int64_t>()(value);
+  }
+  size_t operator()(const uint64_t& value) const {
+    return std::hash<uint64_t>()(value);
+  }
+  size_t operator()(const int& value) const {
+    return std::hash<int>()(value);
+  }
+  size_t operator()(const bool& value) const {
+    return std::hash<bool>()(value);
+  }
+  size_t operator()(const std::string& value) const {
+    return std::hash<std::string>()(value);
+  }
+  size_t operator()(const Type& value) const {
+    return std::hash<int>()(
+        static_cast<int>(value.code()) |
+        (static_cast<int>(value.bits()) << 8) |
+        (static_cast<int>(value.lanes()) << 16));
+  }
+  size_t operator()(const NodeRef& value) const {
+    return AttrsHash::Hash(value);
+  }
+  // hash function of the attribute and attribute fields.
+  static TVM_DLL size_t Hash(const NodeRef& lhs);
+};
+
 // Namespace containing detail implementations
 namespace detail {
 using runtime::TVMArgValue;
@@ -234,6 +329,44 @@ class AttrNormalVisitor {
   AttrVisitor* visitor_;
 };
 
+// Wrapper for normal visitor.
+class AttrsEqualVisitor {
+ public:
+  bool result_{true};
+  // constructor
+  AttrsEqualVisitor(const Node* lhs, const Node* rhs)
+      : lhs_(lhs), rhs_(rhs) {
+  }
+  template<typename T>
+  AttrNopEntry operator()(const char* key, T* lhs_value) {
+    if (!result_) return AttrNopEntry();
+    const T* rhs_value =
+        reinterpret_cast<const T*>(
+            reinterpret_cast<const char*>(rhs_) +
+            (reinterpret_cast<const char*>(lhs_value) -
+             reinterpret_cast<const char*>(lhs_)));
+    if (!AttrsEqual()(*lhs_value, *rhs_value)) {
+      result_ = false;
+    }
+    return AttrNopEntry();
+  }
+
+ private:
+  const Node* lhs_;
+  const Node* rhs_;
+};
+
+class AttrsHashVisitor {
+ public:
+  size_t result_{0};
+
+  template<typename T>
+  AttrNopEntry operator()(const char* key, T* value) {
+    result_ = dmlc::HashCombine(result_, AttrsHash()(*value));
+    return AttrNopEntry();
+  }
+};
+
 // helper entry that does initialization, set default.
 template<typename T>
 struct AttrInitEntry {
@@ -596,6 +729,23 @@ class AttrsNode : public BaseAttrsNode {
     return visitor.fields_;
   }
 
+  bool ContentEqual(const Node* other) const final {
+    DerivedType* pself = self();
+    if (pself == other) return true;
+    if (other == nullptr) return false;
+    if (pself->type_index() != other->type_index()) return false;
+    detail::AttrsEqualVisitor visitor(pself, other);
+    self()->__VisitAttrs__(visitor);
+    return visitor.result_;
+  }
+
+  size_t ContentHash() const final {
+    detail::AttrsHashVisitor visitor;
+    visitor.result_ = std::hash<std::string>()(this->type_key());
+    self()->__VisitAttrs__(visitor);
+    return visitor.result_;
+  }
+
  private:
   DerivedType* self() const {
     return const_cast<DerivedType*>(
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index a0048a2ed771..66e4529acaf1 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -5,6 +5,7 @@
  */
 #include <tvm/expr.h>
 #include <tvm/ir.h>
+#include <tvm/attrs.h>
 #include <tvm/ir_pass.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_mutator.h>
@@ -65,6 +66,14 @@ TVM_REGISTER_API("ir_pass.Equal")
     }
   });
 
+
+TVM_REGISTER_API("ir_pass.AttrsEqual")
+.set_body_typed<bool(const NodeRef&, const NodeRef&)>(AttrsEqual::Equal);
+
+TVM_REGISTER_API("ir_pass.AttrsHash")
+.set_body_typed<int64_t(const NodeRef&)>(AttrsHash::Hash);
+
+
 TVM_REGISTER_API("ir_pass.ExprUseVar")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = ExprUseVar(args[0].operator Expr(), args[1].operator Var());
diff --git a/src/lang/attr_functor.h b/src/lang/attr_functor.h
new file mode 100644
index 000000000000..0cb748bbd496
--- /dev/null
+++ b/src/lang/attr_functor.h
@@ -0,0 +1,76 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file attr_functor.h
+ * \brief A way to define arbitrary function signature
+ *        with dispatch on common attributes.
+ *
+ * Common attributes include:
+ *  - int, float, str constants
+ *  - array of attributes
+ *  - map of attributes
+ */
+#ifndef TVM_LANG_ATTR_FUNCTOR_H_
+#define TVM_LANG_ATTR_FUNCTOR_H_
+
+namespace tvm {
+
+template <typename FType>
+class AttrFunctor;
+
+#define ATTR_FUNCTOR_DISPATCH(OP)                                       \
+  vtable.template set_dispatch<OP>(                                     \
+      [](const NodeRef& n, TSelf* self, Args... args) {                 \
+        return self->Visit_(static_cast<const OP*>(n.node_.get()),      \
+                            std::forward<Args>(args)...);               \
+      });                                                               \
+
+// A functor for common attribute information.
+template <typename R, typename... Args>
+class AttrFunctor<R(const NodeRef& n, Args...)> {
+ private:
+  using TSelf = AttrFunctor<R(const NodeRef& n, Args...)>;
+  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+
+ public:
+  /*! \brief the result type of this functor */
+  using result_type = R;
+  /*!
+   * \brief The functor call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  virtual R Visit(const NodeRef& n, Args... args) {
+    static FType vtable = InitVTable();
+    if (vtable.can_dispatch(n)) {
+      return vtable(n, this, std::forward<Args>(args)...);
+    } else {
+      return VisitDefault_(n, std::forward<Args>(args)...);
+    }
+  }
+  virtual R Visit_(const ArrayNode* op, Args... args) = 0;
+  virtual R Visit_(const StrMapNode* op, Args... args) = 0;
+  virtual R Visit_(const ir::IntImm* op, Args... args) = 0;
+  virtual R Visit_(const ir::UIntImm* op, Args... args) = 0;
+  virtual R Visit_(const ir::FloatImm* op, Args... args) = 0;
+  virtual R Visit_(const ir::StringImm* op, Args... args) = 0;
+  virtual R VisitDefault_(const NodeRef& n, Args... args) = 0;
+
+ private:
+  // initialize the vtable.
+  static FType InitVTable() {
+    using namespace ir;
+    FType vtable;
+    // Set dispatch
+    ATTR_FUNCTOR_DISPATCH(StrMapNode);
+    ATTR_FUNCTOR_DISPATCH(ArrayNode);
+    ATTR_FUNCTOR_DISPATCH(IntImm);
+    ATTR_FUNCTOR_DISPATCH(UIntImm);
+    ATTR_FUNCTOR_DISPATCH(FloatImm);
+    ATTR_FUNCTOR_DISPATCH(StringImm);
+    return vtable;
+  }
+};
+
+}  // namespace tvm
+#endif  // TVM_LANG_ATTR_FUNCTOR_H_
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index 12ebbff4be74..091ecd2700d8 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -3,6 +3,7 @@
  * \file attrs.cc
  */
 #include <tvm/attrs.h>
+#include "attr_functor.h"
 
 namespace tvm {
 
@@ -44,4 +45,158 @@ TVM_REGISTER_NODE_TYPE(DictAttrsNode);
 
 TVM_REGISTER_NODE_TYPE(AttrFieldInfoNode);
 
+
+using namespace ir;
+
+class AttrsEqualChecker :
+      public AttrFunctor<bool(const NodeRef&, const NodeRef&)> {
+ public:
+  bool Check(const NodeRef& lhs, const NodeRef& rhs) {
+    if (!equal_) return false;
+    if (lhs.same_as(rhs)) return true;
+    if (!lhs.defined() || !rhs.defined()) return false;
+    if (!this->Visit(lhs, rhs)) {
+      equal_ = false;
+    }
+    return equal_;
+  }
+
+  bool VisitDefault_(const NodeRef& lhs, const NodeRef& other) final {
+    if (lhs->derived_from<BaseAttrsNode>()) {
+      return static_cast<const BaseAttrsNode*>(lhs.get())->ContentEqual(other.get());
+    }
+    return lhs.same_as(other);
+  }
+
+  bool Visit_(const IntImm* lhs, const NodeRef& other) final {
+    if (const auto* rhs = other.as<IntImm>()) {
+      return lhs->value == rhs->value;
+    }
+    return false;
+  }
+
+  bool Visit_(const UIntImm* lhs, const NodeRef& other) final {
+    if (const auto* rhs = other.as<UIntImm>()) {
+      return lhs->value == rhs->value;
+    }
+    return false;
+  }
+
+  bool Visit_(const FloatImm* lhs, const NodeRef& other) final {
+    if (const auto* rhs = other.as<FloatImm>()) {
+      return lhs->value == rhs->value;
+    }
+    return false;
+  }
+
+  bool Visit_(const StringImm* lhs, const NodeRef& other) final {
+    if (const auto* rhs = other.as<StringImm>()) {
+      return lhs->value == rhs->value;
+    }
+    return false;
+  }
+
+  bool Visit_(const ArrayNode* lhs, const NodeRef& other) final {
+    if (const auto* rhs = other.as<ArrayNode>()) {
+      if (rhs->data.size() != lhs->data.size()) return false;
+      for (size_t  i = 0; i < lhs->data.size(); ++i) {
+        if (!Check(NodeRef(lhs->data[i]), NodeRef(rhs->data[i]))) return false;
+      }
+    }
+    return true;
+  }
+
+  bool Visit_(const StrMapNode* lhs, const NodeRef& other) final {
+    if (const auto* rhs = other.as<StrMapNode>()) {
+      if (rhs->data.size() != lhs->data.size()) return false;
+      for (const auto& kv : lhs->data) {
+        auto it = rhs->data.find(kv.first);
+        if (it == rhs->data.end()) return false;
+        if (!Check(NodeRef(kv.second), NodeRef(it->second))) return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  bool equal_{true};
+};
+
+class AttrContentHasher :
+      public AttrFunctor<void(const NodeRef&)> {
+ public:
+  size_t result_{0};
+
+  void VisitDefault_(const NodeRef& value) final {
+    if (value->derived_from<BaseAttrsNode>()) {
+      Update(static_cast<const BaseAttrsNode*>(value.get())->ContentHash());
+    } else {
+      Update(NodeHash()(value));
+    }
+  }
+
+  void Visit_(const IntImm* op) final {
+    Update(std::hash<int64_t>()(op->value));
+  }
+
+  void Visit_(const UIntImm* op) final {
+    Update(std::hash<uint64_t>()(op->value));
+  }
+
+  void Visit_(const FloatImm* op) final {
+    Update(std::hash<double>()(op->value));
+  }
+
+  void Visit_(const StringImm* op) final {
+    Update(std::hash<std::string>()(op->value));
+  }
+
+  void Visit_(const ArrayNode* op) final {
+    Update(op->data.size());
+    for (size_t  i = 0; i < op->data.size(); ++i) {
+      this->Visit(NodeRef(op->data[i]));
+    }
+  }
+
+  void Visit_(const StrMapNode* lhs) final {
+    using Entry = std::pair<std::string, NodePtr<Node> >;
+    std::vector<Entry> data(lhs->data.begin(), lhs->data.end());
+    std::sort(data.begin(), data.end(), [](const Entry& a, const Entry& b) {
+        return a.first < b.first;
+      });
+    for (const Entry& kv : data) {
+      Update(std::hash<std::string>()(kv.first));
+      this->Visit(NodeRef(kv.second));
+    }
+  }
+
+  void Update(size_t value) {
+    result_ = dmlc::HashCombine(result_, value);
+  }
+};
+
+bool AttrsEqual::Equal(const NodeRef& lhs, const NodeRef& rhs) {
+  if (lhs.same_as(rhs)) return true;
+  AttrsEqualChecker checker;
+  return checker.Check(lhs, rhs);
+}
+
+size_t AttrsHash::Hash(const NodeRef& node) {
+  if (!node.defined()) return 0;
+  AttrContentHasher hasher;
+  hasher.Visit(node);
+  return hasher.result_;
+}
+
+size_t DictAttrsNode::ContentHash() const {
+  return AttrsHash()(this->dict);
+}
+
+bool DictAttrsNode::ContentEqual(const Node* other) const {
+  if (this == other) return true;
+  if (other == nullptr) return false;
+  if (this->type_index() != other->type_index()) return false;
+  return AttrsEqual()(this->dict, static_cast<const DictAttrsNode*>(other)->dict);
+}
+
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 9a7b2a10b5b7..0fe5a115030a 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -124,38 +124,6 @@ def test_binary_broadcast():
         ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((5, 10, 4), "int32")
 
-def test_multibox_prior():
-    sizes = (0.3, 1.5, 0.7)
-    ratios = (1.3, 2.4)
-    steps = (2.0, 1.5)
-    offsets = (0.2, 0.3)
-    clip = True
-
-    ib = relay.ir_builder.IRBuilder()
-    n, c, h, w = tvm.var("n"), 3, 56, 56
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-
-    with ib.function(x) as func:
-        ib.ret(relay.vision.multibox_prior(x.var, sizes, ratios,
-                                           steps, offsets, clip))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
-        (1, h * w * (len(sizes) + len(ratios) - 1), 4), "float32")
-
-    ib = relay.ir_builder.IRBuilder()
-    n, c, h, w = tvm.var("n"), 24, 32, 32
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-
-    with ib.function(x) as func:
-        ib.ret(relay.vision.multibox_prior(x.var))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
-        (1, h * w, 4), "float32")
-
 def test_where():
     ib = relay.ir_builder.IRBuilder()
     cond = ib.param("cond", relay.TensorType((3, 4), "float32"))
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 8d871e9ef4f5..e04bd9bab91a 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -25,5 +25,41 @@ def test_resize_infer_type():
     ftype = func.checked_type
     assert ftype.ret_type == relay.ty.TensorType((n, c, 100, 200), "int8")
 
+
+
+def test_multibox_prior():
+    sizes = (0.3, 1.5, 0.7)
+    ratios = (1.3, 2.4)
+    steps = (2.0, 1.5)
+    offsets = (0.2, 0.3)
+    clip = True
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 3, 56, 56
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+
+    with ib.function(x) as func:
+        ib.ret(relay.vision.multibox_prior(x, sizes, ratios,
+                                           steps, offsets, clip))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (1, h * w * (len(sizes) + len(ratios) - 1), 4), "float32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c, h, w = tvm.var("n"), 24, 32, 32
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+
+    with ib.function(x) as func:
+        ib.ret(relay.vision.multibox_prior(x))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType(
+        (1, h * w, 4), "float32")
+
+
 if __name__ == "__main__":
     test_resize_infer_type()
+    test_multibox_prior()
diff --git a/tests/python/unittest/test_pass_attrs_hash_equal.py b/tests/python/unittest/test_pass_attrs_hash_equal.py
new file mode 100644
index 000000000000..23f0e6374064
--- /dev/null
+++ b/tests/python/unittest/test_pass_attrs_hash_equal.py
@@ -0,0 +1,33 @@
+import tvm
+
+def test_attrs_equal():
+    x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3, 4))
+    y = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3, 4))
+    z = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3,4,1))
+    assert tvm.ir_pass.AttrsEqual(x, y)
+    assert not tvm.ir_pass.AttrsEqual(x, z)
+
+    dattr = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
+    assert not tvm.ir_pass.AttrsEqual(dattr, x)
+    dattr2 = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
+    assert tvm.ir_pass.AttrsEqual(dattr, dattr2)
+
+    assert tvm.ir_pass.AttrsEqual({"x": x}, {"x": y})
+    # array related checks
+    assert tvm.ir_pass.AttrsEqual({"x": [x, x]}, {"x": [y, x]})
+    assert not tvm.ir_pass.AttrsEqual({"x": [x, 1]}, {"x": [y, 2]})
+
+
+def test_attrs_hash():
+    fhash = tvm.ir_pass.AttrsHash
+    x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3, 4))
+    y = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3, 4))
+    assert fhash({"x": x}) == fhash({"x": y})
+    assert fhash({"x": x}) != fhash({"x": [y, 1]})
+    assert fhash({"x": [x, 1]}) == fhash({"x": [y, 1]})
+    assert fhash({"x": [x, 2]}) == fhash({"x": [y, 2]})
+
+
+if __name__ == "__main__":
+    test_attrs_equal()
+    test_attrs_hash()

From 5a7a3e342680f695a07cdfcaef8afccdd687553a Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Mon, 15 Oct 2018 16:17:46 -0700
Subject: [PATCH 226/529] [Relay] remove redundant test cases in
 test_op_level4.py (#1905)

---
 tests/python/relay/test_op_level4.py | 35 ++--------------------------
 1 file changed, 2 insertions(+), 33 deletions(-)

diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 0fe5a115030a..11c0be67ca73 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -13,37 +13,6 @@ def assert_has_type(expr, typ, env=Environment({})):
         raise RuntimeError("Type mismatch %s vs %s" % (
             checked_type, typ))
 
-def test_cmp_type():
-    for op in (relay.greater,
-               relay.greater_equal,
-               relay.less,
-               relay.less_equal,
-               relay.equal,
-               relay.not_equal):
-        ib = relay.ir_builder.IRBuilder()
-        x = ib.param("x", relay.TensorType((10, 4), "float32"))
-        y = ib.param("y", relay.TensorType((5, 10, 1), "float32"))
-        with ib.function(x, y) as func:
-            ib.ret(op(x, y))
-        ib.ret(func)
-        func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type
-        assert ftype.ret_type == relay.TensorType((5, 10, 4), "uint1")
-
-
-def test_binary_broadcast():
-    for op in [relay.right_shift,
-               relay.left_shift,
-               relay.maximum]:
-        ib = relay.ir_builder.IRBuilder()
-        x = ib.param("x", relay.TensorType((10, 4), "int32"))
-        y = ib.param("y", relay.TensorType((5, 10, 1), "int32"))
-        with ib.function(x, y) as func:
-            ib.ret(op(x, y))
-        ib.ret(func)
-        func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type
-        assert ftype.ret_type == relay.TensorType((5, 10, 4), "int32")
 
 def test_binary_op():
     def check_binary_op(opfunc):
@@ -138,9 +107,9 @@ def test_where():
 
 
 if __name__ == "__main__":
-    test_cmp_type()
-    test_binary_broadcast()
     test_binary_op()
     test_binary_broadcast_op()
+    test_cmp_type()
+    test_binary_broadcast()
     test_where()
     test_multibox_prior()

From 33f68d4ac75c5e98c7fa777e41d604b554ee8173 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Mon, 15 Oct 2018 17:10:20 -0700
Subject: [PATCH 227/529] [Relay][Op] Dropout and batch_norm (#1870)

---
 docs/langref/relay_op.rst            |   2 +
 include/tvm/relay/attrs/nn.h         |  35 ++++++
 python/tvm/relay/ir_builder.py       |  28 +++++
 python/tvm/relay/op/nn/nn.py         | 102 ++++++++++++++++
 src/relay/op/nn/nn.cc                | 172 +++++++++++++++++++++++++++
 tests/python/relay/test_op_level1.py |  89 ++++++++++++++
 6 files changed, 428 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 47cab696a8e1..0b937f6636bf 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -38,6 +38,8 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.tanh
    tvm.relay.sigmoid
    tvm.relay.nn.relu
+   tvm.relay.nn.dropout
+   tvm.relay.nn.batch_norm
 
 
 **Level 2: Convolutions**
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index de0da7477a35..0be85d3d1bb9 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -237,6 +237,41 @@ struct PadAttrs : public tvm::AttrsNode<PadAttrs> {
   }
 };
 
+/*! \brief Attributes used in dropout operator */
+struct DropoutAttrs : public tvm::AttrsNode<DropoutAttrs> {
+  double rate;
+  TVM_DECLARE_ATTRS(DropoutAttrs, "relay.attrs.DropoutAttrs") {
+    TVM_ATTR_FIELD(rate)
+      .describe("Fraction of the input that gets dropped out during training time")
+      .set_default(0.5);
+  }
+};  // struct DropoutAttrs
+
+/*! \brief Attributes used in batch_norm operator */
+struct BatchNormAttrs : public tvm::AttrsNode<BatchNormAttrs> {
+  int axis;
+  double epsilon;
+  bool center;
+  bool scale;
+
+  TVM_DECLARE_ATTRS(BatchNormAttrs, "relay.attrs.BatchNormAttrs") {
+    TVM_ATTR_FIELD(axis)
+      .describe("Specify which shape axis denotes the channel.")
+      .set_default(1);
+    TVM_ATTR_FIELD(epsilon)
+      .describe("Small float added to variance to avoid dividing by zero")
+      .set_default(1e-5);
+    TVM_ATTR_FIELD(center)
+      .describe("If True, add offset of beta to normalized tensor. If False, beta is ignored")
+      .set_default(true);
+    TVM_ATTR_FIELD(scale)
+      .describe("If True, multiply by gamma. If False, gamma is not used. "
+                "When the next layer is piecewise linear (also, e.g., nn.relu), "
+                "this can be disabled since the scaling will be done by the next layer.")
+      .set_default(true);
+  }
+};  // struct BatchNormAttrs
+
 /*! \brief Attributes for LRN operator */
 struct LRNAttrs : public tvm::AttrsNode<LRNAttrs> {
   IndexExpr size;
diff --git a/python/tvm/relay/ir_builder.py b/python/tvm/relay/ir_builder.py
index a429aea7d5ea..42a29b29b7d7 100644
--- a/python/tvm/relay/ir_builder.py
+++ b/python/tvm/relay/ir_builder.py
@@ -11,6 +11,32 @@
 from .env import Environment
 
 
+class TupleWrapper(tvm._ffi.node.NodeGeneric):
+    """TupleWrapper.
+
+    This class is a Python wrapper for a Relay tuple of known size.
+    It allows for accessing the fields of the Relay tuple as though
+    it were a Python tuple.
+    """
+
+    def __init__(self, tuple_value, size):
+        self.tuple_value = tuple_value
+        self.size = size
+
+
+    def asnode(self):
+        """Returns the underlying Relay tuple if this wrapper is passed
+        as an argument to an FFI function."""
+
+        return self.tuple_value
+
+    def __getitem__(self, key):
+        return self.tuple_value.fields[key]
+
+    def __len__(self):
+        return len(self.tuple_value.fields)
+
+
 def _convert_to_value(arg, ctxt=tvm.cpu(0)):
     # type: (Any, tvm.Context) -> tvm.nd.NDArray
     """Convert Python values into the appropriate types
@@ -61,6 +87,8 @@ def convert(arg):
         return relay.Tuple([convert(el) for el in arg])
     elif isinstance(arg, PartialFunc):
         return arg.to_func()
+    elif isinstance(arg, tvm._ffi.node.NodeGeneric):
+        return arg.asnode()
     else:
         value = _convert_to_value(arg)
         return Constant(value)
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index e95e3e9b715d..313c26da0234 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1,5 +1,6 @@
 """Neural network operations."""
 from __future__ import absolute_import as _abs
+from tvm.relay.ir_builder import TupleWrapper
 from . import _make
 
 
@@ -484,6 +485,7 @@ def lrn(data, size=5, axis=1, bias=2, alpha=.00001, beta=0.75):
 
     .. math::
         (data / (bias + (alpha * sum_data ^2 /size))^beta)
+
     Parameters
     ----------
     data : relay.Expr
@@ -535,3 +537,103 @@ def l2_normalize(data, eps, axis=None):
         The computed result.
     """
     return _make.l2_normalize(data, eps, axis)
+
+def dropout(data, rate=0.5):
+    """Applies the dropout operation to the input array.
+
+    During training, each element of the input is set to zero with
+    probability ``p``. The whole array is rescaled by ``1/(1-p)``
+    to keep the expected sum of the input unchanged.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    rate : float, optional (default=0.5)
+        The probability for an element to be reset to 0.
+
+    Returns
+    -------
+    result : relay.Tuple([relay.Expr, relay.Expr])
+        The first member of the tuple is the result of dropping elements from ``data``
+        and rescaling. The second member is a "mask" tensor, which is of the same
+        shape and data type as ``data`` and, for each element in ``data``, is 1.0
+        if the element was not dropped and 0.0 if it was.
+    """
+    result = _make.dropout(data, rate)
+    return TupleWrapper(result, 2)
+
+def batch_norm(data, gamma, beta, moving_mean, moving_var,
+               axis=1, epsilon=1e-5, center=True, scale=True):
+    r"""
+    Batch normalization layer (Ioffe and Szegedy, 2014).
+    Normalizes the input at each batch, i.e. applies a transformation
+    that maintains the mean activation close to 0 and the activation
+    standard deviation close to 1.
+
+    .. math::
+
+        data\_mean[i] = mean(data[:,i,:,...]) \\
+        data\_var[i] = var(data[:,i,:,...])
+
+    Then compute the normalized output, which has the same shape as input, as following:
+
+    .. math::
+
+        out[:,i,:,...] = \frac{data[:,i,:,...] - data\_mean[i]}{\sqrt{data\_var[i]+\epsilon}}
+            * gamma[i] + beta[i]
+
+    Both *mean* and *var* returns a scalar by treating the input as a vector.
+
+    Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
+    have shape *(k,)*.
+
+    Besides the inputs and the outputs, this operator accepts two auxiliary
+    states, ``moving_mean`` and ``moving_var``, which are *k*-length
+    vectors. They are global statistics for the whole dataset, which are updated by::
+
+    moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+    moving_var = moving_var * momentum + data_var * (1 - momentum)
+
+    The parameter ``axis`` specifies which axis of the input shape denotes
+    the 'channel' (separately normalized groups).  The default is 1.
+    Specifying -1 sets the channel axis to be the last item in the input shape.
+
+    .. note::
+
+        This operator can be optimized away for inference.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        Input to which batch_norm will be applied.
+    gamma : relay.Expr
+        The gamma scale factor.
+    beta : relay.Expr
+        The beta offset factor.
+    moving_mean : relay.Expr
+        Running mean of input,
+    moving_var : relay.Expr
+        Running variance of input.
+    axis : int, optional, default=1
+        Specify along which shape axis the channel is specified.
+    epsilon : double, optional, default=1e-5
+        Small float added to variance to avoid diving by zero.
+    center : boolean, optional, default=True
+        If True, add offset of beta to normalized tensor, If False,
+        beta is ignored.
+    scale : boolean, optional, default=True
+        If true, multiply by gamma. If False, gamma is not used.
+        When the next layer is piecewise linear (also e.g. nn.relu),
+        this can be disabled since the scalingwill be done by the next layer.
+
+    Returns
+    -------
+    result : relay.Tuple([relay.Expr, relay.Expr, relay.Expr])
+        Tuple of normed data (same shape as input), new running mean (k-length vector),
+        and new running variance (k-length vector)
+    """
+    result = _make.batch_norm(data, gamma, beta, moving_mean, moving_var,
+                              axis, epsilon, center, scale)
+    return TupleWrapper(result, 3)
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index f2439b9fb7ca..23dfe90eebf0 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -217,5 +217,177 @@ Normalizes along dimension axis using an L2 norm
 .set_support_level(2)
 .add_type_rel("Identity", IdentityRel);
 
+// Dropout
+TVM_REGISTER_NODE_TYPE(DropoutAttrs);
+
+bool DropoutRel(const Array<Type>& types,
+                int num_inputs,
+                const Attrs& attrs,
+                const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  // dropout returns the original tensor with dropout applied
+  // and a mask tensor (1.0 where element not dropped, 0.0 where dropped)
+  auto ret_type = TensorTypeNode::make(data->shape, data->dtype);
+  reporter->Assign(types[1], TupleTypeNode::make(Array<Type>({ret_type, ret_type})));
+  return true;
+}
+
+Expr MakeDropout(Expr data, double rate) {
+  auto attrs = make_node<DropoutAttrs>();
+  attrs->rate = rate;
+  static const Op& op = Op::Get("nn.dropout");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.dropout")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeDropout, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.dropout")
+.describe(R"code(Applies the dropout operation to the input array.
+
+During training, each element of the input is set to zero with probability ``p``.
+The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input unchanged.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "Input to which dropout will be applied.")
+.set_support_level(1)
+.add_type_rel("Dropout", DropoutRel);
+
+// batch_norm
+TVM_REGISTER_NODE_TYPE(BatchNormAttrs);
+
+bool CheckVectorLength(int64_t dim, const DataType& dtype, Type vector, const char* name) {
+  const auto* candidate = vector.as<TensorTypeNode>();
+  CHECK(candidate != nullptr)
+    << name << " should be a vector but is not a tensor type,";
+  CHECK_EQ(dtype, candidate->dtype)
+    << name << " should be of the same data type as the original but it is not.";
+  CHECK_EQ(candidate->shape.size(), 1)
+    << name << " should be a vector but has a shape of "
+    << candidate->shape.size() << " dimensions instead of 1.";
+
+  const int64_t* length = as_const_int(candidate->shape[0]);
+  if (length == nullptr) return false;
+  CHECK(*length == dim)
+    << name << " should be as long as the channel but has length "
+    << *length << " instead of " << dim << ".";
+  return true;
+}
+
+bool BatchNormRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 6);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  if (data->shape.size() == 0) return false;
+
+  const BatchNormAttrs* param = attrs.as<BatchNormAttrs>();
+
+  // axis of -1 means use the last dimension
+  CHECK(param->axis >= -1 && param->axis < (int)data->shape.size());
+  int axis = (param->axis != -1) ? param->axis : data->shape.size() - 1;
+
+  auto dim = as_const_int(data->shape[axis]);
+  if (dim == nullptr) return false;
+
+  // if we are using beta and gamma, they need to be of shape (dim,)
+  if (param->scale && !CheckVectorLength(*dim, data->dtype, types[1], "The gamma scale factor")) {
+    return false;
+  }
+
+  if (param->center && !CheckVectorLength(*dim, data->dtype, types[2], "The beta offset factor")) {
+    return false;
+  }
+
+  // the two running averages must also be vectors of length dim
+  if (!CheckVectorLength(*dim, data->dtype, types[3], "The moving mean")) {
+    return false;
+  }
+  if (!CheckVectorLength(*dim, data->dtype, types[4], "The moving variance")) {
+    return false;
+  }
+
+  // output is a tuple of the normed data (same shape as input), new running mean,
+  // and new running average (the latter two are both vectors of length dim)
+  std::vector<Type> fields;
+  auto vec_ty = TensorTypeNode::make(Array<IndexExpr>({data->shape[axis]}),
+                                     data->dtype);
+  fields.push_back(TensorTypeNode::make(data->shape, data->dtype));
+  fields.push_back(vec_ty);
+  fields.push_back(vec_ty);
+  reporter->Assign(types[5], TupleTypeNode::make(Array<Type>(fields)));
+  return true;
+}
+
+Expr MakeBatchNorm(Expr data, Expr gamma, Expr beta, Expr moving_mean, Expr moving_var,
+                   int axis, double epsilon, bool center, bool scale) {
+  auto attrs = make_node<BatchNormAttrs>();
+  attrs->axis = axis;
+  attrs->epsilon = epsilon;
+  attrs->center = center;
+  attrs->scale = scale;
+  static const Op& op = Op::Get("nn.batch_norm");
+  return CallNode::make(op, {data, gamma, beta, moving_mean, moving_var}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.batch_norm")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 9>(MakeBatchNorm, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.batch_norm")
+.describe(R"code(Batch normalization layer (Ioffe and Szegedy, 2014).
+Normalizes the input at each batch, i.e. applies a transformation
+that maintains the mean activation close to 0 and the activation
+standard deviation close to 1.
+
+.. math::
+
+  data\_mean[i] = mean(data[:,i,:,...]) \\
+  data\_var[i] = var(data[:,i,:,...])
+
+Then compute the normalized output, which has the same shape as input, as following:
+
+.. math::
+
+  out[:,i,:,...] = \frac{data[:,i,:,...] - data\_mean[i]}{\sqrt{data\_var[i]+\epsilon}} \
+* gamma[i] + beta[i]
+
+Both *mean* and *var* returns a scalar by treating the input as a vector.
+
+Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta`` have shape *(k,)*.
+
+Besides the inputs and the outputs, this operator accepts two auxiliary
+states, ``moving_mean`` and ``moving_var``, which are *k*-length
+vectors. They are global statistics for the whole dataset, which are updated
+by::
+
+  moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+  moving_var = moving_var * momentum + data_var * (1 - momentum)
+
+The parameter ``axis`` specifies which axis of the input shape denotes
+the 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel
+axis to be the last item in the input shape.
+
+.. note::
+    This operator can be optimized away for inference.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(5)
+.add_argument("data", "Tensor", "Input to which batch_norm will be applied.")
+.add_argument("gamma", "Tensor", "The gamma scale factor.")
+.add_argument("beta", "Tensor", "The beta offset factor.")
+.add_argument("moving_mean", "Tensor", "Running mean of input.")
+.add_argument("moving_var", "Tensor", "Running variance of input.")
+.set_support_level(1)
+.add_type_rel("BatchNorm", BatchNormRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 05c02ab5d197..914eafeb57a9 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -196,6 +196,93 @@ def test_l2_normalize():
     ftype = func.checked_type
     assert ftype.ret_type == relay.ty.TensorType((n, c , h, w), "float32")
 
+def test_dropout():
+    ib = relay.ir_builder.IRBuilder()
+    input_ty = relay.ty.TensorType((3, 4, 5), "int8")
+    x = ib.param("x", input_ty)
+    with ib.function(x) as func:
+        ib.ret(relay.nn.dropout(x))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TupleType([input_ty, input_ty])
+
+    ib = relay.ir_builder.IRBuilder()
+    n, t, d = tvm.var("n"), tvm.var("t"), tvm.var("d")
+    input_ty = relay.ty.TensorType((n, t, d), "float32")
+    x = ib.param("x", input_ty)
+    with ib.function(x) as func:
+        ib.ret(relay.nn.dropout(x, rate=0.75))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TupleType([input_ty, input_ty])
+
+
+def test_batch_norm():
+    # beta and gamma ignored
+    ib = relay.ir_builder.IRBuilder()
+    data = ib.param("data", relay.ty.TensorType((3, 2, 1), "float32"))
+    gamma = ib.param("gamma", relay.ty.TensorType((5,), "int8"))
+    beta = ib.param("beta", relay.ty.TensorType((12, 16), "int64"))
+    moving_mean = ib.param("moving_mean", relay.ty.TensorType((2,), "float32"))
+    moving_var = ib.param("moving_var", relay.ty.TensorType((2,), "float32"))
+    with ib.function(data, gamma, beta, moving_mean, moving_var) as func:
+        ib.ret(relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
+                                   center=False, scale=False))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TupleType(tvm.convert([
+        relay.ty.TensorType((3, 2, 1), "float32"),
+        relay.ty.TensorType((2,), "float32"),
+        relay.ty.TensorType((2,), "float32")
+    ]))
+
+    # with beta and gamma, different axis
+    ib = relay.ir_builder.IRBuilder()
+    data = ib.param("data", relay.ty.TensorType((3, 2, 1), "float32"))
+    gamma = ib.param("gamma", relay.ty.TensorType((3,), "float32"))
+    beta = ib.param("beta", relay.ty.TensorType((3,), "float32"))
+    moving_mean = ib.param("moving_mean", relay.ty.TensorType((3,), "float32"))
+    moving_var = ib.param("moving_var", relay.ty.TensorType((3,), "float32"))
+    with ib.function(data, gamma, beta, moving_mean, moving_var) as func:
+        ib.ret(relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
+                                   axis=0, center=False, scale=False))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TupleType(tvm.convert([
+        relay.ty.TensorType((3, 2, 1), "float32"),
+        relay.ty.TensorType((3,), "float32"),
+        relay.ty.TensorType((3,), "float32")
+    ]))
+
+    # axis=-1
+    ib = relay.ir_builder.IRBuilder()
+    data = ib.param("data", relay.ty.TensorType((1, 2, 3), "float32"))
+    gamma = ib.param("gamma", relay.ty.TensorType((3,), "float32"))
+    beta = ib.param("beta", relay.ty.TensorType((3,), "float32"))
+    moving_mean = ib.param("moving_mean", relay.ty.TensorType((3,), "float32"))
+    moving_var = ib.param("moving_var", relay.ty.TensorType((3,), "float32"))
+    with ib.function(data, gamma, beta, moving_mean, moving_var) as func:
+        ib.ret(relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
+                                   axis=-1, center=False, scale=False))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TupleType(tvm.convert([
+        relay.ty.TensorType((1, 2, 3), "float32"),
+        relay.ty.TensorType((3,), "float32"),
+        relay.ty.TensorType((3,), "float32")
+    ]))
+
+
 if __name__ == "__main__":
     test_unary_op()
     test_single_op()
@@ -207,3 +294,5 @@ def test_l2_normalize():
     test_binary_broadcast_op()
     test_lrn()
     test_l2_normalize()
+    test_dropout()
+    test_batch_norm()

From 06e5fce07dfa891d40142dfc8c6e8060511d542e Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Tue, 16 Oct 2018 19:48:19 +0300
Subject: [PATCH 228/529] [TVM] Eagerer const folding for logic ops (#1907)

---
 src/lang/ir_operator.cc                     | 22 +++++++-----
 tests/python/unittest/test_lang_operator.py | 38 +++++++++++++++++++++
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/src/lang/ir_operator.cc b/src/lang/ir_operator.cc
index 30742764351d..275752644be9 100644
--- a/src/lang/ir_operator.cc
+++ b/src/lang/ir_operator.cc
@@ -310,20 +310,26 @@ Expr operator!=(Expr a, Expr b) {
 
 Expr operator&&(Expr a, Expr b) {
   using ir::UIntImm;
-  const UIntImm* pa = a.as<UIntImm>();
-  const UIntImm* pb = b.as<UIntImm>();
-  if (pa && pb) {
-    return UIntImm::make(UInt(1), pa->value && pb->value);
+  if (a.type().is_bool() && b.type().is_bool()) {
+    const UIntImm* pa = a.as<UIntImm>();
+    const UIntImm* pb = b.as<UIntImm>();
+    if (pa && pa->value) return b;
+    if (pa && !pa->value) return a;
+    if (pb && pb->value) return a;
+    if (pb && !pb->value) return b;
   }
   return ir::And::make(a, b);
 }
 
 Expr operator||(Expr a, Expr b) {
   using ir::UIntImm;
-  const UIntImm* pa = a.as<UIntImm>();
-  const UIntImm* pb = b.as<UIntImm>();
-  if (pa && pb) {
-    return UIntImm::make(UInt(1), pa->value || pb->value);
+  if (a.type().is_bool() && b.type().is_bool()) {
+    const UIntImm* pa = a.as<UIntImm>();
+    const UIntImm* pb = b.as<UIntImm>();
+    if (pa && pa->value) return a;
+    if (pa && !pa->value) return b;
+    if (pb && pb->value) return b;
+    if (pb && !pb->value) return a;
   }
   return ir::Or::make(a, b);
 }
diff --git a/tests/python/unittest/test_lang_operator.py b/tests/python/unittest/test_lang_operator.py
index 9c701ed2abe3..af7d9fd5544a 100644
--- a/tests/python/unittest/test_lang_operator.py
+++ b/tests/python/unittest/test_lang_operator.py
@@ -30,6 +30,44 @@ def test_const_fold2():
     assert (1 * x).same_as(x)
     assert isinstance((1 / x), tvm.expr.Div)
 
+def test_const_fold3():
+    def check_throws(f):
+        try:
+            f()
+        except tvm.TVMError:
+            pass
+        else:
+            raise AssertionError("Should have raised an exception but didn't.")
+
+    # Test that using ints with logic operations is forbidden
+    x = tvm.var("x")
+    for val in [0, 1]:
+        for func in [tvm.all, tvm.any]:
+            check_throws(lambda: func(tvm.const(val, 'uint1'), x))
+            check_throws(lambda: func(x, tvm.const(val, 'uint1')))
+
+    # Test const folding when both arguments are const
+    for tvm_func, py_func in [(tvm.all, lambda a, b: a and b), (tvm.any, lambda a, b: a or b)]:
+        for v1 in [0, 1]:
+            for v2 in [0, 1]:
+                assert tvm.ir_pass.Equal(tvm_func(tvm.const(v1, 'uint1'), tvm.const(v2, 'uint1')),
+                                         tvm.const(py_func(v1, v2), 'uint1'))
+
+    x = tvm.var("x", 'uint1')
+    true = tvm.const(1, 'uint1')
+    false = tvm.const(0, 'uint1')
+
+    assert tvm.all(x, true).same_as(x)
+    assert tvm.all(true, x).same_as(x)
+    assert tvm.any(x, false).same_as(x)
+    assert tvm.any(false, x).same_as(x)
+
+    assert tvm.all(x, false).same_as(false)
+    assert tvm.all(false, x).same_as(false)
+    assert tvm.any(x, true).same_as(true)
+    assert tvm.any(true, x).same_as(true)
+
 if __name__ == "__main__":
     test_const_fold()
     test_const_fold2()
+    test_const_fold3()

From 7b13c1fcfb19f967b0f5a8e6f5a3d5de07d3c421 Mon Sep 17 00:00:00 2001
From: Tatsuya Nishiyama <nishiyama.tatsuya0@gmail.com>
Date: Wed, 17 Oct 2018 05:36:31 +0900
Subject: [PATCH 229/529] Add dtype option to verify_mxnet_frontend_impl
 (#1908)

---
 nnvm/tests/python/frontend/mxnet/test_forward.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 6c086cb367e8..dcab7d8fcde7 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -14,7 +14,7 @@
 
 
 def verify_mxnet_frontend_impl(mx_symbol, data_shape=(1, 3, 224, 224), out_shape=(1, 1000),
-                               gluon_impl=False, name=None):
+                               gluon_impl=False, name=None, dtype='float32'):
     """Use name different from test to avoid let nose pick it up"""
     if gluon_impl:
         def get_gluon_output(name, x):
@@ -57,7 +57,6 @@ def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
         return out.asnumpy()
 
     # random input
-    dtype = 'float32'
     x = np.random.uniform(size=data_shape)
     if gluon_impl:
         gluon_out, gluon_sym = get_gluon_output(name, x)

From a2238d15a3f9c2d968ac850879ee3435f83068bd Mon Sep 17 00:00:00 2001
From: Pariksheet Pinjari <pariksheet.pinjari@huawei.com>
Date: Wed, 17 Oct 2018 06:56:09 +0530
Subject: [PATCH 230/529] [FRONTEND][DARKNET] YOLO V3 model support (#1734)

---
 nnvm/python/nnvm/frontend/darknet.py          |  59 +++++-
 nnvm/python/nnvm/testing/__init__.py          |   2 +-
 nnvm/python/nnvm/testing/darknet.py           |   5 +-
 .../{yolo2_detection.py => yolo_detection.py} | 173 ++++++++++--------
 .../python/frontend/darknet/test_forward.py   | 110 ++++++++---
 tutorials/nnvm/from_darknet.py                |  66 ++++---
 6 files changed, 291 insertions(+), 124 deletions(-)
 rename nnvm/python/nnvm/testing/{yolo2_detection.py => yolo_detection.py} (54%)

diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 7fb3e34750c8..bf3a16cdb23e 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -440,11 +440,13 @@ def __init__(self, net, dtype='float32'):
         self._state_ctr['cell_state'] = 0
         self._state_ctr['gru'] = 0
 
-    def _read_memory_buffer(self, shape, data):
+    def _read_memory_buffer(self, shape, data, dtype=None):
+        if dtype is None:
+            dtype = self.dtype
         length = 1
         for x in shape:
             length *= x
-        data_np = np.zeros(length, dtype=self.dtype)
+        data_np = np.zeros(length, dtype=dtype)
         for i in range(length):
             data_np[i] = data[i]
         return data_np.reshape(shape)
@@ -493,6 +495,31 @@ def _get_connected_weights(self, layer, opname):
             k = self._get_tvm_params_name(opname[0], 'bias')
             self._tvmparams[k] = tvm.nd.array(biases)
 
+    def _get_region_weights(self, layer, opname):
+        """Parse the biases for region layer."""
+        biases = self._read_memory_buffer((layer.n*2, ), layer.biases)
+        attributes = np.array([layer.n, layer.out_c, layer.out_h, layer.out_w,
+                               layer.classes, layer.coords, layer.background],
+                              dtype=np.int32)
+        k = self._get_tvm_params_name(opname, 'bias')
+        self._tvmparams[k] = tvm.nd.array(biases)
+        k = self._get_tvm_params_name(opname, 'attr')
+        self._tvmparams[k] = tvm.nd.array(attributes)
+
+    def _get_yolo_weights(self, layer, opname):
+        """Parse the biases and mask for yolo layer."""
+        biases = self._read_memory_buffer((layer.total*2, ), layer.biases)
+        mask = self._read_memory_buffer((layer.n, ), layer.mask, dtype='int32')
+        attributes = np.array([layer.n, layer.out_c, layer.out_h, layer.out_w,
+                               layer.classes, layer.total],
+                              dtype=np.int32)
+        k = self._get_tvm_params_name(opname, 'bias')
+        self._tvmparams[k] = tvm.nd.array(biases)
+        k = self._get_tvm_params_name(opname, 'mask')
+        self._tvmparams[k] = tvm.nd.array(mask)
+        k = self._get_tvm_params_name(opname, 'attr')
+        self._tvmparams[k] = tvm.nd.array(attributes)
+
     def _get_batchnorm_weights(self, layer, opname, size):
         """Parse the weights for batchnorm, which includes, scales, moving mean
         and moving variances."""
@@ -621,6 +648,11 @@ def _get_darknet_params(self, layer, opname):
         elif LAYERTYPE.CONNECTED == layer.type:
             self._get_connected_weights(layer, opname)
 
+        elif LAYERTYPE.REGION == layer.type:
+            self._get_region_weights(layer, opname)
+
+        elif LAYERTYPE.YOLO == layer.type:
+            self._get_yolo_weights(layer, opname)
     def _preproc_layer(self, layer, layer_num):
         """To preprocess each darknet layer, some layer doesnt need processing."""
         if layer_num == 0:
@@ -850,6 +882,27 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
 
         return processed, sym
 
+    def _make_outlist(self, sym, op_name, layer, layer_num):
+        if layer.type == LAYERTYPE.REGION:
+            k = self._get_tvm_params_name(op_name, 'attr')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            k = self._get_tvm_params_name(op_name, 'bias')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            if layer_num != self.net.n-1:
+                self._outs.insert(0, sym)
+
+        elif layer.type == LAYERTYPE.YOLO:
+            k = self._get_tvm_params_name(op_name, 'attr')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            k = self._get_tvm_params_name(op_name, 'bias')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            k = self._get_tvm_params_name(op_name, 'mask')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            if layer_num != self.net.n-1:
+                self._outs.insert(0, sym)
+
+        return
+
     def from_darknet(self):
         """To convert the darknet symbol to nnvm symbols."""
         for i in range(self.net.n):
@@ -867,6 +920,8 @@ def from_darknet(self):
             layer_name, sym = _darknet_convert_symbol(op_name, _as_list(sym), attr)
             self._get_darknet_params(self.net.layers[i], layer_name)
             self._sym_array[i] = sym
+            self._make_outlist(sym, layer_name, layer, i)
+
         self._outs = _as_list(sym) + self._outs
         if isinstance(self._outs, list):
             sym = _sym.Group(self._outs)
diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
index 3bf03a1e0039..44b8529821d0 100644
--- a/nnvm/python/nnvm/testing/__init__.py
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -13,5 +13,5 @@
 from . import inception_v3
 from . import dcgan
 from . import dqn
-from . import yolo2_detection
+from . import yolo_detection
 from . import check_computation
diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py
index 9a346e01b50b..d4d33a6949f3 100644
--- a/nnvm/python/nnvm/testing/darknet.py
+++ b/nnvm/python/nnvm/testing/darknet.py
@@ -55,10 +55,10 @@ def _letterbox_image(img, w_in, h_in):
     imc, imh, imw = img.shape
     if (w_in / imw) < (h_in / imh):
         new_w = w_in
-        new_h = imh * w_in / imw
+        new_h = imh * w_in // imw
     else:
         new_h = h_in
-        new_w = imw * h_in/imh
+        new_w = imw * h_in // imh
     resized = _resize_image(img, new_w, new_h)
     boxed = np.full((imc, h_in, w_in), 0.5, dtype=float)
     _, resizedh, resizedw = resized.shape
@@ -511,6 +511,7 @@ class ACTIVATION(object):
 layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize);
 layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
 layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
+layer make_upsample_layer(int batch, int w, int h, int c, int stride);
 void free_network(network *net);
 """
                    )
diff --git a/nnvm/python/nnvm/testing/yolo2_detection.py b/nnvm/python/nnvm/testing/yolo_detection.py
similarity index 54%
rename from nnvm/python/nnvm/testing/yolo2_detection.py
rename to nnvm/python/nnvm/testing/yolo_detection.py
index 0b229149b8ea..86f19297cabf 100644
--- a/nnvm/python/nnvm/testing/yolo2_detection.py
+++ b/nnvm/python/nnvm/testing/yolo_detection.py
@@ -9,27 +9,22 @@
 from __future__ import division
 import math
 from collections import namedtuple
+from functools import cmp_to_key
 import numpy as np
 
-def _entry_index(batch, w, h, outputs, classes, coords, location, entry):
-    n = int(location/(w*h))
-    loc = location%(w*h)
-    return batch*outputs + n*w*h*(coords+classes+1) + entry*w*h + loc
-
 Box = namedtuple('Box', ['x', 'y', 'w', 'h'])
-def _get_region_box(x, biases, n, index, i, j, w, h, stride):
-    b = Box(0, 0, 0, 0)
-    b = b._replace(x=(i + x[index + 0*stride]) / w)
-    b = b._replace(y=(j + x[index + 1*stride]) / h)
-    b = b._replace(w=np.exp(x[index + 2*stride]) * biases[2*n] / w)
-    b = b._replace(h=np.exp(x[index + 3*stride]) * biases[2*n+1] / h)
-    return b
-
-def _correct_region_boxes(boxes, n, w, h, netw, neth, relative):
-    new_w, new_h = (netw, (h*netw)/w) if (netw/w < neth/h) else ((w*neth/h), neth)
-    for i in range(n):
-        b = boxes[i]
-        b = boxes[i]
+
+def nms_comparator(a, b):
+    if 'sort_class' in b and b['sort_class'] >= 0:
+        diff = a['prob'][b['sort_class']] - b['prob'][b['sort_class']]
+    else:
+        diff = a['objectness'] - b['objectness']
+    return diff
+
+def _correct_boxes(dets, w, h, netw, neth, relative):
+    new_w, new_h = (netw, (h*netw)//w) if (netw/w < neth/h) else ((w*neth//h), neth)
+    for det in dets:
+        b = det['bbox']
         b = b._replace(x=(b.x - (netw - new_w)/2/netw) / (new_w/netw))
         b = b._replace(y=(b.y - (neth - new_h)/2/neth) / (new_h/neth))
         b = b._replace(w=b.w * netw/new_w)
@@ -39,7 +34,8 @@ def _correct_region_boxes(boxes, n, w, h, netw, neth, relative):
             b = b._replace(w=b.w * w)
             b = b._replace(y=b.y * h)
             b = b._replace(h=b.h * h)
-        boxes[i] = b
+        det['bbox'] = b
+    return dets
 
 def _overlap(x1, w1, x2, w2):
     l1 = x1 - w1/2
@@ -65,72 +61,103 @@ def _box_union(a, b):
 def _box_iou(a, b):
     return _box_intersection(a, b)/_box_union(a, b)
 
-def get_region_boxes(layer_in, imw, imh, netw, neth, thresh, probs,
-                     boxes, relative, tvm_out):
-    "To get the boxes for the image based on the prediction"
-    lw = layer_in.w
-    lh = layer_in.h
-    probs = [[0 for i in range(layer_in.classes + 1)] for y in range(lw*lh*layer_in.n)]
-    boxes = [Box(0, 0, 0, 0) for i in range(lw*lh*layer_in.n)]
-    for i in range(lw*lh):
-        row = int(i / lw)
-        col = int(i % lw)
-        for n in range(layer_in.n):
-            index = n*lw*lh + i
-            obj_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
-                                     layer_in.coords, n*lw*lh + i, layer_in.coords)
-            box_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
-                                     layer_in.coords, n*lw*lh + i, 0)
-            mask_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
-                                      layer_in.coords, n*lw*lh + i, 4)
-            scale = 1 if layer_in.background  else tvm_out[obj_index]
-            boxes[index] = _get_region_box(tvm_out, layer_in.biases, n, box_index, col,
-                                           row, lw, lh, lw*lh)
-            if not layer_in.softmax_tree:
-                max_element = 0
-                for j in range(layer_in.classes):
-                    class_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
-                                               layer_in.coords, n*lw*lh + i, layer_in.coords+1+j)
-                    prob = scale*tvm_out[class_index]
-                    probs[index][j] = prob if prob > thresh else 0
-                    max_element = max(max_element, prob)
-                probs[index][layer_in.classes] = max_element
-
-    _correct_region_boxes(boxes, lw*lh*layer_in.n, imw, imh, netw, neth, relative)
-    return boxes, probs
-
-
-def do_nms_sort(boxes, probs, total, classes, thresh):
-    "Does the sorting based on the threshold values"
-    SortableBbox = namedtuple('SortableBbox', ['index_var', 'class_var', 'probs'])
+def _get_box(data, biases, n, location, lw, lh, w, h):
+    bx = (location[2] + data[location[0]][0][location[1]][location[2]]) / lw
+    by = (location[1] + data[location[0]][1][location[1]][location[2]]) / lh
+    bw = np.exp(data[location[0]][2][location[1]][location[2]]) * biases[2*n] / w
+    bh = np.exp(data[location[0]][3][location[1]][location[2]]) * biases[2*n+1] / h
+    return Box(bx, by, bw, bh)
 
-    s = [SortableBbox(0, 0, []) for i in range(total)]
-    for i in range(total):
-        s[i] = s[i]._replace(index_var=i)
-        s[i] = s[i]._replace(class_var=0)
-        s[i] = s[i]._replace(probs=probs)
+def _get_yolo_detections(l, im_shape, net_shape, thresh, relative, dets):
+    data = l['output']
+    active_data_loc = np.asarray(np.where(data[:, 4, :, :] > thresh))
+    before_correct_dets = []
+    for i in range(active_data_loc.shape[1]):
+        location = [active_data_loc[0][i], active_data_loc[1][i], active_data_loc[2][i]]
+        box_b = _get_box(data, l['biases'], np.asarray(l['mask'])[location[0]], location,
+                         data.shape[2], data.shape[3], net_shape[0], net_shape[1])
+        objectness = data[location[0]][4][location[1]][location[2]]
+        classes = l['classes']
+        prob = objectness*data[location[0], 5:5 + 1 + classes, location[1], location[2]]
+        prob[prob < thresh] = 0
+        detection = {}
+        detection['bbox'] = box_b
+        detection['classes'] = classes
+        detection['prob'] = prob
+        detection['objectness'] = objectness
+        before_correct_dets.append(detection)
+    dets.extend(_correct_boxes(before_correct_dets, im_shape[0], im_shape[1],
+                               net_shape[0], net_shape[1], relative))
+    return
 
+def _get_region_detections(l, im_shape, net_shape, thresh, relative, dets):
+    data = l['output']
+    before_correct_dets = []
+    for row in range(data.shape[2]):
+        for col in range(data.shape[3]):
+            for n in range(data.shape[0]):
+                prob = [0]*l['classes']
+                scale = data[n, l['coords'], row, col] if not l['background'] else 1
+                location = [n, row, col]
+                box_b = _get_box(data, l['biases'], n, location,
+                                 data.shape[2], data.shape[3], data.shape[2], data.shape[3])
+                objectness = scale if scale > thresh else 0
+                if objectness:
+                    prob = scale * data[n, l['coords']+1: l['coords']+1+l['classes'],
+                                        row, col]
+                    prob[prob < thresh] = 0
+                detection = {}
+                detection['bbox'] = box_b
+                detection['prob'] = prob
+                detection['objectness'] = objectness
+                before_correct_dets.append(detection)
+    _correct_boxes(before_correct_dets, im_shape[0], im_shape[1],
+                   net_shape[0], net_shape[1], relative)
+    dets.extend(before_correct_dets)
+    return
+
+def fill_network_boxes(net_shape, im_shape,
+                       thresh, relative, tvm_out):
+    dets = []
+    for layer in tvm_out:
+        if layer['type'] == 'Yolo':
+            _get_yolo_detections(layer, im_shape, net_shape, thresh, relative, dets)
+        elif layer['type'] == 'Region':
+            _get_region_detections(layer, im_shape, net_shape, thresh, relative, dets)
+    return dets
+
+def do_nms_sort(dets, classes, thresh):
+    "Does the sorting based on the threshold values"
+    k = len(dets)-1
+    cnt = 0
+    while cnt < k:
+        if dets[cnt]['objectness'] == 0:
+            dets[k], dets[cnt] = dets[cnt], dets[k]
+            k = k - 1
+        else:
+            cnt = cnt + 1
+    total = k+1
     for k in range(classes):
         for i in range(total):
-            s[i] = s[i]._replace(class_var=k)
-        s = sorted(s, key=lambda x: x.probs[x.index_var][x.class_var], reverse=True)
+            dets[i]['sort_class'] = k
+        dets[0:total] = sorted(dets[0:total],
+                               key=cmp_to_key(nms_comparator), reverse=True)
         for i in range(total):
-            if probs[s[i].index_var][k] == 0:
+            if dets[i]['prob'][k] == 0:
                 continue
-            a = boxes[s[i].index_var]
+            a = dets[i]['bbox']
             for j in range(i+1, total):
-                b = boxes[s[j].index_var]
+                b = dets[j]['bbox']
                 if _box_iou(a, b) > thresh:
-                    probs[s[j].index_var][k] = 0
-    return boxes, probs
+                    dets[j]['prob'][k] = 0
 
-def draw_detections(im, num, thresh, boxes, probs, names, classes):
+def draw_detections(im, dets, thresh, names, classes):
     "Draw the markings around the detected region"
-    for i in range(num):
+    for det in dets:
         labelstr = []
         category = -1
         for j in range(classes):
-            if probs[i][j] > thresh:
+            if det['prob'][j] > thresh:
                 if category == -1:
                     category = j
                 labelstr.append(names[j])
@@ -142,7 +169,7 @@ def draw_detections(im, num, thresh, boxes, probs, names, classes):
             green = _get_color(1, offset, classes)
             blue = _get_color(0, offset, classes)
             rgb = [red, green, blue]
-            b = boxes[i]
+            b = det['bbox']
             left = int((b.x-b.w/2.)*imw)
             right = int((b.x+b.w/2.)*imw)
             top = int((b.y-b.h/2.)*imh)
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
index 3d7d06b48483..b1d5e735611a 100644
--- a/nnvm/tests/python/frontend/darknet/test_forward.py
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -13,6 +13,7 @@
 import tvm
 from tvm.contrib import graph_runtime
 from nnvm import frontend
+from nnvm.testing.darknet import LAYERTYPE
 from nnvm.testing.darknet import __darknetffi__
 import nnvm.compiler
 if sys.version_info >= (3,):
@@ -50,14 +51,24 @@ def _download(url, path, overwrite=False, sizecompare=False):
 _download(DARKNETLIB_URL, DARKNET_LIB)
 LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
 
-def _get_tvm_output(net, data):
+def _read_memory_buffer(shape, data, dtype='float32'):
+    length = 1
+    for x in shape:
+        length *= x
+    data_np = np.zeros(length, dtype=dtype)
+    for i in range(length):
+        data_np[i] = data[i]
+    return data_np.reshape(shape)
+
+def _get_tvm_output(net, data, build_dtype='float32'):
     '''Compute TVM output'''
     dtype = 'float32'
     sym, params = frontend.darknet.from_darknet(net, dtype)
 
     target = 'llvm'
     shape_dict = {'data': data.shape}
-    graph, library, params = nnvm.compiler.build(sym, target, shape_dict, dtype, params=params)
+    graph, library, params = nnvm.compiler.build(sym, target, shape_dict,
+                                                 build_dtype, params=params)
     # Execute on TVM
     ctx = tvm.cpu(0)
     m = graph_runtime.create(graph, library, ctx)
@@ -66,14 +77,50 @@ def _get_tvm_output(net, data):
     m.set_input(**params)
     m.run()
     # get outputs
-    out_shape = (net.outputs,)
-    tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+    tvm_out = []
+    for i in range(m.get_num_outputs()):
+        tvm_out.append(m.get_output(i).asnumpy())
     return tvm_out
 
-def test_forward(net):
+def test_forward(net, build_dtype='float32'):
     '''Test network with given input image on both darknet and tvm'''
     def get_darknet_output(net, img):
-        return LIB.network_predict_image(net, img)
+        LIB.network_predict_image(net, img)
+        out = []
+        for i in range(net.n):
+            layer = net.layers[i]
+            if layer.type == LAYERTYPE.REGION:
+                attributes = np.array([layer.n, layer.out_c, layer.out_h,
+                                       layer.out_w, layer.classes,
+                                       layer.coords, layer.background],
+                                      dtype=np.int32)
+                out.insert(0, attributes)
+                out.insert(0, _read_memory_buffer((layer.n*2, ), layer.biases))
+                layer_outshape = (layer.batch, layer.out_c,
+                                  layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(layer_outshape, layer.output))
+            elif layer.type == LAYERTYPE.YOLO:
+                attributes = np.array([layer.n, layer.out_c, layer.out_h,
+                                       layer.out_w, layer.classes,
+                                       layer.total],
+                                      dtype=np.int32)
+                out.insert(0, attributes)
+                out.insert(0, _read_memory_buffer((layer.total*2, ), layer.biases))
+                out.insert(0, _read_memory_buffer((layer.n, ), layer.mask, dtype='int32'))
+                layer_ou tshape = (layer.batch, layer.out_c,
+                                  layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(layer_outshape, layer.output))
+            elif i == net.n-1:
+                if layer.type == LAYERTYPE.CONNECTED:
+                    darknet_outshape = (layer.batch, layer.out_c)
+                elif layer.type in [LAYERTYPE.SOFTMAX]:
+                    darknet_outshape = (layer.batch, layer.outputs)
+                else:
+                    darknet_outshape = (layer.batch, layer.out_c,
+                                        layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(darknet_outshape, layer.output))
+        return out
+
     dtype = 'float32'
 
     test_image = 'dog.jpg'
@@ -81,11 +128,7 @@ def get_darknet_output(net, img):
     _download(img_url, test_image)
     img = LIB.letterbox_image(LIB.load_image_color(test_image.encode('utf-8'), 0, 0), net.w, net.h)
     darknet_output = get_darknet_output(net, img)
-    darknet_out = np.zeros(net.outputs, dtype='float32')
-    for i in range(net.outputs):
-        darknet_out[i] = darknet_output[i]
     batch_size = 1
-
     data = np.empty([batch_size, img.c, img.h, img.w], dtype)
     i = 0
     for c in range(img.c):
@@ -94,8 +137,9 @@ def get_darknet_output(net, img):
                 data[0][c][h][k] = img.data[i]
                 i = i + 1
 
-    tvm_out = _get_tvm_output(net, data)
-    np.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-3, atol=1e-3)
+    tvm_out = _get_tvm_output(net, data, build_dtype)
+    for tvm_outs, darknet_out in zip(tvm_out, darknet_output):
+        np.testing.assert_allclose(darknet_out, tvm_outs, rtol=1e-3, atol=1e-3)
 
 def test_rnn_forward(net):
     '''Test network with given input data on both darknet and tvm'''
@@ -106,11 +150,14 @@ def get_darknet_network_predict(net, data):
     np_arr = np.zeros([1, net.inputs], dtype='float32')
     np_arr[0, 84] = 1
     cffi_arr = ffi.cast('float*', np_arr.ctypes.data)
-    tvm_out = _get_tvm_output(net, np_arr)
+    tvm_out = _get_tvm_output(net, np_arr)[0]
     darknet_output = get_darknet_network_predict(net, cffi_arr)
     darknet_out = np.zeros(net.outputs, dtype='float32')
     for i in range(net.outputs):
         darknet_out[i] = darknet_output[i]
+    last_layer = net.layers[net.n-1]
+    darknet_outshape = (last_layer.batch, last_layer.outputs)
+    darknet_out = darknet_out.reshape(darknet_outshape)
     np.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-4, atol=1e-4)
 
 def test_forward_extraction():
@@ -152,8 +199,8 @@ def test_forward_resnet50():
     test_forward(net)
     LIB.free_network(net)
 
-def test_forward_yolo():
-    '''test yolo model'''
+def test_forward_yolov2():
+    '''test yolov2 model'''
     model_name = 'yolov2'
     cfg_name = model_name + '.cfg'
     weights_name = model_name + '.weights'
@@ -162,7 +209,22 @@ def test_forward_yolo():
     _download(cfg_url, cfg_name)
     _download(weights_url, weights_name)
     net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
-    test_forward(net)
+    build_dtype = {}
+    test_forward(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_yolov3():
+    '''test yolov3 model'''
+    model_name = 'yolov3'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    _download(cfg_url, cfg_name)
+    _download(weights_url, weights_name)
+    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    build_dtype = {}
+    test_forward(net, build_dtype)
     LIB.free_network(net)
 
 def test_forward_convolutional():
@@ -271,20 +333,21 @@ def test_forward_region():
     net.layers[1] = layer_2
     net.w = net.h = 224
     LIB.resize_network(net, 224, 224)
-    test_forward(net)
+    build_dtype = {}
+    test_forward(net, build_dtype)
     LIB.free_network(net)
 
 def test_forward_yolo_op():
     '''test yolo layer'''
     net = LIB.make_network(2)
     layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 14, 1, 3, 2, 0, 1, 0, 0, 0, 0)
-    a = []
-    layer_2 = LIB.make_yolo_layer(1, 111, 111, 2, 0, a, 2)
+    layer_2 = LIB.make_yolo_layer(1, 111, 111, 2, 9, __darknetffi__.NULL, 2)
     net.layers[0] = layer_1
     net.layers[1] = layer_2
     net.w = net.h = 224
     LIB.resize_network(net, 224, 224)
-    test_forward(net)
+    build_dtype = {}
+    test_forward(net, build_dtype)
     LIB.free_network(net)
 
 def test_forward_upsample():
@@ -313,7 +376,7 @@ def test_forward_softmax():
     '''test softmax layer'''
     net = LIB.make_network(1)
     layer_1 = LIB.make_softmax_layer(1, 75, 1)
-    layer_1.temperature=1
+    layer_1.temperature = 1
     net.layers[0] = layer_1
     net.w = net.h = 5
     LIB.resize_network(net, net.w, net.h)
@@ -324,7 +387,7 @@ def test_forward_softmax_temperature():
     '''test softmax layer'''
     net = LIB.make_network(1)
     layer_1 = LIB.make_softmax_layer(1, 75, 1)
-    layer_1.temperature=0.8
+    layer_1.temperature = 0.8
     net.layers[0] = layer_1
     net.w = net.h = 5
     LIB.resize_network(net, net.w, net.h)
@@ -441,7 +504,8 @@ def test_forward_activation_logistic():
     test_forward_resnet50()
     test_forward_alexnet()
     test_forward_extraction()
-    test_forward_yolo()
+    test_forward_yolov2()
+    test_forward_yolov3()
     test_forward_convolutional()
     test_forward_maxpooling()
     test_forward_avgpooling()
diff --git a/tutorials/nnvm/from_darknet.py b/tutorials/nnvm/from_darknet.py
index 87ab60fc2850..f0eec98c00ea 100644
--- a/tutorials/nnvm/from_darknet.py
+++ b/tutorials/nnvm/from_darknet.py
@@ -1,11 +1,11 @@
 """
-Compile YOLO-V2 in DarkNet Models
+Compile YOLO-V2 and YOLO-V3 in DarkNet Models
 =================================
 **Author**: `Siju Samuel <https://siju-samuel.github.io/>`_
 
 This article is an introductory tutorial to deploy darknet models with NNVM.
 All the required models and libraries will be downloaded from the internet by the script.
-This script runs the YOLO-V2 Model with the bounding boxes
+This script runs the YOLO-V2 and YOLO-V3 Model with the bounding boxes
 Darknet parsing have dependancy with CFFI and CV2 library
 Please install CFFI and CV2 before executing this script
 
@@ -17,6 +17,7 @@
 
 import nnvm
 import nnvm.frontend.darknet
+import nnvm.testing.yolo_detection
 import nnvm.testing.darknet
 import matplotlib.pyplot as plt
 import numpy as np
@@ -28,7 +29,7 @@
 from nnvm.testing.darknet import __darknetffi__
 
 # Model name
-MODEL_NAME = 'yolo'
+MODEL_NAME = 'yolov3'
 
 ######################################################################
 # Download required files
@@ -75,9 +76,11 @@
 data = np.empty([batch_size, net.c, net.h, net.w], dtype)
 shape = {'data': data.shape}
 print("Compiling the model...")
+dtype_dict = {}
 with nnvm.compiler.build_config(opt_level=2):
-    graph, lib, params = nnvm.compiler.build(sym, target, shape, dtype, params)
+    graph, lib, params = nnvm.compiler.build(sym, target, shape, dtype_dict, params)
 
+[neth, netw] = shape['data'][2:] # Current image shape is 608x608
 ######################################################################
 # Load a test image
 # --------------------------------------------------------------------
@@ -87,8 +90,7 @@
           test_image + '?raw=true'
 download(img_url, test_image)
 
-data = nnvm.testing.darknet.load_image(test_image, net.w, net.h)
-
+data = nnvm.testing.darknet.load_image(test_image, netw, neth)
 ######################################################################
 # Execute on TVM Runtime
 # ----------------------
@@ -105,24 +107,44 @@
 
 m.run()
 # get outputs
-out_shape = (net.outputs,)
-tvm_out = m.get_output(0).asnumpy().flatten()
+tvm_out = []
+if MODEL_NAME == 'yolov2':
+    layer_out = {}
+    layer_out['type'] = 'Region'
+    # Get the region layer attributes (n, out_c, out_h, out_w, classes, coords, background)
+    layer_attr = m.get_output(2).asnumpy()
+    layer_out['biases'] = m.get_output(1).asnumpy()
+    out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
+                 layer_attr[2], layer_attr[3])
+    layer_out['output'] = m.get_output(0).asnumpy().reshape(out_shape)
+    layer_out['classes'] = layer_attr[4]
+    layer_out['coords'] = layer_attr[5]
+    layer_out['background'] = layer_attr[6]
+    tvm_out.append(layer_out)
+
+elif MODEL_NAME == 'yolov3':
+    for i in range(3):
+        layer_out = {}
+        layer_out['type'] = 'Yolo'
+        # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
+        layer_attr = m.get_output(i*4+3).asnumpy()
+        layer_out['biases'] = m.get_output(i*4+2).asnumpy()
+        layer_out['mask'] = m.get_output(i*4+1).asnumpy()
+        out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
+                     layer_attr[2], layer_attr[3])
+        layer_out['output'] = m.get_output(i*4).asnumpy().reshape(out_shape)
+        layer_out['classes'] = layer_attr[4]
+        tvm_out.append(layer_out)
 
 # do the detection and bring up the bounding boxes
-thresh = 0.24
-hier_thresh = 0.5
+thresh = 0.5
+nms_thresh = 0.45
 img = nnvm.testing.darknet.load_image_color(test_image)
 _, im_h, im_w = img.shape
-probs = []
-boxes = []
-region_layer = net.layers[net.n - 1]
-boxes, probs = nnvm.testing.yolo2_detection.get_region_boxes(
-    region_layer, im_w, im_h, net.w, net.h,
-    thresh, probs, boxes, 1, tvm_out)
-
-boxes, probs = nnvm.testing.yolo2_detection.do_nms_sort(
-    boxes, probs,
-    region_layer.w*region_layer.h*region_layer.n, region_layer.classes, 0.3)
+dets = nnvm.testing.yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh,
+                                                      1, tvm_out)
+last_layer = net.layers[net.n - 1]
+nnvm.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
 
 coco_name = 'coco.names'
 coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name + '?raw=true'
@@ -136,8 +158,6 @@
 
 names = [x.strip() for x in content]
 
-nnvm.testing.yolo2_detection.draw_detections(
-    img, region_layer.w*region_layer.h*region_layer.n,
-    thresh, boxes, probs, names, region_layer.classes)
+nnvm.testing.yolo_detection.draw_detections(img, dets, thresh, names, last_layer.classes)
 plt.imshow(img.transpose(1, 2, 0))
 plt.show()

From d10db1ffcd24b62bddf64aafdcbeb74f8337943f Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Wed, 17 Oct 2018 06:57:41 +0530
Subject: [PATCH 231/529] [RELAY]Ops Dense, leaky_relu (#1828)

---
 docs/langref/relay_op.rst            |  4 ++
 include/tvm/relay/attrs/nn.h         | 25 +++++++
 python/tvm/relay/op/nn/nn.py         | 52 +++++++++++++++
 src/relay/op/nn/nn.cc                | 98 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level2.py | 42 ++++++++++++
 tests/python/relay/test_op_level3.py | 12 ++++
 6 files changed, 233 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 0b937f6636bf..d40346a9e836 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -51,6 +51,7 @@ This level enables typical convnet models.
 
    tvm.relay.nn.conv2d
    tvm.relay.nn.conv2d_transpose
+   tvm.relay.nn.dense
    tvm.relay.nn.max_pool2d
    tvm.relay.nn.avg_pool2d
    tvm.relay.nn.global_max_pool2d
@@ -70,6 +71,7 @@ This level enables additional math and transform operators.
    :nosignatures:
 
    tvm.relay.zeros
+   tvm.relay.nn.leaky_relu
    tvm.relay.zeros_like
    tvm.relay.ones
    tvm.relay.ones_like
@@ -137,6 +139,7 @@ Level 2 Definitions
 -------------------
 .. autofunction:: tvm.relay.nn.conv2d
 .. autofunction:: tvm.relay.nn.conv2d_transpose
+.. autofunction:: tvm.relay.nn.dense
 .. autofunction:: tvm.relay.nn.max_pool2d
 .. autofunction:: tvm.relay.nn.avg_pool2d
 .. autofunction:: tvm.relay.nn.global_max_pool2d
@@ -149,6 +152,7 @@ Level 2 Definitions
 
 Level 3 Definitions
 -------------------
+.. autofunction:: tvm.relay.nn.leaky_relu
 .. autofunction:: tvm.relay.floor
 .. autofunction:: tvm.relay.ceil
 .. autofunction:: tvm.relay.trunc
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 0be85d3d1bb9..c7b8695d1da5 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -202,6 +202,18 @@ struct GlobalPool2DAttrs : public tvm::AttrsNode<GlobalPool2DAttrs> {
   }
 };
 
+
+/*! \brief Attributes for dense operator */
+struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
+  IndexExpr units;
+
+  TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") {
+    TVM_ATTR_FIELD(units)
+        .describe("Number of hidden units of the dense transformation.");
+  }
+};
+
+
 /*! \brief Attributes for upsampling operator */
 struct UpSamplingAttrs : public tvm::AttrsNode<UpSamplingAttrs> {
   int scale;
@@ -237,6 +249,18 @@ struct PadAttrs : public tvm::AttrsNode<PadAttrs> {
   }
 };
 
+
+/*! \brief Attributes for leaky relu operator */
+struct LeakyReluAttrs : public tvm::AttrsNode<LeakyReluAttrs> {
+  double alpha;
+
+  TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.LeakyReluAttrs") {
+    TVM_ATTR_FIELD(alpha).set_lower_bound(0.0).set_default(0.25)
+        .describe("Slope coefficient for the negative half axis.");
+  }
+};
+
+
 /*! \brief Attributes used in dropout operator */
 struct DropoutAttrs : public tvm::AttrsNode<DropoutAttrs> {
   double rate;
@@ -272,6 +296,7 @@ struct BatchNormAttrs : public tvm::AttrsNode<BatchNormAttrs> {
   }
 };  // struct BatchNormAttrs
 
+
 /*! \brief Attributes for LRN operator */
 struct LRNAttrs : public tvm::AttrsNode<LRNAttrs> {
   IndexExpr size;
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 313c26da0234..51acd4bc38b6 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -430,6 +430,34 @@ def batch_flatten(data):
     """
     return _make.batch_flatten(data)
 
+
+def dense(data, weight, units=None):
+    """Dense operator.
+    Applies a linear transformation
+
+    .. math::
+
+    `Y = X * W`
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    weight : relay.Expr
+        The weight expressions.
+
+    units : int, optional
+        Number of hidden units of the dense transformation.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.dense(data, weight, units)
+
+
 def relu(data):
     """Rectified linear unit.
 
@@ -449,6 +477,30 @@ def relu(data):
     return _make.relu(data)
 
 
+def leaky_relu(data, alpha):
+    """This operator takes data as input and does Leaky version
+    of a Rectified Linear Unit.
+
+    .. math::
+
+        `y = x > 0 ? x : alpha * x`
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    alpha : float
+        Slope coefficient for the negative half axis.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.leaky_relu(data, alpha)
+
+
 def pad(data,
         pad_width,
         pad_value=0.0):
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 23dfe90eebf0..dc5ce2e567d0 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -15,6 +15,104 @@
 namespace tvm {
 namespace relay {
 
+TVM_REGISTER_NODE_TYPE(DenseAttrs);
+
+
+bool DenseRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const DenseAttrs* param = attrs.as<DenseAttrs>();
+  CHECK(param != nullptr);
+
+  CHECK(static_cast<int>(data->shape.size()) != 0);
+
+  Array<tvm::Expr> oshape = data->shape;
+  if (param->units.defined()) {
+    Array<tvm::Expr> dshape = data->shape;
+
+    // validate the weight shape is proper if defined
+    // Assign weight type
+    Array<IndexExpr> wshape({dshape[dshape.size() - 1], param->units});
+    reporter->Assign(types[1], TensorTypeNode::make(wshape, data->dtype));
+    oshape.Set((oshape.size() - 1), param->units);
+  } else {
+    if (weight == nullptr) return false;
+    Array<tvm::Expr> wshape = weight->shape;
+    oshape.Set((oshape.size() - 1), wshape[wshape.size() - 1]);
+  }
+
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+
+// Positional relay function to create dense operator used by frontend FFI.
+Expr MakeDense(Expr data,
+               Expr weight,
+               IndexExpr units) {
+  auto attrs = make_node<DenseAttrs>();
+  attrs->units = units;
+  static const Op& op = Op::Get("nn.dense");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.dense")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeDense, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.dense")
+.describe(R"code(Applies a linear transformation: :math:`Y = XW^T`.
+
+- **data**: `(x1, x2, ..., xn, input_dim)`
+- **weight**: `(units, input_dim)`
+- **out**: `(x1, x2, ..., xn, units)`.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "nD Tensor", "Input data.")
+.add_argument("weight", "2D Tensor", "Weight matrix.")
+.set_support_level(2)
+.add_type_rel("Dense", DenseRel);
+
+
+// Positional relay function to create leaky relu operator used by frontend FFI.
+Expr MakeLeakyRelu(Expr data,
+                   double alpha) {
+  auto attrs = make_node<LeakyReluAttrs>();
+  attrs->alpha = alpha;
+  static const Op& op = Op::Get("nn.leaky_relu");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.leaky_relu")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeLeakyRelu, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.leaky_relu")
+.describe(R"code(Leaky version of a Rectified Linear Unit.
+
+`y = x > 0 ? x : alpha * x`
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "Input data.")
+.set_support_level(3)
+.add_type_rel("Identity", IdentityRel);
+
+
 TVM_REGISTER_API("relay.op.nn._make.softmax")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
   auto make_func = [](Expr data, int axis) {
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index d0d02aece06d..4f37d4893b66 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -219,6 +219,47 @@ def test_pad_infer_type():
     ftype = func.checked_type
     assert ftype.ret_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
 
+def test_dense_infer_type():
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+
+    w = ib.param("w", relay.ty.TensorType((w, 2), "float32"))
+
+    with ib.function(x, w) as func:
+        ib.ret(relay.nn.dense(x, w, units=2))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c, h, 2), "float32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+
+    wh, ww = tvm.var("wh"), tvm.var("ww")
+    w = ib.param("w", relay.ty.TensorType((wh, ww), "float32"))
+
+    with ib.function(x, w) as func:
+        ib.ret(relay.nn.dense(x, w))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c, h, ww), "float32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
+    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+
+    w = ib.param("w", relay.ty.IncompleteType())
+
+    with ib.function(x, w) as func:
+        ib.ret(relay.nn.dense(x, w, units=2))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c, h, 2), "float32")
+
 
 if __name__ == "__main__":
     test_conv2d_infer_type()
@@ -227,3 +268,4 @@ def test_pad_infer_type():
     test_flatten_infer_type()
     test_pad_infer_type()
     test_conv2d_transpose_infer_type()
+    test_dense_infer_type()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 13ab483f936c..0605ac02339b 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -208,6 +208,17 @@ def test_full_like():
     ftype = func.checked_type
     assert ftype.ret_type == relay.TensorType((n, c, h, w), "float32")
 
+def test_infer_type_leaky_relu():
+   ib = relay.ir_builder.IRBuilder()
+   n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+   x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+
+   with ib.function(x) as func:
+       ib.ret(relay.nn.leaky_relu(x, alpha=0.1))
+   ib.ret(func)
+   func = relay.ir_pass.infer_type(ib.env, func.to_func())
+   ftype = func.checked_type
+   assert ftype.ret_type == relay.ty.TensorType((n, c, h, w), "float32")
 
 if __name__ == "__main__":
     test_single_op()
@@ -220,5 +231,6 @@ def test_full_like():
     test_take_infer_type()
     test_full()
     test_full_like()
+    test_infer_type_leaky_relu()
     test_squeeze_axes_infer_type()
     test_squeeze_default_axes_infer_type()

From 8aa1287c494a9a41633ef1d0e8cecfb59923f858 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Tue, 16 Oct 2018 18:48:28 -0700
Subject: [PATCH 232/529] Add javadoc build into Jenkins workflow (#1909)

---
 Makefile                          | 4 ++++
 docs/api_links.rst                | 3 ++-
 tests/scripts/task_python_docs.sh | 4 ++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6a9e3063de39..50048165bb8d 100644
--- a/Makefile
+++ b/Makefile
@@ -70,6 +70,10 @@ lint: cpplint pylint jnilint
 doc:
 	doxygen docs/Doxyfile
 
+javadoc:
+	# build artifact is in jvm/core/target/site/apidocs
+	cd jvm && mvn javadoc:javadoc
+
 # Cython build
 cython:
 	cd python; python setup.py build_ext --inplace
diff --git a/docs/api_links.rst b/docs/api_links.rst
index 909cfe367f29..d9b2406206b3 100644
--- a/docs/api_links.rst
+++ b/docs/api_links.rst
@@ -1,7 +1,8 @@
-Links to C++ and JS API References
+Links to API References
 ==================================
 
 This page contains links to API references that are build with different doc build system.
 
 * `C++ doyxgen API <doxygen/index.html>`_
 * `Javascript jsdoc API <jsdoc/index.html>`_
+* `Java Javadoc API <javadoc/index.html>`_
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 72c9fbf6c0dd..55ec0dddd45f 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -12,6 +12,10 @@ make doc
 jsdoc web/tvm_runtime.js web/README.md || exit -1
 mv out docs/_build/html/jsdoc || exit -1
 
+# Java doc
+make javadoc || exit -1
+mv jvm/core/target/site/apidocs docs/_build/html/javadoc || exit -1
+
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
 cd docs

From 5a93701f7e57c82fab1bbe05c040551a5cff3e94 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Wed, 17 Oct 2018 08:49:41 -0700
Subject: [PATCH 233/529] [DOC]Clear javadoc directory everytime (#1917)

---
 tests/scripts/task_python_docs.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 55ec0dddd45f..2dfa68415f98 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 mkdir -p docs/_build/html
 rm -rf docs/_build/html/jsdoc
+rm -rf docs/_build/html/javadoc
 
 # remove stale tutorials and always build from scratch.
 rm -rf docs/tutorials

From f7269b35e375388b15c7a78b024118bd14cc1d88 Mon Sep 17 00:00:00 2001
From: Takeo Imai <takeo.bono@gmail.com>
Date: Thu, 18 Oct 2018 07:30:33 +0900
Subject: [PATCH 234/529] fix linting command instruction (#1919)

---
 docs/contribute/pull_request.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index c83edc6cf7d1..039ef65c7b13 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -11,7 +11,9 @@ This is a quick guide to submit a pull request, please also refer to the detaile
     git fetch upstream
     git rebase upstream/master
 
-- Make sure code style check pass by typing ``make lint``, and all the existing test-cases pass.
+- Make sure code style check pass by typing the following command, and all the existing test-cases pass.
+  - ``docker/bash.sh tvmai/ci-lint ./tests/scripts/task_lint.sh``  
+     (Note: You must install docker beforehand so you can run a docker image.)
 - Add test-cases to cover the new features or bugfix the patch introduces.
 - Document the code you wrote, see more at :ref:`doc_guide`
 - Send the pull request,  fix the problems reported by automatic checks.

From 62089b29daf0de673ee2918ad524e9708af54f5d Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Thu, 18 Oct 2018 04:03:50 +0530
Subject: [PATCH 235/529] =?UTF-8?q?[NNVM/TOPI][OP]=20Split=20:=20default?=
 =?UTF-8?q?=20axis=20to=200=20and=20allow=20negative=20values=20-=20nump?=
 =?UTF-8?q?=E2=80=A6=20(#1883)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 nnvm/include/nnvm/top/tensor.h                |   2 +-
 nnvm/src/top/tensor/transform.cc              |  27 +++--
 .../tests/python/unittest/test_infer_shape.py |   4 +
 topi/python/topi/transform.py                 | 114 ++----------------
 topi/tests/python/test_topi_transform.py      |   2 +-
 5 files changed, 33 insertions(+), 116 deletions(-)

diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h
index 53ed5b3b0a22..18b937dbb7b0 100644
--- a/nnvm/include/nnvm/top/tensor.h
+++ b/nnvm/include/nnvm/top/tensor.h
@@ -43,7 +43,7 @@ struct SplitParam : public dmlc::Parameter<SplitParam> {
   DMLC_DECLARE_PARAMETER(SplitParam) {
     DMLC_DECLARE_FIELD(indices_or_sections)
         .describe("Number of outputs to be splitted");
-    DMLC_DECLARE_FIELD(axis).set_lower_bound(0).set_default(1)
+    DMLC_DECLARE_FIELD(axis).set_default(1)
         .describe("the axis to be splitted.");
   }
 };
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 270172856a75..a8159b539410 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -344,14 +344,23 @@ inline bool SplitInferShape(const NodeAttrs& attrs,
   const TShape& dshape = (*in_shape)[0];
   if (dshape.ndim() == 0) return false;
 
+  auto axis = param.axis;
+  if (axis < 0) {
+    axis += dshape.ndim();
+  }
+  CHECK_LT(axis, dshape.ndim())
+    << "axis should be within input dimension range but got " <<  axis;
+  CHECK_GT(axis, -1)
+    << "axis should be within input dimension range but got " <<  axis;
+
   if (param.equal_split) {
     int num_outputs = param.indices_or_sections[0];
     CHECK_EQ(out_shape->size(), static_cast<size_t>(num_outputs));
-    CHECK_LT(param.axis, dshape.ndim());
     TShape oshape = dshape;
-    CHECK_EQ(oshape[param.axis] % num_outputs, 0)
-        << "indices_or_sections need to be able to divide input.shape[axis]";
-    oshape[param.axis] /= num_outputs;
+    CHECK_EQ(oshape[axis] % num_outputs, 0)
+        << "indices_or_sections need to be able to divide input.shape[axis] got sections "
+        << num_outputs << " and dimension " << oshape[axis];
+    oshape[axis] /= num_outputs;
 
     for (size_t i = 0; i < out_shape->size(); ++i) {
       NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, i, oshape);
@@ -359,19 +368,19 @@ inline bool SplitInferShape(const NodeAttrs& attrs,
   } else {
     dim_t num_outputs = param.indices_or_sections.ndim() + 1;
     CHECK_EQ(out_shape->size(), static_cast<size_t>(num_outputs));
-    CHECK_LT(param.axis, dshape.ndim());
     TShape oshape = dshape;
     dim_t begin = 0;
     for (dim_t i = 0; i < num_outputs - 1; ++i) {
       CHECK_GT(param.indices_or_sections[i], begin)
-          << "indices_or_sections need to be a sorted ascending list";
-      oshape[param.axis] = param.indices_or_sections[i] - begin;
+          << "indices_or_sections need to be a sorted ascending list got "
+          << param.indices_or_sections;
+      oshape[axis] = param.indices_or_sections[i] - begin;
       begin = param.indices_or_sections[i];
       NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, i, oshape);
     }
-    CHECK_LT(begin, dshape[param.axis])
+    CHECK_LT(begin, dshape[axis])
         << "The sum of sections must match the input.shape[axis]";
-    oshape[param.axis] = dshape[param.axis] - begin;
+    oshape[axis] = dshape[axis] - begin;
     NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, num_outputs - 1, oshape);
   }
   return true;
diff --git a/nnvm/tests/python/unittest/test_infer_shape.py b/nnvm/tests/python/unittest/test_infer_shape.py
index 51e0e9576781..eee8c3bdcacb 100644
--- a/nnvm/tests/python/unittest/test_infer_shape.py
+++ b/nnvm/tests/python/unittest/test_infer_shape.py
@@ -84,6 +84,10 @@ def test_split():
     sdict = infer_shape(z)
     assert(sdict["y"][0] == [10, 10])
     assert(sdict["y"][1] == [10, 10])
+    z = sym.split(x1, indices_or_sections=[6], axis=-1, name="y")
+    sdict = infer_shape(z)
+    assert(sdict["y"][0] == [10, 6])
+    assert(sdict["y"][1] == [10, 14])
 
 
 def test_batchnorm():
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index eb3f9bad1095..311b0facabdb 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -4,7 +4,6 @@
 import tvm
 import topi
 from . import tag
-from .util import ravel_index, unravel_index, get_const_int, get_const_tuple
 from . import cpp
 
 @tvm.tag_scope(tag=tag.BROADCAST)
@@ -23,12 +22,7 @@ def expand_dims(a, axis, num_newaxis=1):
     -------
     ret : tvm.Tensor
     """
-    axis = len(a.shape) + axis + 1 if axis < 0 else axis
-    new_shape = a.shape[:axis] + ([1] * num_newaxis) + a.shape[axis:]
-    def _compute(*indices):
-        idx = indices[:axis] + indices[axis + num_newaxis:]
-        return a(*idx)
-    return tvm.compute(new_shape, _compute)
+    return cpp.expand_dims(a, axis, num_newaxis)
 
 
 @tvm.tag_scope(tag=tag.BROADCAST)
@@ -101,15 +95,8 @@ def transpose(a, axes=None):
     -------
     ret : tvm.Tensor
     """
-    ndim = len(a.shape)
-    axes = axes if axes else tuple(reversed(range(ndim)))
-    new_shape = [a.shape[x] for x in axes]
-    def _compute(*indices):
-        idx = [1] * len(axes)
-        for i, k in enumerate(axes):
-            idx[k] = indices[i]
-        return a(*idx)
-    return tvm.compute(new_shape, _compute)
+    return cpp.transpose(a, axes)
+
 
 def flip(a, axis=0):
     """Flip/reverse elements of an array in a particular axis.
@@ -153,6 +140,7 @@ def strided_slice(a, begin, end, strides=None):
     """
     return cpp.strided_slice(a, begin, end, strides)
 
+
 @tvm.tag_scope(tag=tag.INJECTIVE)
 def reshape(a, newshape):
     """Reshape the array
@@ -168,10 +156,7 @@ def reshape(a, newshape):
     -------
     ret : tvm.Tensor
     """
-    ndim = len(a.shape)
-    a_shape = [a.shape[i] for i in range(ndim)]
-    return tvm.compute(newshape,
-                       lambda *indices: a(*unravel_index(ravel_index(indices, newshape), a_shape)))
+    return cpp.reshape(a, newshape)
 
 
 @tvm.tag_scope(tag=tag.INJECTIVE)
@@ -190,41 +175,7 @@ def squeeze(a, axis=None):
     -------
     squeezed : tvm.Tensor
     """
-    a_ndim = len(a.shape)
-    a_shape = get_const_tuple(a.shape)
-    if axis is None:
-        axis = []
-        for i, ele in enumerate(a_shape):
-            if ele == 1:
-                axis.append(i)
-    else:
-        if isinstance(axis, int):
-            axis = axis + a_ndim if axis < 0 else axis
-            assert a_shape[axis] == 1
-            axis = [axis]
-        else:
-            axis = [ele + a_ndim if ele < 0 else ele for ele in axis]
-            for ele in axis:
-                assert a_shape[ele] == 1
-    out_shape = []
-    search_axis = set(axis)
-    for i, a_dim in enumerate(a_shape):
-        if i not in search_axis:
-            out_shape.append(a_dim)
-    if not out_shape:
-        out_shape.append(1)
-    def _compute(*indices):
-        real_indices = []
-        flag = 0
-        for i in range(a_ndim):
-            if i not in search_axis:
-                real_indices.append(indices[i - flag])
-            else:
-                real_indices.append(0)
-                flag += 1
-        return a(*real_indices)
-
-    return tvm.compute(out_shape, _compute)
+    return cpp.squeeze(a, axis)
 
 
 @tvm.tag_scope(tag=tag.INJECTIVE)
@@ -243,25 +194,7 @@ def concatenate(a_tuple, axis=0):
     -------
     ret : tvm.Tensor
     """
-    assert isinstance(a_tuple, (list, tuple))
-    if axis < 0:
-        axis += len(a_tuple[0].shape)
-    assert axis < len(a_tuple[0].shape)
-    axis_sizes = [a_tuple[i].shape[axis] for i in range(len(a_tuple))]
-    out_shape = [a_tuple[0].shape[i] for i in range(0, axis)] + [sum(axis_sizes)]\
-                + [a_tuple[0].shape[i] for i in range(axis + 1, len(a_tuple[0].shape))]
-    out_shape[axis] = tvm.ir_pass.Simplify(out_shape[axis])
-
-    def _compute(*indices):
-        ret = a_tuple[0](*indices)
-        ind = indices[axis]
-        for i in range(len(a_tuple) - 1):
-            ind -= axis_sizes[i]
-            ret = tvm.select(ind >= 0,
-                             a_tuple[i + 1](*(indices[0:axis] + (ind,) + indices[axis + 1:])),
-                             ret)
-        return ret
-    return tvm.compute(out_shape, _compute)
+    return cpp.concatenate(a_tuple, axis)
 
 
 @tvm.tag_scope(tag=tag.INJECTIVE)
@@ -280,37 +213,7 @@ def split(ary, indices_or_sections, axis=0):
     -------
     ret : tuple of tvm.Tensor
     """
-    def _compute(begin, *indices):
-        real_indices = indices[:axis] + (indices[axis] + begin, ) + indices[axis + 1:]
-        return ary(*real_indices)
-
-    if axis < 0:
-        axis += len(ary.shape)
-    src_axis_size = get_const_int(ary.shape[axis])
-    if isinstance(indices_or_sections, int):
-        assert indices_or_sections > 0
-        assert src_axis_size % indices_or_sections == 0
-        seg_size = src_axis_size // indices_or_sections
-        begin_ids = [seg_size * i for i in range(indices_or_sections)]
-    elif isinstance(indices_or_sections, (tuple, list)):
-        assert tuple(indices_or_sections) == tuple(sorted(indices_or_sections)),\
-            "Should be sorted, recieved %s" % str(indices_or_sections)
-        begin_ids = [0] + list(indices_or_sections)
-    else:
-        raise NotImplementedError()
-    out_shapes = []
-    for i in range(len(begin_ids)):
-        if i == len(begin_ids) - 1:
-            out_axis_size = src_axis_size - begin_ids[i]
-        else:
-            out_axis_size = begin_ids[i + 1] - begin_ids[i]
-        out_shapes.append([ary.shape[i] for i in range(axis)] + [out_axis_size] +\
-                          [ary.shape[i] for i in range(axis + 1, len(ary.shape))])
-    # pylint: disable=cell-var-from-loop
-    return [tvm.compute(out_shape,
-                        lambda *indices: _compute(begin_id, *indices), name="s%d" %i)
-            for i, (out_shape, begin_id) in enumerate(zip(out_shapes, begin_ids))]
-    # pylint: enable=cell-var-from-loop
+    return cpp.split(ary, indices_or_sections, axis)
 
 
 def take(a, indices, axis=None):
@@ -336,6 +239,7 @@ def take(a, indices, axis=None):
         return cpp.take(a, indices)
     return cpp.take(a, indices, int(axis))
 
+
 def matmul(a, b, transp_a=False, transp_b=False):
     """
     Creates an operation that calculates a matrix multiplication (row-major notation):
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index ce2505e0d8f6..8da7f0828c2f 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -139,7 +139,7 @@ def check_device(device):
         with tvm.target.create(device):
             s = topi.generic.schedule_injective(tensor_l)
 
-        foo = tvm.build(s, [A] + tensor_l, device, name="split")
+        foo = tvm.build(s, [A] + list(tensor_l), device, name="split")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npys = np.split(data_npy, indices_or_sections, axis=axis)
         data_nd = tvm.nd.array(data_npy, ctx)

From 6308ff53d302fa559b50f75f0f5e7b19f105f87f Mon Sep 17 00:00:00 2001
From: Takeo Imai <takeo.bono@gmail.com>
Date: Thu, 18 Oct 2018 07:34:41 +0900
Subject: [PATCH 236/529] Fix docstring of ConcatRel (#1912)

---
 src/relay/op/type_relations.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/relay/op/type_relations.h b/src/relay/op/type_relations.h
index 2c34c8bc8949..f6e94e24caa9 100644
--- a/src/relay/op/type_relations.h
+++ b/src/relay/op/type_relations.h
@@ -73,13 +73,10 @@ bool BroadcastCompRel(const Array<Type>& types,
                       const TypeReporter& reporter);
 
 /*!
- * \brief The The concat relation, implements the broadcasting
- *  rule over the two input types producing the broadcasted type.
- *
- * This differs from BroadcastRel in the return dtype,
- * it instead returns bool(uint8), for use in comparsion operators
- * such as equal, not_equal, lt, and so on.
- *
+ * \brief The concat type relation, implements the concatenating
+ *  rule over the list of input types producing one concatenated
+ *  type.
+ * 
  * \param types The input and output types to the relation.
  * \param num_inputs The number of input arguments.
  * \param attrs The attributes

From 9013d2f4b286b59cf6a5e2a4b7ee085e8df80c8f Mon Sep 17 00:00:00 2001
From: Nick Hynes <nhynes@berkeley.edu>
Date: Wed, 17 Oct 2018 15:34:58 -0700
Subject: [PATCH 237/529] Update Jenkinsfile (#1893)

---
 Jenkinsfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index e12ff3558ed1..f63e7d0f396e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -139,6 +139,7 @@ stage('Build') {
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_cpp_unittest.sh"
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_vta.sh"
+          sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_rust.sh"
         }
       }
     }

From 99755d6feba0a092fef11167a25a6e3db67e4a0a Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Wed, 17 Oct 2018 18:20:40 -0700
Subject: [PATCH 238/529] AutoTVM x86 (#1772)

* AutoTVM for x86 conv2d

* Add ApplyGraphBest dispatch context

* Fix tutorial

* Fix conv2d

* Improve tutorial

* Fix default schedule

* Fix 1x1 default schedule loading

* Fix workload type

* Change gridsearch to random

* Add reference to autotvm arm

* Merge conv2d common and 1x1 decl

* Fix lint

* Minor fix
---
 python/tvm/autotvm/__init__.py            |   3 +-
 python/tvm/autotvm/task/__init__.py       |   2 +-
 python/tvm/autotvm/task/dispatcher.py     |  80 ++++
 topi/python/topi/x86/conv2d.py            | 512 +++++++++++++++++-----
 topi/python/topi/x86/conv2d_avx_1x1.py    | 153 +++----
 topi/python/topi/x86/conv2d_avx_common.py | 163 +++----
 tutorials/autotvm/tune_nnvm_x86.py        | 220 ++++++++++
 7 files changed, 815 insertions(+), 318 deletions(-)
 create mode 100644 tutorials/autotvm/tune_nnvm_x86.py

diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index 7170dbdd8565..08cfbb2a95da 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -27,5 +27,6 @@
 from .tuner import callback
 from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
     register_topi_compute, register_topi_schedule, \
-    DispatchContext, FallbackContext, ApplyHistoryBest as apply_history_best
+    DispatchContext, FallbackContext, ApplyHistoryBest as apply_history_best, \
+    ApplyGraphBest as apply_graph_best
 from .env import GLOBAL_SCOPE
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index 8efb0e61b518..04bcec92fd57 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -10,7 +10,7 @@
 from .space import ConfigSpace, ConfigEntity
 from .code_hash import attach_code_hash, attach_code_hash_to_arg
 from .dispatcher import dispatcher, DispatchContext, ApplyConfig, ApplyHistoryBest, \
-    FallbackContext, clear_fallback_cache
+    FallbackContext, clear_fallback_cache, ApplyGraphBest
 
 from .topi_integration import register_topi_compute, register_topi_schedule
 from .nnvm_integration import extract_from_graph, extract_from_multiple_graph
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 8e159cc412c9..164877e3b451 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -345,3 +345,83 @@ def clear_fallback_cache(target, workload):
     while not isinstance(context, FallbackContext):
         context = context._old_ctx
     context.clear_cache(target, workload)
+
+class ApplyGraphBest(DispatchContext):
+    """Load the graph level tuning optimal schedules.
+
+    The input records should be in the ascending order of
+    node index for target operator. Usually this can be obtained
+    with graph tuner.
+
+    This context maintains an internal counter to indicate the current
+    node index.
+    """
+    def __init__(self, records):
+        """
+        Parameters
+        ----------
+        records : str or iterator of (MeasureInput, MeasureResult)
+            Collection of tuning records.
+            If is str, then it should be the filename of a records log file.
+                   Each row of this file is an encoded record pair.
+            Otherwise, it is an iterator.
+        """
+        from ..record import load_from_file
+
+        super(ApplyGraphBest, self).__init__()
+        if isinstance(records, str):
+            records = load_from_file(records)
+        self._records = list(records)
+        self._counter = 0
+        self._global_cfg_dict = {}
+
+    def _query_inside(self, target, workload):
+        """
+        Query the context to get config from records.
+
+        Parameters
+        ----------
+        target : Target
+            The current target
+        workload : Workload
+            The current workload.
+
+        Returns
+        -------
+        cfg : ConfigSpace
+            The specific configuration.
+        """
+        cfg = self._records[self._counter][0].config
+        self._counter += 1
+        return cfg
+
+    def query_global_dict(self, key):
+        """
+        Query the context to get config from global
+        config dictionary.
+
+        Parameters
+        ----------
+        key : str
+            Key to query the config.
+
+        Returns
+        -------
+        cfg : ConfigSpace
+            The specific configuration.
+        """
+        return self._global_cfg_dict[key]
+
+    def update_global_dict(self, key, val):
+        """
+        Update the global config dictionary.
+
+        Parameters
+        ----------
+        key : str
+            Key of config.
+
+        val : ConfigSpace
+            Value of config.
+        """
+        self._global_cfg_dict[key] = val
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 6fe59a909510..f766d827686d 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -1,12 +1,17 @@
-# pylint: disable=invalid-name,unused-variable,invalid-name,unused-argument
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Conv2D schedule on x86"""
 import tvm
+from tvm import autotvm
+from tvm.autotvm.task.dispatcher import ApplyGraphBest
+from tvm.autotvm.task.nnvm_integration import deserialize_args
+from tvm.autotvm.task import register, get_config
 from .. import generic, tag
 from .. import nn
-from ..nn.util import infer_pad, infer_stride
+from ..util import get_const_tuple
 from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, \
-    _get_workload, _get_workload_int8, _get_schedule, _get_schedule_NCHWc, \
+    _get_workload_int8, _get_schedule, _get_schedule_NCHWc, \
     _get_schedule_NCHWc_int8, _get_alter_layout_schedule, Workload
+from ..nn.pad import pad
 
 from . import conv2d_avx_1x1, conv2d_avx_common
 from .conv2d_avx_common import AVXConvCommonFwd
@@ -194,103 +199,164 @@ def _get_schedule_NCHWc_x86_int8(wkl, layout, out_layout):
 def _get_alter_layout_schedule_x86(wkl):
     return _get_schedule_conv(wkl)
 
-@conv2d.register("cpu")
-def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
-    _AVX_SCH_TO_DECL_FUNC = {
-        AVXConvCommonFwd: conv2d_avx_common._declaration_conv,
-        AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv
-    }
-    out_dtype = data.dtype if out_dtype is None else out_dtype
-    target = tvm.target.current_target(allow_none=False)
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
-    if layout == 'NCHW':
-        sch = _get_schedule(wkl)
-        return _AVX_SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, layout, out_dtype)
-    elif layout == 'HWCN':
-        return nn.conv2d_hwcn(data, kernel, stride, padding, out_dtype)
-    elif layout == 'NHWC':
-        return nn.conv2d_nhwc(data, kernel, stride, padding, out_dtype)
+
+def _get_fp32_len():
+    fp32_vec_len = 8
+    target = tvm.target.current_target()
+    if target is not None:
+        for opt in target.options:
+            if opt == '-mcpu=skylake-avx512':
+                fp32_vec_len = 16
+    return fp32_vec_len
+
+
+def _get_default_sch(workload):
+    fp32_vec_len = _get_fp32_len()
+    _, _, kh, kw, _ = workload[2]
+    is_kernel_1x1 = kh == 1 and kw == 1
+    if is_kernel_1x1:
+        cfg = conv2d_avx_1x1._fallback_schedule(workload, fp32_vec_len)
     else:
-        raise ValueError("not support this layout {} yet".format(layout))
+        cfg = conv2d_avx_common._fallback_schedule(workload, fp32_vec_len)
+    return cfg
 
 
-@conv2d_alter_layout.register("cpu")
-def _alter_conv2d_layout(attrs, inputs, tinfos):
-    import nnvm.symbol as sym
-    copy_inputs = [s for s in inputs]
-    new_attrs = {k : attrs[k] for k in attrs.keys()}
-    # only optimize for NCHW, groups=1 conv
-    if attrs['layout'] != 'NCHW' or attrs.get_int("groups") != 1:
-        return None
+def _create_schedule_template(cfg, data, kernel, strides, padding, layout):
+    """Create schedule configuration from input arguments"""
+    dshape = get_const_tuple(data.shape)
+    kshape = get_const_tuple(kernel.shape)
+    if layout == 'NCHW':
+        n, ic, h, w = dshape
+        oc, _, kh, kw = kshape
+    else:
+        raise ValueError("Not support this layout {} with "
+                         "schedule template.".format(layout))
+    is_kernel_1x1 = kh == 1 and kw == 1
+    ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding)
+    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    oh = (h - kh + 2 * ph) // sh + 1
+    ow = (w - kw + 2 * pw) // sw + 1
+
+    # Create schedule config
+    cfg.define_split("tile_ic", ic, num_outputs=2)
+    cfg.define_split("tile_oc", oc, num_outputs=2)
+    cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    if is_kernel_1x1:
+        cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1])
+    else:
+        cfg.define_knob("unroll_kw", [True, False])
 
-    data = tinfos[0]
-    kernel = tinfos[1]
 
-    import ast
-    padding = ast.literal_eval(attrs['padding'])
-    stride = ast.literal_eval(attrs['strides'])
+def conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
+    """convert argument to workload"""
+    if len(kernel.shape) == 4:
+        raw_kernel = kernel
+    else:  # the input kernel is transformed by alter_op_layout
+        shape = get_const_tuple(kernel.shape)
+        raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]),
+                                     dtype=kernel.dtype)
+    return ('conv2d', ) + autotvm.task.args_to_workload(
+        [data, raw_kernel, strides, padding, layout, out_dtype])
 
-    wkl = _get_workload(data, kernel, stride, padding, data.dtype)
-    sch = _get_alter_layout_schedule(wkl)
-    is_kernel_1x1 = isinstance(sch, AVXConv1x1Fwd)
-    ic_bn, oc_bn = sch.ic_bn, sch.oc_bn
 
-    new_attrs['layout'] = 'NCHW%dc' % ic_bn
-    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+@conv2d.register("cpu")
+@autotvm.task.dispatcher
+def conv2d_x86(data, kernel, strides, padding, layout, out_dtype):
+    """x86 conv2d declaration."""
+    return conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
 
-    if is_kernel_1x1:
-        # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w)
-        new_attrs['kernel_layout'] = 'OI%di%doHW' % (ic_bn, oc_bn)
+
+@conv2d_x86.register(["direct"])
+def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype):
+    out_dtype = data.dtype if out_dtype is None else out_dtype
+    padding = padding if isinstance(padding, (tuple, list)) else (padding, padding)
+    strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    if layout == 'NCHW':
+        _create_schedule_template(cfg, data, kernel, strides, padding, layout)
+        if cfg.is_fallback:
+            workload = conv_arg_to_workload(data, kernel, strides, padding,
+                                            layout, out_dtype)
+            cfg = _get_default_sch(workload)
+        args = [cfg, data, kernel, strides, padding, layout, out_dtype]
+        return _declaration_conv_impl(*args)
+    elif layout == 'HWCN':
+        return nn.conv2d_hwcn(data, kernel, strides, padding, out_dtype)
+    elif layout == 'NHWC':
+        return nn.conv2d_nhwc(data, kernel, strides, padding, out_dtype)
     else:
-        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
-        new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
+        raise ValueError("not support this layout {} yet".format(layout))
 
-    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
 
+def _declaration_conv_impl(cfg, data, kernel, strides, padding, layout, out_dtype):
+    out_dtype = data.dtype if out_dtype is None else out_dtype
+    assert layout == 'NCHW', "only support NCHW convolution for AVX"
 
+    HPAD, WPAD = padding
+    HSTR, WSTR = strides
 
-@conv2d_NCHWc.register("cpu")
-def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride,
-                            padding, layout, out_layout, out_dtype):
-    _AVX_SCH_TO_DECL_FUNC = {
-        AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc,
-        AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc
-    }
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
 
-    # Use int8 schedules if the input data is of int8 dtype
-    if data.dtype == 'uint8':
-        _AVX_SCH_TO_DECL_FUNC = {
-            AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc_int8,
-            AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc_int8
-        }
+    pad_height = in_height + 2 * HPAD
+    pad_width = in_width + 2 * WPAD
 
-    n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
-    ic = ic_chunk * ic_block
-    kh, kw = kernel_size
-    if data.dtype == 'uint8':
-        wkl = _get_workload_int8(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
-                                 tvm.placeholder((num_filter, ic, kh, kw),
-                                                 dtype=kernel.dtype),
-                                 stride, padding, out_dtype)
-        sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout)
-    else:
-        wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
-                            tvm.placeholder((num_filter, ic, kh, kw),
-                                            dtype=kernel.dtype),
-                            stride, padding, out_dtype)
-        sch = _get_schedule_NCHWc(wkl, layout, out_layout)
-    return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel)
+    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
+    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1
 
-
-@generic.schedule_conv2d_nchw.register(["cpu"])
-def schedule_conv2d(outs):
+    # pack data
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+    else:
+        data_pad = data
+
+    # fetch schedule
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+
+    shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width)
+    data_vec = tvm.compute(shape,
+                           lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w],
+                           name='data_vec')
+
+    # pack kernel
+    shape = (num_filter//oc_bn, in_channel//ic_bn,
+             kernel_height, kernel_width, ic_bn, oc_bn)
+    kernel_vec = tvm.compute(shape,
+                             lambda CO, CI, h, w, ci, co:
+                             kernel[CO * oc_bn + co, CI * ic_bn + ci, h, w],
+                             name='kernel_vec')
+
+    # convolution
+    oshape = (batch_size, num_filter//oc_bn, out_height, out_width, oc_bn)
+    unpack_shape = (batch_size, num_filter, out_height, out_width)
+
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_vec[n, ic//ic_bn, oh*HSTR+kh, ic%ic_bn,
+                                        ow*WSTR+kw].astype(out_dtype) *
+                               kernel_vec[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn,
+                                          oc_block].astype(out_dtype),
+                               axis=[ic, kh, kw]), name='conv')
+
+    unpack = tvm.compute(unpack_shape,
+                         lambda n, c, h, w: conv[n, c // oc_bn, h, w, c % oc_bn]
+                         .astype(out_dtype),
+                         name='output_unpack',
+                         tag='conv2d_nchw',
+                         attrs={'workload':
+                                    conv_arg_to_workload(data, kernel, strides,
+                                                         padding, layout,
+                                                         out_dtype)})
+    return unpack
+
+
+@autotvm.task.register_topi_schedule(generic.schedule_conv2d_nchw, 'cpu', ['direct'])
+def schedule_conv2d(cfg, outs):
     """Create schedule for tensors"""
-    _AVX_SCH_TO_SCH_FUNC = {
-        AVXConvCommonFwd: conv2d_avx_common._schedule_conv,
-        AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv
-    }
     s = tvm.create_schedule([x.op for x in outs])
-    target = tvm.target.current_target(allow_none=False)
     scheduled_ops = []
 
     def traverse(op):
@@ -316,16 +382,25 @@ def traverse(op):
             if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
-            padding = infer_pad(data, data_pad)
-            if data_pad is None:
-                stride = infer_stride(data, kernel, output)
-            else:
-                stride = infer_stride(data_pad, kernel, output)
 
-            wkl = _get_workload(data, kernel, stride, padding, output.dtype)
-            sch = _get_schedule(wkl)
-            _AVX_SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec,
-                                            kernel, kernel_vec, conv_out, output, outs[0])
+            _, _, kh, kw = get_const_tuple(kernel.shape)
+            is_kernel_1x1 = kh == 1 and kw == 1
+            current_cfg = cfg
+            if cfg.is_fallback:
+                workload_attr = op.attrs["workload"]
+                strides = (int(workload_attr[3][0].value), int(workload_attr[3][1].value))
+                padding = (int(workload_attr[4][0].value), int(workload_attr[4][1].value))
+                layout = workload_attr[5].value
+                out_dtype = workload_attr[6].value
+                workload = conv_arg_to_workload(data, kernel, strides, padding,
+                                                layout, out_dtype)
+                current_cfg = _get_default_sch(workload)
+            args = [s, current_cfg, data, data_pad, data_vec, kernel_vec, conv_out,
+                    output, outs[0]]
+            if is_kernel_1x1:
+                conv2d_avx_1x1._schedule_conv(*args)
+            else:
+                conv2d_avx_common._schedule_conv(*args)
 
         scheduled_ops.append(op)
 
@@ -333,7 +408,7 @@ def traverse(op):
     return s
 
 
-@generic.schedule_conv2d_nhwc.register(["cpu"])
+@generic.schedule_conv2d_nhwc.register("cpu")
 def schedule_conv2d_nhwc(outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
@@ -388,12 +463,223 @@ def traverse(op):
     return s
 
 
-@generic.schedule_conv2d_NCHWc.register(["cpu"])
-def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding,
+# Define template function for autotvm task
+# We define schedule template in this function instead of
+# declaration function since actual input arguments need
+# to be altered by the schedule selected.
+@register("topi_x86_conv2d_NCHWc")
+def _topi_nn_conv2d_NCHWc(*args, **kwargs):
+    assert not kwargs, "Do not support kwargs in template function call"
+    args = deserialize_args(args)
+    data, kernel = args[:2]
+    strides = args[4]
+    padding = args[5]
+    layout = args[6]
+    raw_data_shape = get_const_tuple(data.shape)
+    raw_kernel_shape = get_const_tuple(kernel.shape)
+
+    # get config here
+    cfg = get_config()
+    _create_schedule_template(cfg, data, kernel, strides, padding, layout)
+
+    # change shape with the value in config
+    ic_bn, oc_bn, ow_bn = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
+                           cfg["tile_ow"].size[-1])
+    new_data_shape = (raw_data_shape[0], raw_data_shape[1] // ic_bn,
+                      raw_data_shape[2], raw_data_shape[3], ic_bn)
+    data_layout = "NCHW%dc" % ic_bn
+    out_layout = "NCHW%dc" % oc_bn
+    new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn,
+                        raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn)
+    args[0] = tvm.placeholder(new_data_shape, data.dtype)
+    args[1] = tvm.placeholder(new_kernel_shape, kernel.dtype)
+    args[6] = data_layout
+    args[7] = out_layout
+
+    C = _declaration_conv_NCHWc(cfg, *args, **kwargs)
+    s = _schedule_conv2d_NCHWc(cfg, args[2], args[3], args[4], args[5],
+                               args[6], args[7], [C])
+    return s, [args[0], args[1], C]
+
+
+def conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides,
+                               padding, layout, out_layout, out_dtype):
+    """convert argument to workload"""
+    dshape = get_const_tuple(data.shape)
+    kshape = get_const_tuple(kernel.shape)
+    if len(dshape) > 4:
+        raw_data = tvm.placeholder((dshape[0], dshape[1] * dshape[4], dshape[2],
+                                    dshape[3]), dtype=kernel.dtype)
+    else:
+        raw_data = data
+    if len(kshape) > 4:
+        raw_kernel = tvm.placeholder((kshape[0] * kshape[5], kshape[1] * kshape[4],
+                                      kshape[2], kshape[3]), dtype=kernel.dtype)
+    else:
+        raw_kernel = kernel
+    return ('conv2d_NCHWc', ) + autotvm.task.args_to_workload(
+        [raw_data, raw_kernel, strides, padding, layout, out_layout,
+         out_dtype])
+
+
+def _query_dispatcher(workload, in_alter_op=False):
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    if isinstance(dispatch_ctx, ApplyGraphBest):
+        if in_alter_op:
+            cfg = dispatch_ctx.query(None, None)
+        else:
+            cfg = dispatch_ctx.query_global_dict(workload)
+    else:
+        target = tvm.target.current_target()
+        cfg = dispatch_ctx.query(target, workload)
+        if cfg.is_fallback:
+            cfg = _get_default_sch(workload)
+    return cfg
+
+
+@conv2d_alter_layout.register("cpu")
+def _alter_conv2d_layout(attrs, inputs, tinfo):
+    import nnvm.symbol as sym
+    copy_inputs = [s for s in inputs]
+    new_attrs = {k : attrs[k] for k in attrs.keys()}
+    data, kernel = tinfo[0], tinfo[1]
+    # only optimize for NCHW, groups=1 conv
+    if attrs['layout'] != 'NCHW' or attrs.get_int("groups") != 1:
+        return None
+
+    kernel_size = attrs.get_int_tuple("kernel_size")
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    layout = attrs['layout']
+    out_layout = layout if attrs["out_layout"] == "__undef__" else attrs["out_layout"]
+
+    dtype = data.dtype
+    out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"]
+    workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides,
+                                          padding, layout, out_layout, out_dtype)
+    cfg = _query_dispatcher(workload, True)
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    new_attrs['layout'] = 'NCHW%dc' % ic_bn
+    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+
+    # Store global schedule dictionary for ApplyGraphBest dispatcher
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    if isinstance(dispatch_ctx, ApplyGraphBest):
+        workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides,
+                                              padding, new_attrs['layout'],
+                                              new_attrs['out_layout'], out_dtype)
+        global_dict_key = workload
+        dispatch_ctx.update_global_dict(global_dict_key, cfg)
+
+    # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
+    new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
+
+    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+
+
+@conv2d_NCHWc.register("cpu")
+def conv2d_NCHWc_cpu(data, kernel, num_filter, kernel_size, strides,
+                     padding, layout, out_layout, out_dtype):
+    """x86 conv2d_NCHWc declaration."""
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    if not isinstance(dispatch_ctx, ApplyGraphBest):
+        layout = out_layout = "NCHW"
+    workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides,
+                                          padding, layout, out_layout, out_dtype)
+    cfg = _query_dispatcher(workload)
+    return _declaration_conv_NCHWc(cfg, data, kernel, num_filter, kernel_size, strides,
+                                   padding, layout, out_layout, out_dtype)
+
+
+def _declaration_conv_NCHWc(cfg, data, kernel, num_filter, kernel_size, strides,
+                            padding, layout, out_layout, out_dtype):
+    n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
+    ic = ic_chunk * ic_block
+    kh, kw = kernel_size if isinstance(kernel_size, (tuple, list)) else \
+        (kernel_size, kernel_size)
+    is_kernel_1x1 = kh == 1 and kw == 1
+    ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding)
+    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+
+    if data.dtype == 'uint8':
+        wkl = _get_workload_int8(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
+                                 tvm.placeholder((num_filter, ic, kh, kw),
+                                                 dtype=kernel.dtype),
+                                 strides, padding, out_dtype)
+        sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout)
+        return conv2d_avx_1x1._declaration_conv_NCHWc_int8(wkl, sch, data, kernel) \
+            if is_kernel_1x1 \
+            else conv2d_avx_common._declaration_conv_NCHWc_int8(wkl, sch, data, kernel)
+
+    args = [cfg, data, kernel, (kh, kw), (sh, sw), (ph, pw), layout, out_layout, out_dtype]
+    return _declaration_conv_NCHWc_impl(*args)
+
+
+def _declaration_conv_NCHWc_impl(cfg, data, kernel, kernel_size, strides, padding, layout,
+                                 out_layout, out_dtype):
+    HPAD, WPAD = padding
+    HSTR, WSTR = strides
+
+    n, ic_chunk, ih, iw, ic_block = get_const_tuple(data.shape)
+    ic = ic_chunk * ic_block
+    kh, kw = kernel_size
+    oc_chunk, _, _, _, _, oc_block = get_const_tuple(kernel.shape)
+    oc = oc_chunk * oc_block
+    oh = (ih + 2 * HPAD - kh) // HSTR + 1
+    ow = (iw + 2 * WPAD - kw) // WSTR + 1
+
+    # DOPAD
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
+    else:
+        data_pad = data
+
+    # fetch schedule
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    if ic_bn != ic_block:
+        raise RuntimeError("ic_bn in config is not equal to actual data ic_block: %d vs %d."
+                           % (ic_bn, ic_block))
+    if oc_bn != oc_block:
+        raise RuntimeError("oc_bn in config is not equal to actual kernel oc_block: %d vs %d."
+                           % (oc_bn, oc_block))
+
+    # convolution
+    oshape = (n, oc//oc_bn, oh, ow, oc_bn)
+
+    ic = tvm.reduce_axis((0, ic), name='ic')
+    kh = tvm.reduce_axis((0, kernel_size[0]), name='kh')
+    kw = tvm.reduce_axis((0, kernel_size[1]), name='kw')
+
+    workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size,
+                                          strides, padding, layout,
+                                          out_layout, out_dtype),
+    attrs = {'workload': workload}
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR+kh, ow*WSTR+kw,
+                                        ic%ic_bn].astype(out_dtype) *
+                               kernel[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn, oc_block],
+                               axis=[ic, kh, kw]),
+                       name='conv2d_NCHWc', tag="conv2d_NCHWc", attrs=attrs)
+    return conv
+
+
+@generic.schedule_conv2d_NCHWc.register("cpu")
+def schedule_conv2d_NCHWc(num_filter, kernel_size, strides, padding,
                           layout, out_layout, outs):
+    """x86 conv2d_NCHWc schedule"""
+    return _schedule_conv2d_NCHWc(None, num_filter, kernel_size, strides, padding,
+                                  layout, out_layout, outs)
+
+
+def _schedule_conv2d_NCHWc(cfg, num_filter, kernel_size, strides, padding,
+                           layout, out_layout, outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    if not isinstance(dispatch_ctx, ApplyGraphBest):
+        layout = out_layout = "NCHW"
 
     def traverse(op):
         """Traverse operators from computation graph"""
@@ -416,18 +702,9 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
-            _AVX_SCH_TO_SCH_FUNC = {
-                AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc,
-                AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc
-            }
-
-            # Use int8 schedules if the input data is of int8 dtype
-            if data.dtype == 'uint8':
-                _AVX_SCH_TO_SCH_FUNC = {
-                    AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc_int8,
-                    AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc_int8
-                }
-
+            kh, kw = kernel_size if isinstance(kernel_size, (tuple, list)) else \
+                (kernel_size, kernel_size)
+            is_kernel_1x1 = kh == 1 and kw == 1
             n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
             ic = ic_chunk * ic_block
             original_data = tvm.placeholder((n, ic, h, w), dtype=data.dtype)
@@ -435,16 +712,27 @@ def traverse(op):
             kh, kw = kernel_size
             original_kernel = tvm.placeholder((num_filter, ic, kh, kw),
                                               dtype=kernel.dtype)
-
             if data.dtype == 'uint8':
                 wkl = _get_workload_int8(original_data, original_kernel,
-                                         stride, padding, conv_out.dtype)
+                                         strides, padding, conv_out.dtype)
                 sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout)
+                args = [s, wkl, sch, data_vec, kernel, conv_out, outs[0]]
+                if is_kernel_1x1:
+                    conv2d_avx_1x1._schedule_conv_NCHWc_int8(*args)
+                else:
+                    conv2d_avx_common._schedule_conv_NCHWc_int8(*args)
             else:
-                wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype)
-                sch = _get_schedule_NCHWc(wkl, layout, out_layout)
-            _AVX_SCH_TO_SCH_FUNC[type(sch)](s, wkl, sch, data_vec,
-                                            kernel, conv_out, outs[0])
+                current_cfg = cfg
+                if current_cfg is None:
+                    workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides,
+                                                          padding, layout, out_layout,
+                                                          conv_out.dtype)
+                    current_cfg = _query_dispatcher(workload)
+                args = [s, current_cfg, data_vec, conv_out, outs[0]]
+                if is_kernel_1x1:
+                    conv2d_avx_1x1._schedule_conv_NCHWc(*args)
+                else:
+                    conv2d_avx_common._schedule_conv_NCHWc(*args)
 
         scheduled_ops.append(op)
 
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index bace7451d665..96affc7b9d23 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -3,11 +3,11 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
+from tvm.autotvm.task import ConfigEntity
+
 import topi
 
-from ..util import get_const_tuple
-from ..nn.conv2d import _get_schedule, _get_workload
-from ..nn.util import infer_pad, infer_stride
+from ..nn.util import infer_pad
 from ..nn.pad import pad
 from .tensor_intrin import dot_16x1x16_int8_int8_int32
 from .check_targets import check_skylake
@@ -42,62 +42,51 @@ def _get_default_schedule(wkl, simd_width):
     raise ValueError("cannot decide default schedule for workload: {}".format(wkl))
 
 
-def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
-    assert layout == 'NCHW', "only support NCHW convolution for AVX"
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
-    sch = _get_schedule(wkl)
-
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
+def _fallback_schedule(wkl, simd_width):
+    batch_size, in_channel, height, width, _ = wkl[1]
+    out_channel, _, hkernel, wkernel, _ = wkl[2]
+    HPAD, WPAD = wkl[4]
+    HSTR, WSTR = wkl[3]
+    out_height = (height + 2 * HPAD - hkernel) // HSTR + 1
+    out_width = (width + 2 * WPAD - wkernel) // WSTR + 1
 
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-
-    pad_height = in_height + 2 * HPAD
-    pad_width = in_width + 2 * WPAD
-
-    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
-    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1
+    oc_bn = 1
+    for bn in range(simd_width, 0, -1):
+        if out_channel % bn == 0:
+            oc_bn = bn
+            break
 
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
-    shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn)
-    data_vec = tvm.compute(shape, lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w])
+    ic_bn = 1
+    for bn in range(oc_bn, 0, -1):
+        if in_channel % bn == 0:
+            ic_bn = bn
+            break
 
-    shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, sch.ic_bn, sch.oc_bn, 1, 1)
-    kernel_vec = tvm.compute(shape, lambda CO, CI, ci, co, h, w:
-                             kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w],
-                             name='kernel_vec')
+    for ow_factor in range(out_width, 0, -1):
+        if out_width % ow_factor == 0:
+            for oh_factor in range(out_height, 0, -1):
+                if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
+                    cfg_dict = {"i": -1,
+                                "c": None,
+                                "e": [["tile_ic", "sp", [in_channel // ic_bn, ic_bn]],
+                                      ["tile_oc", "sp", [out_channel // oc_bn, oc_bn]],
+                                      ["tile_oh", "ot", oh_factor],
+                                      ["tile_ow", "sp", [out_width // ow_factor,
+                                                         ow_factor]],],
+                                "t": ""}
+                    return ConfigEntity.from_json_dict(cfg_dict)
 
-    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn)
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_vec[n, ic//sch.ic_bn, oh*HSTR, ow*WSTR, ic%sch.ic_bn] *
-                               kernel_vec[oc_chunk, ic//sch.ic_bn, ic%sch.ic_bn, oc_block, 0, 0],
-                               axis=[ic]), name='conv')
+    raise ValueError("cannot decide default schedule for workload: {}".format(wkl))
 
-    oshape = (batch_size, num_filter, out_height, out_width)
-    unpack = tvm.compute(oshape, lambda n, oc, oh, ow:
-                         conv[n, oc // sch.oc_bn, oh, ow, oc % sch.oc_bn],
-                         tag='conv2d_nchw')
-    return unpack
 
+def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
+    # fetch schedule
+    ic_bn, oc_bn, oh_factor, ow_factor = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
+                                          cfg["tile_oh"].val, cfg["tile_ow"].size[-1])
 
-def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last):
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
-    if data_pad is None:
-        stride = infer_stride(data, kernel, output)
-    else:
-        stride = infer_stride(data_pad, kernel, output)
-
-    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
-    sch = _get_schedule(wkl)
-
-    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HPAD, WPAD = padding
     DOPAD = (HPAD != 0 or WPAD != 0)
 
     A, W = data, kernel_vec
@@ -112,7 +101,7 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     # schedule kernel pack
     oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
     s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-    if sch.oc_bn > 1:
+    if oc_bn > 1:
         s[W].vectorize(oc_block)
     parallel_axis = s[W].fuse(oc_chunk, oh)
     s[W].parallel(parallel_axis)
@@ -121,17 +110,17 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     CC = s.cache_write(C, 'global')
 
     batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor)
+    oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
     s[C].vectorize(oc_block)
 
     s[CC].compute_at(s[C], oh_outer)
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    ic, = s[CC].op.reduce_axis
+    ic, _, _ = s[CC].op.reduce_axis
 
-    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
 
-    oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
-    ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
+    oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)
 
     s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block)
     s[CC].vectorize(oc_block)
@@ -143,9 +132,9 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
         s[O0].compute_inline()
     batch, oc, oh, ow = s[O].op.axis
 
-    oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn)
-    oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor)
-    ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor)
+    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
+    oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
     s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
 
     parallel_axis = s[O].fuse(oc_chunk, oh_outer)
@@ -157,33 +146,11 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     return s
 
 
-def _declaration_conv_NCHWc(wkl, sch, data, kernel):
-    out_dtype = wkl.out_dtype
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    batch_size = data.shape[0]
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
-
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
-    else:
-        data_pad = data
-
-    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
-    ic = tvm.reduce_axis((0, wkl.in_filter), name='ic')
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_pad[n, ic//sch.ic_bn, oh*HSTR, ow*WSTR, ic%sch.ic_bn]
-                               .astype(out_dtype) *
-                               kernel[oc_chunk, ic // sch.ic_bn, ic % sch.ic_bn, oc_block, 0, 0],
-                               axis=[ic]), name='conv2d_NCHWc', tag='conv2d_NCHWc')
-
-    return conv
-
+def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
+    # fetch schedule
+    ic_bn, oh_factor, ow_factor = (cfg["tile_ic"].size[-1], cfg["tile_oh"].val,
+                                   cfg["tile_ow"].size[-1])
 
-def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
     # schedule data
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
@@ -195,8 +162,8 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
     CC = s.cache_write(C, 'global')
 
     batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor)
-    ow_outer, ow_inner = s[C].split(ow, factor=sch.ow_factor)
+    oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[C].split(ow, factor=ow_factor)
     s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
     s[C].vectorize(oc_block)
 
@@ -206,12 +173,12 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
         s[C].parallel(parallel_axis)
 
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    ic, = s[CC].op.reduce_axis
+    ic, _, _ = s[CC].op.reduce_axis
 
-    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
 
-    oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
-    ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
+    oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)
 
     s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block)
     s[CC].fuse(oc_chunk, oh_outer)
@@ -222,8 +189,8 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
 
     if C != O:
         batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-        oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor)
-        ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor)
+        oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
+        ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
         s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
 
         parallel_axis = s[O].fuse(oc_chunk, oh_outer)
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 0d7aba23d236..eaa3d15e64b0 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -3,10 +3,9 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
+from tvm.autotvm.task import ConfigEntity
 
-from ..util import get_const_tuple
-from ..nn.conv2d import _get_schedule, _get_workload
-from ..nn.util import infer_pad, infer_stride
+from ..nn.util import infer_pad
 from ..nn.pad import pad
 from .tensor_intrin import dot_16x1x16_int8_int8_int32
 from .check_targets import check_skylake
@@ -17,7 +16,6 @@
 def _get_default_schedule(wkl, simd_width):
     HPAD, WPAD = wkl.hpad, wkl.wpad
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
     out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
 
     oc_bn = 1
@@ -41,78 +39,49 @@ def _get_default_schedule(wkl, simd_width):
     return AVXConvCommonFwd(ic_bn, oc_bn, reg_n, False)
 
 
-def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
-    out_dtype = data.dtype if out_dtype is None else out_dtype
-    assert layout == 'NCHW', "only support NCHW convolution for AVX"
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
-    sch = _get_schedule(wkl)
-
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-
-    pad_height = in_height + 2 * HPAD
-    pad_width = in_width + 2 * WPAD
+def _fallback_schedule(wkl, simd_width):
+    batch_size, in_channel, height, width, _ = wkl[1]
+    out_channel, _, hkernel, wkernel, _ = wkl[2]
+    HPAD, WPAD = wkl[4]
+    HSTR, WSTR = wkl[3]
+    out_width = (width + 2 * WPAD - wkernel) // WSTR + 1
 
-    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
-    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1
+    oc_bn = 1
+    for bn in range(simd_width, 0, -1):
+        if out_channel % bn == 0:
+            oc_bn = bn
+            break
 
-    # pack data
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
+    ic_bn = 1
+    for bn in range(oc_bn, 0, -1):
+        if in_channel % bn == 0:
+            ic_bn = bn
+            break
 
-    shape = (batch_size, in_channel // sch.ic_bn, pad_height, sch.ic_bn, pad_width)
-    data_vec = tvm.compute(shape,
-                           lambda n, C, h, c, w: data_pad[n, C * sch.ic_bn + c, h, w],
-                           name='data_vec')
+    reg_n = 1
+    for n in range(31, 0, -1):
+        if out_width % n == 0:
+            reg_n = n
+            break
 
-    # pack kernel
-    shape = (num_filter//sch.oc_bn, in_channel//sch.ic_bn,
-             kernel_height, kernel_width, sch.ic_bn, sch.oc_bn)
-    kernel_vec = tvm.compute(shape, lambda CO, CI, h, w, ci, co:
-                             kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w],
-                             name='kernel_vec')
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [in_channel // ic_bn, ic_bn]],
+                      ["tile_oc", "sp", [out_channel // oc_bn, oc_bn]],
+                      ["tile_ow", "sp", [out_width // reg_n, reg_n]],
+                      ["unroll_kw", "ot", False]],
+                "t": ""}
+    return ConfigEntity.from_json_dict(cfg_dict)
 
-    # convolution
-    oshape = (batch_size, num_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
-    unpack_shape = (batch_size, num_filter, out_height, out_width)
 
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    kh = tvm.reduce_axis((0, kernel_height), name='kh')
-    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
+    # fetch schedule
+    ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
+                                      cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
 
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_vec[n, ic//sch.ic_bn, oh*HSTR+kh, ic%sch.ic_bn, ow*WSTR+kw]
-                               .astype(out_dtype) *
-                               kernel_vec[oc_chunk, ic//sch.ic_bn, kh, kw, ic%sch.ic_bn, oc_block]
-                               .astype(out_dtype),
-                               axis=[ic, kh, kw]),
-                       name='conv')
-
-    unpack = tvm.compute(unpack_shape,
-                         lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn]
-                         .astype(out_dtype),
-                         name='output_unpack',
-                         tag='conv2d_nchw')
-    return unpack
-
-
-def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last):
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
-    if data_pad is None:
-        stride = infer_stride(data, kernel, output)
-    else:
-        stride = infer_stride(data_pad, kernel, output)
-    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
-    sch = _get_schedule(wkl)
-
-    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HPAD, WPAD = padding
     DOPAD = (HPAD != 0 or WPAD != 0)
 
     A, W = data, kernel_vec
@@ -128,7 +97,7 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     # schedule kernel pack
     oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
     s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-    if sch.oc_bn > 1:
+    if oc_bn > 1:
         s[W].vectorize(oc_block)
     parallel_axis = s[W].fuse(oc_chunk, oh)
     s[W].parallel(parallel_axis)
@@ -138,7 +107,7 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     CC = s.cache_write(C, 'global')
 
     _, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n)
+    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
     s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
     s[C].fuse(oc_chunk, oh)
     s[C].vectorize(oc_block)
@@ -147,10 +116,10 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
     ic, kh, kw = s[CC].op.reduce_axis
 
-    ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
-    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
 
-    if sch.unroll_kw:
+    if unroll_kw:
         s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
         s[CC].unroll(kw)
     else:
@@ -164,8 +133,8 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
         s[O0].compute_inline()
 
     batch, oc, oh, ow = s[O].op.axis
-    ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n)
-    oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn)
+    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
+    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
     s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
     parallel_axis = s[O].fuse(oc_chunk, oh)
     s[C].compute_at(s[O], parallel_axis)
@@ -176,39 +145,11 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     return s
 
 
-def _declaration_conv_NCHWc(wkl, sch, data, kernel):
-    out_dtype = wkl.out_dtype
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    batch_size = data.shape[0]
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
-
-    # pack data
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
-    else:
-        data_pad = data
-
-    # convolution
-    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
-
-    ic = tvm.reduce_axis((0, wkl.in_filter), name='ic')
-    kh = tvm.reduce_axis((0, wkl.hkernel), name='kh')
-    kw = tvm.reduce_axis((0, wkl.wkernel), name='kw')
-
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_pad[n, ic//sch.ic_bn, oh*HSTR+kh, ow*WSTR+kw, ic%sch.ic_bn]
-                               .astype(out_dtype) *
-                               kernel[oc_chunk, ic//sch.ic_bn, kh, kw, ic%sch.ic_bn, oc_block],
-                               axis=[ic, kh, kw]), name='conv2d_NCHWc', tag="conv2d_NCHWc")
+def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
+    # fetch schedule
+    ic_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_ow"].size[-1],
+                               cfg["unroll_kw"].val)
 
-    return conv
-
-
-def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
     # schedule data
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
@@ -221,7 +162,7 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
     CC = s.cache_write(C, 'global')
 
     _, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n)
+    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
     s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
     parallel_axis = s[C].fuse(oc_chunk, oh)
     s[C].vectorize(oc_block)
@@ -232,10 +173,10 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
     ic, kh, kw = s[CC].op.reduce_axis
 
-    ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
-    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
 
-    if sch.unroll_kw:
+    if unroll_kw:
         s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
         s[CC].unroll(kw)
     else:
@@ -246,7 +187,7 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
 
     if C != O:
         batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-        ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n)
+        ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
         s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
         parallel_axis = s[O].fuse(oc_chunk, oh)
         s[C].compute_at(s[O], parallel_axis)
diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py
new file mode 100644
index 000000000000..ddd91f584c08
--- /dev/null
+++ b/tutorials/autotvm/tune_nnvm_x86.py
@@ -0,0 +1,220 @@
+"""
+Auto-tuning a convolutional network for x86 CPU
+====================================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_
+
+This is a tutorial about how to tune convolution neural network
+for x86 cpu.
+"""
+import os
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from topi.x86.conv2d import conv_NCHWc_arg_to_workload
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+#
+# In this tutorial, we choose resnet-18 as tuning example.
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet_v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+# Replace "llvm" with the correct target of your cpu.
+# For example, for AWS EC2 c5 instance with Intel Xeon
+# Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512".
+# For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be
+# "llvm -mcpu=core-avx2".
+target = "llvm"
+
+batch_size = 1
+dtype = "float32"
+model_name = "resnet-18"
+log_file = "%s.log" % model_name
+
+# Set number of threads used for tuning based on the number of
+# physical cpu cores on your machine.
+num_threads = 1
+os.environ["TVM_NUM_THREADS"] = str(num_threads)
+
+
+#################################################################
+# Configure tensor tuning settings and create tasks
+# -------------------------------------------------
+# To get better kernel execution performance on x86 cpu,
+# we need to change data layout of convolution kernel from
+# "NCHW" to "NCHWc". To deal with this situation, we define
+# conv2d_NCHWc operator in topi. We will tune this operator
+# instead of plain conv2d.
+#
+# We will use local mode for tuning configuration. RPC tracker
+# mode can be setup similarly to the approach in
+# :ref:`tune_nnvm_arm` tutorial.
+
+tuning_option = {
+    'log_filename': log_file,
+    'tuner': 'random',
+    'early_stopping': None,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(number=10, repeat=1,
+                                   min_repeat_ms=1000),
+    ),
+}
+
+# You can skip the implementation of this function for this tutorial.
+def tune_kernels(tasks,
+                 measure_option,
+                 tuner='gridsearch',
+                 early_stopping=None,
+                 log_filename='tuning.log'):
+
+    for i, tsk in enumerate(tasks):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # converting conv2d tasks to conv2d_NCHWc tasks
+        data, kernel, strides, padding, layout, dtype = tsk.args
+        kernel_size = (kernel[1][2], kernel[1][3])
+        data_plc = tvm.placeholder(data[1], name="data")
+        kernel_plc = tvm.placeholder(kernel[1], name="kernel")
+        args = [data_plc, kernel_plc, data[1][1], kernel_size, strides,
+                padding, layout, layout, dtype]
+        args = autotvm.task.nnvm_integration.serialize_args(args)
+        task = autotvm.task.create("topi_x86_conv2d_NCHWc", args=args, target=target)
+        task.workload = conv_NCHWc_arg_to_workload(data_plc, kernel_plc, kernel_size,
+                                                   strides, padding, layout, layout, dtype)
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(task, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(task, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(task)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(task)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        # do tuning
+        n_trial=len(task.config_space)
+        tuner_obj.tune(n_trial=n_trial,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(log_filename)])
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from nnvm graph
+    print("Extract tasks...")
+    net, params, data_shape, out_shape = get_network(model_name, batch_size)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': data_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_kernels(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, shape={'data': data_shape}, params=params, dtype=dtype)
+
+        # upload parameters to device
+        ctx = tvm.cpu()
+        data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
+        module = runtime.create(graph, lib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**params)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended.
+# One sample output is listed below.
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/12]  Current/Best:  598.05/2497.63 GFLOPS | Progress: (252/252) | 1357.95 s Done.
+#    [Task  2/12]  Current/Best:  522.63/2279.24 GFLOPS | Progress: (784/784) | 3989.60 s Done.
+#    [Task  3/12]  Current/Best:  447.33/1927.69 GFLOPS | Progress: (784/784) | 3869.14 s Done.
+#    [Task  4/12]  Current/Best:  481.11/1912.34 GFLOPS | Progress: (672/672) | 3274.25 s Done.
+#    [Task  5/12]  Current/Best:  414.09/1598.45 GFLOPS | Progress: (672/672) | 2720.78 s Done.
+#    [Task  6/12]  Current/Best:  508.96/2273.20 GFLOPS | Progress: (768/768) | 3718.75 s Done.
+#    [Task  7/12]  Current/Best:  469.14/1955.79 GFLOPS | Progress: (576/576) | 2665.67 s Done.
+#    [Task  8/12]  Current/Best:  230.91/1658.97 GFLOPS | Progress: (576/576) | 2435.01 s Done.
+#    [Task  9/12]  Current/Best:  487.75/2295.19 GFLOPS | Progress: (648/648) | 3009.95 s Done.
+#    [Task 10/12]  Current/Best:  182.33/1734.45 GFLOPS | Progress: (360/360) | 1755.06 s Done.
+#    [Task 11/12]  Current/Best:  372.18/1745.15 GFLOPS | Progress: (360/360) | 1684.50 s Done.
+#    [Task 12/12]  Current/Best:  215.34/2271.11 GFLOPS | Progress: (400/400) | 2128.74 s Done.
+#    Compile...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 3.16 ms (0.03 ms)

From 825ffcd1fd77c0e4bfe5185e678ebe82a3b180fe Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Thu, 18 Oct 2018 21:19:20 +0530
Subject: [PATCH 239/529] [DOCS]Update debugger in docs.tvm.ai (#1924)

Update debugger in index to reflect in docs.tvm.ai under the Design and Developer Guide
---
 docs/dev/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index cfd79ccde468..c7a52c6de13b 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -8,6 +8,7 @@ In this part of documentation, we share the rationale for the specific choices m
    :maxdepth: 2
 
    runtime
+   debugger
    nnvm_json_spec
    nnvm_overview
    hybrid_script

From 7946ee2443485d054c984e2f7eb5422db94a1a5b Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Thu, 18 Oct 2018 08:52:04 -0700
Subject: [PATCH 240/529] Update submodule dmlc-core (#1920)

---
 3rdparty/dmlc-core | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 946a54012d0c..4d49691f1a9d 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 946a54012d0c390675ab5b46cd990838d4183d6f
+Subproject commit 4d49691f1a9d944c3b0aa5e63f1db3cad1f941f8

From 4f4ad9d75fa0349d8e544aa3cc9a362f343e2032 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Thu, 18 Oct 2018 21:23:51 +0530
Subject: [PATCH 241/529] [RELAY][OP]Reduction operator framework, argmax,
 argmin (#1865)

---
 docs/langref/relay_op.rst            |   4 +
 include/tvm/relay/type.h             |   7 +
 python/tvm/relay/__init__.py         |   1 +
 python/tvm/relay/op/__init__.py      |   1 +
 python/tvm/relay/op/reduce.py        |  64 ++++++++
 src/relay/op/tensor/reduce.cc        | 217 +++++++++++++++++++++++++++
 src/relay/pass/type_solver.cc        |   7 +
 tests/python/relay/test_op_level4.py |  90 +++++++++++
 8 files changed, 391 insertions(+)
 create mode 100644 python/tvm/relay/op/reduce.py
 create mode 100644 src/relay/op/tensor/reduce.cc

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index d40346a9e836..56558272f2a3 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -106,6 +106,8 @@ This level enables additional math and transform operators.
    tvm.relay.minimum
    tvm.relay.pow
    tvm.relay.where
+   tvm.relay.argmax
+   tvm.relay.argmin
 
 
 **Level 5: Vision/Image Operators**
@@ -183,6 +185,8 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.minimum
 .. autofunction:: tvm.relay.pow
 .. autofunction:: tvm.relay.where
+.. autofunction:: tvm.relay.argmax
+.. autofunction:: tvm.relay.argmin
 
 
 Level 5 Definitions
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index 4a187824f7f7..9a91bd09c70e 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -269,6 +269,13 @@ class TypeReporterNode : public Node {
    *  But it is possible for the solver to resolve src by dst as well.
    */
   TVM_DLL virtual void Assign(const Type& dst, const Type& src) = 0;
+  /*!
+   * \brief assert shape expression comparison.
+   * \param cond The condition of operation.
+   * \return false if assertation can be proven to have failed
+   *      true if solver can still proceed.
+   */
+  TVM_DLL virtual bool Assert(const IndexExpr& cond)= 0;
   /*!
    * \brief assert shape expression equals each other.
    * \param lhs The left operand.
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index b1085be2e1e2..55c1befc3186 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -9,6 +9,7 @@
 
 # Root operators
 from .op import Op
+from .op.reduce import *
 from .op.tensor import *
 from .op.transform import *
 from . import nn
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index bfd368356d89..c0af986be4f7 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -4,6 +4,7 @@
 from .op import get, register, Op
 
 # Operators
+from .reduce import *
 from .tensor import *
 from .transform import *
 from . import nn
diff --git a/python/tvm/relay/op/reduce.py b/python/tvm/relay/op/reduce.py
new file mode 100644
index 000000000000..a2a4519512ea
--- /dev/null
+++ b/python/tvm/relay/op/reduce.py
@@ -0,0 +1,64 @@
+"""Reduce operators."""
+# pylint: disable=redefined-builtin
+
+from . import _make
+
+def argmax(data, axis=None, keepdims=False, exclude=False):
+    """Returns the indices of the maximum values along an axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of maximum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+
+    return _make.argmax(data, axis, keepdims, exclude)
+
+def argmin(data, axis=None, keepdims=False, exclude=False):
+    """Returns the indices of the minimum values along an axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+
+    return _make.argmin(data, axis, keepdims, exclude)
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
new file mode 100644
index 000000000000..d2ec24688633
--- /dev/null
+++ b/src/relay/op/tensor/reduce.cc
@@ -0,0 +1,217 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file reduce.cc
+ * \brief Reduction operators.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <numeric>
+#include <limits>
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Attributes for Reduce operators */
+struct ReduceAttrs : public tvm::AttrsNode<ReduceAttrs> {
+  Array<IndexExpr> axis;
+  bool keepdims;
+  bool exclude;
+
+  TVM_DECLARE_ATTRS(ReduceAttrs, "relay.attrs.ReduceAttrs") {
+    TVM_ATTR_FIELD(axis).set_default(Array<IndexExpr>({}))
+        .describe(R"code(The axis or axes along which to perform the reduction.
+
+      The default, `axis=()`, will compute over all elements into a
+      scalar array with shape `(1,)`.
+
+      If `axis` is int, a reduction is performed on a particular axis.
+
+      If `axis` is a tuple of ints, a reduction is performed on all the axes
+      specified in the tuple.
+
+      If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.)code");
+
+    TVM_ATTR_FIELD(keepdims).set_default(false)
+      .describe("If this is set to `True`, the reduced axes are left "
+                "in the result as dimension with size one.");
+    TVM_ATTR_FIELD(exclude).set_default(false)
+      .describe("Whether to perform reduction on axis that are NOT in axis instead.");
+  }
+};
+
+/*!
+* \brief GetReduceAxes, get the new axis from indim and other arguments
+* \param indim Number of dimensions of input data.
+* \param axis The input axis vector.
+* \param exclude Whether 'axis' input given is the excluded axis.
+* \return r_axes The new reduced axes of the output.
+*/
+inline std::vector<int64_t> GetReduceAxes(const uint32_t indim,
+                                          const Array<IndexExpr>& inaxis,
+                                          bool exclude) {
+  if (!inaxis.defined()) {
+    std::vector<int64_t> r_axes(indim);
+    std::iota(r_axes.begin(), r_axes.end(), 0);
+    return r_axes;
+  }
+
+  std::vector<int64_t> in_axes;
+  for (auto i : inaxis) {
+    const int64_t* k = as_const_int(i);
+    CHECK(k != nullptr) << "Reduce axis need to be constant, cannot be symbolic";
+    int64_t axis = k[0];
+    if (axis < 0) {
+      axis = axis + indim;
+    }
+
+    // Check out of bounds error
+    CHECK(axis >= 0)
+      << "Axis out of bounds in reduce operator.";
+    CHECK(axis < indim)
+      << "Axis out of bounds in reduce operator.";
+    in_axes.push_back(axis);
+  }
+
+  CHECK(in_axes[in_axes.size() - 1] < indim)
+    << "Reduction axis " << in_axes[in_axes.size() - 1]
+    << " exceeds input dimensions " << indim;
+
+  std::sort(in_axes.begin(), in_axes.end());
+
+  if (!exclude) {
+    return in_axes;
+  }
+
+  auto r_size = indim - in_axes.size();
+  std::vector<int64_t> r_axes(r_size);
+  for (uint32_t i = 0, j = 0, k = 0; i < indim; ++i) {
+    if (j < in_axes.size() && in_axes[j] == i) {
+        ++j;
+        continue;
+    }
+    r_axes[k++] = i;
+  }
+  return r_axes;
+}
+
+/*!
+* \brief ReduceShapeImpl get the outshape for the reduction operator
+* \param in_shape Shape of input data.
+* \param param ReduceAttrs details.
+* \param reporter The reporter to report solution to.
+* \return oshape Output shape inferred.
+*/
+inline std::vector<IndexExpr> ReduceShapeImpl(const std::vector<IndexExpr> &in_shape,
+                                              const ReduceAttrs* param,
+                                              const TypeReporter& reporter) {
+  uint32_t indim = in_shape.size();
+  auto r_axes = GetReduceAxes(indim, param->axis, param->exclude);
+  if (!r_axes.size()) {
+    return in_shape;
+  }
+
+  auto max_shape = make_const(Int(64), 1);
+  for (int64_t axis : r_axes) {
+    max_shape *= in_shape[axis];
+  }
+  CHECK(reporter->Assert(max_shape < make_const(Int(64), std::numeric_limits<int32_t>::max())))
+    << "The maximum possible index of reduced shape cannot be more than int32 max.";
+
+  if (param->keepdims) {
+    std::vector<IndexExpr> oshape(in_shape);
+    for (unsigned i = 0, j = 0; i < indim; ++i) {
+      if (j >= r_axes.size() || !(r_axes[j] == i)) {
+        continue;
+      }
+      oshape[i] = 1;
+      ++j;
+    }
+    return oshape;
+  } else {
+    auto osize = indim - r_axes.size();
+    std::vector<IndexExpr> oshape(osize);
+    for (unsigned i = 0, j = 0, k = 0; i < indim; ++i) {
+      if (j < r_axes.size() && (r_axes[j] == i)) {
+        ++j;
+        continue;
+      }
+      oshape[k++] = in_shape[i];
+    }
+    return oshape;
+  }
+}
+
+/*!
+* \brief ArgReduceRel Output type and shape relation evaluation function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return false if This relation cannot be resolved. true if this relation has been resolved.
+*/
+bool ArgReduceRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  CHECK(static_cast<int>(data->shape.size()) != 0);
+  std::vector<IndexExpr> in_shape;
+  for (auto i : data->shape) {
+    in_shape.push_back(i);
+  }
+
+  const ReduceAttrs* param = attrs.as<ReduceAttrs>();
+  CHECK(param != nullptr);
+
+  // assign output type and shape
+  auto oshape = ReduceShapeImpl(in_shape, param, reporter);
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, Int(32)));
+  return true;
+}
+
+
+#define RELAY_REGISTER_REDUCE_OP(OpName)                           \
+  TVM_REGISTER_API("relay.op._make." OpName)                       \
+  .set_body([](const TVMArgs& args, TVMRetValue* rv) {             \
+    auto make_func = [](Expr data,                                 \
+                        Array<IndexExpr> axis,                     \
+                        bool keepdims,                             \
+                        bool exclude) {                            \
+      auto attrs = make_node<ReduceAttrs>();                       \
+      attrs->axis = std::move(axis);                               \
+      attrs->keepdims = keepdims;                                  \
+      attrs->exclude = exclude;                                    \
+      static const Op& op = Op::Get(OpName);                       \
+      return CallNode::make(op, {data}, Attrs(attrs), {});         \
+    };                                                             \
+    runtime::detail::unpack_call<Expr, 4>(make_func, args, rv);    \
+    });                                                            \
+  RELAY_REGISTER_OP(OpName)                                        \
+  .set_num_inputs(1)                                               \
+  .add_argument("data", "Tensor", "The input tensor.")
+
+
+RELAY_REGISTER_REDUCE_OP("argmax")
+.describe(R"code(Creates an operation that finds the indices of the maximum
+values over a given axis.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_support_level(4)
+.add_type_rel("ArgReduce", ArgReduceRel);
+
+
+RELAY_REGISTER_REDUCE_OP("argmin")
+.describe(R"code(Creates an operation that finds the indices of the minimum
+values over a given axis.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_support_level(4)
+.add_type_rel("ArgReduce", ArgReduceRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index f21f6a67acf8..67378c5d14a6 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -18,6 +18,13 @@ class TypeSolver::Reporter : public TypeReporterNode {
     solver_->Unify(dst, src);
   }
 
+  bool Assert(const IndexExpr& cond) final {
+    if (const uint64_t* pdiff = as_const_uint(cond)) {
+      return pdiff[0];
+    }
+    return true;
+  }
+
   bool AssertEQ(const IndexExpr& lhs, const IndexExpr& rhs) final {
     // early warning constant case.
     IndexExpr diff = lhs - rhs;
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 11c0be67ca73..dea300422e45 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -93,6 +93,94 @@ def test_binary_broadcast():
         ftype = func.checked_type
         assert ftype.ret_type == relay.TensorType((5, 10, 4), "int32")
 
+def test_argmax():
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.argmax(x, axis=(1,)))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, h, w), "int32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.argmax(x, axis=(2,), keepdims=True))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c , 1, w), "int32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.argmax(x, axis=(2,), keepdims=True, exclude=True))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((1, 1 , h, 1), "int32")
+
+def test_argmin():
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.argmax(x, axis=(1,)))
+    ib.ret(func)
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, h, w), "int32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.argmin(x, axis=(2,), keepdims=True))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((n, c , 1, w), "int32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.argmin(x, axis=(2,), keepdims=True, exclude=True))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((1, 1 , h, 1), "int32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.argmin(x, axis=(2,1), keepdims=True, exclude=True))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((1, c , h, 1), "int32")
+
+    ib = relay.ir_builder.IRBuilder()
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
+    with ib.function(x) as func:
+        ib.ret(relay.argmin(x, axis=None, keepdims=True, exclude=True))
+    ib.ret(func)
+
+    func = relay.ir_pass.infer_type(ib.env, func.to_func())
+    ftype = func.checked_type
+    assert ftype.ret_type == relay.ty.TensorType((1, 1 , 1, 1), "int32")
+
 def test_where():
     ib = relay.ir_builder.IRBuilder()
     cond = ib.param("cond", relay.TensorType((3, 4), "float32"))
@@ -113,3 +201,5 @@ def test_where():
     test_binary_broadcast()
     test_where()
     test_multibox_prior()
+    test_argmax()
+    test_argmin()

From 2316496e34efa4fa183fac90f0f35cff20322013 Mon Sep 17 00:00:00 2001
From: suexu1025 <suexu1025@gmail.com>
Date: Thu, 18 Oct 2018 11:51:03 -0700
Subject: [PATCH 242/529] fix build issue for MSVC 2017 15.8.0 and above
 (#1928)

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7bd76bbd7906..66c690314a42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,6 +63,7 @@ if(MSVC)
   add_definitions(-DWIN32_LEAN_AND_MEAN)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
   add_definitions(-D_SCL_SECURE_NO_WARNINGS)
+  add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE)
   add_definitions(-DHalide_SHARED)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")

From af817760e51ccc66114648e8f5fef90d326d4589 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 18 Oct 2018 13:06:48 -0700
Subject: [PATCH 243/529] [RELAY][Refactor] TextPrinter, move ret_type after
 body in Function. (#1918)

---
 include/tvm/attrs.h                           |  72 +-
 include/tvm/relay/attrs/nn.h                  |  10 +-
 include/tvm/relay/attrs/transform.h           |   2 +-
 include/tvm/relay/expr.h                      |   8 +-
 include/tvm/relay/op.h                        |   6 +
 python/tvm/_ffi/base.py                       |   6 +-
 python/tvm/relay/__init__.py                  |   9 +-
 python/tvm/relay/base.py                      |  15 +-
 python/tvm/relay/env.py                       |   5 +-
 python/tvm/relay/expr.py                      | 131 ++-
 python/tvm/relay/ir_builder.py                |  30 +-
 python/tvm/relay/op/nn/nn.py                  |   6 +-
 python/tvm/relay/ty.py                        |  30 +-
 src/lang/attr_functor.h                       |  26 +-
 src/lang/attrs.cc                             |  48 +-
 src/lang/reflection.cc                        |   2 +
 src/relay/ir/base.cc                          |   2 +
 src/relay/ir/debug_printer.cc                 | 305 -------
 src/relay/ir/doc.h                            | 514 ------------
 src/relay/ir/environment.cc                   |   2 +
 src/relay/ir/expr.cc                          |  22 +-
 src/relay/ir/expr_functor.cc                  |   2 +-
 src/relay/ir/text_printer.cc                  | 749 ++++++++++++++++++
 src/relay/ir/type.cc                          |  16 +-
 src/relay/op/nn/convolution.cc                |   8 +-
 src/relay/op/nn/nn.cc                         |  19 +-
 src/relay/op/nn/pad.cc                        |   2 +-
 src/relay/op/tensor/transform.cc              |   4 +
 src/relay/pass/dead_code.cc                   |   5 +-
 src/relay/pass/type_functor.h                 |   1 +
 tests/python/relay/test_ir_debug_printer.py   |  90 ---
 tests/python/relay/test_ir_text_printer.py    |  95 +++
 tests/python/relay/test_ir_well_formed.py     |   2 +-
 tests/python/relay/test_pass_alpha_equal.py   |  32 +-
 .../relay/test_pass_dead_code_elimination.py  |   2 +-
 tests/python/relay/test_pass_free_vars.py     |   2 +-
 36 files changed, 1222 insertions(+), 1058 deletions(-)
 delete mode 100644 src/relay/ir/debug_printer.cc
 delete mode 100644 src/relay/ir/doc.h
 create mode 100644 src/relay/ir/text_printer.cc
 delete mode 100644 tests/python/relay/test_ir_debug_printer.py
 create mode 100644 tests/python/relay/test_ir_text_printer.py

diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index fbb067dd6dcf..3b7beaa37838 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -130,6 +130,13 @@ class BaseAttrsNode : public Node {
    * \param os the stream to print the docstring to.
    */
   inline void PrintDocString(std::ostream &os) const;  // NOLINT(*)
+  /*!
+   * \brief Visit attributes that do not equal the default value.
+   *
+   * \note This is useful to extract fields for concise printing.
+   * \param v The visitor
+   */
+  TVM_DLL virtual void VisitNonDefaultAttrs(AttrVisitor* v) = 0;
   /*!
    * \brief Get the field information
    * \return The fields in the Attrs.
@@ -199,6 +206,7 @@ class DictAttrsNode : public BaseAttrsNode {
   TVM_DLL static Attrs make(Map<std::string, NodeRef> dict);
   // implementations
   void VisitAttrs(AttrVisitor* v) final;
+  void VisitNonDefaultAttrs(AttrVisitor* v) final;
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final;
   Array<AttrFieldInfo> ListFieldInfo() const final;
   bool ContentEqual(const Node* other) const final;
@@ -300,15 +308,15 @@ struct AttrNopEntry {
     return *this;
   }
   template<typename T>
-  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED T value) {
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) {
     return *this;
   }
   template<typename T>
-  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED T begin) {
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) {
     return *this;
   }
   template<typename T>
-  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED T end) {
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) {
     return *this;
   }
 };
@@ -603,7 +611,7 @@ class AttrDocEntry {
     return *this;
   }
   template<typename T>
-  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED T value) {
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) {
     std::ostringstream os;
     os << info_->type_info << ", default=" << value;
     info_->type_info = os.str();
@@ -649,6 +657,57 @@ class AttrExistVisitor {
     return AttrNopEntry();
   }
 };
+
+template<typename T>
+struct AttrTriggerNonDefaultEntry {
+  using TSelf = AttrTriggerNonDefaultEntry<T>;
+  // constructor
+  AttrTriggerNonDefaultEntry(
+      AttrVisitor* visitor, const char* key, T* data)
+      : visitor_(visitor), key_(key), data_(data) {}
+
+  ~AttrTriggerNonDefaultEntry() DMLC_THROW_EXCEPTION {
+    if (trigger_) {
+      visitor_->Visit(key_, data_);
+    }
+  }
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    return *this;
+  }
+  TSelf& set_default(const T& value) {
+    if (AttrsEqual()(value, *data_)) {
+      trigger_ = false;
+    }
+    return *this;
+  }
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) {
+    return *this;
+  }
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) {
+    return *this;
+  }
+
+ private:
+  AttrVisitor* visitor_;
+  const char * key_;
+  T *data_;
+  bool trigger_{true};
+};
+
+class AttrNonDefaultVisitor {
+ public:
+  explicit AttrNonDefaultVisitor(AttrVisitor* visitor)
+      : visitor_(visitor) {
+  }
+  template<typename T>
+  AttrTriggerNonDefaultEntry<T>
+  operator()(const char* key, T* value) {
+    return AttrTriggerNonDefaultEntry<T>(visitor_, key, value);
+  }
+
+ private:
+  AttrVisitor* visitor_;
+};
 }  // namespace detail
 
 /*!
@@ -665,6 +724,11 @@ class AttrsNode : public BaseAttrsNode {
     self()->__VisitAttrs__(vis);
   }
 
+  void VisitNonDefaultAttrs(AttrVisitor* v) final {
+    detail::AttrNonDefaultVisitor vis(v);
+    self()->__VisitAttrs__(vis);
+  }
+
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final {
     CHECK_EQ(args.size() % 2, 0);
     const int kLinearSearchBound = 16;
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index c7b8695d1da5..5dbaecdc3e78 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -13,7 +13,7 @@ namespace tvm {
 namespace relay {
 
 /*! \brief Attributes used in convolution operators */
-struct ConvAttrs : public tvm::AttrsNode<ConvAttrs> {
+struct Conv2DAttrs : public tvm::AttrsNode<Conv2DAttrs> {
   Array<IndexExpr> strides;
   Array<IndexExpr> padding;
   Array<IndexExpr> dilation;
@@ -25,7 +25,7 @@ struct ConvAttrs : public tvm::AttrsNode<ConvAttrs> {
   std::string out_layout;
   DataType out_dtype;
 
-  TVM_DECLARE_ATTRS(ConvAttrs, "relay.attrs.ConvAttrs") {
+  TVM_DECLARE_ATTRS(Conv2DAttrs, "relay.attrs.Conv2DAttrs") {
     TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
         .describe("Specifies the strides of the convolution.");
     TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
@@ -55,14 +55,14 @@ struct ConvAttrs : public tvm::AttrsNode<ConvAttrs> {
         .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
                   "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
                   "dimensions respectively.");
-    TVM_ATTR_FIELD(out_layout).set_default("__undef__")
+    TVM_ATTR_FIELD(out_layout).set_default("")
         .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
                   "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
                   "dimensions respectively. Default to be same as input layout.");
 
     // use 0 bits to indicate none.
     TVM_ATTR_FIELD(out_dtype)
-        .set_default(Int(0))
+        .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
   }
 };
@@ -123,7 +123,7 @@ struct Conv2DTransposeAttrs : public tvm::AttrsNode<Conv2DTransposeAttrs> {
                 "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
                 "dimensions respectively.");
     TVM_ATTR_FIELD(out_dtype)
-        .set_default(Int(0))
+        .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
   }
 };
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index d304a59567ea..8e2b741091b3 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -78,7 +78,7 @@ struct InitOpAttrs : public tvm::AttrsNode<InitOpAttrs> {
       .describe("Target shape.");
     TVM_ATTR_FIELD(dtype)
       .describe("Target data type.")
-      .set_default(Int(0));
+      .set_default(NullValue<DataType>());
   }
 };  // struct InitOpAttrs
 
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 5e50cfc05e67..743dc085d035 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -181,8 +181,6 @@ class FunctionNode : public ExprNode {
  public:
   /*! \brief Function parameters */
   tvm::Array<Var> params;
-  /*! \brief User annotated return type of the function. */
-  Type ret_type;
   /*!
    * \brief
    * The expression which represents the computation of the function,
@@ -190,6 +188,8 @@ class FunctionNode : public ExprNode {
    * or sub-expressions may reference the type variables.
    */
   Expr body;
+  /*! \brief User annotated return type of the function. */
+  Type ret_type;
   /*!
    * \brief Type parameters of the function.
    *  Enables the function to vary its type based on these.
@@ -201,8 +201,8 @@ class FunctionNode : public ExprNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("params", &params);
-    v->Visit("ret_type", &ret_type);
     v->Visit("body", &body);
+    v->Visit("ret_type", &ret_type);
     v->Visit("type_params", &type_params);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -217,8 +217,8 @@ class FunctionNode : public ExprNode {
   TVM_DLL FuncType func_type_annotation() const;
 
   TVM_DLL static Function make(tvm::Array<Var> params,
-                               Type ret_type,
                                Expr body,
+                               Type ret_type,
                                tvm::Array<TypeParam> ty_params);
 
   static constexpr const char* _type_key = "relay.Function";
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index 5735a935f6c2..4dcff22b84e8 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -47,6 +47,11 @@ class OpNode : public relay::ExprNode {
    *  This can be empty, in which case it defaults to anything.
    */
   std::string attrs_type_key;
+  /*!
+   * \brief attribute type index,
+   * this field varies in each run and is not exposed to frontend.
+   */
+  uint32_t attrs_type_index{0};
   /*!
    * \brief number of input arguments to the operator,
    * -1 means it is variable length
@@ -416,6 +421,7 @@ inline OpRegistry& OpRegistry::set_num_inputs(int32_t n) {  // NOLINT(*)
 inline OpRegistry& OpRegistry::set_attrs_type_key(  // NOLINT(*)
     const std::string& type_key) {
   get()->attrs_type_key = type_key;
+  get()->attrs_type_index = Node::TypeKey2Index(type_key.c_str());
   return *this;
 }
 
diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index 4c1e979cb684..0a91c41127e4 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -14,13 +14,15 @@
 #----------------------------
 if sys.version_info[0] == 3:
     string_types = (str,)
-    numeric_types = (float, int, np.float32, np.int32)
+    integer_types = (int, np.int32)
+    numeric_types = integer_types + (float, np.float32)
     # this function is needed for python3
     # to convert ctypes.char_p .value back to python str
     py_str = lambda x: x.decode('utf-8')
 else:
     string_types = (basestring,)
-    numeric_types = (float, int, long, np.float32, np.int32)
+    integer_types = (int, long, np.int32)
+    numeric_types = integer_types + (float, np.float32)
     py_str = lambda x: x
 
 
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 55c1befc3186..d6ecdb7855d8 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -1,4 +1,4 @@
-# pylint: disable=wildcard-import, redefined-builtin
+# pylint: disable=wildcard-import, redefined-builtin, invalid-name
 """The Relay IR namespace containing the IR definition and compiler."""
 from . import base
 from . import ty
@@ -19,6 +19,9 @@
 # Span
 Span = base.Span
 
+# Env
+Environment = env.Environment
+
 # Type
 Type = ty.Type
 TupleType = ty.TupleType
@@ -40,3 +43,7 @@
 Let = expr.Let
 If = expr.If
 TupleGetItem = expr.TupleGetItem
+
+# helper functions
+var = expr.var
+const = expr.const
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
index d683c96739cd..5a92eb57d209 100644
--- a/python/tvm/relay/base.py
+++ b/python/tvm/relay/base.py
@@ -3,6 +3,7 @@
 from __future__ import absolute_import as _abs
 from .._ffi.node import NodeBase, register_node as _register_tvm_node
 from . import _make
+from . import _expr
 
 NodeBase = NodeBase
 
@@ -20,7 +21,19 @@ def register_relay_node(type_key=None):
     return _register_tvm_node(type_key)
 
 
+class RelayNode(NodeBase):
+    def astext(self):
+        """Get the text format of the expression.
+
+        Returns
+        -------
+        text : str
+            The text format of the expression.
+        """
+        return _expr._text_print(self)
+
+
 @register_relay_node
-class Span(NodeBase):
+class Span(RelayNode):
     def __init__(self, source, lineno, col_offset):
         self.__init_handle_by_constructor__(_make.Span, source, lineno, col_offset)
diff --git a/python/tvm/relay/env.py b/python/tvm/relay/env.py
index 8dd95d39b327..8c226e509a12 100644
--- a/python/tvm/relay/env.py
+++ b/python/tvm/relay/env.py
@@ -1,12 +1,11 @@
 # pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable, wildcard-import
 """A global environment storing everything needed to interpret or compile a Relay program."""
-from .base import register_relay_node, NodeBase
+from .base import register_relay_node, RelayNode
 from . import _make
 from . import _env
 
-
 @register_relay_node
-class Environment(NodeBase):
+class Environment(RelayNode):
     """The global Relay environment containing functions,
     options and more.
     """
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index a71fd329ed5b..9807fab45089 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -1,13 +1,17 @@
 # pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
 """The expression nodes of Relay."""
 from __future__ import absolute_import
-from .base import NodeBase, register_relay_node
-from . import _expr
+
+import numpy as _np
+from .base import RelayNode, register_relay_node
 from . import _make
+from . import ty as _ty
+from .._ffi import base as _base, node as _node
+from .. import nd as _nd
 from .. import convert
 
 
-class Expr(NodeBase):
+class Expr(RelayNode):
     """The base type for all Relay expressions."""
     @property
     def checked_type(self):
@@ -56,7 +60,7 @@ def __init__(self, fields):
 
 @register_relay_node
 class Var(Expr):
-    """A local variable in Tvm.Relay.
+    """A local variable in Relay.
 
     Local variable can be used to declare input
     arguments to a function, or intermediate variables.
@@ -101,26 +105,26 @@ class Function(Expr):
     params: List[tvm.relay.Var]
         List of input parameters to the function.
 
-    ret_type: tvm.relay.Type
-        The return type annotation of the function.
-
     body: tvm.relay.Expr
         The body of the function.
 
+    ret_type: Optional[tvm.relay.Type]
+        The return type annotation of the function.
+
     type_params: Optional[List[tvm.relay.TypeParam]]
         The additional type parameters, this is only
         used in advanced usecase of template functions.
     """
     def __init__(self,
                  params,
-                 ret_type,
                  body,
+                 ret_type=None,
                  type_params=None):
         if type_params is None:
             type_params = convert([])
 
         self.__init_handle_by_constructor__(
-            _make.Function, params, ret_type, body, type_params)
+            _make.Function, params, body, ret_type, type_params)
 
 
 @register_relay_node
@@ -158,7 +162,7 @@ class Let(Expr):
 
     Parameters
     ----------
-    var: tvm.relay.Var
+    variable: tvm.relay.Var
         The local variable to be bound.
 
     value: tvm.relay.Expr
@@ -167,9 +171,9 @@ class Let(Expr):
     body: tvm.relay.Expr
         The body of the let binding.
     """
-    def __init__(self, var, value, body):
+    def __init__(self, variable, value, body):
         self.__init_handle_by_constructor__(
-            _make.Let, var, value, body)
+            _make.Let, variable, value, body)
 
 
 @register_relay_node
@@ -208,4 +212,105 @@ def __init__(self, tuple_value, index):
         self.__init_handle_by_constructor__(
             _make.TupleGetItem, tuple_value, index)
 
-debug_print = _expr._debug_print
+
+class TupleWrapper(_node.NodeGeneric):
+    """TupleWrapper.
+
+    This class is a Python wrapper for a Relay tuple of known size.
+    It allows for accessing the fields of the Relay tuple as though
+    it were a Python tuple.
+
+    Parameters
+    ----------
+    tuple_value: tvm.relay.Expr
+        The input tuple
+
+    size: int
+        The size of the tuple.
+    """
+    def __init__(self, tuple_value, size):
+        self.tuple_value = tuple_value
+        self.size = size
+
+    def asnode(self):
+        """Returns the underlying Relay tuple if this wrapper is passed
+        as an argument to an FFI function."""
+
+        return self.tuple_value
+
+    def __getitem__(self, key):
+        return self.tuple_value.fields[key]
+
+    def __len__(self):
+        return len(self.tuple_value.fields)
+
+
+def var(name_hint,
+        type_annotation=None,
+        shape=None,
+        dtype="float32"):
+    """Create a new tvm.relay.Var.
+
+    This is a simple wrapper function that allows specify
+    shape and dtype directly.
+
+    Parameters
+    ----------
+    name_hint: str
+        The name of the variable.
+        This name only acts as a hint, and is not used
+        for equality.
+
+    type_annotation: Optional[tvm.relay.Type, str]
+        The type annotation on the variable.
+        When type_annotation is a str, we will create a scalar variable.
+
+    shape: Optional[List[tvm.Expr]]
+        The shape of the tensor type.
+
+    dtype: str, optional
+        The data type of the tensor.
+
+    Examples
+    --------
+    .. code-block:: python
+
+      # The following 4 lines are equivalent to each other
+      x = tvm.relay.Var("x", tvm.relay.TensorType([1, 2]))
+      x = tvm.relay.var("x", tvm.relay.TensorType([1, 2]))
+      x = tvm.relay.var("x", shape=[1, 2])
+      x = tvm.relay.var("x", shape=[1, 2], dtype="float32")
+
+      # The following 2 lines are equivalent to each other.
+      y = tvm.relay.var("x", "float32")
+      y = tvm.relay.var("x", shape=(), dtype="float32")
+    """
+    if type_annotation is not None and shape is not None:
+        raise ValueError("Can only specify either type_annotation or shape.")
+    if shape is not None:
+        type_annotation = _ty.TensorType(shape, dtype)
+    elif isinstance(type_annotation, str):
+        type_annotation = _ty.TensorType((), type_annotation)
+    return Var(name_hint, type_annotation)
+
+
+def const(value, dtype=None):
+    """Create a constant value.
+
+    Parameters
+    ----------
+    value: Union[bool, int, float, numpy.ndarray, tvm.nd.NDArray]
+        The constant value.
+
+    dtype: str, optional
+        The data type of the value.
+    """
+    if isinstance(value, _base.numeric_types):
+        value = _np.array(value, dtype=dtype)
+    elif isinstance(value, (bool, list)):
+        value = _np.array(value, dtype=dtype)
+    if isinstance(value, (_np.ndarray, _np.generic)):
+        value = _nd.array(value)
+    if not isinstance(value, _nd.NDArray):
+        raise ValueError("value has to be scalar or NDArray")
+    return Constant(value)
diff --git a/python/tvm/relay/ir_builder.py b/python/tvm/relay/ir_builder.py
index 42a29b29b7d7..d2771926e58f 100644
--- a/python/tvm/relay/ir_builder.py
+++ b/python/tvm/relay/ir_builder.py
@@ -11,32 +11,6 @@
 from .env import Environment
 
 
-class TupleWrapper(tvm._ffi.node.NodeGeneric):
-    """TupleWrapper.
-
-    This class is a Python wrapper for a Relay tuple of known size.
-    It allows for accessing the fields of the Relay tuple as though
-    it were a Python tuple.
-    """
-
-    def __init__(self, tuple_value, size):
-        self.tuple_value = tuple_value
-        self.size = size
-
-
-    def asnode(self):
-        """Returns the underlying Relay tuple if this wrapper is passed
-        as an argument to an FFI function."""
-
-        return self.tuple_value
-
-    def __getitem__(self, key):
-        return self.tuple_value.fields[key]
-
-    def __len__(self):
-        return len(self.tuple_value.fields)
-
-
 def _convert_to_value(arg, ctxt=tvm.cpu(0)):
     # type: (Any, tvm.Context) -> tvm.nd.NDArray
     """Convert Python values into the appropriate types
@@ -132,8 +106,8 @@ def to_func(self):
         """Converts a PartialFunc into a :py:class:`~relay.Function`."""
         return Function(
             self.params,
-            self.ret_type,
             self.body,
+            self.ret_type,
             self.type_params)
 
 #pylint: disable=invalid-name
@@ -325,7 +299,7 @@ def decl(self, name, *params, **kwargs):
         def _on_exit():
             bindings, _, _, ret_value = self.exit_scope()
             exp = _mk_let(bindings, ret_value)
-            self.env.add(name, Function(params, ret_type, exp))
+            self.env.add(name, Function(params, exp, ret_type))
 
         return WithScope(10, _on_exit)
 
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 51acd4bc38b6..8a5357e4a2df 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1,6 +1,6 @@
 """Neural network operations."""
 from __future__ import absolute_import as _abs
-from tvm.relay.ir_builder import TupleWrapper
+from ...expr import TupleWrapper
 from . import _make
 
 
@@ -145,7 +145,7 @@ def conv2d_transpose(data,
                                   weight_layout, output_padding, out_dtype)
 
 
-def softmax(data, axis):
+def softmax(data, axis=1):
     r"""Computes softmax.
 
     .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
@@ -158,7 +158,7 @@ def softmax(data, axis):
     data: relay.Expr
         The input data to the operator.
 
-    axis: int
+    axis: int, optional
         The axis to sum over when computing softmax
 
     Returns
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index a6ac1857bfa8..34bd60ea08bb 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -1,11 +1,11 @@
 # pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
 """The type nodes of the Relay language."""
 from enum import IntEnum
-from .base import NodeBase, register_relay_node
+from .base import RelayNode, register_relay_node
 from . import _make
 
 
-class Type(NodeBase):
+class Type(RelayNode):
     """The base type for all Relay types."""
 
     def __eq__(self, other):
@@ -21,27 +21,25 @@ def same_as(self, other):
         """Compares two Relay types by referential equality."""
         return super().__eq__(other)
 
+
 @register_relay_node
 class TensorType(Type):
-    """A concrete TensorType in Relay, see tvm/relay/type.h for more details.
+    """A concrete TensorType in Relay.
 
     This is the type assigned to tensor's with a known dype and shape. For
     example a tensor of `float32` and `(5, 5)`.
-    """
-
-    def __init__(self, shape, dtype):
-        """Construct a tensor type.
 
-        Parameters
-        ----------
-        shape: list of tvm.Expr
-        dtype: str
+    Parameters
+    ----------
+    shape: List[tvm.Expr]
+        The shape of the Tensor
 
-        Returns
-        -------
-        tensor_type: The TensorType
-        """
-        self.__init_handle_by_constructor__(_make.TensorType, shape, dtype)
+    dtype: str, optional
+        The content data type.
+    """
+    def __init__(self, shape, dtype="float32"):
+        self.__init_handle_by_constructor__(
+            _make.TensorType, shape, dtype)
 
 
 class Kind(IntEnum):
diff --git a/src/lang/attr_functor.h b/src/lang/attr_functor.h
index 0cb748bbd496..8aa39a774315 100644
--- a/src/lang/attr_functor.h
+++ b/src/lang/attr_functor.h
@@ -17,11 +17,15 @@ namespace tvm {
 template <typename FType>
 class AttrFunctor;
 
+#define ATTR_FUNCTOR_DEFAULT                                        \
+  { return VisitAttrDefault_(op, std::forward<Args>(args)...); }
+
+
 #define ATTR_FUNCTOR_DISPATCH(OP)                                       \
   vtable.template set_dispatch<OP>(                                     \
       [](const NodeRef& n, TSelf* self, Args... args) {                 \
-        return self->Visit_(static_cast<const OP*>(n.node_.get()),      \
-                            std::forward<Args>(args)...);               \
+        return self->VisitAttr_(static_cast<const OP*>(n.node_.get()),  \
+                                std::forward<Args>(args)...);           \
       });                                                               \
 
 // A functor for common attribute information.
@@ -40,21 +44,21 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
    * \param args Additional arguments.
    * \return The result of the call
    */
-  virtual R Visit(const NodeRef& n, Args... args) {
+  virtual R VisitAttr(const NodeRef& n, Args... args) {
     static FType vtable = InitVTable();
     if (vtable.can_dispatch(n)) {
       return vtable(n, this, std::forward<Args>(args)...);
     } else {
-      return VisitDefault_(n, std::forward<Args>(args)...);
+      return VisitAttrDefault_(n.get(), std::forward<Args>(args)...);
     }
   }
-  virtual R Visit_(const ArrayNode* op, Args... args) = 0;
-  virtual R Visit_(const StrMapNode* op, Args... args) = 0;
-  virtual R Visit_(const ir::IntImm* op, Args... args) = 0;
-  virtual R Visit_(const ir::UIntImm* op, Args... args) = 0;
-  virtual R Visit_(const ir::FloatImm* op, Args... args) = 0;
-  virtual R Visit_(const ir::StringImm* op, Args... args) = 0;
-  virtual R VisitDefault_(const NodeRef& n, Args... args) = 0;
+  virtual R VisitAttr_(const ArrayNode* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const StrMapNode* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::IntImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::UIntImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::FloatImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::StringImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttrDefault_(const Node* node, Args... args) = 0;
 
  private:
   // initialize the vtable.
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index 091ecd2700d8..e467018add11 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -11,6 +11,10 @@ void DictAttrsNode::VisitAttrs(AttrVisitor* v)  {
   v->Visit("__dict__", &dict);
 }
 
+void DictAttrsNode::VisitNonDefaultAttrs(AttrVisitor* v) {
+  v->Visit("__dict__", &dict);
+}
+
 void DictAttrsNode::InitByPackedArgs(
     const runtime::TVMArgs& args, bool allow_unknown) {
   for (int i = 0; i < args.size(); i += 2) {
@@ -55,48 +59,48 @@ class AttrsEqualChecker :
     if (!equal_) return false;
     if (lhs.same_as(rhs)) return true;
     if (!lhs.defined() || !rhs.defined()) return false;
-    if (!this->Visit(lhs, rhs)) {
+    if (!this->VisitAttr(lhs, rhs)) {
       equal_ = false;
     }
     return equal_;
   }
 
-  bool VisitDefault_(const NodeRef& lhs, const NodeRef& other) final {
+  bool VisitAttrDefault_(const Node* lhs, const NodeRef& other) final {
     if (lhs->derived_from<BaseAttrsNode>()) {
-      return static_cast<const BaseAttrsNode*>(lhs.get())->ContentEqual(other.get());
+      return static_cast<const BaseAttrsNode*>(lhs)->ContentEqual(other.get());
     }
-    return lhs.same_as(other);
+    return lhs == other.get();
   }
 
-  bool Visit_(const IntImm* lhs, const NodeRef& other) final {
+  bool VisitAttr_(const IntImm* lhs, const NodeRef& other) final {
     if (const auto* rhs = other.as<IntImm>()) {
       return lhs->value == rhs->value;
     }
     return false;
   }
 
-  bool Visit_(const UIntImm* lhs, const NodeRef& other) final {
+  bool VisitAttr_(const UIntImm* lhs, const NodeRef& other) final {
     if (const auto* rhs = other.as<UIntImm>()) {
       return lhs->value == rhs->value;
     }
     return false;
   }
 
-  bool Visit_(const FloatImm* lhs, const NodeRef& other) final {
+  bool VisitAttr_(const FloatImm* lhs, const NodeRef& other) final {
     if (const auto* rhs = other.as<FloatImm>()) {
       return lhs->value == rhs->value;
     }
     return false;
   }
 
-  bool Visit_(const StringImm* lhs, const NodeRef& other) final {
+  bool VisitAttr_(const StringImm* lhs, const NodeRef& other) final {
     if (const auto* rhs = other.as<StringImm>()) {
       return lhs->value == rhs->value;
     }
     return false;
   }
 
-  bool Visit_(const ArrayNode* lhs, const NodeRef& other) final {
+  bool VisitAttr_(const ArrayNode* lhs, const NodeRef& other) final {
     if (const auto* rhs = other.as<ArrayNode>()) {
       if (rhs->data.size() != lhs->data.size()) return false;
       for (size_t  i = 0; i < lhs->data.size(); ++i) {
@@ -106,7 +110,7 @@ class AttrsEqualChecker :
     return true;
   }
 
-  bool Visit_(const StrMapNode* lhs, const NodeRef& other) final {
+  bool VisitAttr_(const StrMapNode* lhs, const NodeRef& other) final {
     if (const auto* rhs = other.as<StrMapNode>()) {
       if (rhs->data.size() != lhs->data.size()) return false;
       for (const auto& kv : lhs->data) {
@@ -127,38 +131,38 @@ class AttrContentHasher :
  public:
   size_t result_{0};
 
-  void VisitDefault_(const NodeRef& value) final {
+  void VisitAttrDefault_(const Node* value) final {
     if (value->derived_from<BaseAttrsNode>()) {
-      Update(static_cast<const BaseAttrsNode*>(value.get())->ContentHash());
+      Update(static_cast<const BaseAttrsNode*>(value)->ContentHash());
     } else {
-      Update(NodeHash()(value));
+      Update(NodeHash()(GetRef<NodeRef>(value)));
     }
   }
 
-  void Visit_(const IntImm* op) final {
+  void VisitAttr_(const IntImm* op) final {
     Update(std::hash<int64_t>()(op->value));
   }
 
-  void Visit_(const UIntImm* op) final {
+  void VisitAttr_(const UIntImm* op) final {
     Update(std::hash<uint64_t>()(op->value));
   }
 
-  void Visit_(const FloatImm* op) final {
+  void VisitAttr_(const FloatImm* op) final {
     Update(std::hash<double>()(op->value));
   }
 
-  void Visit_(const StringImm* op) final {
+  void VisitAttr_(const StringImm* op) final {
     Update(std::hash<std::string>()(op->value));
   }
 
-  void Visit_(const ArrayNode* op) final {
+  void VisitAttr_(const ArrayNode* op) final {
     Update(op->data.size());
     for (size_t  i = 0; i < op->data.size(); ++i) {
-      this->Visit(NodeRef(op->data[i]));
+      this->VisitAttr(NodeRef(op->data[i]));
     }
   }
 
-  void Visit_(const StrMapNode* lhs) final {
+  void VisitAttr_(const StrMapNode* lhs) final {
     using Entry = std::pair<std::string, NodePtr<Node> >;
     std::vector<Entry> data(lhs->data.begin(), lhs->data.end());
     std::sort(data.begin(), data.end(), [](const Entry& a, const Entry& b) {
@@ -166,7 +170,7 @@ class AttrContentHasher :
       });
     for (const Entry& kv : data) {
       Update(std::hash<std::string>()(kv.first));
-      this->Visit(NodeRef(kv.second));
+      this->VisitAttr(NodeRef(kv.second));
     }
   }
 
@@ -184,7 +188,7 @@ bool AttrsEqual::Equal(const NodeRef& lhs, const NodeRef& rhs) {
 size_t AttrsHash::Hash(const NodeRef& node) {
   if (!node.defined()) return 0;
   AttrContentHasher hasher;
-  hasher.Visit(node);
+  hasher.VisitAttr(node);
   return hasher.result_;
 }
 
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 497ec24f4129..5197645026eb 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -208,6 +208,8 @@ class JSONAttrGetter : public AttrVisitor {
     node_->type_key = node->type_key();
     // sepcially handle global object
     auto* f = dmlc::Registry<NodeFactoryReg>::Find(node_->type_key);
+    CHECK(f != nullptr)
+        << "Node type \'" << node_->type_key << "\' is not registered in TVM";
     if (f->fglobal_key != nullptr) {
       node_->global_key = f->fglobal_key(node);
       return;
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
index 97ac9e52a4c2..4e71444bf1ae 100644
--- a/src/relay/ir/base.cc
+++ b/src/relay/ir/base.cc
@@ -51,6 +51,8 @@ Span SpanNode::make(SourceName source, int lineno, int col_offset) {
   return Span(n);
 }
 
+TVM_REGISTER_NODE_TYPE(SpanNode);
+
 TVM_REGISTER_API("relay._make.Span")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = SpanNode::make(args[0], args[1], args[2]);
diff --git a/src/relay/ir/debug_printer.cc b/src/relay/ir/debug_printer.cc
deleted file mode 100644
index cb463ef6975a..000000000000
--- a/src/relay/ir/debug_printer.cc
+++ /dev/null
@@ -1,305 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file debug_printer.cc
- * \brief A pretty printer for the Relay IR.
- * As we had not determined a formal syntax yet, right now it is only for debug purpose.
- */
-
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/environment.h>
-#include <tvm/relay/error.h>
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <unordered_map>
-#include <string>
-#include <vector>
-#include <iostream>
-#include "../pass/type_functor.h"
-#include "doc.h"
-
-namespace tvm {
-namespace relay {
-
-using namespace tvm::runtime;
-
-Doc KindDocify(TypeParamNode::Kind k) {
-  switch (k) {
-    case TypeParamNode::kShapeVar:
-      return DocOfStr("ShapeVar");
-    case TypeParamNode::kShape:
-      return DocOfStr("Shape");
-    case TypeParamNode::kBaseType:
-      return DocOfStr("BaseType");
-    case TypeParamNode::kType:
-      return DocOfStr("Type");
-    default:
-      LOG(FATAL) << "unreachable code: case not handle in kind";
-      throw;  // log fatal throw but compiler doesnt know
-  }
-}
-
-template<typename T>
-std::vector<Doc> MapDocify(const tvm::Array<T>& arr, const std::function<Doc(const T&)>& f) {
-  std::vector<Doc> vec;
-  for (size_t i = 0; i < arr.size(); ++i) {
-    vec.push_back(f(arr[i]));
-  }
-  return vec;
-}
-
-template<typename T, typename Hash = std::hash<T>, typename Eq = std::equal_to<T>>
-class Counter {
-  std::unordered_map<T, size_t, Hash, Eq> cnt_;
-
- public:
-  Counter() = default;
-  Counter(const Counter&) = delete;
-  size_t operator()(const T& t) {
-    auto v = cnt_.count(t) == 0 ? 0 : cnt_.at(t) + 1;
-    cnt_[t] = v;
-    return v;
-  }
-};
-
-std::string Mangle(const std::string& str, size_t s) {
-  return str + "_" + std::to_string(s);
-  // return s == 0 ? str : str + "_" + std::to_string(s - 1);
-  // the above line look prettier but is dangerous:
-  // suppose we have x, x, x_0. mangling will give x, x_0, x_0!
-  // the save approach give x_0, x_1, x_0_1, and in fact never clash:
-  // stripping _([0-9]*) is invert of mangle under all circumstances.
-  // another problem is we need to prevent Var/TypeParam/GlobalVar clashing each other.
-}
-
-constexpr size_t indent = 2;
-
-struct TypeParamName {
-  bool operator==(const TypeParamName&) const {
-    return true;
-  }
-};
-
-struct mhash {
-  size_t operator()(const ::tvm::relay::TypeParamName&) const noexcept {
-    return 0;
-  }
-};
-
-class TypeDocifier : private TypeFunctor<Doc(const Type& n)> {
-  Environment env;
-  Counter<TypeParamName, mhash> cnt;
-  std::unordered_map<TypeParam, Doc, NodeHash, NodeEqual> map;
-
-  std::vector<Doc> DocifyTypeArray(const tvm::Array<Type>& arr) {
-    return MapDocify<Type>(arr, [=](const Type& t) { return Docify(t); });
-  }
-
-  std::vector<Doc> DocifyTypeParam(const tvm::Array<TypeParam>& arr) {
-    return MapDocify<TypeParam>(arr, [=](const TypeParam& tp) {
-        return Docify(tp);
-      });
-  }
-
-  std::vector<Doc> DocifyTypeConstraint(const tvm::Array<TypeConstraint>& arr) {
-    return MapDocify<TypeConstraint>(arr, [=](const TypeConstraint& tc) { return Docify(tc); });
-  }
-
-  Doc VisitType_(const TensorTypeNode* t) final {
-    return DocOfStr("tensor");
-  }
-
-  Doc VisitType_(const TypeParamNode* p) final {
-    auto tp = GetRef<TypeParam>(p);
-    if (map.count(tp) == 0) {
-      auto name =
-        DocOfStr(Mangle("tp", cnt(TypeParamName())) +
-                 std::string(":")) +
-        KindDocify(p->kind);
-      map.insert(std::pair<TypeParam, Doc>(tp, name));
-    }
-    return map.at(tp);
-  }
-
-  Doc Quantify(const tvm::Array<TypeParam>& tp, const Doc& d) {
-    if (tp.size() == 0) {
-      return d;
-    }
-    return Seq("forall", DocifyTypeParam(tp), ",") + Sep() + d;
-  }
-
-  Doc Constraint(const tvm::Array<TypeConstraint>& tc, const Doc& d) {
-    if (tc.size() == 0) {
-      return d;
-    }
-    return Seq("(", DocifyTypeConstraint(tc), ") =>") + Sep() + d;
-  }
-
-  Doc VisitType_(const FuncTypeNode* f) final {
-    auto inner = Seq("<", DocifyTypeArray(f->arg_types), ">") + Sep() +
-                 DocOfStr("->") + Sep() + Docify(f->ret_type);
-    return Group(Quantify(f->type_params,
-                          Constraint(f->type_constraints, inner)));
-  }
-
-  Doc VisitType_(const TypeRelationNode* r) final {
-    return DocOfStr("Relation") + Seq("(", DocifyTypeArray(r->args), ")");
-  }
-
-  Doc VisitType_(const TupleTypeNode* t) final {
-    return Seq("<", DocifyTypeArray(t->fields), ">");
-  }
-
-  Doc VisitType_(const IncompleteTypeNode* i) final {
-    return DocOfStr("_");
-  }
-
- public:
-  TypeDocifier(const Environment& env) : env(env) { }
-
-  Doc Docify(const Type& t) { return t.get() ? (*this)(t) : DocOfStr("_"); }
-};
-
-class ExprDocifier : private ExprFunctor<Doc(const Expr& n)> {
-  Environment env;
-  Counter<std::string> cnt;
-  std::unordered_map<Var, std::string, NodeHash, NodeEqual> map;
-  TypeDocifier td;
-
-  std::string VarName(const Var& v) {
-    if (map.count(v) == 0) {
-      map.insert(std::pair<Var, std::string>(v, Mangle(v->name_hint, cnt(v->name_hint))));
-    }
-    return map.at(v);
-  }
-
-  Doc TypeAnnotation(const Doc& d, const Type& t) {
-    // test for t being null. probably shouldnt has null. should talk to jared.
-    if (!t.get() || t.as<IncompleteTypeNode>()) {
-      return d;
-    } else {
-      return d + DocOfStr(":") + td.Docify(t);
-    }
-  }
-
-  std::vector<Doc> DocifyExprArray(const tvm::Array<Expr>& arr) {
-    std::vector<Doc> vec;
-    for (size_t i = 0; i < arr.size(); ++i) {
-      vec.push_back(Docify(arr[i]));
-    }
-    return vec;
-  }
-
-  std::vector<Doc> DocifyParamArray(const tvm::Array<Var>& arr) {
-    std::vector<Doc> vec;
-    for (Var param : arr) {
-      vec.emplace_back(TypeAnnotation(DocOfStr(VarName(param)),
-                                      param->type_annotation));
-    }
-    return vec;
-  }
-
-  Doc VisitExpr_(const ConstantNode* c) final {
-    return DocOfStr("some_constant");
-  }
-
-  Doc VisitExpr_(const TupleNode* t) final {
-    return Seq("<", DocifyExprArray(t->fields), ">");
-  }
-
-  Doc VisitExpr_(const VarNode* v) final {
-    return DocOfStr(VarName(GetRef<Var>(v)));
-  }
-
-  Doc VisitExpr_(const GlobalVarNode* g) final {
-    return DocOfStr(g->name_hint);
-  }
-
-  Doc VisitExpr_(const FunctionNode* f) final {
-    return Group(TypeAnnotation(Seq("(", DocifyParamArray(f->params), ")"), f->ret_type) + Sep() +
-                 DocOfStr("=>") + Sep() +
-                 Block(indent, "{", Docify(f->body), "}"));
-  }
-
-  Doc VisitExpr_(const CallNode* c) final {
-    return Docify(c->op) + Seq("<", DocifyExprArray(c->args), ">");
-  }
-
-  Doc VisitExpr_(const LetNode* l) final {
-    return Group(DocOfStr("let") + Sep() +
-                 TypeAnnotation(Docify(l->var), l->var->type_annotation) + Sep() +
-                 DocOfStr("=") + Sep() + Docify(l->value) + DocOfStr(";") + Endl() +
-                 Docify(l->body));
-  }
-
-  Doc VisitExpr_(const IfNode* i) final {
-    return Group(DocOfStr("if") + Sep() + Docify(i->cond) + Sep() +
-                 Block(indent, "{", Docify(i->true_branch), "}") + Sep() +
-                 DocOfStr("else") + Sep() +
-                 Block(indent, "{", Docify(i->false_branch), "}"));
-  }
-
-  Doc VisitExpr_(const OpNode* o) final {
-    return DocOfStr(o->name);
-  }
-
-  Doc VisitExpr_(const TupleGetItemNode* g) final {
-    return Docify(g->tuple) + DocOfStr(std::string(".") + std::to_string(g->index));
-  }
-
- public:
-  ExprDocifier(const Environment& env) : env(env), td(env) { }
-
-  Doc Docify(const Expr& e) { return (*this)(e); }
-};
-
-Doc DocOfExpr(const Environment& env, const Expr& expr) {
-  ExprDocifier d(env);
-  return d.Docify(expr);
-}
-
-Doc DocOfType(const Environment& env, const Type& expr) {
-  TypeDocifier d(env);
-  return d.Docify(expr);
-}
-
-RDoc ExprRDoc(const Environment& env, const Expr& expr) {
-  return Layout(DocOfExpr(env, expr));
-}
-
-RDoc TypeRDoc(const Environment& env, const Type& expr) {
-  return Layout(DocOfType(env, expr));
-}
-
-std::ostream & DebugPrint(const Environment& env, const Expr& e, std::ostream& os) {
-  return os << ExprRDoc(env, e);
-}
-
-std::ostream & DebugPrint(const Environment& env, const Type& t, std::ostream& os) {
-  return os << TypeRDoc(env, t);
-}
-
-std::string PrintExpr(const Environment& env, const Expr& e) {
-  std::stringstream ss;
-  ss << ExprRDoc(env, e);
-  return ss.str();
-}
-
-std::string PrintType(const Environment& env, const Type& t) {
-  std::stringstream ss;
-  ss << TypeRDoc(env, t);
-  return ss.str();
-}
-
-TVM_REGISTER_API("relay._expr._debug_print")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    NodeRef x = args[1];
-    if (x.as<TypeNode>()) {
-      *ret = PrintType(args[0], Downcast<Type>(x));
-    } else {
-      *ret = PrintExpr(args[0], Downcast<Expr>(x));
-    }
-  });
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/ir/doc.h b/src/relay/ir/doc.h
deleted file mode 100644
index 1837eedd6006..000000000000
--- a/src/relay/ir/doc.h
+++ /dev/null
@@ -1,514 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file doc.h
- * \brief A pretty printer DSL for constructing (Doc) and formatting (RDoc) documents.
- *        It is based heavily on Philip Wadler's "A prettier printer."
- *        See https://homepages.inf.ed.ac.uk/wadler/papers/prettier/prettier.pdf
- *        for more details.
- *
- * Since the original paper uses call by value for efficiency, everything doc function is maximally lazy.
- * You can probably yank speed by doing strict analysis and removing some Lazy (if this is bottleneck).
- */
-#ifndef TVM_RELAY_IR_DOC_H_
-#define TVM_RELAY_IR_DOC_H_
-
-#include <tvm/relay/error.h>
-#include <unordered_map>
-#include <utility>
-#include <string>
-#include <functional>
-#include <vector>
-#include <memory>
-#include <ostream>
-#include <map>
-
-namespace tvm {
-namespace relay {
-
-/*! \brief A Document represent structured text.
- * beside having unstructured string, it capture different ways to compose them -
- * line break, space, indentation, representation choice.
- */
-struct Doc;
-
-/*! \brief RDoc represent rendered document.
- * all the high level detail on the document, such as indentation, choice, has been removed.
- * there is only one single, straight forward way to print it.
- */
-struct RDoc;
-
-//! \brief Empty document
-inline Doc Nil();
-
-//! \brief Concatenate two documents
-inline Doc App(const Doc& l, const Doc& r);
-
-//! \brief Indent a document
-inline Doc Nest(size_t width, const Doc& doc);
-
-//! \brief Lift string to a document
-inline Doc DocOfStr(const std::string& text);
-
-//! \brief New line
-inline Doc Endl();
-
-//! \brief Remove all line break from the Document.
-inline Doc Flatten(const Doc& d);
-
-/*! \brief Choose between two possible layouts.
- * assume Flatten(l) == Flatten(r), and l need to be more compact.
- */
-inline Doc Choose(const Doc& l, const Doc& r);
-
-//! \brief Use a single line if possible
-inline Doc Group(const Doc& d);
-
-//! \brief print an RDoc
-inline std::ostream& operator<<(std::ostream& os, const RDoc& rdoc);
-
-/*! \brief Joins a vector of documents with a given separator document
- *  \example Join(["a", "b, "c"], ", ") => "a, b, c"
- *  \param vec the vector of documents
- *  \param sep the separator between documents
- */
-inline Doc Join(const std::vector<Doc>& vec, const Doc& sep);
-
-/*! \brief Creates an indented block.
- *  \param indent the indentation size
- *  \param open the opening string
- *  \param body the body of the block
- *  \param close the closing string
- */
-inline Doc Block(size_t indent, const std::string& open,
-                 const Doc& body, const std::string& close);
-
-/*! \brief Creates a comma-separated sequence with opening and closing strings.
- *  \param open the opening string
- *  \param body the body of the Block
- *  \param close the closing string
- */
-inline Doc Seq(const std::string& open,
-               const std::vector<Doc>& body, const std::string& close);
-
-//! \brief Either a space or a new line
-inline Doc Sep();
-
-/*! \brief Layout a document to a given width
- *  \param d the document to render
- *  \param width the line width
- */
-inline RDoc Layout(const Doc& d, size_t width = 80);
-
-// end of API, start of implementation
-
-template<typename T>
-struct LazyNode {
-  mutable std::function<T()> thunk;
-  explicit LazyNode(const std::function<T()>& thunk) : thunk(thunk) { }
-};
-
-//! \brief denote a value that will be computed (at most once) on need.
-template<typename T>
-struct Lazy {
-  std::shared_ptr<LazyNode<T> > lazy_node;
-  explicit Lazy(const std::function<T()>& thunk) :
-    lazy_node(std::make_shared<LazyNode<T>>(thunk)) { }
-  explicit Lazy(const T& value) : Lazy([=]() { return value; }) { }
-  explicit Lazy(const Lazy<Lazy<T>>& thunk) : Lazy([=]() { return thunk.get().get(); }) { }
-  // calculate the result.
-  // memoize it by replacing the thunk with a constant function which immediate return.
-  T get() const {
-    T res = lazy_node->thunk();
-    lazy_node->thunk = [=]() { return res; };
-    return res;
-  }
-  template<typename R>
-  Lazy<R> map(const std::function<R(const T&)>& func) const {
-    Lazy<T> self(*this);
-    return Lazy<R>([=]() -> R { return func(self.get()); });
-  }
-};
-
-struct NilNode;
-struct AppNode;
-struct NestNode;
-struct TextNode;
-struct LineNode;
-struct ChoiceNode;
-
-/*! \brief The inner representation of Doc.
- * a doc represent structured text,
- * and can be rendered onto screen while keeping the structure.
- */
-struct DocNode {
-  /* a docnode is a union of the below node.
-   * exactly one of them will be non null.
-   * their meaning is denoted by the construction function of the same name.
-   * so for example, the meaning of AppNode is exactly a node construct by App.
-   */
-  std::shared_ptr<NilNode> nil;
-  std::shared_ptr<AppNode> app;
-  std::shared_ptr<NestNode> nest;
-  std::shared_ptr<TextNode> text;  // construct by DocOfStr
-  std::shared_ptr<LineNode> line;
-  std::shared_ptr<ChoiceNode> choice;
-  DocNode(std::shared_ptr<NilNode> nil,
-           std::shared_ptr<AppNode> app,
-           std::shared_ptr<NestNode> nest,
-           std::shared_ptr<TextNode> text,
-           std::shared_ptr<LineNode> line,
-           std::shared_ptr<ChoiceNode> choice) :
-    nil(nil),
-    app(app),
-    nest(nest),
-    text(text),
-    line(line),
-    choice(choice) { }
-};
-
-struct Doc {
-  Lazy<DocNode> doc;
-  explicit Doc(const DocNode& ed) : doc(ed) { }
-  explicit Doc(const Lazy<Doc>& ldoc) :
-    doc(ldoc.map<Lazy<DocNode> >([](const Doc& d){ return d.doc; })) { }
-
-  Doc operator+(const Doc& r) const {
-    return App(*this, r);
-  }
-
-  template<typename T>
-  Lazy<T> Match(
-    const std::function<T()>& nilf,
-    const std::function<T(const Doc&, const Doc&)>& appf,
-    const std::function<T(size_t, const Doc&)>& nestf,
-    const std::function<T(const std::string&)>& textf,
-    const std::function<T()>& linef,
-    const std::function<T(const Doc&, const Doc&)>& choicef) const;
-};
-
-struct NilNode { };
-
-struct AppNode {
-  Doc left, right;
-  AppNode(const Doc& left, const Doc& right) : left(left), right(right) { }
-};
-
-struct NestNode {
-  size_t space;
-  Doc doc;
-  NestNode(size_t space, const Doc& doc) : space(space), doc(doc) { }
-};
-
-struct TextNode {
-  std::string text;
-  explicit TextNode(const std::string& text) : text(text) { }
-};
-
-struct LineNode { };
-
-struct ChoiceNode {
-  Doc left, right;
-  ChoiceNode(const Doc& left, const Doc& right) : left(left), right(right) { }
-};
-
-template<typename T>
-Lazy<T> Doc::Match(
-    const std::function<T()>& nilf,
-    const std::function<T(const Doc&, const Doc&)>& appf,
-    const std::function<T(size_t, const Doc&)>& nestf,
-    const std::function<T(const std::string&)>& textf,
-    const std::function<T()>& linef,
-    const std::function<T(const Doc&, const Doc&)>& choicef) const {
-    return doc.map<T>([=](const DocNode& d) {
-      if (d.nil) {
-        return nilf();
-      } else if (d.app) {
-        return appf(d.app->left, d.app->right);
-      } else if (d.nest) {
-        return nestf(d.nest->space, d.nest->doc);
-      } else if (d.text) {
-        return textf(d.text->text);
-      } else if (d.line) {
-        return linef();
-      } else {
-        return choicef(d.choice->left, d.choice->right);
-      }
-    });
-}
-
-//! \brief Empty document
-inline Doc Nil() {
-  return Doc(DocNode(std::make_shared<NilNode>(), nullptr, nullptr, nullptr, nullptr, nullptr));
-}
-
-//! \brief Concatenate two documents
-inline Doc App(const Doc& l, const Doc& r) {
-  return Doc(DocNode(
-    nullptr,
-    std::make_shared<AppNode>(l, r),
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr));
-}
-
-//! \brief Indent a document
-inline Doc Nest(size_t width, const Doc& doc) {
-  auto x = std::make_shared<NestNode>(width, doc);
-  return Doc(DocNode(
-    nullptr,
-    nullptr,
-    std::make_shared<NestNode>(width, doc),
-    nullptr,
-    nullptr,
-    nullptr));
-}
-
-//! \brief Lift string to a document
-inline Doc DocOfStr(const std::string& text) {
-  return Doc(DocNode(nullptr, nullptr, nullptr,
-    std::make_shared<TextNode>(text), nullptr, nullptr));
-}
-
-//! \brief New line
-inline Doc Endl() {
-  return Doc(DocNode(nullptr, nullptr, nullptr, nullptr, std::make_shared<LineNode>(), nullptr));
-}
-
-/*! \brief Choose between two possible layouts.
- * assume Flatten(l) == Flatten(r), and l need to be more compact.
- */
-inline Doc Choose(const Doc& l, const Doc& r) {
-  return Doc(DocNode(
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    std::make_shared<ChoiceNode>(l, r)));
-}
-
-//! \brief Remove new line from the whole document.
-inline Doc Flatten(const Doc& d) {
-  return Doc(d.Match<Doc>(
-    []() { return Nil(); },
-    [](const Doc& l, const Doc& r) { return Flatten(l) + Flatten(r); },
-    [](size_t space, const Doc& doc) { return Flatten(doc); },
-    [](const std::string& str) { return DocOfStr(str); },
-    []() { return DocOfStr(" "); },
-    [](const Doc& l, const Doc& r) { return Flatten(l); }));
-}
-
-//! \brief Use a single line if possible
-inline Doc Group(const Doc& d) {
-  return Choose(Flatten(d), d);
-}
-
-struct RNilNode;
-struct RTextNode;
-struct RLineNode;
-
-struct RDocNode {
-  std::shared_ptr<RNilNode> rnil;
-  std::shared_ptr<RTextNode> rtext;
-  std::shared_ptr<RLineNode> rline;
-  RDocNode(const std::shared_ptr<RNilNode>& rnil,
-           const std::shared_ptr<RTextNode>& rtext,
-           const std::shared_ptr<RLineNode>& rline) :
-    rnil(rnil), rtext(rtext), rline(rline) { }
-};
-
-/*! \brief RDoc represent rendered document.
- * all the high level detail on the document, such as indentation, alternative, has been removed.
- * there is only one single, straight forward way to print it.
- */
-struct RDoc {
-  Lazy<RDocNode> doc;
-  explicit RDoc(const RDocNode& d) : doc(d) { }
-  explicit RDoc(const Lazy<RDoc>& ldoc) :
-    doc(ldoc.map<Lazy<RDocNode>>([](const RDoc& d){ return d.doc; })) { }
-  template<typename T>
-  Lazy<T> Match(
-    const std::function<T()> &rnilf,
-    const std::function<T(const std::string&, const RDoc&)>& rtextf,
-    const std::function<T(size_t, const RDoc&)>& rlinef) const;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const RDoc& rdoc) {
-  return *rdoc.Match<std::ostream*>(
-    [&]() { return & os; },
-    [&](const std::string& text, const RDoc& r) {
-      return & (os << text << r);
-    },
-    [&](size_t space, const RDoc& r) {
-      return & (os << std::endl << std::string(space, ' ') << r);
-    }).get();
-}
-
-struct RNilNode { };
-
-struct RTextNode {
-  std::string text;
-  RDoc rest;
-  RTextNode(const std::string& text, const RDoc& rest) : text(text), rest(rest) { }
-};
-
-struct RLineNode {
-  size_t space;
-  RDoc rest;
-  RLineNode(size_t space, const RDoc& rest) : space(space), rest(rest) { }
-};
-
-//! \brief Empty RDoc
-inline RDoc RNil() { return RDoc(RDocNode(std::make_shared<RNilNode>(), nullptr, nullptr)); }
-
-//! \brief RDoc that begin with std::string
-inline RDoc RText(const std::string& text, const RDoc& rest) {
-  return RDoc(RDocNode(nullptr, std::make_shared<RTextNode>(text, rest), nullptr));
-}
-
-//! \brief RDoc that begin with a new line, followed by space
-inline RDoc RLine(size_t space, const RDoc& rest) {
-  return RDoc(RDocNode(nullptr, nullptr, std::make_shared<RLineNode>(space, rest)));
-}
-
-template<typename T>
-Lazy<T> RDoc::Match(
-  const std::function<T()>& rnilf,
-  const std::function<T(const std::string&, const RDoc&)>& rtextf,
-  const std::function<T(size_t, const RDoc&)>& rlinef) const {
-  return doc.map<T>([=](const RDocNode& rdoc) {
-    if (rdoc.rnil) {
-      return rnilf();
-    } else if (rdoc.rtext) {
-      return rtextf(rdoc.rtext->text, rdoc.rtext->rest);
-    } else {
-      return rlinef(rdoc.rline->space, rdoc.rline->rest);
-    }
-  });
-}
-
-template<typename T>
-struct List;
-
-template<typename T>
-struct EagerList {
-  const std::shared_ptr<std::pair<T, List<T>>> cons;
-};
-
-//! \brief lazy list
-template<typename T>
-struct List {
-  Lazy<EagerList<T> > l;
-  List() : l([]() { return EagerList<T>({nullptr}); }) { }
-  List(const T& t, const List<T>& l) :
-    l([=]() { return EagerList<T>({std::make_shared<std::pair<T, List<T>>>(t, l)}); }) { }
-  template<typename R>
-  Lazy<R> Match(const std::function<R()>& nullf,
-                const std::function<R(const T&, const List<T>&)>& consf) const {
-    return l.template map<R>([=](const EagerList<T>& l) {
-        if (l.cons) {
-          return consf(l.cons->first, l.cons->second);
-        } else {
-          return nullf();
-        }
-    });
-  }
-};
-
-//! \brief Does x fit into line of size w?
-inline bool Fits(int w, const RDoc& x) {
-  return (w >= 0) && x.Match<bool>(
-    []() { return true; },
-    [=](const std::string& s, const RDoc& x) { return Fits(w - s.size(), x); },
-    [](size_t space, const RDoc& x) { return true; }).get();
-}
-
-//! \brief Choose the one that fits best.
-inline RDoc Better(size_t w, size_t k, const RDoc& x, const RDoc& y) {
-  return Fits(w-k, x) ? x : y;
-}
-
-typedef std::pair<size_t/*indent size*/, Doc> best_arg;
-inline RDoc Best(size_t w/*wrap width*/, size_t k/*space used*/,
-  const List<best_arg>& l/*to be rendered*/) {
-  return RDoc(l.Match<RDoc>(
-    []() { return RNil(); },
-    [=](const best_arg& p, const List<best_arg>& z) {
-      return RDoc(p.second.Match<RDoc>(
-        [=]() { return Best(w, k, z); },
-        [=](const Doc& x, const Doc& y) {
-          return Best(
-            w,
-            k,
-            List<best_arg>(best_arg(p.first, x), List<best_arg>(best_arg(p.first, y), z))); },
-        [=](size_t j, const Doc& x) {
-          return Best(w, k, List<best_arg>(best_arg(p.first + j, x), z)); },
-        [=](const std::string& text) { return RText(text, Best(w, k + text.size(), z)); },
-        [=]() { return RLine(p.first, Best(w, p.first, z)); },
-        [=](const Doc& x, const Doc& y) {
-          return Better(
-            w,
-            k,
-            Best(w, k, List<best_arg>(best_arg(p.first, x), z)),
-            Best(w, k, List<best_arg>(best_arg(p.first, y), z))); }));
-    }));
-}
-
-/*! \brief Joins a vector of documents with a given separator document
- *  \example Join(["a", "b, "c"], ", ") => "a, b, c"
- *  \param vec the vector of documents
- *  \param sep the separator between documents
- */
-inline Doc Join(const std::vector<Doc>& vec, const Doc& sep) {
-  // https://www.safaribooksonline.com/library/view/c-cookbook/0596007612/ch04s09.html
-  Doc output = Nil();
-  for (auto p = vec.begin(); p != vec.end(); ++p) {
-    output = output + *p;
-    if (p != vec.end() - 1) {
-      output = output + sep;
-    }
-  }
-
-  return output;
-}
-
-/*! \brief Creates an indented block.
- *  \param indent the indentation size
- *  \param open the opening string
- *  \param body the body of the block
- *  \param close the closing string
- */
-inline Doc Block(size_t indent, const std::string& open,
-  const Doc& body, const std::string& close) {
-  return DocOfStr(open) + Nest(indent, Endl() + body) + Endl() + DocOfStr(close);
-}
-
-/*! \brief Creates a comma-separated sequence with opening and closing strings.
- *  \param open the opening string
- *  \param body the body of the Block
- *  \param close the closing string
- */
-inline Doc Seq(const std::string& open,
-  const std::vector<Doc>& body, const std::string& close) {
-  return Group(DocOfStr(open) +
-               Nest(open.size(), Join(body, DocOfStr(",") + Endl())) +
-               DocOfStr(close));
-}
-
-//! \brief Either a space or a new line
-inline Doc Sep() {
-  return Choose(DocOfStr(" "), Endl());
-}
-
-/*! \brief Layout a document to a given width
- *  \param d the document to render
- *  \param width the line width
- */
-inline RDoc Layout(const Doc& d, size_t width) {
-  return Best(width, 0, List<best_arg>(best_arg(0, d), List<best_arg>()));
-}
-
-}  // namespace relay
-}  // namespace tvm
-#endif  // TVM_RELAY_IR_DOC_H_
diff --git a/src/relay/ir/environment.cc b/src/relay/ir/environment.cc
index eeebbb32a9fe..8bda7587f217 100644
--- a/src/relay/ir/environment.cc
+++ b/src/relay/ir/environment.cc
@@ -100,6 +100,8 @@ void EnvironmentNode::Merge(const Environment &env) {
   }
 }
 
+TVM_REGISTER_NODE_TYPE(EnvironmentNode);
+
 TVM_REGISTER_API("relay._make.Environment")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = EnvironmentNode::make(args[0]);
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index c248ad0de6f7..a1d274e3a78e 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -17,6 +17,8 @@ Constant ConstantNode::make(runtime::NDArray data) {
   return Constant(n);
 }
 
+TVM_REGISTER_NODE_TYPE(ConstantNode);
+
 TVM_REGISTER_API("relay._make.Constant")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = ConstantNode::make(args[0]);
@@ -44,6 +46,8 @@ Tuple TupleNode::make(tvm::Array<relay::Expr> fields) {
   return Tuple(n);
 }
 
+TVM_REGISTER_NODE_TYPE(TupleNode);
+
 TVM_REGISTER_API("relay._make.Tuple")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = TupleNode::make(args[0]);
@@ -61,6 +65,8 @@ Var VarNode::make(std::string name_hint, Type type_annotation) {
   return Var(n);
 }
 
+TVM_REGISTER_NODE_TYPE(VarNode);
+
 TVM_REGISTER_API("relay._make.Var")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = VarNode::make(args[0], args[1]);
@@ -82,6 +88,8 @@ GlobalVar GlobalVarNode::make(std::string name_hint) {
   return GlobalVar(n);
 }
 
+TVM_REGISTER_NODE_TYPE(GlobalVarNode);
+
 TVM_REGISTER_API("relay._make.GlobalVar")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = GlobalVarNode::make(args[0]);
@@ -94,13 +102,13 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 
 
 Function FunctionNode::make(tvm::Array<Var> params,
-                            Type ret_type,
                             Expr body,
+                            Type ret_type,
                             tvm::Array<TypeParam> type_params) {
   NodePtr<FunctionNode> n = make_node<FunctionNode>();
   n->params = std::move(params);
-  n->ret_type = std::move(ret_type);
   n->body = std::move(body);
+  n->ret_type = std::move(ret_type);
   n->type_params = std::move(type_params);
   return Function(n);
 }
@@ -113,6 +121,8 @@ FuncType FunctionNode::func_type_annotation() const {
   return FuncTypeNode::make(param_types, this->ret_type, this->type_params, {});
 }
 
+TVM_REGISTER_NODE_TYPE(FunctionNode);
+
 TVM_REGISTER_API("relay._make.Function")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
   *ret = FunctionNode::make(args[0], args[1], args[2], args[3]);
@@ -135,6 +145,8 @@ Call CallNode::make(Expr op, Array<Expr> args, Attrs attrs,
   return Call(n);
 }
 
+TVM_REGISTER_NODE_TYPE(CallNode);
+
 TVM_REGISTER_API("relay._make.Call")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
   *ret = CallNode::make(args[0], args[1], args[2], args[3]);
@@ -154,6 +166,8 @@ Let LetNode::make(Var var, Expr value, Expr body) {
   return Let(n);
 }
 
+TVM_REGISTER_NODE_TYPE(LetNode);
+
 TVM_REGISTER_API("relay._make.Let")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = LetNode::make(args[0], args[1], args[2]);
@@ -173,6 +187,8 @@ If IfNode::make(Expr cond, Expr true_branch, Expr false_branch) {
   return If(n);
 }
 
+TVM_REGISTER_NODE_TYPE(IfNode);
+
 TVM_REGISTER_API("relay._make.If").set_body([](TVMArgs args, TVMRetValue *ret) {
   *ret = IfNode::make(args[0], args[1], args[2]);
 });
@@ -190,6 +206,8 @@ TupleGetItem TupleGetItemNode::make(Expr tuple, int index) {
   return TupleGetItem(n);
 }
 
+TVM_REGISTER_NODE_TYPE(TupleGetItemNode);
+
 TVM_REGISTER_API("relay._make.TupleGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
   *ret = TupleGetItemNode::make(args[0], args[1]);
 });
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index c55e4d672b6c..26d9939aae10 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -92,7 +92,7 @@ Expr ExprMutator::VisitExpr_(const FunctionNode* op) {
       body.same_as(op->body)) {
     return GetRef<Expr>(op);
   } else {
-    return FunctionNode::make(params, ret_type, body, ty_params);
+    return FunctionNode::make(params, body, ret_type, ty_params);
   }
 }
 
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
new file mode 100644
index 000000000000..6e3c3454e97b
--- /dev/null
+++ b/src/relay/ir/text_printer.cc
@@ -0,0 +1,749 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file text_printer.cc
+ * \brief Text printer to print relay in text form.
+ */
+#include <tvm/relay/environment.h>
+#include <tvm/relay/expr_functor.h>
+#include <sstream>
+#include "../pass/type_functor.h"
+#include "../../lang/attr_functor.h"
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief the text value used in text printer.
+ * Defined as a struct for future compatibility reason
+ */
+struct TextValue {
+  /*! \brief The str representation */
+  std::string name;
+  // constructor
+  TextValue() {}
+  // constructor
+  explicit TextValue(std::string name) : name(name) {}
+};
+
+// operator overloading
+inline std::ostream& operator<<(std::ostream& os, const TextValue& val) {  // NOLINT(*)
+  return os << val.name;
+}
+
+/*!
+ * \brief Meta data context for TextPrinter.
+ *
+ * This is an important part to enable bi-directional serializability.
+ * We use tvm's Node system to build the current IR.
+ * It can be hard to design a text format for all the possible nodes
+ * as the set of nodes can grow when we do more extensions.
+ *
+ * Instead of trying to design readable text format for every nodes,
+ * we support a meta-data section in the text format.
+ * We allow the text format to refer to a node in the meta-data section.
+ *
+ * The meta-data section is a json serialized string of an Array<NodeRef>.
+ * Each element in the meta-data section can be referenced by the text format.
+ * Each meta data node is printed in the following format.
+ *
+ * meta.<type-key-of-node>(<index-in-meta-section>)
+ *
+ * Specifically, consider the following IR(constructed by python).
+ *
+ * \code
+ *
+ * n = tvm.var("n")
+ * x = tvm.relay.var("x", shape=(n, 1))
+ * f = tvm.relay.Function([x], x)
+ * print(f.astext())
+ *
+ * \endcode
+ *
+ * The corresponding text format is shown in the following code block.
+ *
+ * \code
+ *
+ * function(%x: Tensor[(meta.Variable(id=0),), float32]) {
+ *   %x
+ * }
+ * # Meta data section is a json-serialized string
+ * # of the following array.
+ * # [tvm.var("n")]
+ *
+ * \endcode
+ *
+ * Note that we store tvm.var("n") in the meta data section.
+ * Since it is stored in the index-0 in the meta-data seciton,
+ * we print it as meta.Variable(0).
+ *
+ * The text parser can recover this object by loading from the corresponding
+ * location in the meta data section.
+ *
+ * This is is a design trade-off.
+ * It allows us to embedded any meta-data in the text format,
+ * while still being able to tweak the text part of the printed IR easily.
+ */
+class TextMetaDataContext {
+ public:
+  /*!
+   * \brief Get text representation of meta node.
+   * \param node The node to be converted to meta node.
+   * \return A string representation of the meta node.
+   */
+  std::string GetMetaNode(const NodeRef& node) {
+    std::ostringstream os;
+    auto it = meta_index_.find(node);
+    int64_t index;
+    if (it != meta_index_.end()) {
+      index = it->second;
+    } else {
+      index = static_cast<int64_t>(meta_data_.size());
+      meta_data_.push_back(node);
+      meta_index_[node] = index;
+    }
+    os << "meta." << node->type_key() << "(id=" << index << ")";
+    return os.str();
+  }
+  /*!
+   * \brief Get the metadata section in json format.
+   * \return the meta datastring.
+   */
+  std::string GetMetaSection() const {
+    if (meta_data_.size() == 0) return std::string();
+    return SaveJSON(Array<NodeRef>(meta_data_));
+  }
+
+ private:
+  /*! \brief additional metadata stored in TVM json format */
+  std::vector<NodeRef> meta_data_;
+  /*! \brief map from meta data into its index */
+  std::unordered_map<NodeRef, int64_t, NodeHash, NodeEqual> meta_index_;
+};
+
+class TextPrinter :
+    public ExprFunctor<TextValue(const Expr&)> ,
+    public TypeFunctor<void (const Type&, std::ostream& os)>,  // NOLINT(*)
+    public AttrFunctor<void (const NodeRef&, std::ostream& os)> { // NOLINT(*)
+ public:
+  /*!
+   * \brief Print a node to string.
+   * \param node.
+   * \return The string representation.
+   */
+  std::string Print(const NodeRef& node) {
+    if (node.as<FunctionNode>()) {
+      this->PrintFunc(Downcast<Function>(node));
+    } else if (node.as<EnvironmentNode>()) {
+      this->PrintEnv(Downcast<Environment>(node));
+    } else if (node.as_derived<TypeNode>()) {
+      this->PrintType(Downcast<Type>(node), stream_);
+    } else if (node.as_derived<ExprNode>()) {
+      this->PrintExpr(Downcast<Expr>(node));
+    } else {
+      stream_ << node;
+    }
+    std::string meta_json = meta_.GetMetaSection();
+    if (meta_json.length() != 0) {
+      // append meta data in the end.
+      stream_ << "# meta data\n"
+              << "r\"\"\"\n"
+              << meta_json << "\n"
+              << "\"\"\"";
+    }
+    return stream_.str();
+  }
+
+  void PrintFunc(const Function& func) {
+    this->PrintFuncInternal("function", func);
+    stream_ << "\n";
+  }
+
+  void PrintEnv(const Environment& env) {
+    int counter = 0;
+    for (const auto& kv : env->functions) {
+      std::ostringstream os;
+      if (counter++ != 0) {
+        stream_ << "\n";
+      }
+      os << "def @" << kv.first->name_hint;
+      this->PrintFuncInternal(os.str(), kv.second);
+      stream_ << "\n";
+    }
+  }
+
+  void PrintExpr(const Expr& expr) {
+    TextValue val = GetValue(expr);
+    stream_ << val << "\n";
+  }
+
+  /*!
+   * \brief Get text representation of expr.
+   *
+   * This function may generate additional instructions
+   * in order to compute the final result id of expr.
+   *
+   * When trying to recursively print out an Expr.
+   * The caller should always call GetValue of its children first.
+   * Then the caller can print out to stream_ using the obtained value.
+   *
+   * This is to avoid the call of subsequent GetValue print out
+   * additional instructions which get mixed with the partial instruction
+   * printed by the caller.
+   *
+   * \param expr The input expression.
+   * \return The text value of Expr.
+   */
+  TextValue GetValue(const Expr& expr) {
+    auto it = memo_.find(expr);
+    if (it != memo_.end()) return it->second;
+    TextValue val = this->VisitExpr(expr);
+    memo_[expr] = val;
+    return val;
+  }
+  //------------------------------------
+  // Overload of Expr printing functions
+  //------------------------------------
+  TextValue VisitExpr_(const ConstantNode* op) final {
+    // Print out simple scalar directly.
+    if (op->is_scalar()) {
+      std::ostringstream os;
+      DataType dtype = TVMType2Type(op->data->dtype);
+      CHECK_EQ(op->data->ctx.device_type, kDLCPU);
+      if (dtype == Int(32)) {
+        return ConstScalar(dtype, static_cast<const int32_t*>(op->data->data));
+      } else if (dtype == Int(64)) {
+        return ConstScalar(dtype, static_cast<const int64_t*>(op->data->data));
+      } else if (dtype == Float(32)) {
+        return ConstScalar(dtype, static_cast<const float*>(op->data->data));
+      } else if (dtype == Float(64)) {
+        return ConstScalar(dtype, static_cast<const double*>(op->data->data));
+      }
+    }
+    // default fall-back, record it as meta node.
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = " << meta_.GetMetaNode(GetRef<NodeRef>(op));
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  TextValue VisitExpr_(const TupleNode* op) final {
+    std::vector<TextValue> fields;
+    for (Expr field : op->fields) {
+      fields.push_back(GetValue(field));
+    }
+    // NOTE: always recursively visit to get ids,
+    // before print out the current line
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = (";
+    for (size_t i = 0; i < fields.size(); ++i) {
+      stream_ << fields[i];
+      if (i + 1 != fields.size()) {
+        stream_ << ", ";
+      }
+    }
+    stream_ << ')';
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  TextValue VisitExpr_(const VarNode* op) final {
+    Var var = GetRef<Var>(op);
+    // This is an unbounded var.
+    TextValue val = AllocVarName(var);
+    this->PrintIndent();
+    stream_ << "free_var ";
+    this->PrintVarDecl(var, stream_);
+    this->PrintEndInst("\n");
+    return val;
+  }
+
+  TextValue VisitExpr_(const GlobalVarNode* op) final {
+    return TextValue('@' + op->name_hint);
+  }
+
+  TextValue VisitExpr_(const FunctionNode* op) final {
+    TextValue id = AllocTempVar();
+    std::ostringstream os;
+    os << id << " = function";
+    this->PrintFuncInternal(os.str(), GetRef<Function>(op));
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  TextValue VisitExpr_(const CallNode* op) final {
+    // TODO(tqchen, M.K.): support generic call
+    // possibly through meta-data
+    CHECK_EQ(op->type_args.size(), 0U)
+        << "generic call not yet supported";
+    TextValue call_op = GetValue(op->op);
+    std::vector<TextValue> args;
+    for (Expr arg : op->args) {
+      args.emplace_back(GetValue(arg));
+    }
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = " << call_op << "(";
+    for (size_t i = 0; i < args.size(); ++i) {
+      stream_ << args[i];
+      if (i + 1 != args.size()) {
+        stream_ << ", ";
+      }
+    }
+    this->PrintCallAttrs(op->op, op->attrs, stream_);
+    stream_ << ")";
+    this->PrintEndInst("");
+    this->PrintOptionalInfo(GetRef<Expr>(op));
+    stream_ << '\n';
+    return id;
+  }
+
+  TextValue VisitExpr_(const LetNode* op) final {
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = ";
+    this->PrintScope(GetRef<Expr>(op));
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  TextValue VisitExpr_(const IfNode* op) final {
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = ";
+    this->PrintScope(GetRef<Expr>(op));
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  TextValue VisitExpr_(const OpNode* op) final {
+    return TextValue(op->name);
+  }
+
+  TextValue VisitExpr_(const TupleGetItemNode* op) final {
+    TextValue tuple = GetValue(op->tuple);
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = " << tuple << "[" << op->index << "]";
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  /*!
+   * \brief Print the type to os
+   * \param type The type to be printed.
+   * \param os The output type.
+   */
+  void PrintType(const Type& type, std::ostream& os) {  // NOLINT(*)
+    this->VisitType(type, os);
+  }
+  //------------------------------------
+  // Overload of Expr printing functions
+  //------------------------------------
+  void VisitType_(const TensorTypeNode* node, std::ostream& os) final {  // NOLINT(*)
+    // scalar type
+    if (node->shape.size() == 0) {
+      os << runtime::TVMType2String(Type2TVMType(node->dtype));
+      return;
+    }
+    os << "Tensor[(";
+    for (size_t i = 0; i < node->shape.size(); ++i) {
+      this->PrintAttr(node->shape[i], os);
+      if (i + 1 != node->shape.size()) {
+        os << ", ";
+      }
+    }
+    // conform to python tuple format (1,)
+    if (node->shape.size() == 1) {
+      os << ",";
+    }
+    os << "), " << runtime::TVMType2String(Type2TVMType(node->dtype)) << "]";
+  }
+
+  void VisitTypeDefault_(const Node* node, std::ostream& os) final {  // NOLINT(*)
+    // by default always print as meta-data
+    os << meta_.GetMetaNode(GetRef<NodeRef>(node));
+  }
+
+  /*!
+   * \brief Print an attribute value to os.
+   * \param value The value to be printed.
+   * \param os The output type.
+   */
+  void PrintAttr(const NodeRef& value, std::ostream& os) {  // NOLINT(*)
+    this->VisitAttr(value, os);
+  }
+  //------------------------------------
+  // Overload of Attr printing functions
+  //------------------------------------
+  void VisitAttr_(const ArrayNode* op, std::ostream& os) final {  // NOLINT(*)
+    os << "[";
+    for (size_t i = 0; i < op->data.size(); ++i) {
+      this->PrintAttr(NodeRef(op->data[i]), os);
+      if (i + 1 != op->data.size()) {
+        os << ", ";
+      }
+    }
+    os << "]";
+  }
+  void VisitAttrDefault_(const Node* op, std::ostream& os) final { // NOLINT(*)
+    os << meta_.GetMetaNode(GetRef<NodeRef>(op));
+  }
+
+  void VisitAttr_(const ir::IntImm* op, std::ostream& os) final {  // NOLINT(*)
+    this->PrintConstScalar(op->type, &(op->value), os);
+  }
+
+  void VisitAttr_(const ir::UIntImm* op, std::ostream& os) final {  // NOLINT(*)
+    this->PrintConstScalar(op->type, &(op->value), os);
+  }
+
+  void VisitAttr_(const ir::FloatImm* op, std::ostream& os) final {  // NOLINT(*)
+    this->PrintConstScalar(op->type, &(op->value), os);
+  }
+
+  void VisitAttr_(const ir::StringImm* op, std::ostream& os) final {  // NOLINT(*)
+    this->PrintString(op->value, os);
+  }
+
+ protected:
+  /*!
+   * \brief Print attributes after call.
+   * \param op The operator to be called.
+   * \param attrs The attributes.
+   * \param os The output stream.
+   */
+  void PrintCallAttrs(const Expr& op, const Attrs& attrs, std::ostream& os);  // NOLINT(*)
+
+  /*!
+   * \brief Print the a new scopr.
+   * \param body The body.
+   */
+  void PrintScope(Expr body) {
+    stream_ << "{\n";
+    int sid = this->BeginScope();
+    this->PrintScopeBody(body);
+    this->EndScope(sid);
+    this->PrintIndent();
+    stream_ << "}";
+  }
+  /*!
+   * \brief Print the body of a new scope without {}
+   *
+   * This function will keep printing continuous sequence
+   * of let/if scope without introducing a new scope in the text.
+   *
+   * \param body The body.
+   */
+  void PrintScopeBody(Expr body) {
+    if (const LetNode* let = body.as<LetNode>()) {
+      TextValue value = GetValue(let->value);
+      AllocVarName(let->var);
+      // let var = value;
+      this->PrintIndent();
+      stream_ << "let ";
+      this->PrintVarDecl(let->var, stream_);
+      stream_ << " = " << value;
+      this->PrintEndInst("\n");
+      this->PrintScopeBody(let->body);
+    } else if (const IfNode* ifnode = body.as<IfNode>()) {
+      TextValue cond = GetValue(ifnode->cond);
+      this->PrintIndent();
+      stream_ << "if (" << cond << ") ";
+      this->PrintScope(ifnode->true_branch);
+      this->PrintIndent();
+      stream_ << "else ";
+      this->PrintScope(ifnode->false_branch);
+      this->PrintEndInst("\n");
+    } else {
+      TextValue value = GetValue(body);
+      this->PrintIndent();
+      stream_ << value;
+      this->PrintEndInst("\n");
+    }
+  }
+
+  /*!
+   * \brief Internal function to print a function argument list and its body.
+   * \param prefix The prefix before argument list.
+   * \param fn The function to be printed.
+   */
+  void PrintFuncInternal(std::string prefix, const Function& fn) {
+    // TODO(tqchen, M.K.) support generic function
+    // Possibly through meta-data
+    CHECK_EQ(fn->type_params.size(), 0U)
+        << "generic fn not yet supported";
+    this->PrintIndent();
+    stream_ << prefix << "(";
+    size_t decl_indent = prefix.length() + 1;
+    for (size_t i = 0; i < fn->params.size(); ++i) {
+      if (i != 0) {
+        this->PrintIndent(decl_indent);
+      }
+      AllocVarName(fn->params[i]);
+      this->PrintVarDecl(fn->params[i], stream_);
+      if (i + 1 != fn->params.size()) {
+        stream_ << ",\n";
+      }
+    }
+    stream_ << ") ";
+    if (fn->ret_type.defined()) {
+      stream_ << " -> ";
+      this->PrintType(fn->ret_type, stream_);
+    }
+    this->PrintScope(fn->body);
+  }
+  /*!
+   * \brief Print additional info about expr in comment.
+   * \param expr The expression.
+   */
+  void PrintOptionalInfo(const Expr& expr) {
+    // additional information in comment.
+    if (expr->checked_type_.defined()) {
+      stream_ << " # ty=";
+      this->PrintType(expr->checked_type(), stream_);
+    }
+  }
+  /*!
+   * \brief print var_name[:type]
+   * \param var The variable to be printed
+   * \param os The output stream
+   */
+  void PrintVarDecl(const Var& var, std::ostream& os) {  // NOLINT(*)
+    TextValue v = GetValue(var);
+    os << v;
+    if (var->type_annotation.defined()) {
+      os << ": ";
+      this->PrintType(var->type_annotation, os);
+    }
+  }
+  /*!
+   * \brief Get a constant scalar value.
+   * \param dtype The data type.
+   * \param data The pointer to the data.
+   * \tparam T the content data type holding the data.
+   */
+  template<typename T>
+  TextValue ConstScalar(DataType dtype, const T* data) {
+    std::ostringstream os;
+    PrintConstScalar(dtype, data, os);
+    return TextValue(os.str());
+  }
+  /*!
+   * \brief special method to print out const scalar
+   * \param dtype The data type
+   * \param data The pointer to hold the data.
+   * \param os The output stream.
+   */
+  template<typename T>
+  void PrintConstScalar(DataType dtype, const T* data, std::ostream& os) {  // NOLINT(*)
+    if (dtype == Int(32)) {
+      os << data[0];
+    } else if (dtype == Float(32)) {
+      os << data[0] << 'f';
+    } else if (dtype == Bool()) {
+      PrintBool(data[0] != 0, os);
+    } else {
+      os << dtype << "(" << data[0] << ")";
+    }
+  }
+  /*!
+   * \brief Print constant bool value.
+   * \param value The value to be printed.
+   * \param os The output stream
+   */
+  void PrintBool(bool value, std::ostream& os) { // NOLINT(*)
+    if (value) {
+      os << "True";
+    } else {
+      os << "False";
+    }
+  }
+  /*!
+   * \brief Print constant string.
+   * \param value The value to be printed.
+   * \param os The output stream
+   */
+  void PrintString(const std::string& value, std::ostream& os) { // NOLINT(*)
+    // TODO(M.K.): add escape.
+    os << "\"" << value << "\"";
+  }
+  /*!
+   * \brief get a unique name with the corresponding prefix
+   * \param prefix The prefix of the name
+   * \return The returned name.
+   */
+  std::string GetUniqueName(std::string prefix) {
+    auto it = name_alloc_map_.find(prefix);
+    if (it != name_alloc_map_.end()) {
+      while (true) {
+        std::ostringstream os;
+        os << prefix << (++it->second);
+        std::string name = os.str();
+        if (name_alloc_map_.count(name) == 0) {
+          prefix = name;
+          break;
+        }
+      }
+    }
+    name_alloc_map_[prefix] = 0;
+    return prefix;
+  }
+  /*!
+   * \brief mark the beginning of a new scope
+   * \return The scope id.
+   */
+  int BeginScope() {
+    int sid = static_cast<int>(scope_valid_.size());
+    scope_valid_.push_back(true);
+    indent_ += 2;
+    return sid;
+  }
+  /*!
+   * \brief mark the end of an old scope.
+   * \param scope_id The scope id to be ended.
+   */
+  void EndScope(int scope_id) {
+    scope_valid_[scope_id] = false;
+    indent_ -= 2;
+  }
+  /*!
+   * \brief Print the indent to the stream.
+   * \param more_indent More indentation besides the current one.
+   */
+  void PrintIndent(int64_t more_indent = 0) {
+    for (int i = 0; i < indent_ + more_indent; ++i) {
+      stream_ << ' ';
+    }
+  }
+  /*!
+   * \brief print end of the line.
+   */
+  void PrintEndInst(const char* suffix) {
+    stream_ << suffix;
+  }
+  /*!
+   * \brief Allocate temporary value
+   * \return A new text value.
+   */
+  TextValue AllocTempVar() {
+    std::ostringstream os;
+    os << '%' << temp_var_counter_++;
+    return TextValue(os.str());
+  }
+  /*!
+   * \brief Allocate name to a variable.
+   * \param var The input variable.
+   * \return The corresponding name.
+   */
+  TextValue AllocVarName(const Var& var) {
+    std::string name = GetUniqueName('%' + var->name_hint);
+    TextValue val(name);
+    CHECK(!memo_.count(var));
+    memo_[var] = val;
+    return val;
+  }
+
+ private:
+  class AttrPrinter;
+  friend class AttrPrinter;
+  /*! \brief meta data context */
+  TextMetaDataContext meta_;
+  /*! \brief Check whether scope is still valid */
+  std::vector<bool> scope_valid_;
+  /*! \brief The current indentation value */
+  int indent_{0};
+  /*! \brief name allocation map */
+  std::unordered_map<std::string, int> name_alloc_map_;
+  /*! \brief Map from expression to its text value */
+  std::unordered_map<Expr, TextValue, NodeHash, NodeEqual> memo_;
+  /*! \brief counter of temporary variable */
+  int64_t temp_var_counter_{0};
+  /*! \brief Output stream */
+  std::ostringstream stream_;
+};
+
+/*!
+ * \brief Attribute printer which prints the attributes in the call.
+ */
+class TextPrinter::AttrPrinter: public AttrVisitor {
+ public:
+  AttrPrinter(std::ostream& stream, TextPrinter* parent)  // NOLINT(*)
+      : stream_(stream), parent_(parent) {}
+
+  void Visit(const char* key, double* value) final {
+    PrintSep();
+    stream_ << key << "=" << value[0];
+  }
+  void Visit(const char* key, int64_t* value) final {
+    PrintSep();
+    stream_ << key << "=" << value[0];
+  }
+  void Visit(const char* key, uint64_t* value) final {
+    PrintSep();
+    stream_ << key << "=" << value[0];
+  }
+  void Visit(const char* key, int* value) final {
+    PrintSep();
+    stream_ << key << "=" << value[0];
+  }
+  void Visit(const char* key, bool* value) final {
+    PrintSep();
+    stream_ << key << "=";
+    parent_->PrintBool(value[0], stream_);
+  }
+  void Visit(const char* key, std::string* value) final {
+    PrintSep();
+    stream_ << key << "=";
+    parent_->PrintString(value[0], stream_);
+  }
+  void Visit(const char* key, void** value) final {
+    LOG(FATAL) << "do not allow void as argument";
+  }
+  void Visit(const char* key, DataType* value) final {
+    PrintSep();
+    stream_ << key << "=";
+    parent_->PrintString(runtime::TVMType2String(Type2TVMType(value[0])), stream_);
+  }
+  void Visit(const char* key, NodeRef* value) final {
+    PrintSep();
+    stream_ << key << "=";
+    parent_->PrintAttr(value[0], stream_);
+  }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    LOG(FATAL) << "do not allow NDarray as argument";
+  }
+
+ private:
+  void PrintSep() {
+    stream_ << ", ";
+  }
+  std::ostream& stream_;  // NOLINT(*)
+  TextPrinter* parent_;
+};
+
+void TextPrinter::PrintCallAttrs(const Expr& op,
+                                 const Attrs& attrs,
+                                 std::ostream& os) {  // NOLINT(*)
+  if (!attrs.defined()) return;
+  if (const auto* op_node = op.as<OpNode>()) {
+    if (attrs->type_index() == op_node->attrs_type_index) {
+      AttrPrinter printer(os, this);
+      const_cast<BaseAttrsNode*>(attrs.operator->())
+          ->VisitNonDefaultAttrs(&printer);
+      return;
+    }
+  }
+  os << ", " << meta_.GetMetaNode(attrs);
+}
+
+std::string RelayPrint(const NodeRef& node) {
+  return TextPrinter().Print(node);
+}
+
+TVM_REGISTER_API("relay._expr._text_print")
+.set_body_typed<std::string(const NodeRef&)>(RelayPrint);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
index a5af539947f0..f45ab3b4c9a7 100644
--- a/src/relay/ir/type.cc
+++ b/src/relay/ir/type.cc
@@ -22,6 +22,8 @@ TensorType TensorTypeNode::Scalar(DataType dtype) {
   return TensorTypeNode::make({}, dtype);
 }
 
+TVM_REGISTER_NODE_TYPE(TensorTypeNode);
+
 TVM_REGISTER_API("relay._make.TensorType")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
   Array<IndexExpr> shape = args[0];
@@ -30,8 +32,8 @@ TVM_REGISTER_API("relay._make.TensorType")
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<TensorTypeNode>([](const TensorTypeNode *node,
-                                     tvm::IRPrinter *p) {
-  p->stream << "TensorTypeNode(" << node->dtype << ", " << node->shape << ")";
+                                 tvm::IRPrinter *p) {
+  p->stream << "TensorType(" << node->shape << ", " << node->dtype << ")";
 });
 
 TypeParam TypeParamNode::make(std::string name, TypeParamNode::Kind kind) {
@@ -41,6 +43,8 @@ TypeParam TypeParamNode::make(std::string name, TypeParamNode::Kind kind) {
   return TypeParam(n);
 }
 
+TVM_REGISTER_NODE_TYPE(TypeParamNode);
+
 TVM_REGISTER_API("relay._make.TypeParam")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
   int kind = args[1];
@@ -61,6 +65,8 @@ IncompleteType IncompleteTypeNode::make(TypeParamNode::Kind kind) {
   return IncompleteType(n);
 }
 
+TVM_REGISTER_NODE_TYPE(IncompleteTypeNode);
+
 TVM_REGISTER_API("relay._make.IncompleteType")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     int kind = args[0];
@@ -86,6 +92,8 @@ FuncType FuncTypeNode::make(tvm::Array<Type> arg_types,
   return FuncType(n);
 }
 
+TVM_REGISTER_NODE_TYPE(FuncTypeNode);
+
 TVM_REGISTER_API("relay._make.FuncType")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
   *ret = FuncTypeNode::make(args[0], args[1], args[2], args[3]);
@@ -111,6 +119,8 @@ TypeRelation TypeRelationNode::make(TypeRelationFn func,
   return TypeRelation(n);
 }
 
+TVM_REGISTER_NODE_TYPE(TypeRelationNode);
+
 TVM_REGISTER_API("relay._make.TypeRelation")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = TypeRelationNode::make(args[0], args[1], args[2], args[3]);
@@ -129,6 +139,8 @@ TupleType TupleTypeNode::make(Array<Type> fields) {
   return TupleType(n);
 }
 
+TVM_REGISTER_NODE_TYPE(TupleTypeNode);
+
 TVM_REGISTER_API("relay._make.TupleType")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = TupleTypeNode::make(args[0]);
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 4717e3fe0803..b573a2981c39 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -11,7 +11,7 @@
 namespace tvm {
 namespace relay {
 
-TVM_REGISTER_NODE_TYPE(ConvAttrs);
+TVM_REGISTER_NODE_TYPE(Conv2DAttrs);
 
 bool Conv2DRel(const Array<Type>& types,
                int num_inputs,
@@ -25,7 +25,7 @@ bool Conv2DRel(const Array<Type>& types,
   static const Layout kNCHW("NCHW");
   static const Layout kOIHW("OIHW");
 
-  const ConvAttrs* param = attrs.as<ConvAttrs>();
+  const Conv2DAttrs* param = attrs.as<Conv2DAttrs>();
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->weight_layout);
@@ -113,7 +113,7 @@ Expr MakeConv2D(Expr data,
                 std::string weight_layout,
                 std::string out_layout,
                 DataType out_dtype) {
-  auto attrs = make_node<ConvAttrs>();
+  auto attrs = make_node<Conv2DAttrs>();
   attrs->strides = std::move(strides);
   attrs->padding = std::move(padding);
   attrs->dilation = std::move(dilation);
@@ -148,6 +148,7 @@ with the layer input to produce a tensor of outputs.
             (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.Conv2DAttrs")
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -296,6 +297,7 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
                 out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.Conv2DTransposeAttrs")
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index dc5ce2e567d0..4a8df2c80ec3 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -78,6 +78,7 @@ RELAY_REGISTER_OP("nn.dense")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.DenseAttrs")
 .set_num_inputs(2)
 .add_argument("data", "nD Tensor", "Input data.")
 .add_argument("weight", "2D Tensor", "Weight matrix.")
@@ -107,6 +108,7 @@ RELAY_REGISTER_OP("nn.leaky_relu")
 `y = x > 0 ? x : alpha * x`
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.LeakyReluAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(3)
@@ -135,6 +137,7 @@ RELAY_REGISTER_OP("nn.softmax")
 
 - **data**: The input data
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.SoftmaxAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
@@ -163,6 +166,7 @@ RELAY_REGISTER_OP("nn.log_softmax")
 
 - **data**: The input data
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.SoftmaxAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
@@ -171,9 +175,9 @@ RELAY_REGISTER_OP("nn.log_softmax")
 
 // BatchFlatten
 bool BatchFlattenRel(const Array<Type>& types,
-               int num_inputs,
-               const Attrs& attrs,
-               const TypeReporter& reporter) {
+                     int num_inputs,
+                     const Attrs& attrs,
+                     const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -278,6 +282,7 @@ centered at that value (zero padding is added where necessary).
 
 - **data**: The input tensor.
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.LRNAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -296,12 +301,12 @@ Expr MakeL2Normalize(Expr data,
 }
 
 TVM_REGISTER_API("relay.op.nn._make.l2_normalize")
-  .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-      runtime::detail::unpack_call<Expr, 3>(MakeL2Normalize, args, rv);
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeL2Normalize, args, rv);
   });
 
 RELAY_REGISTER_OP("nn.l2_normalize")
-    .describe(R"code(L2 Normalization layer.
+.describe(R"code(L2 Normalization layer.
 
 Normalizes along dimension axis using an L2 norm
 
@@ -352,6 +357,7 @@ During training, each element of the input is set to zero with probability ``p``
 The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input unchanged.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.DropoutAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "Input to which dropout will be applied.")
 .set_support_level(1)
@@ -478,6 +484,7 @@ axis to be the last item in the input shape.
 .. note::
     This operator can be optimized away for inference.
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.BatchNormAttrs")
 .set_num_inputs(5)
 .add_argument("data", "Tensor", "Input to which batch_norm will be applied.")
 .add_argument("gamma", "Tensor", "The gamma scale factor.")
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index 667a5be90fc8..b67bb96c64a9 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -60,7 +60,7 @@ bool PadRel(const Array<Type>& types,
 }
 
 // Handler to create a call to the padding op used by front-end FFI
-  Expr MakePad(Expr data, Array<Array<IndexExpr> > pad_width, double pad_value) {
+Expr MakePad(Expr data, Array<Array<IndexExpr> > pad_width, double pad_value) {
   auto attrs = make_node<PadAttrs>();
   attrs->pad_value = pad_value;
   attrs->pad_width = std::move(pad_width);
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 956883476d09..ea67199f4760 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -76,6 +76,7 @@ RELAY_REGISTER_OP("expand_dims")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.ExpandDimsAttrs")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
 .add_type_rel("ExpandDims", ExpandDimsRel);
@@ -481,6 +482,7 @@ RELAY_REGISTER_OP("zeros")
 .describe(R"code(Fill array with zeros.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.InitOpAttrs")
 .set_num_inputs(0)
 .set_support_level(3)
 .add_type_rel("InitOp", InitOpRel);
@@ -503,6 +505,7 @@ RELAY_REGISTER_OP("ones")
 .describe(R"code(Fill array with ones.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.InitOpAttrs")
 .set_num_inputs(0)
 .set_support_level(3)
 .add_type_rel("InitOp", InitOpRel);
@@ -697,6 +700,7 @@ RELAY_REGISTER_OP("squeeze")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.SqueezeAttrs")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Squeeze", SqueezeRel);
diff --git a/src/relay/pass/dead_code.cc b/src/relay/pass/dead_code.cc
index 2e2eca1f2739..5d153c606e63 100644
--- a/src/relay/pass/dead_code.cc
+++ b/src/relay/pass/dead_code.cc
@@ -74,7 +74,10 @@ class CalcDep : private ExprMutator {
   }
 
   Expr VisitExpr_(const FunctionNode* f) final {
-    return FunctionNode::make(f->params, f->ret_type, Eliminate(f->body), f->type_params);
+    return FunctionNode::make(f->params,
+                              Eliminate(f->body),
+                              f->ret_type,
+                              f->type_params);
   }
 
   // generate the let list from dependency graph
diff --git a/src/relay/pass/type_functor.h b/src/relay/pass/type_functor.h
index 70a2d9347eab..81f93cacaa80 100644
--- a/src/relay/pass/type_functor.h
+++ b/src/relay/pass/type_functor.h
@@ -20,6 +20,7 @@ class TypeFunctor;
 #define TYPE_FUNCTOR_DEFAULT \
   { return VisitTypeDefault_(op, std::forward<Args>(args)...); }
 
+
 #define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                                   \
   vtable.template set_dispatch<OP>(                                       \
       [](const NodeRef& n, TSelf* self, Args... args) {                   \
diff --git a/tests/python/relay/test_ir_debug_printer.py b/tests/python/relay/test_ir_debug_printer.py
deleted file mode 100644
index b8aa86a87638..000000000000
--- a/tests/python/relay/test_ir_debug_printer.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import tvm
-from tvm import relay
-from tvm.relay.expr import debug_print
-from tvm.relay.ir_builder import IRBuilder
-
-ib = IRBuilder()
-
-def show(e):
-    r = debug_print(ib.env, e)
-    assert r is not None
-
-
-def test_constant():
-    arr = tvm.nd.array(10)
-    const = relay.Constant(arr)
-    show(const)
-    # should print the array inside?
-
-
-def test_tuple():
-    fields = tvm.convert([])
-    tup = relay.Tuple(fields)
-    show(tup)
-
-
-def test_local_var():
-    name_hint = 's'
-    lv = relay.Var(name_hint)
-    show(lv)
-
-
-def test_dup_var():
-    lv = relay.Var('s')
-    rv = relay.Var('s')
-    show(relay.Tuple([lv, rv]))
-
-
-def test_large_dup_var():
-    av = relay.Var('s')
-    bv = relay.Var('s')
-    cv = relay.Var('s')
-    show(relay.Tuple([av, bv, cv]))
-
-
-def test_global_var():
-    name_hint = 'g'
-    gv = relay.GlobalVar(name_hint)
-    gv.name_hint == name_hint
-    show(gv)
-
-
-def test_function():
-    param_names = ['a', 'b', 'c', 'd']
-    params = tvm.convert([relay.Var(n) for n in param_names])
-    ret_type = None
-    body = params[0]
-    type_params = tvm.convert([])
-    fn = relay.Function(params, ret_type, body, type_params)
-    show(fn)
-
-
-
-def test_call():
-    op = relay.Var('f')
-    arg_names = ['a', 'b', 'c', 'd']
-    args = tvm.convert([relay.Var(n) for n in arg_names])
-    call = relay.Call(op, args, None, None)
-    show(call)
-
-
-def test_let():
-    ty = relay.ty.TensorType((10, 20), 'float32')
-    lv = relay.Var('x', ty)
-    arr = tvm.nd.array(10)
-    value = relay.Constant(arr)
-    let = relay.Let(lv, value, lv)
-    show(let)
-
-
-def test_if():
-    cond = relay.Var('cond')
-    left = relay.Var('left')
-    right = relay.Var('right')
-    ife = relay.If(cond, left, right)
-    show(ife)
-
-def test_tuple_get_item():
-    t = relay.Var('t')
-    g = relay.TupleGetItem(t, 0)
-    show(g)
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
new file mode 100644
index 000000000000..79a4fdd010c5
--- /dev/null
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -0,0 +1,95 @@
+import tvm
+import numpy as np
+from tvm import relay
+
+do_print = [False]
+
+def show(text):
+    if do_print[0]:
+        print("---------------------------")
+        print(text)
+
+def test_func():
+    x = relay.var("x", shape=(3, 2))
+    y = relay.var("y")
+    one = relay.const(10e10, dtype="float32")
+    z = relay.add(x, one)
+    z = relay.add(z, z)
+    f = relay.Function([x, y], z)
+    show(z.astext())
+    show(f.astext())
+
+
+def test_env():
+    x = relay.var("x", "float32")
+    y = relay.var("y", "float32")
+    z = relay.add(x, y)
+    z = relay.add(z, z)
+    f = relay.Function([x, y], z)
+    env = relay.Environment()
+    env.add("myf", f)
+    text = env.astext()
+    assert "def @myf" in text
+    assert "%1 = add(%0, %0) # ty=float32" in text
+    show(text)
+
+
+def test_meta_data():
+    n, c, h, w = tvm.var("n"), 10, 224, 224
+    x = relay.var("x", shape=(n, c, h, w))
+    w = relay.var("w")
+    z = relay.nn.conv2d(x, w,
+                        kernel_size=(3, 3),
+                        padding=(1, 1),
+                        channels=2)
+    f = relay.Function([x, w], z)
+    text = f.astext()
+    assert "channels=2" in text
+    assert "meta.Variable(id=0)" in text
+    show(text)
+
+    text = relay.const([1,2,3]).astext()
+    assert "meta.relay.Constant(id=0)" in text
+    show(text)
+
+
+def test_call_attrs():
+    x = relay.var("x")
+    # non default args
+    z = relay.nn.softmax(x, axis=2)
+    assert "axis=2" in z.astext()
+    # default args
+    z = relay.nn.softmax(x)
+    assert "softmax(%x)" in z.astext()
+    # non default args
+    z = relay.expand_dims(x, axis=2, num_newaxis=2)
+    assert "num_newaxis=2" in z.astext()
+
+
+def test_let_if_scope():
+    x = relay.var("x", "float32")
+    y = relay.var("y", "float32")
+    cond = relay.var("cond", "bool")
+    v1 = relay.var("v")
+    v2 = relay.var("v", "float32")
+    then_branch = relay.Let(
+        v1, relay.const(1, "float32"),
+        relay.Let(v2, x, relay.subtract(v1, v2)))
+    v3 = relay.var("v")
+    let2 = relay.Let(v3, y, v3)
+    else_branch = relay.add(let2, let2)
+    result = relay.If(cond, then_branch, else_branch)
+    f = relay.Function([x, y, cond], result)
+    text = f.astext()
+    assert text.count("{") == 4
+    assert "%cond: bool" in text
+    show(f.astext())
+
+
+if __name__ == "__main__":
+    do_print[0] = True
+    test_let_if_scope()
+    test_func()
+    test_env()
+    test_meta_data()
+    test_call_attrs()
diff --git a/tests/python/relay/test_ir_well_formed.py b/tests/python/relay/test_ir_well_formed.py
index d555c2beb627..7ccc96d271ac 100644
--- a/tests/python/relay/test_ir_well_formed.py
+++ b/tests/python/relay/test_ir_well_formed.py
@@ -10,7 +10,7 @@ def test_well_formed():
     let = relay.Let(x, v, x)
     assert well_formed(let)
     assert not well_formed(relay.Let(x, v, let))
-    f = relay.Function([x], ty, x)
+    f = relay.Function([x], x, ty)
     assert well_formed(f)
     # this test should pass in case of weak uniqueness (only test for shadowing)
     # but we want all binder to be distinct from each other.
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index 04ef3cf3da8f..51c1d4a2715a 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -262,49 +262,49 @@ def test_function_alpha_equal():
     basic_args = [relay.Var("v3", tt1), relay.Var("v4", tt2)]
     basic_tps = [tp1, tp2]
 
-    func = relay.Function([v1, v2],
-                          tt2, v1, basic_tps)
-    mapped = relay.Function(basic_args, tt2, basic_args[0], basic_tps)
+    func = relay.Function([v1, v2], v1,
+                          tt2, basic_tps)
+    mapped = relay.Function(basic_args, basic_args[0], tt2, basic_tps)
     assert alpha_equal(func, mapped)
 
-    fewer_params = relay.Function([relay.Var("v4", tt2)], tt2, v4, basic_tps)
+    fewer_params = relay.Function([relay.Var("v4", tt2)], v4, tt2, basic_tps)
     assert not alpha_equal(func, fewer_params)
 
     more_params = relay.Function([relay.Var("v3", tt1),
                                   relay.Var("v4", tt2),
-                                  relay.Var("v2", tt2)], tt2, v4, basic_tps)
+                                  relay.Var("v2", tt2)], v4, tt2, basic_tps)
     assert not alpha_equal(func, more_params)
 
-    params_unordered = relay.Function([v2, v1],
-                                      tt2, v1, basic_tps)
+    params_unordered = relay.Function([v2, v1], v1,
+                                      tt2, basic_tps)
     assert not alpha_equal(func, params_unordered)
 
-    params_mismatch = relay.Function([v1, v3],
-                                     tt2, v1, basic_tps)
+    params_mismatch = relay.Function([v1, v3], v1,
+                                     tt2, basic_tps)
     assert not alpha_equal(func, params_mismatch)
 
     # also would not typecheck
-    ret_type_mismatch = relay.Function(basic_args, tt1, v4, basic_tps)
+    ret_type_mismatch = relay.Function(basic_args, v4, tt1, basic_tps)
     assert not alpha_equal(func, ret_type_mismatch)
 
     # also mis-typed
-    different_body = relay.Function(basic_args, tt2, v3, basic_tps)
+    different_body = relay.Function(basic_args, v3, tt2, basic_tps)
     assert not alpha_equal(func, different_body)
 
-    fewer_type_params = relay.Function(basic_args, tt2, v4, [tp1])
+    fewer_type_params = relay.Function(basic_args, v4, tt2, [tp1])
     assert not alpha_equal(func, fewer_type_params)
 
-    more_type_params = relay.Function(basic_args, tt2, v4, [tp1, tp2, tp3])
+    more_type_params = relay.Function(basic_args, v4, tt2, [tp1, tp2, tp3])
     assert not alpha_equal(func, more_type_params)
 
-    type_params_unordered = relay.Function(basic_args, tt2, v4, [tp2, tp1])
+    type_params_unordered = relay.Function(basic_args, v4, tt2, [tp2, tp1])
     assert not alpha_equal(func, type_params_unordered)
 
-    different_type_params = relay.Function(basic_args, tt2, v4, [tp3, tp4])
+    different_type_params = relay.Function(basic_args, v4, tt2, [tp3, tp4])
     assert not alpha_equal(func, different_type_params)
 
     # a well-typed example that also differs in body, ret type, and type params
-    tupled_example = relay.Function(basic_args, tt3, relay.Tuple([v3, v4]))
+    tupled_example = relay.Function(basic_args, relay.Tuple([v3, v4]), tt3)
     assert not alpha_equal(func, tupled_example)
 
 
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index 121cea0081bd..c4bacce3ddfc 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -59,7 +59,7 @@ def test_recursion():
     n = relay.Var("n", e.int32)
     data = relay.Var("data", e.float32)
     funcbody = relay.If(equal(n, convert(0)), data, f(subtract(n, convert(1.0)), log(data)))
-    value = relay.Function([n, data], e.float32, funcbody, [])
+    value = relay.Function([n, data], funcbody, e.float32, [])
     orig = relay.Let(f, funcbody, f(convert(2.0), convert(10000.0)))
     assert alpha_equal(dead_code_elimination(orig), orig)
     assert alpha_equal(dead_code_elimination(relay.Let(f, funcbody, e.three)), e.three)
diff --git a/tests/python/relay/test_pass_free_vars.py b/tests/python/relay/test_pass_free_vars.py
index a4c745de10e0..524196661753 100644
--- a/tests/python/relay/test_pass_free_vars.py
+++ b/tests/python/relay/test_pass_free_vars.py
@@ -13,7 +13,7 @@ def test_free_vars():
     let = relay.Let(x, v, x)
     fvx = free_vars(let)
     assert len(free_vars(let)) == 0
-    f = relay.Function([x], ty, x)
+    f = relay.Function([x], x, ty)
     assert len(free_vars(f)) == 0
 
 

From 1742f731413ae43bd989308b3373b7ae0842a108 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@cs.washington.edu>
Date: Thu, 18 Oct 2018 13:25:07 -0700
Subject: [PATCH 244/529] adding Liangfu Chen as reviewer (#1926)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index e06ba5055672..9865a1ade6cf 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -21,6 +21,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
 
 ## Reviewers
+- [Liangfu Chen](https://github.com/liangfu)
 - [Masahiro Masuda](https://github.com/masahi)
 - [Kazutaka Morita](https://github.com/kazum)
 - [Tatsuya Nishiyama](https://github.com/nishi-t)

From a5c3d730d775ae7ae25bb9369a9412a222e48f6e Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Fri, 19 Oct 2018 04:47:55 +0530
Subject: [PATCH 245/529] [YOLO]Add the probability to the image (#1910)

---
 nnvm/python/nnvm/testing/yolo_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/testing/yolo_detection.py b/nnvm/python/nnvm/testing/yolo_detection.py
index 86f19297cabf..7c600d38db62 100644
--- a/nnvm/python/nnvm/testing/yolo_detection.py
+++ b/nnvm/python/nnvm/testing/yolo_detection.py
@@ -160,7 +160,7 @@ def draw_detections(im, dets, thresh, names, classes):
             if det['prob'][j] > thresh:
                 if category == -1:
                     category = j
-                labelstr.append(names[j])
+                labelstr.append(names[j] + " " + str(round(det['prob'][j], 4)))
         if category > -1:
             imc, imh, imw = im.shape
             width = int(imh * 0.006)

From c96ab9d8c5d84e40d2f40c248f5aa4f8b2271da3 Mon Sep 17 00:00:00 2001
From: Zhen Zhang <7168454+izgzhen@users.noreply.github.com>
Date: Thu, 18 Oct 2018 22:18:00 -0700
Subject: [PATCH 246/529] Check iter_type in vectorize (#1921)

---
 src/schedule/schedule_lang.cc               | 7 +++++++
 tests/python/unittest/test_lang_schedule.py | 9 +++++++++
 2 files changed, 16 insertions(+)

diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index d503e978887e..29265f2e94b8 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -352,6 +352,13 @@ inline void SetAttrIterType(StageNode* self, IterVar var, IterVarType iter_type)
 }
 
 Stage& Stage::vectorize(IterVar var) {   // NOLINT(*)
+  CHECK(var->iter_type == kDataPar ||
+        var->iter_type == kOpaque ||
+        var->iter_type == kUnrolled ||
+        var->iter_type == kVectorized ||
+        var->iter_type == kTensorized ||
+        var->iter_type == kParallelized)
+      << "Cannot vectorize on " << IterVarType2String(var->iter_type);
   SetAttrIterType(operator->(), var, kVectorized);
   return *this;
 }
diff --git a/tests/python/unittest/test_lang_schedule.py b/tests/python/unittest/test_lang_schedule.py
index 1eb42f3f0bca..a00785dea7af 100644
--- a/tests/python/unittest/test_lang_schedule.py
+++ b/tests/python/unittest/test_lang_schedule.py
@@ -1,3 +1,4 @@
+from nose.tools import raises
 import tvm
 import pickle as pkl
 
@@ -112,6 +113,13 @@ def test_vectorize():
     assert s[T].iter_var_attrs[xi].iter_type == UNROLL
     assert s[T].iter_var_attrs[yi].iter_type == VECTORIZE
 
+@raises(Exception)
+def test_vectorize_commreduce():
+    V = tvm.placeholder((128,), name='V')
+    ax = tvm.reduce_axis((0, 128), name='ax')
+    O = tvm.compute((1,), lambda _: tvm.sum(V[ax], axis=[ax]))
+    s = tvm.create_schedule(O.op)
+    s[O].vectorize(ax) # should throw here
 
 def test_pragma():
     m = 100
@@ -197,3 +205,4 @@ def intrin_func(ins, outs):
     test_split()
     test_fuse()
     test_vectorize()
+    test_vectorize_commreduce()

From 6cf92725415f725a1b6916bebc256366a82f3eab Mon Sep 17 00:00:00 2001
From: Nick Hynes <nhynes@berkeley.edu>
Date: Fri, 19 Oct 2018 09:34:18 -0700
Subject: [PATCH 247/529] Update SGX example (#1933)

---
 .gitignore                              |  3 ++
 apps/sgx/enclave/enclave_config.xml.in  |  4 +-
 apps/sgx/enclave/src/lib.rs             | 48 +++++++++++++------
 apps/sgx/run_model.py                   |  6 ++-
 docker/install/ubuntu_install_rust.sh   |  2 +-
 rust/src/runtime/array.rs               | 42 ++++++++++++++++-
 rust/src/runtime/mod.rs                 |  3 ++
 rust/src/runtime/packed_func.rs         | 62 +++++++++++++++++++++++--
 rust/src/runtime/sgx.rs                 |  4 +-
 rust/src/runtime/threading.rs           |  2 +-
 src/runtime/sgx/tvm.edl                 |  1 -
 src/runtime/sgx/untrusted/sgx_module.cc | 12 +++--
 12 files changed, 156 insertions(+), 33 deletions(-)

diff --git a/.gitignore b/.gitignore
index d24fccb6f513..410a36aecdec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -206,3 +206,6 @@ tvm_t.*
 *.cer
 *.crt
 *.der
+
+# patch sentinel
+patched.txt
diff --git a/apps/sgx/enclave/enclave_config.xml.in b/apps/sgx/enclave/enclave_config.xml.in
index 2423f93086b8..630c84c2cc31 100644
--- a/apps/sgx/enclave/enclave_config.xml.in
+++ b/apps/sgx/enclave/enclave_config.xml.in
@@ -1,8 +1,8 @@
 <EnclaveConfiguration>
   <ProdID>0</ProdID>
   <ISVSVN>0</ISVSVN>
-  <StackMaxSize>0x20000</StackMaxSize>
-  <HeapMaxSize>0x5000000</HeapMaxSize>
+  <StackMaxSize>0xf0000</StackMaxSize>
+  <HeapMaxSize>0xf000000</HeapMaxSize>
   <TCSNum>NUM_THREADS</TCSNum>
   <TCSPolicy>0</TCSPolicy> <!-- must be "bound" to use thread_local -->
   <DisableDebug>0</DisableDebug>
diff --git a/apps/sgx/enclave/src/lib.rs b/apps/sgx/enclave/src/lib.rs
index d74015a92510..3310040d3657 100644
--- a/apps/sgx/enclave/src/lib.rs
+++ b/apps/sgx/enclave/src/lib.rs
@@ -2,20 +2,32 @@
 
 #[macro_use]
 extern crate lazy_static;
+#[macro_use]
 extern crate tvm;
 
-use std::{convert::TryFrom, sync::Mutex};
+use std::{
+  convert::{TryFrom, TryInto},
+  sync::Mutex,
+};
 
-use tvm::runtime::{sgx, Graph, GraphExecutor, SystemLibModule, TVMArgValue, TVMRetValue};
+use tvm::{
+  ffi::runtime::DLTensor,
+  runtime::{
+    load_param_dict, sgx, Graph, GraphExecutor, SystemLibModule, TVMArgValue, TVMRetValue, Tensor,
+  },
+};
 
 lazy_static! {
   static ref SYSLIB: SystemLibModule = { SystemLibModule::default() };
   static ref MODEL: Mutex<GraphExecutor<'static, 'static>> = {
-    let _params = include_bytes!(concat!("../", env!("BUILD_DIR"), "/params.bin"));
     let graph_json = include_str!(concat!("../", env!("BUILD_DIR"), "/graph.json"));
+    let params_bytes = include_bytes!(concat!("../", env!("BUILD_DIR"), "/params.bin"));
+    let params = load_param_dict(params_bytes).unwrap();
 
     let graph = Graph::try_from(graph_json).unwrap();
-    Mutex::new(GraphExecutor::new(graph, &*SYSLIB).unwrap())
+    let mut exec = GraphExecutor::new(graph, &*SYSLIB).unwrap();
+    exec.load_params(params);
+    Mutex::new(exec)
   };
 }
 
@@ -24,13 +36,15 @@ fn ecall_init(_args: &[TVMArgValue]) -> TVMRetValue {
   TVMRetValue::from(0)
 }
 
-fn ecall_main(_args: &[TVMArgValue]) -> TVMRetValue {
-  let model = MODEL.lock().unwrap();
-  // model.set_input("data", args[0]);
+fn ecall_main(args: &[TVMArgValue<'static>]) -> TVMRetValue {
+  let mut model = MODEL.lock().unwrap();
+  let inp = args[0].try_into().unwrap();
+  let mut out: Tensor = args[1].try_into().unwrap();
+  model.set_input("data", inp);
   model.run();
   sgx::shutdown();
-  // model.get_output(0).into()
-  TVMRetValue::from(42)
+  out.copy(model.get_output(0).unwrap());
+  TVMRetValue::from(1)
 }
 
 pub mod ecalls {
@@ -40,15 +54,16 @@ pub mod ecalls {
 
   use std::{
     ffi::CString,
-    os::raw::{c_char, c_int},
+    mem,
+    os::raw::{c_char, c_int, c_void},
     slice,
   };
 
   use tvm::{
     ffi::runtime::{TVMRetValueHandle, TVMValue},
     runtime::{
-      sgx::{run_worker, SgxStatus},
-      PackedFunc,
+      sgx::{ocall_packed_func, run_worker, SgxStatus},
+      DataType, PackedFunc,
     },
   };
 
@@ -63,8 +78,10 @@ pub mod ecalls {
 
   const ECALLS: &'static [&'static str] = &["__tvm_run_worker__", "__tvm_main__", "init"];
 
+  pub type EcallPackedFunc = Box<Fn(&[TVMArgValue<'static>]) -> TVMRetValue + Send + Sync>;
+
   lazy_static! {
-    static ref ECALL_FUNCS: Vec<PackedFunc> = {
+    static ref ECALL_FUNCS: Vec<EcallPackedFunc> = {
       vec![
         Box::new(run_worker),
         Box::new(ecall_main),
@@ -87,7 +104,8 @@ pub mod ecalls {
         tvm_ocall!(tvm_ocall_register_export(
           CString::new(*ecall).unwrap().as_ptr(),
           i as i32
-        )).expect(&format!("Error registering `{}`", ecall));
+        ))
+        .expect(&format!("Error registering `{}`", ecall));
       });
     }
   }
@@ -108,7 +126,7 @@ pub mod ecalls {
         .into_iter()
         .zip(type_codes.into_iter())
         .map(|(v, t)| TVMArgValue::new(*v, *t as i64))
-        .collect::<Vec<TVMArgValue>>()
+        .collect::<Vec<TVMArgValue<'static>>>()
     };
     let (rv, tc) = ECALL_FUNCS[func_id as usize](&args).into_tvm_value();
     unsafe {
diff --git a/apps/sgx/run_model.py b/apps/sgx/run_model.py
index 491a5ccbda3c..232a03524801 100644
--- a/apps/sgx/run_model.py
+++ b/apps/sgx/run_model.py
@@ -8,8 +8,10 @@
 def main():
     ctx = tvm.context('cpu', 0)
     model = tvm.module.load(osp.join(CWD, 'build', 'enclave.signed.so'))
-    out = model()
-    if out == 42:
+    inp = tvm.nd.array(np.ones((1, 3, 224, 224), dtype='float32'), ctx)
+    out = tvm.nd.array(np.empty((1, 1000), dtype='float32'), ctx)
+    model(inp, out)
+    if abs(out.asnumpy().sum() - 1) < 0.001:
         print('It works!')
     else:
         print('It doesn\'t work!')
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index 9a51afeea79b..a8a9bddacf2c 100644
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -3,7 +3,7 @@ apt-get update && apt-get install -y --no-install-recommends --force-yes curl
 export RUSTUP_HOME=/opt/rust
 export CARGO_HOME=/opt/rust
 # this rustc is one supported by the installed version of rust-sgx-sdk
-curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2018-09-25
+curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2018-10-01
 . $CARGO_HOME/env
 rustup toolchain add nightly
 rustup component add rust-src
diff --git a/rust/src/runtime/array.rs b/rust/src/runtime/array.rs
index 9d0941811758..100258d9a157 100644
--- a/rust/src/runtime/array.rs
+++ b/rust/src/runtime/array.rs
@@ -126,6 +126,7 @@ pub struct Tensor<'a> {
   /// The `Tensor` strides. Can be `None` if the `Tensor` is contiguous.
   pub(super) strides: Option<Vec<usize>>,
   pub(super) byte_offset: isize,
+  /// The number of elements in the `Tensor`.
   pub(super) size: usize,
 }
 
@@ -316,12 +317,12 @@ pub struct DataType {
 
 impl DataType {
   /// Returns the number of bytes occupied by an element of this `DataType`.
-  fn itemsize(&self) -> usize {
+  pub fn itemsize(&self) -> usize {
     (self.bits * self.lanes) >> 3
   }
 
   /// Returns whether this `DataType` represents primitive type `T`.
-  fn is_type<T: 'static>(&self) -> bool {
+  pub fn is_type<T: 'static>(&self) -> bool {
     if self.lanes != 1 {
       return false;
     }
@@ -345,6 +346,16 @@ impl<'a> From<&'a DataType> for DLDataType {
   }
 }
 
+impl From<DLDataType> for DataType {
+  fn from(dtype: DLDataType) -> Self {
+    Self {
+      code: dtype.code as usize,
+      bits: dtype.bits as usize,
+      lanes: dtype.lanes as usize,
+    }
+  }
+}
+
 macro_rules! make_dtype_const {
   ($name: ident, $code: ident, $bits: expr, $lanes: expr) => {
     const $name: DataType = DataType {
@@ -394,6 +405,33 @@ impl Default for TVMContext {
   }
 }
 
+impl<'a> From<DLTensor> for Tensor<'a> {
+  fn from(dlt: DLTensor) -> Self {
+    unsafe {
+      let dtype = DataType::from(dlt.dtype);
+      let shape = slice::from_raw_parts(dlt.shape, dlt.ndim as usize).to_vec();
+      let size = shape.iter().map(|v| *v as usize).product::<usize>() as usize;
+      let storage = Storage::from(slice::from_raw_parts(
+        dlt.data as *const u8,
+        dtype.itemsize() * size,
+      ));
+      Self {
+        data: storage,
+        ctx: TVMContext::default(),
+        dtype: dtype,
+        size: size,
+        shape: shape,
+        strides: if dlt.strides == ptr::null_mut() {
+          None
+        } else {
+          Some(slice::from_raw_parts_mut(dlt.strides as *mut usize, size).to_vec())
+        },
+        byte_offset: dlt.byte_offset as isize,
+      }
+    }
+  }
+}
+
 /// `From` conversions to `Tensor` for owned or borrowed `ndarray::Array`.
 ///
 /// # Panics
diff --git a/rust/src/runtime/mod.rs b/rust/src/runtime/mod.rs
index bdf7094113d8..1a9c5ba7c7bd 100644
--- a/rust/src/runtime/mod.rs
+++ b/rust/src/runtime/mod.rs
@@ -14,6 +14,9 @@ use std::os::raw::c_char;
 
 pub use self::{array::*, graph::*, module::*, packed_func::*, threading::*, workspace::*};
 
+#[cfg(target_env = "sgx")]
+use self::sgx::ocall_packed_func;
+
 #[no_mangle]
 pub extern "C" fn TVMAPISetLastError(cmsg: *const c_char) {
   #[cfg(not(target_env = "sgx"))]
diff --git a/rust/src/runtime/packed_func.rs b/rust/src/runtime/packed_func.rs
index 030d677329c0..a6ad7fc35821 100644
--- a/rust/src/runtime/packed_func.rs
+++ b/rust/src/runtime/packed_func.rs
@@ -1,8 +1,9 @@
 use std::{any::Any, convert::TryFrom, marker::PhantomData, os::raw::c_void};
 
+use super::Tensor;
 use ffi::runtime::{
   BackendPackedCFunc, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLTensor,
-  TVMTypeCode_kArrayHandle, TVMTypeCode_kHandle, TVMValue,
+  TVMTypeCode_kArrayHandle, TVMTypeCode_kHandle, TVMTypeCode_kNDArrayContainer, TVMValue,
 };
 
 use errors::*;
@@ -55,6 +56,18 @@ macro_rules! impl_prim_tvm_arg {
         }
       }
     }
+    impl<'a> TryFrom<TVMArgValue<'a>> for $type {
+      type Error = Error;
+      fn try_from(val: TVMArgValue<'a>) -> Result<Self> {
+        ensure!(
+          val.type_code == $code as i64,
+          "Could not downcast arg. Expected `{}`, got `{}`",
+          $code,
+          val.type_code
+        );
+        Ok(unsafe { val.value.$field as $type })
+      }
+    }
   };
   ($type:ty, $field:ident, $code:expr) => {
     impl_prim_tvm_arg!($type, $field, $code, $type);
@@ -75,7 +88,6 @@ impl_prim_tvm_arg!(i32, v_int64);
 impl_prim_tvm_arg!(u32, v_int64);
 impl_prim_tvm_arg!(i64, v_int64);
 impl_prim_tvm_arg!(u64, v_int64);
-impl_prim_tvm_arg!(bool, v_int64);
 
 /// Creates a conversion to a `TVMArgValue` for an object handle.
 impl<'a, T> From<*const T> for TVMArgValue<'a> {
@@ -127,6 +139,23 @@ impl<'a> From<&'a DLTensor> for TVMArgValue<'a> {
   }
 }
 
+impl<'a> TryFrom<TVMArgValue<'a>> for Tensor<'a> {
+  type Error = Error;
+  fn try_from(val: TVMArgValue<'a>) -> Result<Self> {
+    ensure!(
+      val.type_code == TVMTypeCode_kArrayHandle as i64
+        || val.type_code == TVMTypeCode_kNDArrayContainer as i64,
+      "Could not downcast arg. Expected `{}` or `{}`, but got `{}`",
+      TVMTypeCode_kArrayHandle,
+      TVMTypeCode_kNDArrayContainer,
+      val.type_code,
+    );
+
+    let dlt = unsafe { *(val.value.v_handle as *mut DLTensor as *const DLTensor) };
+    Ok(dlt.into())
+  }
+}
+
 /// An owned TVMPODValue. Can be converted from a variety of primitive and object types.
 /// Can be downcasted using `try_from` if it contains the desired type.
 ///
@@ -175,7 +204,7 @@ impl TVMRetValue {
       2 => TVMValue {
         v_float64: self.prim_value.clone() as f64,
       },
-      3 | 7 | 8 | 9 | 10 => TVMValue {
+      3 | 7 | 8 | 9 | 10 | 13 => TVMValue {
         v_handle: Box::into_raw(self.box_value) as *mut c_void,
       },
       11 | 12 => TVMValue {
@@ -265,6 +294,33 @@ impl_prim_ret_value!(isize, 0);
 impl_prim_ret_value!(usize, 1);
 impl_boxed_ret_value!(String, 11);
 
+impl<'a, 't> From<&'t Tensor<'a>> for TVMRetValue {
+  fn from(val: &'t Tensor<'a>) -> Self {
+    TVMRetValue {
+      prim_value: 0,
+      box_value: box DLTensor::from(val),
+      type_code: TVMTypeCode_kNDArrayContainer as i64,
+    }
+  }
+}
+
+impl<'a> TryFrom<TVMRetValue> for Tensor<'a> {
+  type Error = Error;
+  fn try_from(ret: TVMRetValue) -> Result<Self> {
+    ensure!(
+      ret.type_code == TVMTypeCode_kArrayHandle as i64
+        || ret.type_code == TVMTypeCode_kNDArrayContainer as i64,
+      "Could not downcast arg. Expected `{}` or `{}`, but got `{}`",
+      TVMTypeCode_kArrayHandle,
+      TVMTypeCode_kNDArrayContainer,
+      ret.type_code,
+    );
+
+    let dlt = unsafe { *(ret.prim_value as *mut DLTensor as *const DLTensor) };
+    Ok(dlt.into())
+  }
+}
+
 // @see `WrapPackedFunc` in `llvm_module.cc`.
 pub(super) fn wrap_backend_packed_func(func: BackendPackedCFunc) -> PackedFunc {
   box move |args: &[TVMArgValue]| {
diff --git a/rust/src/runtime/sgx.rs b/rust/src/runtime/sgx.rs
index bf9d54a4af65..00be3ee3b608 100644
--- a/rust/src/runtime/sgx.rs
+++ b/rust/src/runtime/sgx.rs
@@ -60,11 +60,11 @@ pub fn ocall_packed_func<S: AsRef<str>>(fn_name: S, args: &[TVMArgValue]) -> Res
 #[macro_export]
 macro_rules! ocall_packed {
   ($fn_name:expr, $($args:expr),+) => {
-    ::runtime::sgx::ocall_packed_func($fn_name, &[$($args.into(),)+])
+    ocall_packed_func($fn_name, &[$($args.into(),)+])
       .expect(concat!("Error calling `", $fn_name, "`"))
   };
   ($fn_name:expr) => {
-    ::runtime::sgx::ocall_packed_func($fn_name, &Vec::new())
+    ocall_packed_func($fn_name, &Vec::new())
       .expect(concat!("Error calling `", $fn_name, "`"))
   }
 }
diff --git a/rust/src/runtime/threading.rs b/rust/src/runtime/threading.rs
index 693ebf7c4a33..1d6d7fc78834 100644
--- a/rust/src/runtime/threading.rs
+++ b/rust/src/runtime/threading.rs
@@ -23,7 +23,7 @@ use super::super::errors::*;
 use ffi::runtime::TVMParallelGroupEnv;
 
 #[cfg(target_env = "sgx")]
-use super::{TVMArgValue, TVMRetValue};
+use super::{sgx::ocall_packed_func, TVMArgValue, TVMRetValue};
 
 type FTVMParallelLambda =
   extern "C" fn(task_id: usize, penv: *const TVMParallelGroupEnv, cdata: *const c_void) -> i32;
diff --git a/src/runtime/sgx/tvm.edl b/src/runtime/sgx/tvm.edl
index d46940ecefef..8466d78af72f 100644
--- a/src/runtime/sgx/tvm.edl
+++ b/src/runtime/sgx/tvm.edl
@@ -21,7 +21,6 @@ enclave {
                                    [out] TVMValue* ret_val,
                                    [out] int* ret_type_code);
         void tvm_ocall_register_export([in, string] const char* name, int func_id);
-        void* tvm_ocall_reserve_space(size_t num_bytes, size_t alignment);
     };
 };
 
diff --git a/src/runtime/sgx/untrusted/sgx_module.cc b/src/runtime/sgx/untrusted/sgx_module.cc
index 2fef99df889f..fc0710ae3a53 100644
--- a/src/runtime/sgx/untrusted/sgx_module.cc
+++ b/src/runtime/sgx/untrusted/sgx_module.cc
@@ -202,21 +202,25 @@ void tvm_ocall_packed_func(const char* name,
 
 // Allocates space for return values. The returned pointer is only valid between
 // successive calls to `tvm_ocall_reserve_space`.
-void* tvm_ocall_reserve_space(size_t num_bytes, size_t alignment) {
+TVM_REGISTER_GLOBAL("__sgx_reserve_space__")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  size_t num_bytes = args[0];
+  size_t alignment = args[1];
+
   static TVMContext ctx = { kDLCPU, 0 };
   static thread_local void* buf = nullptr;
   static thread_local size_t buf_size = 0;
   static thread_local size_t buf_align = 0;
 
-  if (buf_size >= num_bytes && buf_align >= alignment) return buf;
+  if (buf_size >= num_bytes && buf_align >= alignment) *rv = nullptr;
 
   DeviceAPI::Get(ctx)->FreeDataSpace(ctx, buf);
   buf = DeviceAPI::Get(ctx)->AllocDataSpace(ctx, num_bytes, alignment, {});
   buf_size = num_bytes;
   buf_align = alignment;
 
-  return buf;
-}
+  *rv = buf;
+});
 
 }  // extern "C"
 }  // namespace sgx

From 0d12fc85943f21b78bccb461d34407d3af331424 Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Fri, 19 Oct 2018 14:11:27 -0700
Subject: [PATCH 248/529] [Relay] Nullable Type Alpha Equality (#1906)

---
 src/relay/pass/alpha_eq.cc                  | 16 ++++++------
 tests/python/relay/test_pass_alpha_equal.py | 27 +++++++++++++++++++++
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
index 29d2f87cf04a..059504efc883 100644
--- a/src/relay/pass/alpha_eq.cc
+++ b/src/relay/pass/alpha_eq.cc
@@ -193,6 +193,12 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
 };
 
 bool AlphaEqual(const Type& t1, const Type& t2) {
+  if (t1.defined() != t2.defined())
+    return false;
+
+  if (!t1.defined())
+    return true;
+
   TypeAlphaEq aeq;
   aeq.VisitType(t1, t2);
   return aeq.equal;
@@ -373,15 +379,11 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
 
  private:
   void MergeVarDecl(const Var& var1, const Var& var2) {
-    if (var1->type_annotation.defined() != var2->type_annotation.defined()) {
-      equal = false;
-      return;
-    }
-    if (var1->type_annotation.defined() &&
-        !AlphaEqual(var1->type_annotation, var2->type_annotation)) {
-      equal = false;
+    equal = equal && AlphaEqual(var1->type_annotation, var2->type_annotation);
+    if (!equal) {
       return;
     }
+
     eq_map.Set(var1, var2);
   }
 };
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index 51c1d4a2715a..2bfbc7f10a40 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -187,6 +187,25 @@ def test_var_alpha_equal():
     assert alpha_equal(l1, l2)
     assert not alpha_equal(l1, l3)
 
+    # type annotations
+    tt1 = relay.TensorType([], "int32")
+    tt2 = relay.TensorType([], "int32")
+    tt3 = relay.TensorType([], "int64")
+    v3 = relay.Var("v3", tt1)
+    v4 = relay.Var("v4", tt2)
+    v5 = relay.Var("v5", tt3)
+
+    l4 = relay.Let(v3, convert(1), v3)
+    l5 = relay.Let(v4, convert(1), v4)
+    l6 = relay.Let(v5, convert(1), v5)
+
+    # same annotations
+    assert alpha_equal(l4, l5)
+    # different annotations
+    assert not alpha_equal(l4, l6)
+    # one null annotation
+    assert not alpha_equal(l1, l4)
+
 
 def test_global_var_alpha_equal():
     v1 = relay.GlobalVar("v1")
@@ -307,6 +326,14 @@ def test_function_alpha_equal():
     tupled_example = relay.Function(basic_args, relay.Tuple([v3, v4]), tt3)
     assert not alpha_equal(func, tupled_example)
 
+    # nullable
+    no_ret_type = relay.Function(basic_args, v4, None, [tp1, tp2])
+    # both null
+    assert alpha_equal(no_ret_type, no_ret_type)
+    # one null
+    assert not alpha_equal(func, no_ret_type)
+    assert not alpha_equal(no_ret_type, func)
+
 
 def test_call_alpha_equal():
     v1 = relay.Var("v1")

From c92092b1ab221fe963836cb1d8a0d3f822bf69a3 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Fri, 19 Oct 2018 18:19:11 -0700
Subject: [PATCH 249/529] Fix x86 Conv tuning tutorial (#1932)

---
 tutorials/autotvm/tune_nnvm_x86.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py
index ddd91f584c08..efd1ee4e1a12 100644
--- a/tutorials/autotvm/tune_nnvm_x86.py
+++ b/tutorials/autotvm/tune_nnvm_x86.py
@@ -118,11 +118,12 @@ def tune_kernels(tasks,
         prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
 
         # converting conv2d tasks to conv2d_NCHWc tasks
+        # data, kernel are tuples of ("TENSOR", shape, dtype)
         data, kernel, strides, padding, layout, dtype = tsk.args
         kernel_size = (kernel[1][2], kernel[1][3])
         data_plc = tvm.placeholder(data[1], name="data")
         kernel_plc = tvm.placeholder(kernel[1], name="kernel")
-        args = [data_plc, kernel_plc, data[1][1], kernel_size, strides,
+        args = [data_plc, kernel_plc, kernel[1][0], kernel_size, strides,
                 padding, layout, layout, dtype]
         args = autotvm.task.nnvm_integration.serialize_args(args)
         task = autotvm.task.create("topi_x86_conv2d_NCHWc", args=args, target=target)

From e15e2e3ccce217992d3c6f57f9641b4d33dd11eb Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 19 Oct 2018 21:56:09 -0700
Subject: [PATCH 250/529] [RELAY] IR builder stablize refactor, clean pass
 (#1934)

---
 include/tvm/relay/attrs/nn.h                  |   2 +-
 include/tvm/relay/environment.h               |  28 +-
 include/tvm/relay/expr.h                      |   9 +-
 include/tvm/relay/op.h                        |   6 +-
 include/tvm/relay/pass.h                      |  27 +-
 include/tvm/relay/type.h                      |  32 +-
 python/tvm/relay/__init__.py                  |   6 +-
 python/tvm/relay/env.py                       | 106 ++---
 python/tvm/relay/expr.py                      |  63 ++-
 python/tvm/relay/ir_builder.py                | 387 ------------------
 python/tvm/relay/ir_pass.py                   |  69 ++--
 python/tvm/relay/scope_builder.py             | 185 +++++++++
 python/tvm/relay/ty.py                        |  62 ++-
 src/relay/ir/environment.cc                   | 102 ++---
 src/relay/ir/expr.cc                          |   2 +-
 src/relay/ir/expr_functor.cc                  |   4 +-
 src/relay/ir/text_printer.cc                  |  12 +-
 src/relay/ir/type.cc                          |  22 +-
 src/relay/op/image/resize.cc                  |   1 +
 src/relay/op/nn/nn.cc                         |   5 +
 src/relay/op/nn/pad.cc                        |   1 +
 src/relay/op/nn/pooling.cc                    |   5 +
 src/relay/op/nn/upsampling.cc                 |   1 +
 src/relay/op/tensor/reduce.cc                 |   4 +-
 src/relay/op/tensor/transform.cc              |  13 +
 src/relay/op/type_relations.cc                | 177 ++------
 src/relay/op/type_relations.h                 |  27 --
 src/relay/op/vision/multibox_op.cc            |   1 +
 src/relay/pass/alpha_eq.cc                    |  10 +-
 src/relay/pass/dead_code.cc                   |   4 +-
 src/relay/pass/kind_check.cc                  |   4 +-
 src/relay/pass/let_list.h                     |   2 +-
 src/relay/pass/type_functor.h                 |   4 +-
 src/relay/pass/type_infer.cc                  |  80 +++-
 src/relay/pass/type_subst.cc                  |  12 +-
 src/relay/pass/type_subst.h                   |   4 +-
 src/relay/pass/type_visitor.h                 |  12 +-
 src/relay/pass/util.cc                        |  24 +-
 tests/python/relay/test_ir_builder.py         |  19 -
 tests/python/relay/test_ir_nodes.py           |   8 +-
 tests/python/relay/test_ir_text_printer.py    |  32 +-
 tests/python/relay/test_op_level1.py          | 339 +++++----------
 tests/python/relay/test_op_level2.py          | 315 ++++++--------
 tests/python/relay/test_op_level3.py          | 239 ++++-------
 tests/python/relay/test_op_level4.py          | 223 +++-------
 tests/python/relay/test_op_level5.py          |  54 +--
 tests/python/relay/test_pass_alpha_equal.py   | 103 +++--
 tests/python/relay/test_pass_check_kind.py    |  44 +-
 .../relay/test_pass_dead_code_elimination.py  |  22 +-
 tests/python/relay/test_pass_free_vars.py     |   2 +-
 tests/python/relay/test_type_infer.py         | 131 +++---
 tests/python/relay/test_type_solver.py        |   2 -
 52 files changed, 1212 insertions(+), 1836 deletions(-)
 delete mode 100644 python/tvm/relay/ir_builder.py
 create mode 100644 python/tvm/relay/scope_builder.py
 delete mode 100644 tests/python/relay/test_ir_builder.py

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 5dbaecdc3e78..6b522ef3bfd0 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -254,7 +254,7 @@ struct PadAttrs : public tvm::AttrsNode<PadAttrs> {
 struct LeakyReluAttrs : public tvm::AttrsNode<LeakyReluAttrs> {
   double alpha;
 
-  TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.LeakyReluAttrs") {
+  TVM_DECLARE_ATTRS(LeakyReluAttrs, "relay.attrs.LeakyReluAttrs") {
     TVM_ATTR_FIELD(alpha).set_lower_bound(0.0).set_default(0.25)
         .describe("Slope coefficient for the negative half axis.");
   }
diff --git a/include/tvm/relay/environment.h b/include/tvm/relay/environment.h
index 46cedf12b816..2ed389571ad6 100644
--- a/include/tvm/relay/environment.h
+++ b/include/tvm/relay/environment.h
@@ -47,12 +47,13 @@ class EnvironmentNode : public RelayNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("functions", &functions);
-    v->Visit("global_map_", &global_map_);
+    v->Visit("global_var_map_", &global_var_map_);
   }
 
   TVM_DLL static Environment make(tvm::Map<GlobalVar, Function> global_funcs);
 
-  /*! \brief Add a function to the global environment.
+  /*!
+   * \brief Add a function to the global environment.
    * \param var The name of the global function.
    * \param func The function.
    * \param update Controls whether you can replace a definition in the
@@ -60,39 +61,46 @@ class EnvironmentNode : public RelayNode {
    */
   void Add(const GlobalVar& var, const Function& func, bool update = false);
 
-  /*! \brief Update a function in the global environment.
+  /*!
+   * \brief Update a function in the global environment.
    * \param var The name of the global function to update.
    * \param func The new function.
    */
   void Update(const GlobalVar& var, const Function& func);
 
-  /*! \brief Remove a function from the global environment.
+  /*!
+   * \brief Remove a function from the global environment.
    * \param var The name of the global function to update.
    */
   void Remove(const GlobalVar& var);
 
-  /*! \brief Lookup a global function by its variable.
+  /*!
+   * \brief Lookup a global function by its variable.
    * \param str The unique string specifying the global variable.
    * \returns The global variable.
    */
   GlobalVar GetGlobalVar(const std::string& str);
 
-  /*! \brief Lookup a global function by its variable.
+  /*!
+   * \brief Lookup a global function by its variable.
    * \param var The global var to lookup.
    * \returns The function named by the variable argument.
    */
   Function Lookup(const GlobalVar& var);
 
-  /*! \brief Lookup a global function by its string name
+  /*!
+   * \brief Lookup a global function by its string name
    * \param name The name of the function.
    * \returns The function named by the argument.
    */
   Function Lookup(const std::string& name);
 
-  /*! \brief Combine with another Environment.
+  /*!
+   * \brief Update the functions inside this environment by
+   *        functions in another environment.
    * \param other The other environment.
    */
-  void Merge(const Environment& other);
+  void Update(const Environment& other);
 
   static constexpr const char* _type_key = "relay.Environment";
   TVM_DECLARE_NODE_TYPE_INFO(EnvironmentNode, Node);
@@ -101,7 +109,7 @@ class EnvironmentNode : public RelayNode {
   /*! \brief A map from string names to global variables that
    * ensures global uniqueness.
    */
-  tvm::Map<std::string, GlobalVar> global_map_;
+  tvm::Map<std::string, GlobalVar> global_var_map_;
 };
 
 struct Environment : public NodeRef {
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 743dc085d035..142982d48907 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -197,7 +197,7 @@ class FunctionNode : public ExprNode {
    *
    * \note This can be usually empty for non-polymorphic functions.
    */
-  tvm::Array<TypeParam> type_params;
+  tvm::Array<TypeVar> type_params;
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("params", &params);
@@ -219,7 +219,7 @@ class FunctionNode : public ExprNode {
   TVM_DLL static Function make(tvm::Array<Var> params,
                                Expr body,
                                Type ret_type,
-                               tvm::Array<TypeParam> ty_params);
+                               tvm::Array<TypeVar> ty_params);
 
   static constexpr const char* _type_key = "relay.Function";
   TVM_DECLARE_NODE_TYPE_INFO(FunctionNode, ExprNode);
@@ -375,13 +375,14 @@ class TupleGetItemNode : public ExprNode {
   int index;
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
-    v->Visit("tuple", &tuple);
+    v->Visit("tuple_value", &tuple);
     v->Visit("index", &index);
+    v->Visit("_checked_type_", &checked_type_);
   }
 
   TVM_DLL static TupleGetItem make(Expr tuple, int index);
 
-  static constexpr const char * _type_key = "relay.GetItem";
+  static constexpr const char * _type_key = "relay.TupleGetItem";
   TVM_DECLARE_NODE_TYPE_INFO(TupleGetItemNode, ExprNode);
 };
 
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index 4dcff22b84e8..fe6d957e79ed 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -371,14 +371,14 @@ inline OpRegistry& OpRegistry::add_type_rel(
     env_type_rel_func = env_func;
   }
 
-  Array<TypeParam> type_params;
+  Array<TypeVar> type_params;
   Array<Type> arg_types;
 
   // Add inputs.
   std::string input_name_prefix = "in";
   for (int i = 0; i < get()->num_inputs; i++) {
     auto name = input_name_prefix + std::to_string(i);
-    auto param = TypeParamNode::make(name, TypeParamNode::Kind::kType);
+    auto param = TypeVarNode::make(name, TypeVarNode::Kind::kType);
     type_params.push_back(param);
     arg_types.push_back(param);
   }
@@ -386,7 +386,7 @@ inline OpRegistry& OpRegistry::add_type_rel(
   Array<Type> ty_call_args = arg_types;
 
   // Add output type.
-  auto out_param = TypeParamNode::make("out", TypeParamNode::Kind::kType);
+  auto out_param = TypeVarNode::make("out", TypeVarNode::Kind::kType);
   type_params.push_back(out_param);
   // this will trigger copy on write.
   ty_call_args.push_back(out_param);
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 1043e4aaaa4c..9a3b75364167 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -12,21 +12,30 @@
 namespace tvm {
 namespace relay {
 
-/*! \brief Infer the type of an expression with the provided environment.
+/*!
+ * \brief Infer the type of an expression.
  *
  * The result of type checking is a new expression with unambigous
  * type information filled in, as well as it's checked type field
  * populated with the result type.
  *
- * \param env The environment used for global settings and referencing
- * global functions.
- *
- * \param e The expression to type check.
+ * \param expr The expression to type check.
+ * \param env The environment used for referencing global functions, can be None.
  *
  * \return A type checked expression with its checked_type field populated.
  */
-Expr InferType(const Environment& env, const Expr& e);
-Expr InferType(const Environment& env, const GlobalVar& var, const Function& f);
+Expr InferType(const Expr& expr, const Environment& env);
+/*!
+ * \brief Infer the type of a function as if it is mapped to var in the env.
+ *
+ * \param f the function.
+ * \param env The environment used for referencing global functions.
+ * \param var The global variable corresponding to the function.
+ *
+ * \return A type checked Function with its checked_type field populated.
+ * \note this function mutates env and is not thread-safe.
+ */
+Function InferType(const Function& f, const Environment& env, const GlobalVar& var);
 
 /*!
  * \brief Check that types are well kinded by applying "kinding rules".
@@ -111,7 +120,7 @@ tvm::Array<Var> FreeVariables(const Expr& e);
  *
  * \return the set of free type variables.
  */
-tvm::Array<TypeParam> FreeTypeVariables(const Expr& e);
+tvm::Array<TypeVar> FreeTypeVariables(const Expr& e);
 
 /*! \brief Get free type parameters from type t.
  *
@@ -121,7 +130,7 @@ tvm::Array<TypeParam> FreeTypeVariables(const Expr& e);
  *
  * \return the set of free type variables.
  */
-tvm::Array<TypeParam> FreeTypeVariables(const Type& t);
+tvm::Array<TypeVar> FreeTypeVariables(const Type& t);
 
 /*! \brief Remove expressions which does not effect the program result.
  *
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index 9a91bd09c70e..2bb9b3070270 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -98,7 +98,7 @@ RELAY_DEFINE_NODE_REF(TensorType, TensorTypeNode, Type);
  *  This can be viewed as template parameter in c++ template function.
  *
  * For example, in the following pesudo code,
- * the TypeParam of f is TypeParam(kind=kShapeVar, var=n).
+ * the TypeVar of f is TypeVar(kind=kShapeVar, var=n).
  * This function can take in a Tensor with shape=(3, 3) and
  * returns a Tensor with shape=(9,)
  *
@@ -108,13 +108,13 @@ RELAY_DEFINE_NODE_REF(TensorType, TensorTypeNode, Type);
  *  f(x : Tensor[i32, (n, n)]) -> Tensor[i32, (n * n)]
  *
  * \endcode
- * \sa TypeParamNode The actual container class of TypeParam
+ * \sa TypeVarNode The actual container class of TypeVar
  */
-class TypeParam;
-/*! \brief TypeParam container node */
-class TypeParamNode : public TypeNode {
+class TypeVar;
+/*! \brief TypeVar container node */
+class TypeVarNode : public TypeNode {
  public:
-  /*! \brief possible kinds of TypeParam */
+  /*! \brief possible kinds of TypeVar */
   enum Kind : int {
     /*! \brief template variable in shape expression */
     kType = 0,
@@ -136,13 +136,13 @@ class TypeParamNode : public TypeNode {
     v->Visit("span", &span);
   }
 
-  TVM_DLL static TypeParam make(std::string name, Kind kind);
+  TVM_DLL static TypeVar make(std::string name, Kind kind);
 
-  static constexpr const char* _type_key = "relay.TypeParam";
-  TVM_DECLARE_NODE_TYPE_INFO(TypeParamNode, TypeNode);
+  static constexpr const char* _type_key = "relay.TypeVar";
+  TVM_DECLARE_NODE_TYPE_INFO(TypeVarNode, TypeNode);
 };
 
-RELAY_DEFINE_NODE_REF(TypeParam, TypeParamNode, Type);
+RELAY_DEFINE_NODE_REF(TypeVar, TypeVarNode, Type);
 
 /*!
  * \brief IncompleteType.
@@ -150,20 +150,20 @@ RELAY_DEFINE_NODE_REF(TypeParam, TypeParamNode, Type);
  *
  * If we view the type relations as "computational graph of types",
  * then IncompleteType represents intermediate values of the graph,
- * TypeParam represents the input to the graph.
+ * TypeVar represents the input to the graph.
  */
 class IncompleteType;
 
 /*! \brief IncompleteType container node */
 class IncompleteTypeNode : public TypeNode {
  public:
-  TypeParamNode::Kind kind;
+  TypeVarNode::Kind kind;
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("kind", &kind);
   }
 
-  TVM_DLL static IncompleteType make(TypeParamNode::Kind kind);
+  TVM_DLL static IncompleteType make(TypeVarNode::Kind kind);
 
   static constexpr const char* _type_key = "relay.IncompleteType";
   TVM_DECLARE_NODE_TYPE_INFO(IncompleteTypeNode, TypeNode);
@@ -192,7 +192,7 @@ class FuncType;
  * Relay support polymorphic function type.
  * This can be roughly viewed as template function in C++.
  *
- * \sa TypeParam, TypeConstraint
+ * \sa TypeVar, TypeConstraint
  */
 class FuncTypeNode : public TypeNode {
  public:
@@ -203,7 +203,7 @@ class FuncTypeNode : public TypeNode {
   // The following fields are used in polymorphic(template) functions
   // For normal functions, the following two fields will be empty.
   /*! \brief The type parameters of the function */
-  tvm::Array<TypeParam> type_params;
+  tvm::Array<TypeVar> type_params;
   /*!
    * \brief potential constraint the type need to obey
    * \note this field is reserved for futher purposes.
@@ -220,7 +220,7 @@ class FuncTypeNode : public TypeNode {
 
   TVM_DLL static FuncType make(tvm::Array<Type> arg_types,
                                Type ret_type,
-                               tvm::Array<TypeParam> type_params,
+                               tvm::Array<TypeVar> type_params,
                                tvm::Array<TypeConstraint> type_constraints);
 
   static constexpr const char* _type_key = "relay.FuncType";
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index d6ecdb7855d8..731a816460ee 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -5,7 +5,6 @@
 from . import expr
 from . import env
 from . import ir_pass
-from . import ir_builder
 
 # Root operators
 from .op import Op
@@ -16,6 +15,8 @@
 from . import vision
 from . import image
 
+from .scope_builder import ScopeBuilder
+
 # Span
 Span = base.Span
 
@@ -27,11 +28,12 @@
 TupleType = ty.TupleType
 TensorType = ty.TensorType
 Kind = ty.Kind
-TypeParam = ty.TypeParam
+TypeVar = ty.TypeVar
 TypeConstraint = ty.TypeConstraint
 FuncType = ty.FuncType
 TypeRelation = ty.TypeRelation
 IncompleteType = ty.IncompleteType
+scalar_type = ty.scalar_type
 
 # Expr
 Constant = expr.Constant
diff --git a/python/tvm/relay/env.py b/python/tvm/relay/env.py
index 8c226e509a12..9c3241e18ef8 100644
--- a/python/tvm/relay/env.py
+++ b/python/tvm/relay/env.py
@@ -1,31 +1,40 @@
 # pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable, wildcard-import
 """A global environment storing everything needed to interpret or compile a Relay program."""
 from .base import register_relay_node, RelayNode
+from .._ffi import base as _base
 from . import _make
 from . import _env
+from . import expr as _expr
+
 
 @register_relay_node
 class Environment(RelayNode):
-    """The global Relay environment containing functions,
-    options and more.
-    """
-
-    def __init__(self, funcs=None):
-        """Construct an environment.
-
-        Parameters
-        ------
-        funcs : optional, dict
-            Map of global var to Function
+    """The global Relay environment containing collection of functions.
 
-        Returns
-        ------
-        env: A new environment containing :py:class:`~relay.env.Environment`.
-        """
-        funcs = funcs if funcs else {}
-        self.__init_handle_by_constructor__(_make.Environment, funcs)
+    Each global function is identified by an unique tvm.relay.GlobalVar.
+    tvm.relay.GlobalVar and Environment is necessary in order to enable
+    recursions in function to avoid cyclic reference in the function.x
 
-    def add(self, var, func):
+    Parameters
+    ----------
+    functions : dict, optional.
+        Map of global var to Function
+    """
+    def __init__(self, functions=None):
+        if functions is None:
+            functions = {}
+        elif isinstance(functions, dict):
+            mapped_funcs = {}
+            for k, v in functions.items():
+                if isinstance(k, _base.string_types):
+                    k = _expr.GlobalVar(k)
+                if not isinstance(k, _expr.GlobalVar):
+                    raise TypeError("Expect functions to be Dict[GlobalVar, Function]")
+                mapped_funcs[k] = v
+            functions = mapped_funcs
+        self.__init_handle_by_constructor__(_make.Environment, functions)
+
+    def __setitem__(self, var, func):
         """Add a function to the environment.
 
         Parameters
@@ -36,50 +45,55 @@ def add(self, var, func):
         func: Function
             The function.
         """
-        if isinstance(var, str):
-            var = _env.Environment_GetGlobalVar(self, var)
-
+        if isinstance(var, _base.string_types):
+            var = _expr.GlobalVar(var)
         _env.Environment_Add(self, var, func)
 
-    def merge(self, other):
-        """Merge two environments.
+    def __getitem__(self, var):
+        """Lookup a global function by name or by variable.
 
         Parameters
         ----------
-        other: Environment
-            The environment to merge into the current Environment.
+        var: str or GlobalVar
+            The name or global variable.
+
+        Returns
+        -------
+            func: Function
+                The function referenced by :code:`var`.
         """
-        return _env.Environment_Merge(self, other)
+        if isinstance(var, _base.string_types):
+            return _env.Environment_Lookup_str(self, var)
+        else:
+            return _env.Environment_Lookup(self, var)
 
-    def global_var(self, name):
-        """Get a global variable by name.
+    def update(self, other):
+        """Insert functions in another Environment to current one.
 
         Parameters
         ----------
-        name: str
-            The name of the global variable.
-
-        Returns
-        -------
-            global_var: GlobalVar
-                The global variable mapped to :code:`name`.
+        other: Environment
+            The environment to merge into the current Environment.
         """
-        return _env.Environment_GetGlobalVar(self, name)
+        if isinstance(other, dict):
+            other = Environment(other)
+        return _env.Environment_Update(self, other)
 
-    def __getitem__(self, var):
-        """Lookup a global function by name or by variable.
+    def get_global_var(self, name):
+        """Get a global variable in the function by name.
 
         Parameters
         ----------
-        var: str or GlobalVar
-            The name or global variable.
+        name: str
+            The name of the global variable.
 
         Returns
         -------
-            func: Function
-                The function referenced by :code:`var`.
+        global_var: GlobalVar
+            The global variable mapped to :code:`name`.
+
+        Raises
+        ------
+        tvm.TVMError if we cannot find corresponding global var.
         """
-        if isinstance(var, str):
-            return _env.Environment_Lookup_str(self, var)
-        else:
-            return _env.Environment_Lookup(self, var)
+        return _env.Environment_GetGlobalVar(self, name)
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 9807fab45089..36116d07d601 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -28,9 +28,6 @@ def checked_type(self):
                              " the checked_type for this node")
         return ret
 
-    def __call__(self, *args):
-        return Call(self, args, None, None)
-
 
 @register_relay_node
 class Constant(Expr):
@@ -57,6 +54,14 @@ class Tuple(Expr):
     def __init__(self, fields):
         self.__init_handle_by_constructor__(_make.Tuple, fields)
 
+    def __getitem__(self, index):
+        if index >= len(self):
+            raise IndexError("Tuple index out of range")
+        return self.fields[index]
+
+    def __len__(self):
+        return len(self.fields)
+
 
 @register_relay_node
 class Var(Expr):
@@ -95,6 +100,16 @@ class GlobalVar(Expr):
     def __init__(self, name_hint):
         self.__init_handle_by_constructor__(_make.GlobalVar, name_hint)
 
+    def __call__(self, *args):
+        """Invoke the gobal function.
+
+        Parameters
+        ----------
+        args: List[relay.Expr]
+            Arguments.
+        """
+        return Call(self, args, None, None)
+
 
 @register_relay_node
 class Function(Expr):
@@ -126,6 +141,16 @@ def __init__(self,
         self.__init_handle_by_constructor__(
             _make.Function, params, body, ret_type, type_params)
 
+    def __call__(self, *args):
+        """Invoke the gobal function.
+
+        Parameters
+        ----------
+        args: List[relay.Expr]
+            Arguments.
+        """
+        return Call(self, args, None, None)
+
 
 @register_relay_node
 class Call(Expr):
@@ -238,11 +263,17 @@ def asnode(self):
 
         return self.tuple_value
 
-    def __getitem__(self, key):
-        return self.tuple_value.fields[key]
+    def __getitem__(self, index):
+        if index >= len(self):
+            raise IndexError("Tuple index out of range")
+        return TupleGetItem(self.tuple_value, index)
 
     def __len__(self):
-        return len(self.tuple_value.fields)
+        return self.size
+
+    def __repr__(self):
+        return ("TupleWrapper(" + self.tuple_value.__repr__() +
+                ", " + self.size + ")")
 
 
 def var(name_hint,
@@ -304,13 +335,27 @@ def const(value, dtype=None):
 
     dtype: str, optional
         The data type of the value.
+
+    Note
+    ----
+    When dtype is None, we use the following rule:
+
+    - int maps to "int32"
+    - float maps to "float32"
+    - bool maps to "bool"
+    - other using the same default rule as numpy.
     """
-    if isinstance(value, _base.numeric_types):
-        value = _np.array(value, dtype=dtype)
-    elif isinstance(value, (bool, list)):
+    if isinstance(value, (_base.numeric_types, (bool, list))):
         value = _np.array(value, dtype=dtype)
+        # convert default to int32 and float32
+        if dtype is None:
+            if value.dtype == "float64":
+                value = value.astype("float32")
+            elif value.dtype == "int64":
+                value = value.astype("int32")
     if isinstance(value, (_np.ndarray, _np.generic)):
         value = _nd.array(value)
+
     if not isinstance(value, _nd.NDArray):
         raise ValueError("value has to be scalar or NDArray")
     return Constant(value)
diff --git a/python/tvm/relay/ir_builder.py b/python/tvm/relay/ir_builder.py
deleted file mode 100644
index d2771926e58f..000000000000
--- a/python/tvm/relay/ir_builder.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# pylint: disable=no-else-return
-"""IR builder for the Relay IR.
-
-Enables users to construct Relay programs with a Python API.
-"""
-from collections import OrderedDict
-import numpy as np
-import tvm
-from .ty import Type, FuncType, TensorType
-from .expr import Expr, Constant, Let, Var, Function, If
-from .env import Environment
-
-
-def _convert_to_value(arg, ctxt=tvm.cpu(0)):
-    # type: (Any, tvm.Context) -> tvm.nd.NDArray
-    """Convert Python values into the appropriate types
-       for the Relay evaluator.
-    """
-    if isinstance(arg, bool): # bool is subclass of int
-        return tvm.nd.array(np.array(arg, dtype='uint8'), ctxt)
-    elif isinstance(arg, int):
-        return tvm.nd.array(np.array(arg, dtype='int32'), ctxt)
-    elif isinstance(arg, float):
-        return tvm.nd.array(arg, ctxt)
-    elif isinstance(arg, np.ndarray):
-        return tvm.nd.array(arg, ctxt)
-    elif isinstance(arg, tvm.ndarray.NDArray):
-        return arg
-    else:
-        # raise Exception(f"can't convert {type(arg)} to a Relay AST")
-        raise Exception("unsupported argument type {0}".format(type(arg)))
-
-
-def _convert_type(rtype):
-    if isinstance(rtype, str):
-        return scalar_type(rtype)
-    elif isinstance(rtype, Type):
-        return rtype
-    else:
-        raise Exception(
-            "unsupported conversion to Relay type {0}".format(type(rtype)))
-
-
-def convert(arg):
-    # type: (Any) -> Expr
-    """Convert some Python objects into a Relay AST fragment.
-
-    Parameters
-    ----------
-    arg: Any
-        The Python object
-
-    Returns
-    -------
-    expr: relay.Expr
-        The converted expression.
-    """
-    if isinstance(arg, Expr):
-        return arg
-    elif isinstance(arg, tuple):
-        return relay.Tuple([convert(el) for el in arg])
-    elif isinstance(arg, PartialFunc):
-        return arg.to_func()
-    elif isinstance(arg, tvm._ffi.node.NodeGeneric):
-        return arg.asnode()
-    else:
-        value = _convert_to_value(arg)
-        return Constant(value)
-
-
-class WithScope(object):
-    """A wrapper for builder methods which introduce scoping."""
-
-    def __init__(self, enter_value, exit_cb):
-        self._enter_value = enter_value
-        self._exit_cb = exit_cb
-
-    def __enter__(self):
-        return self._enter_value
-
-    def __exit__(self, ptype, value, trace):
-        if value:
-            raise value
-        else:
-            self._exit_cb()
-
-
-class PartialFunc(object):
-    """A wrapper around functions while they are being built.
-
-      Used by the builder as a user is building up a function,
-      allows Function nodes which contain partially initialized
-      state.
-    """
-
-    def __init__(self, params, ret_type, body, type_params):
-        self.params = params
-        self.ret_type = ret_type
-        self.body = body
-        self.type_params = type_params
-
-    def param_ids(self):
-        return [p for p in self.params]
-
-    def to_func(self):
-        """Converts a PartialFunc into a :py:class:`~relay.Function`."""
-        return Function(
-            self.params,
-            self.body,
-            self.ret_type,
-            self.type_params)
-
-#pylint: disable=invalid-name
-
-
-def _mk_let(bindings, ret_value):
-    let_expr = ret_value
-    for var, value in reversed(list(bindings.items())):
-        let_expr = Let(var, value, let_expr)
-    return let_expr
-
-
-class IRBuilder(object):
-    """The IRBuilder class.
-
-    Enables users to build up a Relay environment and program.
-
-    Examples
-    --------
-
-    Program:
-       fn (x : Tensor[f32, (10, 10)]) {
-         let t1 = log(x);
-         let t2 = add(t1, x);
-         return t1;
-       }
-
-    ..code-block: python
-        b = IRBuilder()
-        with b.function(('x', tensor_type(10, 10))) as func:
-            x, = func.param_ids()
-            t1 = b.let('t1', log(x))
-            t2 = b.let('t2', add(t1, x))
-            b.ret(t2)
-    """
-
-    def __init__(self):
-        self.bindings = [OrderedDict({})]
-        self.scopes = [OrderedDict({})]
-        self.params = []
-        self.ret_values = [None]
-        self.env = Environment({})
-
-    def enter_scope(self, params=None):
-        if not params:
-            params = []
-
-        self.bindings.append(OrderedDict({}))
-        self.scopes.append(OrderedDict({}))
-        self.params.append(params)
-        self.ret_values.append(None)
-
-    def exit_scope(self):
-        bindings = self.bindings.pop()
-        scopes = self.scopes.pop()
-        params = self.params.pop()
-        ret_value = self.ret_values.pop()
-        return bindings, scopes, params, ret_value
-
-    #pylint: disable=invalid-name
-    def bind(self, name, value, ty):
-        lv = Var(name, ty)
-        self.scopes[-1][name] = lv
-        self.bindings[-1][lv] = value
-        return lv
-
-    def let(self, name, value, value_type=None):
-        if not isinstance(value, Expr):
-            value = convert(value)
-
-        return self.bind(name, value, value_type)
-
-    def _convert_params(self, raw_params):
-        relay_params = []
-        for raw_param in raw_params:
-            if isinstance(raw_param, Var):
-                param = raw_param
-            elif isinstance(raw_param, tuple):
-                var, ty = raw_param
-                ty = _convert_type(ty)
-                param = Var(var, ty)
-            elif isinstance(raw_param, str):
-                param = Var(raw_param, None)
-            else:
-                raise Exception("unknown parameter type")
-
-            self.scopes[-1][param.name_hint] = param
-            relay_params.append(param)
-
-        return relay_params
-
-    def function(self, *params):
-        """Construct a Relay function."""
-
-        relay_params = self._convert_params(params)
-
-        self.enter_scope()
-
-        pfunc = PartialFunc(relay_params, None, None, [])
-
-        def _on_exit():
-            bindings, _, _, ret_value = self.exit_scope()
-            body = _mk_let(bindings, ret_value)
-            pfunc.body = body
-
-        return WithScope(pfunc, _on_exit)
-
-    def ret(self, x):
-        """Set `x` to be the return value of the current function."""
-        if not self.ret_values[-1]:
-            self.ret_values[-1] = convert(x)
-        else:
-            raise Exception(
-                "return value already set, a function can only have one return value")
-
-    def if_scope(self, cond):
-        """Construct the if branch an if expression with scoping."""
-        self.enter_scope()
-
-        def _on_exit():
-            bindings, _, _, ret_value = self.exit_scope()
-            assert self.ret_values[-1] is None
-            true_branch = _mk_let(bindings, ret_value)
-            self.ret_values[-1] = If(cond, true_branch, None)
-
-        return WithScope(10, _on_exit)
-
-    def else_scope(self):
-        """Construct the else branch of an if expression with scoping."""
-        self.enter_scope()
-
-        def _on_exit():
-            bindings, _, _, ret_value = self.exit_scope()
-            partial_if = self.ret_values[-1]
-            assert isinstance(
-                partial_if, If) and partial_if.false_branch is None
-            false_branch = _mk_let(bindings, ret_value)
-            self.ret_values[-1] = If(
-                partial_if.cond,
-                partial_if.true_branch,
-                false_branch)
-
-        return WithScope(10, _on_exit)
-
-    def param(self, name, ty=None):
-        if not ty:
-            ty = scalar_type('float32')
-        else:
-            ty = _convert_type(ty)
-
-        return Var(name, ty)
-
-    def global_var(self, name):
-        # type: (str) -> GlobalVar
-        """Construct a global var with `name` as its name hint.
-
-        Parameters
-        ----------
-        name: str
-            The name of the global variable.
-
-        Returns
-        -------
-        global_var: relay.GlobalVar
-            The global variable with `name`.
-
-        """
-        return self.env.global_var(name)
-
-    def decl(self, name, *params, **kwargs):
-        """Create a global function.
-
-        Parameters
-        ----------
-        name: str or GlobalVar
-            The name of the function.
-        params: params
-            The parameters of the function.
-
-        Returns
-        -------
-        with_scope: Scope for the function.
-        """
-
-        ret_type = kwargs.get('ret_type', None)
-
-        self.enter_scope()
-
-        def _on_exit():
-            bindings, _, _, ret_value = self.exit_scope()
-            exp = _mk_let(bindings, ret_value)
-            self.env.add(name, Function(params, exp, ret_type))
-
-        return WithScope(10, _on_exit)
-
-    def get(self):
-        """Get the full program.
-
-        Returns
-        ----------
-        (prog, env) : (relay.Expr, relay.Environment)
-            A pair of the partial program, and the modified environment.
-        """
-        bindings = self.bindings.pop()
-        scope = self.scopes.pop()
-
-        if self.bindings:
-            raise Exception("IRBuilder: binding error")
-
-        if self.scopes:
-            raise Exception("IRBuilder: scoping error")
-
-        if bindings and scope and not self.ret_values:
-            raise Exception("IRBuilder: no return value set")
-
-        return _mk_let(bindings, self.ret_values[-1]), self.env
-
-
-def scalar_type(dtype):
-    """Construct a Relay scalar type.
-
-    Parameters
-    ----------
-    dtype: dtype
-        The dtype of the scalar type.
-
-    Returns:
-    scalar_type: relay.Type
-        The scalar type.
-    """
-    return TensorType(tvm.convert([]), dtype)
-
-
-def tensor_type(*shape, **kwargs):
-    """Construct a Relay Tensor type.
-
-    Parameters
-    ----------
-    shape: list of tvm.Expr
-        The shape of the Tensor type.
-    dtype: dtype
-        The dtype of the Tensor type.
-
-    Returns
-    -------
-    tensor_type: relay.Type
-        The resulting tensor types.
-    """
-    dtype = kwargs.get('dtype', 'float32')
-
-    return TensorType(tvm.convert(shape), dtype)
-
-
-def func_type(args, ret_type, type_params=None):
-    """Construct a Relay function type.
-
-    Parameters
-    ----------
-    args: list of relay.Type
-        The argument types.
-
-    ret_type: relay.Type
-        The return type.
-
-    type_params: list of relay.TypeParam
-        The type parameters.
-
-    Returns
-    -------
-    func_type: The function type.
-    """
-    if not type_params:
-        type_params = []
-
-    args = [_convert_type(arg) for arg in args]
-    ret_type = _convert_type(ret_type)
-    return FuncType(args, ret_type, type_params, [])
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index cbb7095e2f17..549203d12c9f 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -2,37 +2,39 @@
 # pylint: disable=unidiomatic-typecheck
 """The set of passes for Relay.
 
-Exposes an interface for configuring the passes and scripting
-them in Python.
+Exposes an interface for configuring the passes and
+scripting them in Python.
 """
 from . import _ir_pass
 from . import _make
 # pylint: disable=invalid-name
 
-def infer_type(env, expr):
+def infer_type(expr, env=None):
     """Infer the type of expr under the context of env.
 
     Parameters
     ----------
-    env : relay.Environment
+    expr: tvm.relay.Expr
+      The input expression.
+
+    env: Optional[tvm.relay.Environment]
       The global environment.
 
-    expr : relay.Expr
-      The input expression.
 
     Returns
     -------
-    checked_expr : relay.Expr
+    checked_expr : tvm.relay.Expr
       The checked expression.
     """
-    return _ir_pass.infer_type(env, expr)
+    return _ir_pass.infer_type(expr, env)
 
-def well_formed(e):
+
+def well_formed(expr):
     """Check that each Var is only bound once (well formed).
 
     Parameters
     ----------
-    e: relay.Expr
+    expr: tvm.relay.Expr
       The input expression
 
     Returns
@@ -40,7 +42,8 @@ def well_formed(e):
     well_form : bool
       whether the input expression is well formed
     """
-    return _ir_pass.well_formed(e)
+    return _ir_pass.well_formed(expr)
+
 
 def check_kind(t, env=None):
     """Check that the type is well kinded.
@@ -48,10 +51,10 @@ def check_kind(t, env=None):
 
     Parameters
     ----------
-    t: relay.Type
+    t: tvm.relay.Type
       The type to check
 
-    env: relay.Environment, optional
+    env: tvm.relay.Environment, optional
       The global environment
 
     Returns
@@ -71,61 +74,65 @@ def check_kind(t, env=None):
     else:
         return _ir_pass.check_kind(t)
 
+
 def free_vars(e):
     """Get free variables from expression e.
 
     Parameters
     ----------
-    e: relay.Expr
+    e: tvm.relay.Expr
       The input expression
 
     Returns
     -------
-    free : List[relay.Var]
-      the list of free variables
+    free : List[tvm.relay.Var]
+        The list of free variables
     """
     return _ir_pass.free_vars(e)
 
-def free_type_vars(e):
+
+def free_type_vars(expr):
     """Get free type variables from expression/type e
 
     Parameters
     ----------
-    e: relay.Expr/relay.Type
-      The input expression/type
+    expr: Union[tvm.relay.Expr,tvm.relay.Type]
+        The input expression/type
 
     Returns
     -------
-    free : List[relay.TypeParam]
-      the list of free type variables
+    free : List[tvm.relay.TypeParam]
+        The list of free type variables
     """
-    return _ir_pass.free_type_vars(e)
+    return _ir_pass.free_type_vars(expr)
 
-def dead_code_elimination(e):
+
+def dead_code_elimination(expr):
     """ Remove expressions which does not effect the program result (dead code).
 
     Parameters
     ----------
-    e: relay.Expr
-      The input Expression
+    e: tvm.relay.Expr
+        The input Expression
 
     Returns
     -------
-    result: relay.Expr
-      An expression which is semantically equal to the input expression,
-      but with dead code removed.
+    result: tvm.relay.Expr
+        An expression which is semantically equal to the input expression,
+        but with dead code removed.
     """
-    return _ir_pass.dead_code_elimination(e)
+    return _ir_pass.dead_code_elimination(expr)
+
 
 def alpha_equal(lhs, rhs):
     """Compare two Relay expr for structural equivalence (alpha equivalence).
 
     Parameters
     ----------
-    lhs: relay.Expr
+    lhs: tvm.relay.Expr
       One of the input Expression.
 
-    rhs: relay.Expr
+    rhs: tvm.relay.Expr
       One of the input Expression.
 
     Returns
diff --git a/python/tvm/relay/scope_builder.py b/python/tvm/relay/scope_builder.py
new file mode 100644
index 000000000000..641566946f58
--- /dev/null
+++ b/python/tvm/relay/scope_builder.py
@@ -0,0 +1,185 @@
+"""The scope builder interface """
+from __future__ import absolute_import
+
+from . import expr as _expr
+from .._ffi import base as _base
+
+class WithScope(object):
+    """A wrapper for builder methods which introduce scoping.
+
+    Parameters
+    ----------
+    enter_value: object
+        The value returned by enter.
+    """
+
+    def __init__(self, enter_value, exit_cb):
+        self._enter_value = enter_value
+        self._exit_cb = exit_cb
+
+    def __enter__(self):
+        return self._enter_value
+
+    def __exit__(self, ptype, value, trace):
+        if value:
+            raise value
+        else:
+            self._exit_cb()
+
+
+def _make_lets(bindings, ret_value):
+    """Make a nested let expressions.
+
+    Parameters
+    ----------
+    bindings: List[Tuple[tvm.relay.Var,tvm.relay.Expr]]
+        The sequence of let bindings
+
+    ret_value: tvm.relay.Expr
+        The final value of the expression.
+
+    Returns
+    -------
+    lets: tvm.relay.Expr
+        A nested let expression.
+    """
+    if ret_value is None:
+        raise RuntimeError("ret is not called in this scope")
+    if isinstance(ret_value, _expr.If) and ret_value.false_branch is None:
+        raise RuntimeError("Creating an If expression without else.")
+    let_expr = ret_value
+    for var, value in reversed(bindings):
+        let_expr = _expr.Let(var, value, let_expr)
+    return let_expr
+
+
+class ScopeBuilder(object):
+    """Scope builder class.
+
+    Enables users to build up a nested
+    scope(let, if) expression easily.
+
+    Examples
+    --------
+    ..code-block: python
+
+        sb = relay.ScopeBuilder()
+        cond = relay.var("cond", 'bool')
+        x = relay.var("x")
+        y = relay.var("y")
+
+        with sb.if_scope(cond):
+            one = relay.const(1, "float32")
+            t1 = sb.let(t1, relay.add(x, one))
+            sb.ret(t1)
+        with sb.else_scope():
+            sb.ret(y)
+
+        print(sb.get().astext())
+    """
+    def __init__(self):
+        self._bindings = [[]]
+        self._ret_values = [None]
+
+    def _enter_scope(self):
+        self._bindings.append([])
+        self._ret_values.append(None)
+
+    def _exit_scope(self):
+        bindings = self._bindings.pop()
+        ret_value = self._ret_values.pop()
+        return bindings, ret_value
+
+    def let(self, var, value):
+        """Create a new let binding.
+
+        Parameters
+        ----------
+        var: Union[Tuple[str, relay.Type], tvm.relay.Var]
+            The variable or name of variable.
+
+        value: tvm.relay.Expr
+            The value to be binded
+        """
+        if isinstance(var, (tuple, list)):
+            if len(var) > 2:
+                raise ValueError("Expect var to be Tuple[str, relay.Type]")
+            var = _expr.var(*var)
+        elif isinstance(var, _base.string_types):
+            var = _expr.var(var)
+        self._bindings[-1].append((var, value))
+        return var
+
+    def if_scope(self, cond):
+        """Create a new if scope.
+
+        Parameters
+        ----------
+        cond: tvm.relay.Expr
+            The condition
+
+        Returns
+        -------
+        scope: WithScope
+            The if scope.
+
+        Note
+        ----
+        The user must follows with an else scope.
+        """
+        self._enter_scope()
+        def _on_exit():
+            bindings, ret_value = self._exit_scope()
+            if self._ret_values[-1] is not None:
+                raise RuntimeError("result already returned before if scope")
+            true_branch = _make_lets(bindings, ret_value)
+            self._ret_values[-1] = _expr.If(cond, true_branch, None)
+        return WithScope(None, _on_exit)
+
+    def else_scope(self):
+        """Create a new else scope.
+
+        Returns
+        -------
+        scope: WithScope
+            The if scope.
+        """
+        self._enter_scope()
+
+        def _on_exit():
+            bindings, ret_value = self._exit_scope()
+            partial_if = self._ret_values[-1]
+            no_else = (not isinstance(partial_if, _expr.If) or
+                       partial_if.false_branch is not None)
+            if no_else:
+                raise RuntimeError("else scope must follows")
+            false_branch = _make_lets(bindings, ret_value)
+            self._ret_values[-1] = _expr.If(
+                partial_if.cond,
+                partial_if.true_branch,
+                false_branch)
+        return WithScope(None, _on_exit)
+
+    def ret(self, value):
+        """Set the return value of this scope.
+
+        Parameters
+        ----------
+        value: tvm.relay.Expr
+            The return value.
+        """
+        if self._ret_values[-1] is not None:
+            raise RuntimeError("ret value is already set in this scope.")
+        self._ret_values[-1] = value
+
+    def get(self):
+        """Get the generated result.
+
+        Returns
+        -------
+        value: tvm.relay.Expr
+            The final result of the expression.
+        """
+        if len(self._bindings) != 1:
+            raise RuntimeError("can only call get at the outmost scope")
+        return _make_lets(self._bindings[-1], self._ret_values[-1])
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index 34bd60ea08bb..824b0f20e281 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -56,7 +56,7 @@ class Kind(IntEnum):
     Shape = 3
 
 @register_relay_node
-class TypeParam(Type):
+class TypeVar(Type):
     """A type parameter used for generic types in Relay,
     see tvm/relay/type.h for more details.
 
@@ -66,7 +66,7 @@ class TypeParam(Type):
     """
 
     def __init__(self, var, kind=Kind.Type):
-        """Construct a TypeParam.
+        """Construct a TypeVar.
 
         Parameters
         ----------
@@ -78,10 +78,10 @@ def __init__(self, var, kind=Kind.Type):
 
         Returns
         -------
-        type_param: TypeParam
+        type_param: TypeVar
             The type parameter.
         """
-        self.__init_handle_by_constructor__(_make.TypeParam, var, kind)
+        self.__init_handle_by_constructor__(_make.TypeVar, var, kind)
 
 
 @register_relay_node
@@ -122,26 +122,30 @@ class FuncType(Type):
 
     We informally write them as:
     `forall (type_params), (arg_types) -> ret_type where type_constraints`
+
+    Parameters
+    ----------
+    arg_types: List[tvm.relay.Type]
+        The argument types
+
+    ret_type: tvm.relay.Type
+        The return type.
+
+    type_params: List[tvm.relay.TypeVar]
+        The type parameters
+
+    type_constraints: List[tvm.relay.TypeConstraint]
+        The type constraints.
     """
     def __init__(self,
                  arg_types,
                  ret_type,
-                 type_params,
-                 type_constraints):
-        """Construct a function type.
-
-        Parameters
-        ----------
-        arg_types:  list of Type
-        ret_type: Type
-        type_params: list of TypeParam
-        type_constraints: list of TypeConstraint
-
-        Returns
-        -------
-        func_type: FuncType
-            The function type.
-        """
+                 type_params=None,
+                 type_constraints=None):
+        if type_params is None:
+            type_params = []
+        if type_constraints is None:
+            type_constraints = []
         self.__init_handle_by_constructor__(
             _make.FuncType, arg_types, ret_type, type_params, type_constraints)
 
@@ -175,3 +179,21 @@ class TypeRelation(TypeConstraint):
     def __init__(self, func, args, num_inputs, attrs):
         self.__init_handle_by_constructor__(_make.TypeRelation,
                                             func, args, num_inputs, attrs)
+
+
+def scalar_type(dtype):
+    """Creates a scalar type.
+
+    This function returns TensorType((), dtype)
+
+    Parameters
+    ----------
+    dtype : str
+        The content data type.
+
+    Returns
+    -------
+    s_type: tvm.relay.TensorType
+        The result type.
+    """
+    return TensorType((), dtype)
diff --git a/src/relay/ir/environment.cc b/src/relay/ir/environment.cc
index 8bda7587f217..6dfaa0b24a53 100644
--- a/src/relay/ir/environment.cc
+++ b/src/relay/ir/environment.cc
@@ -16,87 +16,71 @@ using namespace runtime;
 Environment EnvironmentNode::make(tvm::Map<GlobalVar, Function> global_funcs) {
   auto n = make_node<EnvironmentNode>();
   n->functions = std::move(global_funcs);
+
+  for (const auto& kv : n->functions) {
+    // set gloval var map
+    CHECK(!n->global_var_map_.count(kv.first->name_hint))
+        << "Duplicate global function name " << kv.first->name_hint;
+    n->global_var_map_.Set(kv.first->name_hint, kv.first);
+  }
   return Environment(n);
 }
 
-GlobalVar EnvironmentNode::GetGlobalVar(const std::string &str) {
-  auto global_id = global_map_.find(str);
-  if (global_id != global_map_.end()) {
-    return (*global_id).second;
-  } else {
-    auto id = GlobalVarNode::make(str);
-    this->global_map_.Set(str, id);
-    return id;
-  }
+GlobalVar EnvironmentNode::GetGlobalVar(const std::string& name) {
+  auto it = global_var_map_.find(name);
+  CHECK(it != global_var_map_.end())
+      << "Cannot find global var " << name << " in the Environment";
+  return (*it).second;
 }
 
-/*!
- * \brief Add a new item to the global environment
- * \note if the update flag is not set adding a duplicate
- * definition will trigger an exception, otherwise we will
- * update the definition if and only if it is type compatible.
- */
-void EnvironmentNode::Add(const GlobalVar &var,
-                          const Function &func,
+void EnvironmentNode::Add(const GlobalVar& var,
+                          const Function& func,
                           bool update) {
   // Type check the item before we add it to the environment.
   auto env = GetRef<Environment>(this);
-
-  Expr checked_expr = InferType(env, var, func);
-
-  if (const FunctionNode *func_node = checked_expr.as<FunctionNode>()) {
-    auto checked_func = GetRef<Function>(func_node);
-    auto type = checked_func->checked_type();
-
-    CHECK(type.as<IncompleteTypeNode>() == nullptr);
-
-    if (functions.find(var) != functions.end()) {
-      if (!update) {
-        throw dmlc::Error("already have definition for XXXX.");
-      }
-
-      auto old_type = functions[var].as<FunctionNode>()->checked_type();
-
-      if (!AlphaEqual(type, old_type)) {
-        throw dmlc::Error(
-            "Environment#update changes type, not possible in this mode.");
-      }
-
-      this->functions.Set(var, checked_func);
-    } else {
-      this->functions.Set(var, checked_func);
-    }
-  } else {
-    LOG(FATAL) << "internal error: unknown item type, unreachable code";
+  Function checked_func = InferType(func, env, var);
+  auto type = checked_func->checked_type();
+  CHECK(type.as<IncompleteTypeNode>() == nullptr);
+  if (functions.find(var) != functions.end()) {
+    CHECK(update)
+        << "Already have definition for " << var->name_hint;
+    auto old_type = functions[var].as<FunctionNode>()->checked_type();
+    CHECK(AlphaEqual(type, old_type))
+        << "Environment#update changes type, not possible in this mode.";
   }
+  this->functions.Set(var, checked_func);
+  // set gloval var map
+  CHECK(!global_var_map_.count(var->name_hint))
+      << "Duplicate global function name " << var->name_hint;
+  global_var_map_.Set(var->name_hint, var);
 }
 
-void EnvironmentNode::Update(const GlobalVar &var, const Function &func) {
+void EnvironmentNode::Update(const GlobalVar& var, const Function& func) {
   this->Add(var, func, true);
 }
 
-void EnvironmentNode::Remove(const GlobalVar & var) {
+void EnvironmentNode::Remove(const GlobalVar& var) {
   auto functions_node = this->functions.CopyOnWrite();
   functions_node->data.erase(var.node_);
+  auto gvar_node = global_var_map_.CopyOnWrite();
+  gvar_node->data.erase(var->name_hint);
 }
 
-Function EnvironmentNode::Lookup(const GlobalVar &var) {
-  auto func = functions.find(var);
-  if (func != functions.end()) {
-    return (*func).second;
-  } else {
-    throw Error(std::string("there is no definition of ") + var->name_hint);
-  }
+Function EnvironmentNode::Lookup(const GlobalVar& var) {
+  auto it = functions.find(var);
+  CHECK(it != functions.end())
+      << "There is no definition of " << var->name_hint;
+  return (*it).second;
 }
 
-Function EnvironmentNode::Lookup(const std::string &str) {
-  GlobalVar id = this->GetGlobalVar(str);
+Function EnvironmentNode::Lookup(const std::string &name) {
+  GlobalVar id = this->GetGlobalVar(name);
   return this->Lookup(id);
 }
 
-void EnvironmentNode::Merge(const Environment &env) {
+void EnvironmentNode::Update(const Environment &env) {
   for (auto pair : env->functions) {
-    this->functions.Set(pair.first, pair.second);
+    this->Update(pair.first, pair.second);
   }
 }
 
@@ -134,10 +118,10 @@ TVM_REGISTER_API("relay._env.Environment_Lookup_str")
     *ret = env->Lookup(var);
   });
 
-TVM_REGISTER_API("relay._env.Environment_Merge")
+TVM_REGISTER_API("relay._env.Environment_Update")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     Environment env = args[0];
-    env->Merge(args[1]);
+    env->Update(args[1]);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index a1d274e3a78e..2d373b769559 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -104,7 +104,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 Function FunctionNode::make(tvm::Array<Var> params,
                             Expr body,
                             Type ret_type,
-                            tvm::Array<TypeParam> type_params) {
+                            tvm::Array<TypeVar> type_params) {
   NodePtr<FunctionNode> n = make_node<FunctionNode>();
   n->params = std::move(params);
   n->body = std::move(body);
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 26d9939aae10..a7367c384cb3 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -66,11 +66,11 @@ Expr ExprMutator::VisitExpr_(const TupleNode* op) {
 }
 
 Expr ExprMutator::VisitExpr_(const FunctionNode* op) {
-  tvm::Array<TypeParam> ty_params;
+  tvm::Array<TypeVar> ty_params;
   bool all_ty_params_changed = true;
 
   for (auto ty_param : op->type_params) {
-    TypeParam new_ty_param = Downcast<TypeParam>(VisitType(ty_param));
+    TypeVar new_ty_param = Downcast<TypeVar>(VisitType(ty_param));
     ty_params.push_back(new_ty_param);
     all_ty_params_changed &= new_ty_param.same_as(ty_param);
   }
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index 6e3c3454e97b..5bbcb0608e6f 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -217,6 +217,8 @@ class TextPrinter :
         return ConstScalar(dtype, static_cast<const float*>(op->data->data));
       } else if (dtype == Float(64)) {
         return ConstScalar(dtype, static_cast<const double*>(op->data->data));
+      } else if (dtype == Bool()) {
+        return ConstScalar(dtype, static_cast<const uint8_t*>(op->data->data));
       }
     }
     // default fall-back, record it as meta node.
@@ -638,8 +640,14 @@ class TextPrinter :
    * \return The corresponding name.
    */
   TextValue AllocVarName(const Var& var) {
-    std::string name = GetUniqueName('%' + var->name_hint);
-    TextValue val(name);
+    std::string name = var->name_hint;
+    // always make sure first name is alpha
+    if (name.length() != 0 && !std::isalpha(name[0])) {
+      name = "%v" + name;
+    } else {
+      name = "%" + name;
+    }
+    TextValue val(GetUniqueName(name));
     CHECK(!memo_.count(var));
     memo_[var] = val;
     return val;
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
index f45ab3b4c9a7..39347adced92 100644
--- a/src/relay/ir/type.cc
+++ b/src/relay/ir/type.cc
@@ -36,30 +36,30 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
   p->stream << "TensorType(" << node->shape << ", " << node->dtype << ")";
 });
 
-TypeParam TypeParamNode::make(std::string name, TypeParamNode::Kind kind) {
-  NodePtr<TypeParamNode> n = make_node<TypeParamNode>();
+TypeVar TypeVarNode::make(std::string name, TypeVarNode::Kind kind) {
+  NodePtr<TypeVarNode> n = make_node<TypeVarNode>();
   n->var = tvm::Var(name);
   n->kind = std::move(kind);
-  return TypeParam(n);
+  return TypeVar(n);
 }
 
-TVM_REGISTER_NODE_TYPE(TypeParamNode);
+TVM_REGISTER_NODE_TYPE(TypeVarNode);
 
-TVM_REGISTER_API("relay._make.TypeParam")
+TVM_REGISTER_API("relay._make.TypeVar")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
   int kind = args[1];
   *ret =
-    TypeParamNode::make(args[0], static_cast<TypeParamNode::Kind>(kind));
+    TypeVarNode::make(args[0], static_cast<TypeVarNode::Kind>(kind));
     });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<TypeParamNode>([](const TypeParamNode *node,
+.set_dispatch<TypeVarNode>([](const TypeVarNode *node,
                                     tvm::IRPrinter *p) {
-  p->stream << "TypeParamNode(" << node->var->name_hint << ", "
+  p->stream << "TypeVarNode(" << node->var->name_hint << ", "
     << node->kind << ")";
 });
 
-IncompleteType IncompleteTypeNode::make(TypeParamNode::Kind kind) {
+IncompleteType IncompleteTypeNode::make(TypeVarNode::Kind kind) {
   auto n = make_node<IncompleteTypeNode>();
   n->kind = std::move(kind);
   return IncompleteType(n);
@@ -70,7 +70,7 @@ TVM_REGISTER_NODE_TYPE(IncompleteTypeNode);
 TVM_REGISTER_API("relay._make.IncompleteType")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     int kind = args[0];
-    *ret = IncompleteTypeNode::make(static_cast<TypeParamNode::Kind>(kind));
+    *ret = IncompleteTypeNode::make(static_cast<TypeVarNode::Kind>(kind));
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
@@ -82,7 +82,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 
 FuncType FuncTypeNode::make(tvm::Array<Type> arg_types,
                             Type ret_type,
-                            tvm::Array<TypeParam> type_params,
+                            tvm::Array<TypeVar> type_params,
                             tvm::Array<TypeConstraint> type_constraints) {
   NodePtr<FuncTypeNode> n = make_node<FuncTypeNode>();
   n->arg_types = std::move(arg_types);
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index e6d60f9344a1..b4984becdf8b 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -78,6 +78,7 @@ RELAY_REGISTER_OP("image.resize")
            for layout NHWC
            (batch_size, size[0], size[1], channels)
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ResizeAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(5)
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 4a8df2c80ec3..8a7cffd2cd27 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -247,6 +247,8 @@ RELAY_REGISTER_UNARY_OP("relay.op.nn._make.", "relu")
 
 
 // Positional relay function to create LRN operator used by frontend FFI.
+TVM_REGISTER_NODE_TYPE(LRNAttrs);
+
 Expr MakeLRN(Expr data,
              IndexExpr size,
              IndexExpr axis,
@@ -290,6 +292,8 @@ centered at that value (zero padding is added where necessary).
 
 
 // Positional relay function to create L2Normalize operator used by frontend FFI.
+TVM_REGISTER_NODE_TYPE(L2NormalizeAttrs);
+
 Expr MakeL2Normalize(Expr data,
                      double eps,
                      Array<IndexExpr> axis) {
@@ -315,6 +319,7 @@ Normalizes along dimension axis using an L2 norm
 
 - **data**: The input tensor.
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.L2NormalizeAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index b67bb96c64a9..da7db042178e 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -77,6 +77,7 @@ RELAY_REGISTER_OP("nn.pad")
 .describe(R"code(Pad for n-D tensor.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.PadAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 665eaf6de880..8c989ac91237 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -12,6 +12,7 @@ namespace tvm {
 namespace relay {
 
 TVM_REGISTER_NODE_TYPE(MaxPool2DAttrs);
+TVM_REGISTER_NODE_TYPE(AvgPool2DAttrs);
 
 template <typename AttrTtype>
 bool Pool2DRel(const Array<Type>& types,
@@ -115,6 +116,7 @@ RELAY_REGISTER_OP("nn.max_pool2d")
            equation.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.MaxPool2DAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -169,6 +171,7 @@ Average pooling operation for one dimensional data.
            equation.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.AvgPool2DAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -232,6 +235,7 @@ RELAY_REGISTER_OP("nn.global_avg_pool2d")
            (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.GlobalPool2DAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -261,6 +265,7 @@ RELAY_REGISTER_OP("nn.global_max_pool2d")
            (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.GlobalPool2DAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index a429a7c40e82..45bedd73c4c0 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -78,6 +78,7 @@ RELAY_REGISTER_OP("nn.upsampling")
            (batch_size, in_height*scale, in_width*scale, channels)
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.UpSamplingAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index d2ec24688633..017ef1e5dfec 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -199,7 +199,7 @@ RELAY_REGISTER_REDUCE_OP("argmax")
 values over a given axis.
 
 )code" TVM_ADD_FILELINE)
-.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
 .set_support_level(4)
 .add_type_rel("ArgReduce", ArgReduceRel);
 
@@ -209,7 +209,7 @@ RELAY_REGISTER_REDUCE_OP("argmin")
 values over a given axis.
 
 )code" TVM_ADD_FILELINE)
-.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
 .set_support_level(4)
 .add_type_rel("ArgReduce", ArgReduceRel);
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index ea67199f4760..61ee2778d0a2 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -144,12 +144,14 @@ RELAY_REGISTER_OP("concatenate")
 - **axis** : The axis along which the tensors are concatenated.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ConcatenateAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input list of tensors.")
 .set_support_level(1)
 .add_type_rel("Concatenate", ConcatenateRel);
 
 /* relay.transpose */
+TVM_REGISTER_NODE_TYPE(TransposeAttrs);
 
 bool TransposeRel(const Array<Type>& types,
                   int num_inputs,
@@ -224,12 +226,15 @@ RELAY_REGISTER_OP("transpose")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.TransposeAttrs")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Transpose", TransposeRel);
 
 /* relay.reshape */
 
+TVM_REGISTER_NODE_TYPE(ReshapeAttrs);
+
 bool ReshapeRel(const Array<Type>& types,
                 int num_inputs,
                 const Attrs& attrs,
@@ -310,6 +315,7 @@ Example::
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.ReshapeAttrs")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Reshape", ReshapeRel);
@@ -397,12 +403,14 @@ Examples::
                               [ 4., 3.]]
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.TakeAttrs")
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("indices", "Tensor", "The indices tensor.")
 .set_support_level(2)
 .add_type_rel("Take", TakeRel);
 
+// Init ops
 TVM_REGISTER_NODE_TYPE(InitOpAttrs);
 
 bool FullRel(const Array<Type>& types,
@@ -448,6 +456,7 @@ RELAY_REGISTER_OP("full")
 .describe(R"code(Fill array with scalar value.
 
 )code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.InitOpAttrs")
 .set_num_inputs(1)
 .add_argument("fill_value", "double", "The value to fill.")
 .set_support_level(3)
@@ -634,6 +643,10 @@ Examples::
 .set_support_level(4)
 .add_type_rel("Where", WhereRel);
 
+
+// Squeeze
+TVM_REGISTER_NODE_TYPE(SqueezeAttrs);
+
 Expr MakeSqueeze(Expr data,
                  Array<IndexExpr> axes) {
   auto attrs = make_node<SqueezeAttrs>();
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 169ef35474e2..467c0fcde860 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -7,6 +7,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/logging.h>
 #include <tvm/relay/op.h>
+#include <tvm/ir_pass.h>
 #include <numeric>
 #include "./type_relations.h"
 
@@ -21,14 +22,6 @@ TensorType ToTensorType(const Type& t) {
   }
 }
 
-// TODO(@jroesch) what size value do we extract, 64bit or 32bit?
-int ToInt(const tvm::Expr& e) {
-  CHECK(e.defined());
-  auto imm = e.as<tvm::ir::IntImm>();
-  CHECK(imm) << "TYPE: " << imm << imm->type << std::endl;
-  return imm->value;
-}
-
 bool IdentityRel(const Array<Type>& types,
                  int num_inputs,
                  const Attrs& attrs,
@@ -39,72 +32,54 @@ bool IdentityRel(const Array<Type>& types,
   return true;
 }
 
+bool EqualCheck(const IndexExpr& lhs,
+                const IndexExpr& rhs) {
+  IndexExpr diff = lhs - rhs;
+  if (const int64_t* pdiff = as_const_int(diff)) {
+    return pdiff[0] == 0;
+  }
+  // symbolic
+  diff = tvm::ir::CanonicalSimplify(diff);
+  if (const int64_t* pdiff = as_const_int(diff)) {
+    return pdiff[0] == 0;
+  }
+  return false;
+}
+
+bool EqualConstInt(const IndexExpr& lhs, int64_t value) {
+  if (const int64_t* pvalue = as_const_int(lhs)) {
+    return pvalue[0] == value;
+  }
+  return false;
+}
+
 Type ConcreteBroadcast(const TensorType& t1,
                        const TensorType& t2,
                        DataType output_dtype) {
-  RELAY_LOG(INFO) << "ConcreteBroadcast: t1=" << t1 << " t2=" << t2
-                  << std::endl;
-  auto sh1 = t1->shape;
-  auto sh2 = t2->shape;
-  RELAY_LOG(INFO) << "ConcreteBroadcast: sh1=" << sh1 << " sh2=" << sh2
-                  << std::endl;
-  if (sh1.size() == 0 && sh2.size() == 0) {
-    return TensorTypeNode::make({}, output_dtype);
-    // We have non-zero shapes so broadcast rules apply.
-  } else {
-    auto suffix_len = static_cast<int>(std::min(sh1.size(), sh2.size()));
-    auto full_len = static_cast<int>(std::max(sh1.size(), sh2.size()));
-
-    auto rev_sh1 = sh1.rbegin();
-    auto rev_sh2 = sh2.rbegin();
-
-    while (rev_sh1 != sh1.rend() && rev_sh2 != sh2.rend()) {
-      auto dim1 = ToInt(*rev_sh1);
-      auto dim2 = ToInt(*rev_sh2);
-      if ((dim1 != dim2) && ((dim1 != 1) && (dim2 != 1))) {
-        CHECK(false) << "Dimension mistmatch "
-                     << "dim1: " << dim1 << " dim2: " << dim2 << std::endl;
-      }
-      rev_sh1++;
-      rev_sh2++;
-    }
-
-    Array<IndexExpr> larger;
-    Array<IndexExpr> smaller;
-
-    for (int i = 0; i < (full_len - suffix_len); i++) {
-      smaller.push_back(make_const(tvm::Int(64), 1));
-    }
-
-    if (sh1.size() < sh2.size()) {
-      for (auto sh : sh1) {
-        smaller.push_back(sh);
-      }
-      larger = sh2;
-    } else if (sh1.size() > sh2.size()) {
-      for (auto sh : sh1) {
-        larger.push_back(sh);
-      }
-      smaller = sh2;
+  std::vector<IndexExpr> oshape;
+  size_t ndim1 = t1->shape.size();
+  size_t ndim2 = t2->shape.size();
+  size_t i = 1;
+  for (; i <= std::min(ndim1, ndim2); ++i) {
+    IndexExpr s1 = t1->shape[ndim1 - i];
+    IndexExpr s2 = t2->shape[ndim2 - i];
+    if (EqualCheck(s1, s2)) {
+      oshape.push_back(s1);
+    } else if (EqualConstInt(s1, 1)) {
+      oshape.push_back(s2);
+    } else if (EqualConstInt(s2, 1)) {
+      oshape.push_back(s1);
     } else {
-      larger = sh1;
-      smaller = sh2;
+      LOG(FATAL) << "Incompatible broadcast type " << t1 << " and " << t2;
     }
-
-    CHECK_EQ(larger.size(), smaller.size());
-
-    Array<IndexExpr> out_shape;
-    for (size_t i = 0; i < smaller.size(); i++) {
-      auto left = smaller[i].as<tvm::ir::IntImm>();
-      auto right = larger[i].as<tvm::ir::IntImm>();
-      CHECK(left);
-      CHECK(right);
-      int64_t dim = std::max(left->value, right->value);
-      out_shape.push_back(make_const(tvm::Int(64), dim));
-    }
-
-    return TensorTypeNode::make(out_shape, output_dtype);
   }
+  size_t max_ndim = std::max(ndim1, ndim2);
+  auto& rshape = (ndim1 > ndim2) ? t1->shape : t2->shape;
+  for (; i <= max_ndim; ++i) {
+    oshape.push_back(rshape[max_ndim - i]);
+  }
+  return TensorTypeNode::make(Array<IndexExpr>(
+      oshape.rbegin(), oshape.rend()), output_dtype);
 }
 
 bool BroadcastRel(const Array<Type>& types,
@@ -141,71 +116,5 @@ bool BroadcastCompRel(const Array<Type>& types,
   return false;
 }
 
-/*! \brief Handle concrete concat case from known input to output. */
-inline Type ConcreteConcatRel(const Type& input_type) {
-  if (auto tuple_node = input_type.as<TupleTypeNode>()) {
-    // NB: For now the axis argument is hardwired to be 0.
-    std::vector<int> dims;
-    DataType dtype;
-
-    CHECK_LT(1, tuple_node->fields.size());
-    bool skip_first = true;
-
-    // Collect the suffix dimensions since axis is zero.
-    // TODO(@jroesch): This is a demonstration of how
-    // to do varargs. It requires a little more work to
-    // fully type the behavior of concat.
-
-    auto first = Downcast<TensorType>(tuple_node->fields[0]);
-    dtype = first->dtype;
-
-    for (auto dim_expr : first->shape) {
-      if (!skip_first) {
-        dims.push_back(ToInt(dim_expr));
-      } else {
-        skip_first = false;
-      }
-    }
-
-    std::vector<int> axis_dims;
-    for (auto field_ty : tuple_node->fields) {
-      auto ttype = Downcast<TensorType>(field_ty);
-      for (size_t i = 0; i < ttype->shape.size(); i++) {
-        if (i != 0) {
-          CHECK_EQ(ToInt(dims[i - 1]), ToInt(ttype->shape[i]));
-        } else {
-          axis_dims.push_back(ToInt(ttype->shape[i]));
-        }
-      }
-    }
-
-    auto out_axis_dim = std::accumulate(axis_dims.begin(), axis_dims.end(), 0);
-
-    Array<tvm::Expr> out_shape = { make_const(Int(64), out_axis_dim) };
-
-    for (auto dim : dims) {
-      out_shape.push_back(make_const(Int(64), dim));
-    }
-
-    return TensorTypeNode::make(out_shape, dtype);
-
-  } else {
-    throw TypeRelationError("concat can only be used with a tuple as its argument");
-  }
-}
-
-bool ConcatRel(const Array<Type>& types,
-               int num_inputs,
-               const Attrs& attrs,
-               const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
-  if (types[0].as<TupleTypeNode>()) {
-    reporter->Assign(types[1], ConcreteConcatRel(types[0]));
-    return true;
-  }
-  return false;
-}
-
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/type_relations.h b/src/relay/op/type_relations.h
index f6e94e24caa9..534e917a0b6c 100644
--- a/src/relay/op/type_relations.h
+++ b/src/relay/op/type_relations.h
@@ -13,17 +13,6 @@
 
 namespace tvm {
 namespace relay {
-
-/*! \brief The error raised by a type relation.
- *
- * This error is how a type relation signals that it has failed.
- *
- */
-struct TypeRelationError : Error {
-  explicit TypeRelationError(const std::string& msg)
-      : Error(msg) {}
-};
-
 /*!
  * \brief The identity type relation, all the types are equal.
  *
@@ -72,22 +61,6 @@ bool BroadcastCompRel(const Array<Type>& types,
                       const Attrs& attrs,
                       const TypeReporter& reporter);
 
-/*!
- * \brief The concat type relation, implements the concatenating
- *  rule over the list of input types producing one concatenated
- *  type.
- * 
- * \param types The input and output types to the relation.
- * \param num_inputs The number of input arguments.
- * \param attrs The attributes
- * \param reporter The reporter.
- * \return true whether relation has been resolved.
- */
-bool ConcatRel(const Array<Type>& types,
-               int num_inputs,
-               const Attrs& attrs,
-               const TypeReporter& reporter);
-
 }  // namespace relay
 }  // namespace tvm
 
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index 63e75c0bb213..ce069a78186b 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -63,6 +63,7 @@ TVM_REGISTER_API("relay.op.vision._make.multibox_prior")
 RELAY_REGISTER_OP("vision.multibox_prior")
 .describe(R"doc("Generate prior(anchor) boxes from data, sizes and ratios."
 )doc" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.MultiBoxPriorAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(4)
diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
index 059504efc883..56aeefda78f1 100644
--- a/src/relay/pass/alpha_eq.cc
+++ b/src/relay/pass/alpha_eq.cc
@@ -34,7 +34,7 @@ bool SameNDArray(const NDArray& lhs, const NDArray& rhs) {
 }
 
 struct TypeAlphaEq : TypeVisitor<const Type&> {
-  tvm::Map<TypeParam, TypeParam> eq_map;
+  tvm::Map<TypeVar, TypeVar> eq_map;
   bool equal;
 
   TypeAlphaEq() : eq_map(), equal(true) {}
@@ -76,10 +76,10 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
     }
   }
 
-  void VisitType_(const TypeParamNode* ti1, const Type& t2) final {
-    if (const TypeParamNode* ti2 = t2.as<TypeParamNode>()) {
-      auto tid1 = GetRef<TypeParam>(ti1);
-      auto tid2 = GetRef<TypeParam>(ti2);
+  void VisitType_(const TypeVarNode* ti1, const Type& t2) final {
+    if (const TypeVarNode* ti2 = t2.as<TypeVarNode>()) {
+      auto tid1 = GetRef<TypeVar>(ti1);
+      auto tid2 = GetRef<TypeVar>(ti2);
 
       // We handle open terms with this rule assuming variables are identical.
       //
diff --git a/src/relay/pass/dead_code.cc b/src/relay/pass/dead_code.cc
index 5d153c606e63..0d2677e11c67 100644
--- a/src/relay/pass/dead_code.cc
+++ b/src/relay/pass/dead_code.cc
@@ -20,7 +20,9 @@ bool IsBoolLit(const Expr& e, bool b) {
   if (const ConstantNode* c = e.as<ConstantNode>()) {
     if (c->is_scalar()) {
       auto dt = c->tensor_type()->dtype;
-      if (dt == UInt(8)) {
+      if (dt == Bool()) {
+        return *reinterpret_cast<const uint8_t*>(c->data->data) == b;
+      } else if (dt == UInt(8)) {
         return *reinterpret_cast<const uint8_t*>(c->data->data) == b;
       } else if (dt == UInt(16)) {
         return *reinterpret_cast<const uint16_t*>(c->data->data) == b;
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
index 72807985ced4..3f4d81b7e24f 100644
--- a/src/relay/pass/kind_check.cc
+++ b/src/relay/pass/kind_check.cc
@@ -20,7 +20,7 @@ namespace tvm {
 namespace relay {
 
 using namespace tvm::runtime;
-using Kind = TypeParamNode::Kind;
+using Kind = TypeVarNode::Kind;
 
 struct KindChecker : TypeVisitor<> {
   bool valid;
@@ -33,7 +33,7 @@ struct KindChecker : TypeVisitor<> {
       return tv->kind == k;
     }
 
-    if (const TypeParamNode *tp = t.as<TypeParamNode>()) {
+    if (const TypeVarNode *tp = t.as<TypeVarNode>()) {
       return tp->kind == k;
     }
 
diff --git a/src/relay/pass/let_list.h b/src/relay/pass/let_list.h
index 43b8bb8bba1d..904ceab36c3d 100644
--- a/src/relay/pass/let_list.h
+++ b/src/relay/pass/let_list.h
@@ -61,7 +61,7 @@ class LetList {
    *  \return a Var that hold the inserted expr.
    */
   Var Push(Expr expr) {
-    return Push(IncompleteTypeNode::make(TypeParamNode::kType), expr);
+    return Push(IncompleteTypeNode::make(TypeVarNode::kType), expr);
   }
 
   /*!
diff --git a/src/relay/pass/type_functor.h b/src/relay/pass/type_functor.h
index 81f93cacaa80..b8eaa85a73d2 100644
--- a/src/relay/pass/type_functor.h
+++ b/src/relay/pass/type_functor.h
@@ -61,7 +61,7 @@ class TypeFunctor<R(const Type& n, Args...)> {
   // Functions that can be overriden by subclass
   virtual R VisitType_(const TensorTypeNode* op,
                        Args... args) TYPE_FUNCTOR_DEFAULT;
-  virtual R VisitType_(const TypeParamNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TypeVarNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const TypeConstraintNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const FuncTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const TypeRelationNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
@@ -79,7 +79,7 @@ class TypeFunctor<R(const Type& n, Args...)> {
     FType vtable;
     // Set dispatch
     RELAY_TYPE_FUNCTOR_DISPATCH(TensorTypeNode);
-    RELAY_TYPE_FUNCTOR_DISPATCH(TypeParamNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TypeVarNode);
     RELAY_TYPE_FUNCTOR_DISPATCH(TypeConstraintNode);
     RELAY_TYPE_FUNCTOR_DISPATCH(FuncTypeNode);
     RELAY_TYPE_FUNCTOR_DISPATCH(TypeRelationNode);
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 1b30865eacb1..3e233274af2e 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -28,6 +28,39 @@
 
 namespace tvm {
 namespace relay {
+
+// Necessary deferred relation for TupleGetItem
+struct TupleGetItemAttrs : public tvm::AttrsNode<TupleGetItemAttrs> {
+  int index;
+
+  TVM_DECLARE_ATTRS(TupleGetItemAttrs, "relay.attrs.TupleGetItemAttrs") {
+    TVM_ATTR_FIELD(index);
+  }
+};
+
+bool TupleGetItemRel(const Array<Type>& types,
+                     int num_inputs,
+                     const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  if (types[0].as<IncompleteTypeNode>()) return false;
+  const auto* data = types[0].as<TupleTypeNode>();
+  CHECK(data != nullptr)
+      << "TupleGetItem expect input type to be TupleType "
+      << " get " << types[0] << " instead";
+  const auto* param = attrs.as<TupleGetItemAttrs>();
+  CHECK(param != nullptr);
+  CHECK_GE(param->index, 0);
+  CHECK_LT(param->index,  data->fields.size());
+  reporter->Assign(types[1], data->fields[param->index]);
+  return true;
+}
+
+TVM_REGISTER_NODE_TYPE(TupleGetItemAttrs);
+TVM_REGISTER_API("tvm.relay.type_relation.TupleGetItem")
+.set_body_typed<bool(const Array<Type>&, int, const Attrs&, const TypeReporter&)>(
+    TupleGetItemRel);
+
 //
 // The inference algorithm can roughly be devided into three stages:
 // - Populate the constraints by visiting the expression (TypeInferencer.GetType)
@@ -38,8 +71,7 @@ namespace relay {
 class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
  public:
   // constructors
-  TypeInferencer()
-      : env_(EnvironmentNode::make({})) {
+  TypeInferencer() {
   }
   explicit TypeInferencer(Environment env)
       : env_(env) {
@@ -58,6 +90,8 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
   std::unordered_map<Expr, Type, NodeHash, NodeEqual> type_map_;
   // The solver used by the inferencer.
   TypeSolver solver_;
+  // relation function
+  TypeRelationFn tuple_getitem_rel_;
   // Unify two types
   Type Unify(const Type& t1, const Type& t2, const Span& span) {
     // TODO(tqchen, jroesch): propagate span to solver
@@ -90,12 +124,14 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
     if (op->type_annotation.defined()) {
       return op->type_annotation;
     } else {
-      return IncompleteTypeNode::make(TypeParamNode::kType);
+      return IncompleteTypeNode::make(TypeVarNode::kType);
     }
   }
 
   Type VisitExpr_(const GlobalVarNode* op) final {
     GlobalVar var = GetRef<GlobalVar>(op);
+    CHECK(env_.defined())
+        << "Cannot do type inference without a global variable";
     Expr e = env_->Lookup(var);
     return e->checked_type();
   }
@@ -116,17 +152,17 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
   }
 
   Type VisitExpr_(const TupleGetItemNode* op) final {
-    // TODO(M.K.)
-    // handle case where field type is not known
-    Type tuple_type = GetType(op->tuple);
-    auto tuple_ty_node = tuple_type.as<TupleTypeNode>();
-    if (!tuple_ty_node) {
-      LOG(FATAL) << "only expressions with tuple types is accepted" << GetRef<TupleGetItem>(op);
-    }
-    if (static_cast<int>(tuple_ty_node->fields.size()) <= op->index) {
-      LOG(FATAL) << "tuple not big enough" << GetRef<TupleGetItem>(op);
+    if (!tuple_getitem_rel_.defined())  {
+      tuple_getitem_rel_ = TypeRelationFn(
+          EnvFunc::Get("tvm.relay.type_relation.TupleGetItem").node_);
     }
-    return tuple_ty_node->fields[op->index];
+    Type tuple_type = GetType(op->tuple);
+    Type rtype = IncompleteTypeNode::make(TypeVarNode::Kind::kType);
+    auto attrs = make_node<TupleGetItemAttrs>();
+    attrs->index = op->index;
+    solver_.AddConstraint(TypeRelationNode::make(
+        tuple_getitem_rel_, {tuple_type, rtype}, 1, Attrs(attrs)));
+    return rtype;
   }
 
   Type VisitExpr_(const OpNode* op) final {
@@ -169,7 +205,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
     for (size_t i = 0; i < op->type_params.size(); ++i) {
       if (!op->type_params[i].same_as(rel->args[i])) return Type();
     }
-    Type rtype = IncompleteTypeNode::make(TypeParamNode::Kind::kType);
+    Type rtype = IncompleteTypeNode::make(TypeVarNode::Kind::kType);
     arg_types.push_back(rtype);
     // we can do simple replacement here
     solver_.AddConstraint(TypeRelationNode::make(
@@ -179,7 +215,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
 
   // instantiate the function type with fresh
   FuncType Instantiate(const FuncTypeNode* fn_ty, Array<Type>* ty_args) {
-    tvm::Map<TypeParam, Type> subst_map;
+    tvm::Map<TypeVar, Type> subst_map;
 
     // Build a subsitituion map up from the function type and type arguments.
     // Eventually allow the type vars to be passed in.
@@ -196,7 +232,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
     // This is a temporary work around to check recursive functions whose
     // return type is not yet known.
     if (!ret_type.defined()) {
-      ret_type = IncompleteTypeNode::make(TypeParamNode::Kind::kType);
+      ret_type = IncompleteTypeNode::make(TypeVarNode::Kind::kType);
     }
     Type inst_ty = FuncTypeNode::make(fn_ty->arg_types,
                                       ret_type, {},
@@ -305,7 +341,6 @@ class TypeInferencer::Resolver : public ExprMutator {
     return AttachCheckedType(op);
   }
 
-
   Expr VisitExpr_(const FunctionNode* op) final {
     return AttachCheckedType(op);
   }
@@ -363,20 +398,21 @@ Expr TypeInferencer::Infer(Expr expr) {
   return Resolver(type_map_, &solver_).VisitExpr(expr);
 }
 
-Expr InferType(const Environment& env, const Expr& expr) {
+
+Expr InferType(const Expr& expr, const Environment& env) {
   return TypeInferencer(env).Infer(expr);
 }
 
-Expr InferType(const Environment& env,
-               const GlobalVar& var,
-               const Function& func) {
+Function InferType(const Function& func,
+                   const Environment& env,
+                   const GlobalVar& var) {
   Function func_copy = Function(make_node<FunctionNode>(*func.operator->()));
   func_copy->checked_type_ = func_copy->func_type_annotation();
   env->functions.Set(var, func_copy);
   Expr func_ret = TypeInferencer(env).Infer(func_copy);
   auto map_node = env->functions.CopyOnWrite();
   map_node->data.erase(var.node_);
-  return func_ret;
+  return Downcast<Function>(func_ret);
 }
 
 TVM_REGISTER_API("relay._ir_pass.infer_type")
diff --git a/src/relay/pass/type_subst.cc b/src/relay/pass/type_subst.cc
index 0b17fa0bc4f8..bffd779d1af2 100644
--- a/src/relay/pass/type_subst.cc
+++ b/src/relay/pass/type_subst.cc
@@ -10,13 +10,13 @@ namespace tvm {
 namespace relay {
 
 struct TypeSubstV : TypeMutator {
-  tvm::Map<TypeParam, Type> subst_map;
+  tvm::Map<TypeVar, Type> subst_map;
 
-  explicit TypeSubstV(tvm::Map<TypeParam, Type> subst_map)
+  explicit TypeSubstV(tvm::Map<TypeVar, Type> subst_map)
     : subst_map(subst_map) {}
 
-  Type VisitType_(const TypeParamNode* op) override {
-    auto id = GetRef<TypeParam>(op);
+  Type VisitType_(const TypeVarNode* op) override {
+    auto id = GetRef<TypeVar>(op);
     if (subst_map.find(id) != subst_map.end()) {
       return this->subst_map[id];
     } else {
@@ -25,12 +25,12 @@ struct TypeSubstV : TypeMutator {
   }
 };
 
-Type TypeSubst(const Type& type, const TypeParam& target, const Type& subst) {
+Type TypeSubst(const Type& type, const TypeVar& target, const Type& subst) {
   TypeSubstV ty_sub({ {target, subst} });
   return ty_sub.VisitType(type);
 }
 
-Type TypeSubst(const Type& type, tvm::Map<TypeParam, Type> subst_map) {
+Type TypeSubst(const Type& type, tvm::Map<TypeVar, Type> subst_map) {
   TypeSubstV ty_sub(subst_map);
   return ty_sub.VisitType(type);
 }
diff --git a/src/relay/pass/type_subst.h b/src/relay/pass/type_subst.h
index aee3209afb7a..808e3536ae30 100644
--- a/src/relay/pass/type_subst.h
+++ b/src/relay/pass/type_subst.h
@@ -11,8 +11,8 @@
 namespace tvm {
 namespace relay {
 
-Type TypeSubst(const Type& type, const TypeParam& target, const Type& subst);
-Type TypeSubst(const Type& type, tvm::Map<TypeParam, Type> subst_map);
+Type TypeSubst(const Type& type, const TypeVar& target, const Type& subst);
+Type TypeSubst(const Type& type, tvm::Map<TypeVar, Type> subst_map);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/type_visitor.h b/src/relay/pass/type_visitor.h
index 6468269686e8..c1b2c3e1a3ad 100644
--- a/src/relay/pass/type_visitor.h
+++ b/src/relay/pass/type_visitor.h
@@ -19,7 +19,7 @@ namespace relay {
  */
 template <typename... Args>
 struct TypeVisitor : ::tvm::relay::TypeFunctor<void(const Type& n, Args...)> {
-  void VisitType_(const TypeParamNode* op, Args... args) override {}
+  void VisitType_(const TypeVarNode* op, Args... args) override {}
 
   void VisitType_(const FuncTypeNode* op, Args... args) override {
     for (auto type_param : op->type_params) {
@@ -60,16 +60,16 @@ struct TypeMutator : TypeFunctor<Type(const Type& n)> {
     return TensorTypeNode::make(op->shape, op->dtype);
   }
 
-  Type VisitType_(const TypeParamNode* op) override {
-    return GetRef<TypeParam>(op);
+  Type VisitType_(const TypeVarNode* op) override {
+    return GetRef<TypeVar>(op);
   }
 
   Type VisitType_(const FuncTypeNode* op) override {
-    Array<TypeParam> type_params;
+    Array<TypeVar> type_params;
     for (auto type_param : op->type_params) {
       auto new_type_param = VisitType(type_param);
-      if (const TypeParamNode* tin = new_type_param.as<TypeParamNode>()) {
-        type_params.push_back(GetRef<TypeParam>(tin));
+      if (const TypeVarNode* tin = new_type_param.as<TypeVarNode>()) {
+        type_params.push_back(GetRef<TypeVar>(tin));
       } else {
         CHECK(false) << new_type_param << std::endl;
       }
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index c845995b2003..8ebac921203f 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -14,14 +14,14 @@ namespace relay {
 
 class FreeVar;
 class FreeTypeVar : private TypeVisitor<> {
-  std::unordered_set<TypeParam, NodeHash, NodeEqual> * free_vars;
-  std::unordered_set<TypeParam, NodeHash, NodeEqual> * bound_vars;
-  FreeTypeVar(std::unordered_set<TypeParam, NodeHash, NodeEqual> * free_vars,
-              std::unordered_set<TypeParam, NodeHash, NodeEqual> * bound_vars) :
+  std::unordered_set<TypeVar, NodeHash, NodeEqual> * free_vars;
+  std::unordered_set<TypeVar, NodeHash, NodeEqual> * bound_vars;
+  FreeTypeVar(std::unordered_set<TypeVar, NodeHash, NodeEqual> * free_vars,
+              std::unordered_set<TypeVar, NodeHash, NodeEqual> * bound_vars) :
     free_vars(free_vars), bound_vars(bound_vars) { }
 
-  void VisitType_(const TypeParamNode* tp) final {
-    auto var = GetRef<TypeParam>(tp);
+  void VisitType_(const TypeVarNode* tp) final {
+    auto var = GetRef<TypeVar>(tp);
     if (bound_vars->count(var) == 0) {
       free_vars->insert(var);
     }
@@ -75,8 +75,8 @@ class FreeVar : public ExprVisitor {
  public:
   std::unordered_set<Var, NodeHash, NodeEqual> free_vars;
   std::unordered_set<Var, NodeHash, NodeEqual> bound_vars;
-  std::unordered_set<TypeParam, NodeHash, NodeEqual> free_types;
-  std::unordered_set<TypeParam, NodeHash, NodeEqual> bound_types;
+  std::unordered_set<TypeVar, NodeHash, NodeEqual> free_types;
+  std::unordered_set<TypeVar, NodeHash, NodeEqual> bound_types;
 
   void VisitType(const Type& t) final {
     FreeTypeVar(&free_types, &bound_types)(t);
@@ -89,16 +89,16 @@ tvm::Array<Var> FreeVariables(const Expr& e) {
   return tvm::Array<Var>(fv.free_vars.begin(), fv.free_vars.end());
 }
 
-tvm::Array<TypeParam> FreeTypeVariables(const Expr& e) {
+tvm::Array<TypeVar> FreeTypeVariables(const Expr& e) {
   FreeVar fv;
   fv.VisitExpr(e);
-  return tvm::Array<TypeParam>(fv.free_types.begin(), fv.free_types.end());
+  return tvm::Array<TypeVar>(fv.free_types.begin(), fv.free_types.end());
 }
 
-tvm::Array<TypeParam> FreeTypeVariables(const Type& t) {
+tvm::Array<TypeVar> FreeTypeVariables(const Type& t) {
   FreeVar fv;
   fv.VisitType(t);
-  return tvm::Array<TypeParam>(fv.free_types.begin(), fv.free_types.end());
+  return tvm::Array<TypeVar>(fv.free_types.begin(), fv.free_types.end());
 }
 
 TVM_REGISTER_API("relay._ir_pass.free_vars")
diff --git a/tests/python/relay/test_ir_builder.py b/tests/python/relay/test_ir_builder.py
deleted file mode 100644
index 165c66f17ac3..000000000000
--- a/tests/python/relay/test_ir_builder.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import numpy as np
-from tvm.relay.expr import Let, Constant
-from tvm.relay.ir_builder import IRBuilder
-
-def test_let():
-    b = IRBuilder()
-    x = b.let('x', 1)
-    b.ret(x)
-    prog, _ = b.get()
-    assert isinstance(prog, Let)
-    var = prog.var
-    value = prog.value
-    assert var.name_hint == 'x'
-    assert var == prog.body
-    assert isinstance(value, Constant)
-    assert value.data.asnumpy() == np.array(1)
-
-if __name__ == "__main__":
-    test_let()
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index e571f2a9c99a..fc9f30c9a61d 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -34,7 +34,7 @@ def test_tensor_type():
 
 
 def test_type_param():
-    tp = relay.TypeParam('name', relay.Kind.Type)
+    tp = relay.TypeVar('name', relay.Kind.Type)
     assert tp.kind == relay.Kind.Type
     # assert tp.span  # TODO allow us to set span
     str(tp)
@@ -56,7 +56,7 @@ def test_func_type():
 
 
 def test_tuple_type():
-    tp = relay.TypeParam('tp', relay.Kind.Type)
+    tp = relay.TypeVar('tp', relay.Kind.Type)
     tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([]))
     tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
     fields = tvm.convert([tp, tf, tt])
@@ -66,7 +66,7 @@ def test_tuple_type():
 
 
 def test_type_relation():
-    tp = relay.TypeParam('tp', relay.Kind.Type)
+    tp = relay.TypeVar('tp', relay.Kind.Type)
     tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([]))
     tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
     args = tvm.convert([tf, tt, tp])
@@ -173,7 +173,7 @@ def test_if():
 def test_tuple_get_item():
     tup = relay.Var("tuple")
     get = relay.TupleGetItem(tup, 1)
-    assert get.tuple == tup
+    assert get.tuple_value == tup
     assert get.index == 1
     str(get)
 
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 79a4fdd010c5..29814ecc5eb7 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -27,7 +27,7 @@ def test_env():
     z = relay.add(z, z)
     f = relay.Function([x, y], z)
     env = relay.Environment()
-    env.add("myf", f)
+    env["myf"] = f
     text = env.astext()
     assert "def @myf" in text
     assert "%1 = add(%0, %0) # ty=float32" in text
@@ -70,15 +70,18 @@ def test_let_if_scope():
     x = relay.var("x", "float32")
     y = relay.var("y", "float32")
     cond = relay.var("cond", "bool")
-    v1 = relay.var("v")
-    v2 = relay.var("v", "float32")
-    then_branch = relay.Let(
-        v1, relay.const(1, "float32"),
-        relay.Let(v2, x, relay.subtract(v1, v2)))
-    v3 = relay.var("v")
-    let2 = relay.Let(v3, y, v3)
-    else_branch = relay.add(let2, let2)
-    result = relay.If(cond, then_branch, else_branch)
+
+    sb = relay.ScopeBuilder()
+    with sb.if_scope(cond):
+        v1 = sb.let("v", relay.const(1, "float32"))
+        v2 = sb.let("v", x)
+        sb.ret(relay.subtract(v1, v2))
+    with sb.else_scope():
+        v3 = relay.var("v")
+        let2 = relay.Let(v3, y, v3)
+        sb.ret(relay.add(let2, let2))
+    result = sb.get()
+
     f = relay.Function([x, y, cond], result)
     text = f.astext()
     assert text.count("{") == 4
@@ -86,10 +89,17 @@ def test_let_if_scope():
     show(f.astext())
 
 
+def test_variable_name():
+    # avoid pure number even if the namehint is pure number
+    v1 = relay.var("1")
+    assert "%v1" in v1.astext()
+
+
 if __name__ == "__main__":
     do_print[0] = True
-    test_let_if_scope()
     test_func()
     test_env()
     test_meta_data()
     test_call_attrs()
+    test_let_if_scope()
+    test_variable_name()
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 914eafeb57a9..5afae6e872d1 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -1,282 +1,143 @@
 import tvm
 import numpy as np
 from tvm import relay
-from tvm.relay.ir_pass import infer_type
-from tvm.relay.ir_builder import IRBuilder, func_type
-from tvm.relay.ir_builder import scalar_type, convert, tensor_type
-from tvm.relay.env import Environment
 
-def assert_has_type(expr, typ, env=Environment({})):
-    checked_expr = infer_type(env, expr)
-    checked_type = checked_expr.checked_type
-    if checked_type != typ:
-        raise RuntimeError("Type mismatch %s vs %s" % (
-            checked_type, typ))
 
-def test_single_op():
+def test_unary_op():
     def check_single_op(opfunc):
-        "Program: fn (x : float32) { let t1 = f(x); t1 }"
-        b = IRBuilder()
-        with b.function(('x', 'float32')) as func:
-            x, = func.param_ids()
-            t1 = b.let('t1', opfunc(x))
-            b.ret(t1)
-        assert_has_type(func.to_func(), func_type(['float32'], 'float32'))
-
-    for opfunc in [tvm.relay.log, tvm.relay.exp, tvm.relay.sqrt,
-                   tvm.relay.sigmoid, tvm.relay.tanh]:
+        tp = relay.TensorType((10, 4), "float32")
+        x = relay.var("x", tp)
+        y = opfunc(x)
+        # test printer
+        assert ("%0 = {}(%x)".format(y.op.name)) in y.astext()
+        # test type inference
+        assert relay.ir_pass.infer_type(y).checked_type == tp
+
+    for opfunc in [tvm.relay.log,
+                   tvm.relay.exp,
+                   tvm.relay.sqrt,
+                   tvm.relay.sigmoid,
+                   tvm.relay.tanh,
+                   relay.nn.relu]:
         check_single_op(opfunc)
 
 
+def test_binary_op():
+    def check_binary_op(opfunc):
+        n = tvm.var("n")
+        t1 = relay.TensorType((5, n, 5))
+        t2 = relay.TensorType((n, 1))
+        x = relay.var("x", t1)
+        y = relay.var("y", t2)
+        z = opfunc(x, y)
+        # test printer
+        assert ("%0 = {}(%x, %y)".format(z.op.name)) in z.astext()
+        assert relay.ir_pass.infer_type(z).checked_type == t1
+
+    for opfunc in [relay.add,
+                   relay.subtract,
+                   relay.mod,
+                   relay.multiply,
+                   relay.divide]:
+        check_binary_op(opfunc)
+
 
 def test_expand_dims_infer_type():
-    ib = relay.ir_builder.IRBuilder()
     n, t, d = tvm.var("n"), tvm.var("t"), 100
-    # let's mimic a batch of sequences
-    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.expand_dims(x, axis=2))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
-        (n, t, 1, 100), "float32")
+    x = relay.var("x", shape=(n, t, d))
+    y = relay.expand_dims(x, axis=2)
+    assert "axis=2" in y.astext()
+    checked = relay.ir_pass.infer_type(y)
+    assert checked.checked_type == relay.TensorType((n, t, 1, 100))
 
 
 def test_softmax():
-    ib = relay.ir_builder.IRBuilder()
     n, d = tvm.var("n"), tvm.var("d")
-    x = ib.param("x", relay.ty.TensorType((n, d), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.nn.softmax(x, axis=1))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, d), "float32")
+    x = relay.var("x", shape=(n, d))
+    y = relay.nn.softmax(x, axis=1)
+    assert "nn.softmax" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, d))
 
 
 def test_log_softmax():
-    ib = relay.ir_builder.IRBuilder()
     n, d = tvm.var("n"), tvm.var("d")
-    x = ib.param("x", relay.ty.TensorType((n, d), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.nn.log_softmax(x, axis=1))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, d), "float32")
-
-def test_unary_op():
-    for op in [relay.exp,
-               relay.log,
-               relay.sqrt,
-               relay.sigmoid,
-               relay.nn.relu]:
-        ib = relay.ir_builder.IRBuilder()
-        x = ib.param("x", relay.TensorType((10, 4), "int32"))
-        with ib.function(x) as func:
-            ib.ret(op(x))
-        ib.ret(func)
-        func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type
-        assert ftype.ret_type == relay.TensorType((10, 4), "int32")
-
-
-def test_binary_op():
-    def check_binary_op(opfunc):
-        """
-        Program:
-            fn (x, y) {
-                return x <op> y;
-            }
-        """
-        b = IRBuilder()
-
-        x = b.param('x', tensor_type(5, 5, 5))
-        y = b.param('y', tensor_type(5, 5, 5))
-        with b.function(x, y) as func:
-            b.ret(opfunc(x, y))
-        b.ret(func)
-        prog, env = b.get()
-        ttype = tensor_type(5, 5, 5)
-        expected_ty = func_type([ttype, ttype], ttype)
-        assert_has_type(func.to_func(), expected_ty)
-
-    for opfunc in [relay.add, relay.subtract, relay.mod,
-                   relay.multiply, relay.divide]:
-        check_binary_op(opfunc)
-
-
-def test_binary_broadcast_op():
-    def check_binary_broadcast_op(opfunc):
-        """
-        Program:
-            fn (x: Tensor[(10, 4), f32], y: Tensor[(5, 10, 1), f32]) -> Tensor[(5, 10, 4), f32] {
-                return x <op> y;
-            }
-        """
-        b = IRBuilder()
-        x = b.param('x', tensor_type(10, 4))
-        y = b.param('y', tensor_type(5, 10, 1))
-        with b.function(x, y) as func:
-            b.ret(opfunc(x, y))
-        b.ret(func)
-        prog, env = b.get()
-
-        expected_ty = func_type([tensor_type(10, 4), tensor_type(5, 10, 1)],
-                                tensor_type(5, 10, 4))
-        assert_has_type(func.to_func(), expected_ty)
-
-    for opfunc in [relay.add, relay.subtract, relay.mod,
-                   relay.multiply, relay.divide]:
-        check_binary_broadcast_op(opfunc)
+    x = relay.var("x", shape=(n, d))
+    y = relay.nn.log_softmax(x, axis=0)
+    assert "nn.log_softmax" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, d))
 
 
 def test_concatenate_infer_type():
-    ib = relay.ir_builder.IRBuilder()
-    n, t, d = tvm.var("n"), tvm.var("t"), 100
-    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
-    y = ib.param("y", relay.ty.TensorType((n, t, d), "float32"))
-    with ib.function(x, y) as func:
-        ib.ret(relay.concatenate((x, y), axis=-1))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
-        (n, t, 200), "float32")
-
-    ib = relay.ir_builder.IRBuilder()
     n, t, d = tvm.var("n"), tvm.var("t"), 100
-    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
-    y = ib.param("y", relay.ty.TensorType((n, t, d), "float32"))
-    with ib.function(x, y) as func:
-        ib.ret(relay.concatenate((x, y), axis=2))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
-        (n, t, 200), "float32")
-
-    ib = relay.ir_builder.IRBuilder()
-    n, t, d = tvm.var("n"), tvm.var("t"), 100
-    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
-    y = ib.param("y", relay.ty.TensorType((n, t, d), "float32"))
-    with ib.function(x, y) as func:
-        ib.ret(relay.concatenate((x, y), axis=1))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
-        (n, t + t, 100), "float32")
-
-def test_lrn():
-    ib = relay.ir_builder.IRBuilder()
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=.00001, beta=0.75))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c , h, w), "float32")
+    x = relay.var("x", shape=(n, t, d))
+    y = relay.var("y", shape=(n, t, d))
+    z = relay.concatenate((x, y), axis=-1)
+    assert "axis=" in z.astext()
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((n, t, 200))
 
+    z = relay.concatenate((x, y), axis=2)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((n, t, 200))
 
-def test_l2_normalize():
-    ib = relay.ir_builder.IRBuilder()
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.nn.l2_normalize(x, eps=0.001, axis=[1]))
-    ib.ret(func)
+    z = relay.concatenate((x, y), axis=1)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((n, t + t, 100))
 
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c , h, w), "float32")
 
 def test_dropout():
-    ib = relay.ir_builder.IRBuilder()
-    input_ty = relay.ty.TensorType((3, 4, 5), "int8")
-    x = ib.param("x", input_ty)
-    with ib.function(x) as func:
-        ib.ret(relay.nn.dropout(x))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TupleType([input_ty, input_ty])
-
-    ib = relay.ir_builder.IRBuilder()
     n, t, d = tvm.var("n"), tvm.var("t"), tvm.var("d")
-    input_ty = relay.ty.TensorType((n, t, d), "float32")
-    x = ib.param("x", input_ty)
-    with ib.function(x) as func:
-        ib.ret(relay.nn.dropout(x, rate=0.75))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TupleType([input_ty, input_ty])
+    input_ty = relay.TensorType((n, t, d), "float32")
+    x = relay.var("x", input_ty)
+    y, _ = relay.nn.dropout(x, rate=0.75)
+    assert "rate=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == input_ty
 
 
 def test_batch_norm():
     # beta and gamma ignored
-    ib = relay.ir_builder.IRBuilder()
-    data = ib.param("data", relay.ty.TensorType((3, 2, 1), "float32"))
-    gamma = ib.param("gamma", relay.ty.TensorType((5,), "int8"))
-    beta = ib.param("beta", relay.ty.TensorType((12, 16), "int64"))
-    moving_mean = ib.param("moving_mean", relay.ty.TensorType((2,), "float32"))
-    moving_var = ib.param("moving_var", relay.ty.TensorType((2,), "float32"))
-    with ib.function(data, gamma, beta, moving_mean, moving_var) as func:
-        ib.ret(relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
-                                   center=False, scale=False))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TupleType(tvm.convert([
-        relay.ty.TensorType((3, 2, 1), "float32"),
-        relay.ty.TensorType((2,), "float32"),
-        relay.ty.TensorType((2,), "float32")
+    data = relay.var("data", relay.TensorType((3, 2, 1)))
+    beta = relay.var("beta", relay.TensorType((2,)))
+    gamma = relay.var("gamma", relay.TensorType((2,)))
+    moving_mean = relay.var("moving_mean", relay.TensorType((2,)))
+    moving_var = relay.var("moving_var", relay.TensorType((2,)))
+    y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
+                            center=False, scale=False)
+    yy = relay.ir_pass.infer_type(y)
+    assert "center=" in yy.astext()
+    assert yy.checked_type == relay.ty.TupleType(tvm.convert([
+        relay.TensorType((3, 2, 1), "float32"),
+        relay.TensorType((2,), "float32"),
+        relay.TensorType((2,), "float32")
     ]))
 
-    # with beta and gamma, different axis
-    ib = relay.ir_builder.IRBuilder()
-    data = ib.param("data", relay.ty.TensorType((3, 2, 1), "float32"))
-    gamma = ib.param("gamma", relay.ty.TensorType((3,), "float32"))
-    beta = ib.param("beta", relay.ty.TensorType((3,), "float32"))
-    moving_mean = ib.param("moving_mean", relay.ty.TensorType((3,), "float32"))
-    moving_var = ib.param("moving_var", relay.ty.TensorType((3,), "float32"))
-    with ib.function(data, gamma, beta, moving_mean, moving_var) as func:
-        ib.ret(relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
-                                   axis=0, center=False, scale=False))
-    ib.ret(func)
+    beta = relay.var("beta", relay.TensorType((3,)))
+    gamma = relay.var("gamma", relay.TensorType((3,)))
+    moving_mean = relay.var("moving_mean", relay.TensorType((3,)))
+    moving_var = relay.var("moving_var", relay.TensorType((3,)))
 
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TupleType(tvm.convert([
+    y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
+                            axis=0, center=False, scale=False)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.ty.TupleType(tvm.convert([
         relay.ty.TensorType((3, 2, 1), "float32"),
         relay.ty.TensorType((3,), "float32"),
         relay.ty.TensorType((3,), "float32")
     ]))
 
     # axis=-1
-    ib = relay.ir_builder.IRBuilder()
-    data = ib.param("data", relay.ty.TensorType((1, 2, 3), "float32"))
-    gamma = ib.param("gamma", relay.ty.TensorType((3,), "float32"))
-    beta = ib.param("beta", relay.ty.TensorType((3,), "float32"))
-    moving_mean = ib.param("moving_mean", relay.ty.TensorType((3,), "float32"))
-    moving_var = ib.param("moving_var", relay.ty.TensorType((3,), "float32"))
-    with ib.function(data, gamma, beta, moving_mean, moving_var) as func:
-        ib.ret(relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
-                                   axis=-1, center=False, scale=False))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TupleType(tvm.convert([
+    data = relay.var("data", relay.TensorType((1, 2, 3)))
+    beta = relay.var("beta", relay.TensorType((3,)))
+    gamma = relay.var("gamma", relay.TensorType((3,)))
+    moving_mean = relay.var("moving_mean", relay.TensorType((3,)))
+    moving_var = relay.var("moving_var", relay.TensorType((3,)))
+    y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
+                            axis=-1, center=False, scale=False)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.ty.TupleType(tvm.convert([
         relay.ty.TensorType((1, 2, 3), "float32"),
         relay.ty.TensorType((3,), "float32"),
         relay.ty.TensorType((3,), "float32")
@@ -285,14 +146,10 @@ def test_batch_norm():
 
 if __name__ == "__main__":
     test_unary_op()
-    test_single_op()
+    test_binary_op()
     test_expand_dims_infer_type()
     test_concatenate_infer_type()
     test_softmax()
     test_log_softmax()
-    test_binary_op()
-    test_binary_broadcast_op()
-    test_lrn()
-    test_l2_normalize()
     test_dropout()
     test_batch_norm()
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 4f37d4893b66..2f32b316924a 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -3,162 +3,111 @@
 import tvm
 from tvm import relay
 
+
 def test_conv2d_infer_type():
     # symbolic in batch dimension
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), 10, 224, 224
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-    w = ib.param("w", relay.ty.IncompleteType())
-
-    with ib.function(x, w) as func:
-        ib.ret(relay.nn.conv2d(x, w,
-                               kernel_size=(3, 3),
-                               padding=(1, 1),
-                               channels=2))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
+    x = relay.var("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w")
+    y = relay.nn.conv2d(x, w,
+                        kernel_size=(3, 3),
+                        padding=(1, 1),
+                        channels=2)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type ==  relay.TensorType(
         (n, 2, 224, 224), "float32")
-    assert ftype.arg_types[1] == relay.ty.TensorType(
+    assert yy.args[1].checked_type == relay.TensorType(
         (2, 10, 3, 3), "float32")
 
     # infer by shape of w, mixed precision
-    ib = relay.ir_builder.IRBuilder()
+
     n, c, h, w = tvm.var("n"), 10, 224, 224
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
-    w = ib.param("w", relay.ty.TensorType((2, 10, 3, 3), "int8"))
-    with ib.function(x, w) as func:
-        ib.ret(relay.nn.conv2d(x, w, out_dtype="int32"))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
+    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
+    w = relay.var("w", relay.TensorType((2, 10, 3, 3), "int8"))
+    y = relay.nn.conv2d(x, w, out_dtype="int32")
+    assert "out_dtype=\"int32\"" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type ==  relay.TensorType(
         (n, 2, 222, 222), "int32")
 
     # Infer with a different layout
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = 4, 32, 224, 224
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
-    w = ib.param("w", relay.ty.IncompleteType())
-    with ib.function(x, w) as func:
-        ib.ret(relay.nn.conv2d(x, w,
-                               kernel_size=(3, 3),
-                               padding=(1, 1),
-                               channels=16,
-                               data_layout="NCHW4n4c",
-                               weight_layout="OIHW4o4i",
-                               out_dtype="int32"))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
+    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
+    w = relay.var("w")
+    y = relay.nn.conv2d(x, w,
+                        kernel_size=(3, 3),
+                        padding=(1, 1),
+                        channels=16,
+                        data_layout="NCHW4n4c",
+                        weight_layout="OIHW4o4i",
+                        out_dtype="int32")
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type ==  relay.TensorType(
         (1, 4, 224, 224, 4, 4), "int32")
-    assert ftype.arg_types[1] == relay.ty.TensorType(
+    assert yy.args[1].checked_type == relay.TensorType(
         (4, 8, 3, 3, 4, 4), "int8")
 
 def test_conv2d_transpose_infer_type():
     # symbolic in batch dimension
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), 10, 10, 12
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-    w = ib.param("w", relay.ty.IncompleteType())
-
-    with ib.function(x, w) as func:
-        ib.ret(relay.nn.conv2d_transpose(x, w,
-                                         kernel_size=(3, 3),
-                                         padding=(1, 1),
-                                         channels=15))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.IncompleteType())
+    y = relay.nn.conv2d_transpose(x, w,
+                                  kernel_size=(3, 3),
+                                  padding=(1, 1),
+                                  channels=15)
+    assert "channels=15" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
         (n, 15, 10, 12), "float32")
-    assert ftype.arg_types[1] == relay.ty.TensorType(
+    assert yy.args[1].checked_type == relay.TensorType(
         (10, 15, 3, 3), "float32")
 
     # infer by shape of w, mixed precision
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), 10, 10, 12
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-    w = ib.param("w", relay.ty.TensorType((12, 11, 5, 5), "float32"))
-    with ib.function(x, w) as func:
-        ib.ret(relay.nn.conv2d_transpose(x, w,
-                                         output_padding=(1, 1),
-                                         channels=11,
-                                         data_layout="NHWC"))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.TensorType((12, 11, 5, 5), "float32"))
+    y = relay.nn.conv2d_transpose(x, w,
+                                  output_padding=(1, 1),
+                                  channels=11,
+                                  data_layout="NHWC")
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
         (n, 15, 15, 11), "float32")
 
 def test_upsampling_infer_type():
-    ib = relay.ir_builder.IRBuilder()
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.nn.upsampling(x, scale=2, layout="NCHW", method="BILINEAR"))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c, h*2, w*2), "float32")
-
-    ib = relay.ir_builder.IRBuilder()
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.nn.upsampling(x, scale=2, layout="NCHW", method="BILINEAR")
+    "method=\"BINLINEAR\"" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h*2, w*2), "float32")
     n, c = tvm.var("n"), tvm.var("c")
-    x = ib.param("x", relay.ty.TensorType((n, c, 100, 200), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.nn.upsampling(x, scale=2, layout="NCHW", method="BILINEAR"))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c, 200, 400), "float32")
+    x = relay.var("x", relay.TensorType((n, c, 100, 200), "float32"))
+    y = relay.nn.upsampling(x, scale=2, layout="NCHW", method="BILINEAR")
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, 200, 400), "float32")
 
 def _test_pool2d_infer_type(opfunc):
-    ib = relay.ir_builder.IRBuilder()
-    n, c, h, w = tvm.var("n"), 10, 224, 224
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(opfunc(x, pool_size=(1, 1)))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, 10, 224, 224), "float32")
-
-    ph, pw = tvm.var("ph"), tvm.var("pw")
-    sh, sw = tvm.var("sh"), tvm.var("sw")
-
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), 10, 224, 224
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(opfunc(x, pool_size=(ph, pw), strides=(sh, sw)))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
-        (n, 10, (((224 - ph)/sh) + 1), (((224 - pw)/sw) + 1)), "float32")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = opfunc(x, pool_size=(1, 1))
+    assert "pool_size=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, 10, 224, 224), "float32")
 
 def _test_global_pool2d_infer_type(opfunc):
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), tvm.var("c"), 224, 224
-    x = ib.param("x", relay.ty.TensorType((n, h, w, c), "float32"))
-    with ib.function(x) as func:
-        ib.ret(opfunc(x, layout="NHWC"))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, 1, 1, c), "float32")
+    x = relay.var("x", relay.TensorType((n, h, w, c), "float32"))
+    y = opfunc(x, layout="NHWC")
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, 1, 1, c), "float32")
 
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(opfunc(x))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c, 1, 1), "float32")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = opfunc(x)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, 1, 1), "float32")
 
 def test_pool2d_infer_type():
     _test_pool2d_infer_type(relay.nn.max_pool2d)
@@ -167,101 +116,83 @@ def test_pool2d_infer_type():
     _test_global_pool2d_infer_type(relay.nn.global_avg_pool2d)
 
 def test_flatten_infer_type():
-    ib = relay.ir_builder.IRBuilder()
     d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
-    x = ib.param("x", relay.ty.TensorType((d1, d2, d3, d4), "float32"))
+    x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32"))
+    y = relay.nn.batch_flatten(x)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((d1, ((d2*d3)*d4)), "float32")
 
-    with ib.function(x) as func:
-        ib.ret(relay.nn.batch_flatten(x))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((d1, ((d2*d3)*d4)), "float32")
+    x = relay.var("x", relay.TensorType((3, 2, 4, 3), "float32"))
+    y = relay.nn.batch_flatten(x)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((3, 24), "float32")
 
-    ib = relay.ir_builder.IRBuilder()
-    x = ib.param("x", relay.ty.TensorType((3, 2, 4, 3), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.nn.batch_flatten(x))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((3, 24), "float32")
-
-    ib = relay.ir_builder.IRBuilder()
-    x = ib.param("x", relay.ty.TensorType((d1, 2, d3, 3), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.nn.batch_flatten(x))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((d1, ((2*d3)*3)), "float32")
+    x = relay.var("x", relay.TensorType((d1, 2, d3, 3), "float32"))
+    y = relay.nn.batch_flatten(x)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((d1, ((2*d3)*3)), "float32")
 
 def test_pad_infer_type():
     # entirely concrete case
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = 1, 2, 3, 4
-    t = ib.param("t", relay.TensorType((n, c, h, w), "float32"))
-    with ib.function(t) as func:
-        ib.ret(relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4))))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.TensorType((3, 6, 9, 12), "float32")
+    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4)))
+    "pad_width=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((3, 6, 9, 12), "float32")
 
     # some symbolic values
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), 2, 3, tvm.var("w")
-    t = ib.param("t", relay.TensorType((n, c, h, w), "float32"))
-    with ib.function(t) as func:
-        ib.ret(relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4))))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
+    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4)))
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
 
 def test_dense_infer_type():
-    ib = relay.ir_builder.IRBuilder()
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-
-    w = ib.param("w", relay.ty.TensorType((w, 2), "float32"))
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.TensorType((w, 2), "float32"))
+    y = relay.nn.dense(x, w, units=2)
+    "units=2" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
 
-    with ib.function(x, w) as func:
-        ib.ret(relay.nn.dense(x, w, units=2))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c, h, 2), "float32")
-
-    ib = relay.ir_builder.IRBuilder()
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     wh, ww = tvm.var("wh"), tvm.var("ww")
-    w = ib.param("w", relay.ty.TensorType((wh, ww), "float32"))
+    w = relay.var("w", relay.TensorType((wh, ww), "float32"))
+    y = relay.nn.dense(x, w)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, ww), "float32")
 
-    with ib.function(x, w) as func:
-        ib.ret(relay.nn.dense(x, w))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c, h, ww), "float32")
-
-    ib = relay.ir_builder.IRBuilder()
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.IncompleteType())
+    y = relay.nn.dense(x, w, units=2)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
+
 
-    w = ib.param("w", relay.ty.IncompleteType())
+def test_lrn():
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", shape=(n, c , h, w))
+    y = relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=.00001, beta=0.75)
+    "alpha=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c , h, w))
 
-    with ib.function(x, w) as func:
-        ib.ret(relay.nn.dense(x, w, units=2))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c, h, 2), "float32")
+def test_l2_normalize():
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", shape=(n, c , h, w))
+    y = relay.nn.l2_normalize(x, eps=0.001, axis=[1])
+    "axis=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c , h, w))
 
 
 if __name__ == "__main__":
+    test_lrn()
+    test_l2_normalize()
     test_conv2d_infer_type()
     test_pool2d_infer_type()
     test_upsampling_infer_type()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 0605ac02339b..d1bff2940457 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -3,154 +3,92 @@
 import tvm
 import numpy as np
 from tvm import relay
-from tvm.relay.ir_pass import infer_type
-from tvm.relay.ir_builder import IRBuilder, func_type
-from tvm.relay.env import Environment
 from nose.tools import raises
 
 def test_zeros_ones():
     for op in [relay.zeros, relay.ones]:
-        ib = relay.ir_builder.IRBuilder()
-        with ib.function() as func:
-            ib.ret(op((124, 50), "float64"))
-        ib.ret(func)
-        func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type
-        assert ftype.ret_type == relay.TensorType((124, 50), "float64")
-
+        y = op(shape=(124, 50), dtype="float64")
+        yy = relay.ir_pass.infer_type(y)
+        assert yy.checked_type == relay.TensorType((124, 50), "float64")
 
 def test_unary_identity():
-    for op in [relay.zeros_like, relay.ones_like]:
-        ib = relay.ir_builder.IRBuilder()
-        x = ib.param("x", relay.TensorType((8, 9, 4), "int32"))
-        with ib.function(x) as func:
-            ib.ret(op(x))
-        ib.ret(func)
-        func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type
-        assert ftype.ret_type == relay.TensorType((8, 9, 4), "int32")
+    for op in [relay.zeros_like,
+               relay.ones_like,
+               relay.ceil,
+               relay.floor,
+               relay.trunc,
+               relay.round,
+               relay.abs,
+               relay.copy,
+               relay.negative]:
+        x = relay.var("x", relay.TensorType((8, 9, 4), "float32"))
+        y = op(x)
+        yy = relay.ir_pass.infer_type(y)
+        assert yy.checked_type == relay.TensorType((8, 9, 4), "float32")
 
 
 def test_clip_type():
-    ib = relay.ir_builder.IRBuilder()
-    a = ib.param("a", relay.TensorType((10, 4), "float32"))
-    with ib.function(a) as func:
-        ib.ret(relay.clip(a, 1., 4.))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.TensorType((10, 4), "float32")
-
-
-def test_copy_infer_type():
-    ib = relay.ir_builder.IRBuilder()
-    n, t, d = tvm.var("n"), tvm.var("t"), 100
-    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.copy(x))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
-        (n, t, 100), "float32")
+    a = relay.var("a", relay.TensorType((10, 4), "float32"))
+    y = relay.clip(a, 1., 4.)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((10, 4), "float32")
 
 
 def test_transpose_infer_type():
-    ib = relay.ir_builder.IRBuilder()
     n, t, d = tvm.var("n"), tvm.var("t"), 100
-    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.transpose(x, axes=(1, 0, 2)))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
+    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
+    y = relay.transpose(x, axes=(1, 0, 2))
+    "axes=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
         (t, n, 100), "float32")
 
 
-def test_squeeze_default_axes_infer_type():
-    ib = relay.ir_builder.IRBuilder()
+def test_squeeze_infer_type():
     n, t, d = 1, 4, 1
-    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.squeeze(x))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
-        (4,), "float32")
-
+    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
+    y = relay.squeeze(x, axes=(2,))
+    assert "axes=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
+        (1, 4), "float32")
 
-def test_squeeze_axes_infer_type():
-    ib = relay.ir_builder.IRBuilder()
     n, t, d = 1, 4, 1
-    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.squeeze(x, axes=(2,)))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
-        (1, 4), "float32")
+    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
+    y = relay.squeeze(x)
+    assert "axes=" not in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
+        (4,), "float32")
 
 
 @raises(tvm._ffi.base.TVMError)
 def test_squeeze_bad_axes_infer_type():
-    ib = relay.ir_builder.IRBuilder()
     n, t, d = 1, 4, 1
-    x = ib.param("x", relay.ty.TensorType((n, t, d), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.squeeze(x, axes=(1,)))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
+    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
+    y = relay.squeeze(x, axes=(1,))
+    yy = relay.ir_pass.infer_type(y)
 
 
 def test_reshape_infer_type():
-    ib = relay.ir_builder.IRBuilder()
     n, t, d1, d2 = tvm.var("n"), tvm.var("t"), 100, 20
-    x = ib.param("x", relay.ty.TensorType((n, t, d1, d2), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.reshape(x, newshape=(n, t, 2000)))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
+    x = relay.var("x", relay.TensorType((n, t, d1, d2), "float32"))
+    y = relay.reshape(x, newshape=(n, t, 2000))
+    assert "newshape=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
         (n, t, 2000), "float32")
 
 
-def assert_has_type(expr, typ, env=Environment({})):
-    checked_expr = infer_type(env, expr)
-    checked_type = checked_expr.checked_type
-    if checked_type != typ:
-        raise RuntimeError("Type mismatch %s vs %s" % (
-            checked_type, typ))
-
-def test_single_op():
-    def check_single_op(opfunc):
-        "Program: fn (x : float32) { let t1 = f(x); t1 }"
-        b = IRBuilder()
-        with b.function(('x', 'float32')) as func:
-            x, = func.param_ids()
-            t1 = b.let('t1', opfunc(x))
-            b.ret(t1)
-        assert_has_type(func.to_func(), func_type(['float32'], 'float32'))
-
-    for opfunc in [tvm.relay.ceil, tvm.relay.floor, tvm.relay.trunc,
-                   tvm.relay.round, tvm.relay.abs, tvm.relay.negative]:
-        check_single_op(opfunc)
 
 def test_take_infer_type():
     def verify_take(dshape, indices_shape, oshape, axis=None):
-        ib = relay.ir_builder.IRBuilder()
-        x = ib.param("x", relay.ty.TensorType(dshape, "float32"))
-        indices = ib.param("indices", relay.ty.TensorType(indices_shape, "int32"))
-        with ib.function(x, indices) as func:
-            ib.ret(relay.take(x, indices, axis=axis))
-        ib.ret(func)
-        func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type
-        assert ftype.ret_type == relay.ty.TensorType(oshape, "float32")
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        indices = relay.var("indices", relay.TensorType(indices_shape, "int32"))
+        y = relay.take(x, indices, axis=axis)
+        y.astext()
+        yy = relay.ir_pass.infer_type(y)
+        assert yy.checked_type == relay.TensorType(oshape, "float32")
 
     d1, d2, d3 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3")
     d4, d5, d6 = tvm.var("d4"), tvm.var("d5"), tvm.var("d6")
@@ -164,73 +102,52 @@ def verify_take(dshape, indices_shape, oshape, axis=None):
 
 def test_full():
     # default settings: match input dtype
-    ib = relay.ir_builder.IRBuilder()
-    x = ib.param("x", relay.TensorType((), "int8"))
-    with ib.function(x) as func:
-        ib.ret(relay.full(x, ()))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.TensorType((), "int8")
+    x = relay.var("x", relay.TensorType((), "int8"))
+    y = relay.full(x, ())
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((), "int8")
 
     # change the shape and dtype
-    ib = relay.ir_builder.IRBuilder()
-    x = ib.param("x", relay.TensorType((), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.full(x, (1, 2), "int8"))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.TensorType((1, 2), "int8")
+    x = relay.var("x", relay.TensorType((), "float32"))
+    y = relay.full(x, (1, 2), "int8")
+    "shape=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((1, 2), "int8")
 
 
 def test_full_like():
     # concrete shape
-    ib = relay.ir_builder.IRBuilder()
-    base = ib.param("base", relay.TensorType((1, 2, 3), "float32"))
-    fill = ib.param("fill", relay.TensorType((), "float32"))
-    with ib.function(base, fill) as func:
-        ib.ret(relay.full_like(base, fill))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.TensorType((1, 2, 3), "float32")
+    base = relay.var("base", relay.TensorType((1, 2, 3), "float32"))
+    fill = relay.var("fill", relay.TensorType((), "float32"))
+    y = relay.full_like(base, fill)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((1, 2, 3), "float32")
 
     # symbolic shape
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), 2, 3, tvm.var("w")
-    base = ib.param("base", relay.TensorType((n, c, h, w), "float32"))
-    fill = ib.param("fill", relay.TensorType((), "float32"))
-    with ib.function(base, fill) as func:
-        ib.ret(relay.full_like(base, fill))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.TensorType((n, c, h, w), "float32")
+    base = relay.var("base", relay.TensorType((n, c, h, w), "float32"))
+    fill = relay.var("fill", relay.TensorType((), "float32"))
+    y = relay.full_like(base, fill)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
 def test_infer_type_leaky_relu():
-   ib = relay.ir_builder.IRBuilder()
    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-   x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-
-   with ib.function(x) as func:
-       ib.ret(relay.nn.leaky_relu(x, alpha=0.1))
-   ib.ret(func)
-   func = relay.ir_pass.infer_type(ib.env, func.to_func())
-   ftype = func.checked_type
-   assert ftype.ret_type == relay.ty.TensorType((n, c, h, w), "float32")
+   x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+   y = relay.nn.leaky_relu(x, alpha=0.1)
+   "alpha=0.1" in y.astext()
+   yy = relay.ir_pass.infer_type(y)
+   assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
 if __name__ == "__main__":
-    test_single_op()
     test_zeros_ones()
     test_unary_identity()
     test_clip_type()
-    test_copy_infer_type()
     test_transpose_infer_type()
     test_reshape_infer_type()
     test_take_infer_type()
     test_full()
     test_full_like()
     test_infer_type_leaky_relu()
-    test_squeeze_axes_infer_type()
-    test_squeeze_default_axes_infer_type()
+    test_squeeze_infer_type()
+    test_squeeze_bad_axes_infer_type()
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index dea300422e45..c2b685affab4 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -1,66 +1,24 @@
 import tvm
 import numpy as np
 from tvm import relay
-from tvm.relay.ir_pass import infer_type
-from tvm.relay.ir_builder import IRBuilder, func_type
-from tvm.relay.ir_builder import scalar_type, convert, tensor_type
-from tvm.relay.env import Environment
-
-def assert_has_type(expr, typ, env=Environment({})):
-    checked_expr = infer_type(env, expr)
-    checked_type = checked_expr.checked_type
-    if checked_type != typ:
-        raise RuntimeError("Type mismatch %s vs %s" % (
-            checked_type, typ))
 
 
 def test_binary_op():
     def check_binary_op(opfunc):
-        """
-        Program:
-            fn (x, y) {
-                return x <op> y;
-            }
-        """
-        b = IRBuilder()
-
-        x = b.param('x', tensor_type(5, 5, 5))
-        y = b.param('y', tensor_type(5, 5, 5))
-        with b.function(x, y) as func:
-            b.ret(opfunc(x, y))
-        b.ret(func)
-        prog, env = b.get()
-        ttype = tensor_type(5, 5, 5)
-        expected_ty = func_type([ttype, ttype], ttype)
-        assert_has_type(func.to_func(), expected_ty)
+        n = tvm.var("n")
+        t1 = relay.TensorType((5, n, 5))
+        t2 = relay.TensorType((n, 1))
+        x = relay.var("x", t1)
+        y = relay.var("y", t2)
+        z = opfunc(x, y)
+        # test printer
+        assert ("%0 = {}(%x, %y)".format(z.op.name)) in z.astext()
+        assert relay.ir_pass.infer_type(z).checked_type == t1
 
     for opfunc in [relay.pow]:
         check_binary_op(opfunc)
 
 
-def test_binary_broadcast_op():
-    def check_binary_broadcast_op(opfunc):
-        """
-        Program:
-            fn (x: Tensor[(10, 4), f32], y: Tensor[(5, 10, 1), f32]) -> Tensor[(5, 10, 4), f32] {
-                return x <op> y;
-            }
-        """
-        b = IRBuilder()
-        x = b.param('x', tensor_type(10, 4))
-        y = b.param('y', tensor_type(5, 10, 1))
-        with b.function(x, y) as func:
-            b.ret(opfunc(x, y))
-        b.ret(func)
-        prog, env = b.get()
-
-        expected_ty = func_type([tensor_type(10, 4), tensor_type(5, 10, 1)],
-                                tensor_type(5, 10, 4))
-        assert_has_type(func.to_func(), expected_ty)
-
-    for opfunc in [relay.pow]:
-        check_binary_broadcast_op(opfunc)
-
 def test_cmp_type():
     for op in (relay.greater,
                relay.greater_equal,
@@ -68,138 +26,59 @@ def test_cmp_type():
                relay.less_equal,
                relay.equal,
                relay.not_equal):
-        ib = relay.ir_builder.IRBuilder()
-        x = ib.param("x", relay.TensorType((10, 4), "float32"))
-        y = ib.param("y", relay.TensorType((5, 10, 1), "float32"))
-        with ib.function(x, y) as func:
-            ib.ret(op(x, y))
-        ib.ret(func)
-        func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type
-        assert ftype.ret_type == relay.TensorType((5, 10, 4), "uint1")
+        x = relay.var("x", relay.TensorType((10, 4), "float32"))
+        y = relay.var("y", relay.TensorType((5, 10, 1), "float32"))
+        z = op(x, y)
+        z.astext()
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.TensorType((5, 10, 4), "bool")
+
 
-def test_binary_broadcast():
+def test_binary_int_broadcast():
     for op in [relay.right_shift,
                relay.left_shift,
                relay.maximum,
                relay.minimum]:
-        ib = relay.ir_builder.IRBuilder()
-        x = ib.param("x", relay.TensorType((10, 4), "int32"))
-        y = ib.param("y", relay.TensorType((5, 10, 1), "int32"))
-        with ib.function(x, y) as func:
-            ib.ret(op(x, y))
-        ib.ret(func)
-        func = relay.ir_pass.infer_type(ib.env, func.to_func())
-        ftype = func.checked_type
-        assert ftype.ret_type == relay.TensorType((5, 10, 4), "int32")
-
-def test_argmax():
-    ib = relay.ir_builder.IRBuilder()
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.argmax(x, axis=(1,)))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, h, w), "int32")
-
-    ib = relay.ir_builder.IRBuilder()
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.argmax(x, axis=(2,), keepdims=True))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c , 1, w), "int32")
-
-    ib = relay.ir_builder.IRBuilder()
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.argmax(x, axis=(2,), keepdims=True, exclude=True))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((1, 1 , h, 1), "int32")
-
-def test_argmin():
-    ib = relay.ir_builder.IRBuilder()
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.argmax(x, axis=(1,)))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, h, w), "int32")
-
-    ib = relay.ir_builder.IRBuilder()
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.argmin(x, axis=(2,), keepdims=True))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c , 1, w), "int32")
-
-    ib = relay.ir_builder.IRBuilder()
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.argmin(x, axis=(2,), keepdims=True, exclude=True))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((1, 1 , h, 1), "int32")
-
-    ib = relay.ir_builder.IRBuilder()
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.argmin(x, axis=(2,1), keepdims=True, exclude=True))
-    ib.ret(func)
-
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((1, c , h, 1), "int32")
-
-    ib = relay.ir_builder.IRBuilder()
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c , h, w), "float32"))
-    with ib.function(x) as func:
-        ib.ret(relay.argmin(x, axis=None, keepdims=True, exclude=True))
-    ib.ret(func)
+        x = relay.var("x", relay.TensorType((10, 4), "int32"))
+        y = relay.var("y", relay.TensorType((5, 10, 1), "int32"))
+        z = op(x, y)
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.TensorType((5, 10, 4), "int32")
+
+
+def test_arg_reduce():
+    for op in [relay.argmax, relay.argmin]:
+        n, c , h, w = 10, 20, 3, 4
+        x = relay.var("x", relay.ty.TensorType((n, c , h, w), "float32"))
+        z = relay.argmax(x, axis=(1,))
+        "axis="  in z.astext()
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.ty.TensorType((n, h, w), "int32")
+        n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+        x = relay.var("x", relay.ty.TensorType((n, c , h, w), "float32"))
+        z = relay.argmax(x, axis=(2,), keepdims=True)
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.ty.TensorType((n, c , 1, w), "int32")
+
+        n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+        x = relay.var("x", relay.ty.TensorType((n, c , h, w), "float32"))
+        z = relay.argmax(x, axis=(2,), keepdims=True, exclude=True)
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.ty.TensorType((1, 1 , h, 1), "int32")
 
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((1, 1 , 1, 1), "int32")
 
 def test_where():
-    ib = relay.ir_builder.IRBuilder()
-    cond = ib.param("cond", relay.TensorType((3, 4), "float32"))
-    x = ib.param("x", relay.TensorType((3, 4), "float32"))
-    y = ib.param("y", relay.TensorType((3, 4), "float32"))
-    with ib.function(cond, x, y) as func:
-        ib.ret(relay.where(cond, x, y))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.TensorType((3, 4), "float32")
+    cond = relay.var("cond", relay.TensorType((3, 4), "float32"))
+    x = relay.var("x", relay.TensorType((3, 4), "float32"))
+    y = relay.var("y", relay.TensorType((3, 4), "float32"))
+    z = relay.where(cond, x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((3, 4), "float32")
 
 
 if __name__ == "__main__":
     test_binary_op()
-    test_binary_broadcast_op()
     test_cmp_type()
-    test_binary_broadcast()
+    test_binary_int_broadcast()
     test_where()
-    test_multibox_prior()
-    test_argmax()
-    test_argmin()
+    test_arg_reduce()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index e04bd9bab91a..4e554cd0cf81 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -4,26 +4,18 @@
 from tvm import relay
 
 def test_resize_infer_type():
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
+    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
     th, tw = tvm.var("th"), tvm.var("tw")
+    z = relay.image.resize(x, (th, tw))
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((n, c, th, tw), "int8")
 
-    with ib.function(x) as func:
-        ib.ret(relay.image.resize(x, (th, tw)))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c, th, tw), "int8")
-
-    ib = relay.ir_builder.IRBuilder()
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "int8"))
-    with ib.function(x) as func:
-        ib.ret(relay.image.resize(x, (100, 200), "NCHW", "BILINEAR", False))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType((n, c, 100, 200), "int8")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
+    z= relay.image.resize(x, (100, 200), "NCHW", "BILINEAR", False)
+    assert "size=" in z.astext()
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((n, c, 100, 200), "int8")
 
 
 
@@ -34,29 +26,21 @@ def test_multibox_prior():
     offsets = (0.2, 0.3)
     clip = True
 
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), 3, 56, 56
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
 
-    with ib.function(x) as func:
-        ib.ret(relay.vision.multibox_prior(x, sizes, ratios,
-                                           steps, offsets, clip))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
+    z = relay.vision.multibox_prior(x, sizes, ratios,
+                                    steps, offsets, clip)
+    assert "sizes=" in z.astext()
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType(
         (1, h * w * (len(sizes) + len(ratios) - 1), 4), "float32")
 
-    ib = relay.ir_builder.IRBuilder()
     n, c, h, w = tvm.var("n"), 24, 32, 32
-    x = ib.param("x", relay.ty.TensorType((n, c, h, w), "float32"))
-
-    with ib.function(x) as func:
-        ib.ret(relay.vision.multibox_prior(x))
-    ib.ret(func)
-    func = relay.ir_pass.infer_type(ib.env, func.to_func())
-    ftype = func.checked_type
-    assert ftype.ret_type == relay.ty.TensorType(
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    z = relay.vision.multibox_prior(x)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType(
         (1, h * w, 4), "float32")
 
 
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index 2bfbc7f10a40..7b8a0bbdfa49 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -2,7 +2,6 @@
 import numpy as np
 from tvm import relay
 from tvm.relay.ir_pass import alpha_equal
-from tvm.relay.ir_builder import convert
 
 def test_tensor_type_alpha_equal():
     t1 = relay.TensorType((3, 4), "float32")
@@ -29,9 +28,9 @@ def test_incomplete_type_alpha_equal():
 
 
 def test_type_param_alpha_equal():
-    t1 = relay.TypeParam("v1", relay.Kind.Type)
-    t2 = relay.TypeParam("v2", relay.Kind.Shape)
-    t3 = relay.TypeParam("v3", relay.Kind.Type)
+    t1 = relay.TypeVar("v1", relay.Kind.Type)
+    t2 = relay.TypeVar("v2", relay.Kind.Shape)
+    t3 = relay.TypeVar("v3", relay.Kind.Type)
 
     # only pointer equality and eq_map allow equal params
     assert t1 == t1
@@ -54,10 +53,10 @@ def test_func_type_alpha_equal():
     t1 = relay.TensorType((1, 2), "float32")
     t2 = relay.TensorType((1, 2, 3), "float32")
 
-    tp1 = relay.TypeParam("v1", relay.Kind.Type)
-    tp2 = relay.TypeParam("v2", relay.Kind.Type)
-    tp3 = relay.TypeParam("v3", relay.Kind.Shape)
-    tp4 = relay.TypeParam("v3", relay.Kind.Shape)
+    tp1 = relay.TypeVar("v1", relay.Kind.Type)
+    tp2 = relay.TypeVar("v2", relay.Kind.Type)
+    tp3 = relay.TypeVar("v3", relay.Kind.Shape)
+    tp4 = relay.TypeVar("v3", relay.Kind.Shape)
 
     broadcast = tvm.get_env_func("tvm.relay.type_relation.Broadcast")
     identity = tvm.get_env_func("tvm.relay.type_relation.Identity")
@@ -113,8 +112,8 @@ def test_func_type_alpha_equal():
 def test_tuple_type_alpha_equal():
     t1 = relay.TensorType((1, 2, 3), "float32")
     t2 = relay.TensorType((1, 2, 3, 4), "float32")
-    tp1 = relay.TypeParam("v1", relay.Kind.Type)
-    tp2 = relay.TypeParam("v2", relay.Kind.Type)
+    tp1 = relay.TypeVar("v1", relay.Kind.Type)
+    tp2 = relay.TypeVar("v2", relay.Kind.Type)
 
     tup1 = relay.TupleType(tvm.convert([t1, t2, tp1]))
     tup2 = relay.TupleType(tvm.convert([t1, t2, tp1]))
@@ -164,11 +163,11 @@ def test_type_relation_alpha_equal():
 
 
 def test_constant_alpha_equal():
-    x = convert(1)
-    y = convert(2)
+    x = relay.const(1)
+    y = relay.const(2)
     assert alpha_equal(x, x)
     assert not alpha_equal(x, y)
-    assert alpha_equal(x, convert(1))
+    assert alpha_equal(x, relay.const(1))
 
 
 def test_var_alpha_equal():
@@ -180,9 +179,9 @@ def test_var_alpha_equal():
     assert not alpha_equal(v1, v2)
 
     # let node allows for setting the eq_map
-    l1 = relay.Let(v1, convert(1), v1)
-    l2 = relay.Let(v2, convert(1), v2)
-    l3 = relay.Let(v1, convert(1), v2)
+    l1 = relay.Let(v1, relay.const(1), v1)
+    l2 = relay.Let(v2, relay.const(1), v2)
+    l3 = relay.Let(v1, relay.const(1), v2)
 
     assert alpha_equal(l1, l2)
     assert not alpha_equal(l1, l3)
@@ -223,34 +222,34 @@ def test_tuple_alpha_equal():
     # unit value is a valid tuple
     assert alpha_equal(relay.Tuple([]), relay.Tuple([]))
 
-    tup = relay.Tuple([v1, convert(2), convert(3), relay.Tuple([convert(4)])])
-    same = relay.Tuple([v1, convert(2), convert(3), relay.Tuple([convert(4)])])
+    tup = relay.Tuple([v1, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])])
+    same = relay.Tuple([v1, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])])
 
     assert alpha_equal(tup, same)
 
     # use the eq_map
     let_tup = relay.Let(v1, tup, v1)
-    let_mapped = relay.Let(v2, relay.Tuple([v2, convert(2), convert(3),
-                                            relay.Tuple([convert(4)])]),
+    let_mapped = relay.Let(v2, relay.Tuple([v2, relay.const(2), relay.const(3),
+                                            relay.Tuple([relay.const(4)])]),
                            v2)
     assert alpha_equal(let_tup, let_mapped)
 
-    more_fields = relay.Tuple([v1, convert(2), convert(3), relay.Tuple([convert(4)]), v2])
+    more_fields = relay.Tuple([v1, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)]), v2])
     assert not alpha_equal(tup, more_fields)
 
-    fewer_fields = relay.Tuple([v1, convert(2), convert(3)])
+    fewer_fields = relay.Tuple([v1, relay.const(2), relay.const(3)])
     assert not alpha_equal(tup, fewer_fields)
 
-    different_end = relay.Tuple([v1, convert(2), convert(3),
-                           relay.Tuple([convert(5)])])
+    different_end = relay.Tuple([v1, relay.const(2), relay.const(3),
+                           relay.Tuple([relay.const(5)])])
     assert not alpha_equal(tup, different_end)
 
-    different_start = relay.Tuple([v2, convert(2), convert(3),
-                                 relay.Tuple([convert(4)])])
+    different_start = relay.Tuple([v2, relay.const(2), relay.const(3),
+                                 relay.Tuple([relay.const(4)])])
     assert not alpha_equal(tup, different_start)
 
-    longer_at_end = relay.Tuple([v1, convert(2), convert(3),
-                                 relay.Tuple([convert(4), convert(5)])])
+    longer_at_end = relay.Tuple([v1, relay.const(2), relay.const(3),
+                                 relay.Tuple([relay.const(4), relay.const(5)])])
     assert not alpha_equal(tup, longer_at_end)
 
 
@@ -273,10 +272,10 @@ def test_function_alpha_equal():
     v4 = relay.Var("v4", tt2)
     vret = relay.Constant(tvm.nd.array(np.ones(1)))
 
-    tp1 = relay.TypeParam("tp1", relay.Kind.Type)
-    tp2 = relay.TypeParam("tp2", relay.Kind.Type)
-    tp3 = relay.TypeParam("tp3", relay.Kind.Shape)
-    tp4 = relay.TypeParam("tp4", relay.Kind.Shape)
+    tp1 = relay.TypeVar("tp1", relay.Kind.Type)
+    tp2 = relay.TypeVar("tp2", relay.Kind.Type)
+    tp3 = relay.TypeVar("tp3", relay.Kind.Shape)
+    tp4 = relay.TypeVar("tp4", relay.Kind.Shape)
 
     basic_args = [relay.Var("v3", tt1), relay.Var("v4", tt2)]
     basic_tps = [tp1, tp2]
@@ -346,11 +345,11 @@ def test_call_alpha_equal():
     tt1 = relay.TensorType((1, 2, 3), "float32")
     tt2 = relay.TensorType((), "int8")
 
-    basic_args = [convert(1), convert(2), v2, relay.Tuple([])]
+    basic_args = [relay.const(1), relay.const(2), v2, relay.Tuple([])]
 
     # manually writing out args to ensure that args does not rely on
     # pointer equality
-    call = relay.Call(v1, [convert(1), convert(2), v2, relay.Tuple([])],
+    call = relay.Call(v1, [relay.const(1), relay.const(2), v2, relay.Tuple([])],
                       attr1, [tt1])
     same = relay.Call(v1, basic_args, attr1, [tt1])
     assert alpha_equal(call, same)
@@ -358,19 +357,19 @@ def test_call_alpha_equal():
     different_fn = relay.Call(v2, basic_args, attr1, [tt1])
     assert not alpha_equal(call, different_fn)
 
-    fewer_args = relay.Call(v1, [convert(1), convert(2), v2], attr1, [tt1])
+    fewer_args = relay.Call(v1, [relay.const(1), relay.const(2), v2], attr1, [tt1])
     assert not alpha_equal(call, fewer_args)
 
-    reordered_args = relay.Call(v1, [convert(2), convert(1),
+    reordered_args = relay.Call(v1, [relay.const(2), relay.const(1),
                                      relay.Tuple([]), v2], attr1, [tt1])
     assert not alpha_equal(call, reordered_args)
 
-    different_args = relay.Call(v1, [convert(1), convert(2), convert(3)],
+    different_args = relay.Call(v1, [relay.const(1), relay.const(2), relay.const(3)],
                                 attr1, [tt1])
     assert not alpha_equal(call, different_args)
 
-    more_args = relay.Call(v1, [convert(1), convert(2), v2, relay.Tuple([]),
-                                convert(3), convert(4)], attr1, [tt1])
+    more_args = relay.Call(v1, [relay.const(1), relay.const(2), v2, relay.Tuple([]),
+                                relay.const(3), relay.const(4)], attr1, [tt1])
     assert not alpha_equal(call, more_args)
 
     different_attrs = relay.Call(v1, basic_args, attr2, [tt1])
@@ -394,27 +393,27 @@ def test_let_alpha_equal():
     v2 = relay.Var("v2")
     v3 = relay.Var("v3")
 
-    let = relay.Let(v1, convert(2), v1)
-    mapped = relay.Let(v2, convert(2), v2)
+    let = relay.Let(v1, relay.const(2), v1)
+    mapped = relay.Let(v2, relay.const(2), v2)
     assert alpha_equal(let, mapped)
 
-    mismatched_var = relay.Let(v2, convert(2), v3)
+    mismatched_var = relay.Let(v2, relay.const(2), v3)
     assert not alpha_equal(let, mismatched_var)
 
-    different_value = relay.Let(v2, convert(3), v2)
+    different_value = relay.Let(v2, relay.const(3), v2)
     assert not alpha_equal(let, different_value)
 
-    different_body = relay.Let(v2, convert(3), convert(12))
+    different_body = relay.Let(v2, relay.const(3), relay.const(12))
     assert not alpha_equal(let, different_body)
 
     # specified types must match
 
-    let_with_type = relay.Let(v1_wtype, convert(2), v1_wtype)
-    same_type = relay.Let(v1_wtype, convert(2), v1_wtype)
+    let_with_type = relay.Let(v1_wtype, relay.const(2), v1_wtype)
+    same_type = relay.Let(v1_wtype, relay.const(2), v1_wtype)
     assert alpha_equal(let_with_type, same_type)
     assert not alpha_equal(let, let_with_type)
     v2 = relay.Var("v1", tt2)
-    different_type = relay.Let(v2, convert(2), v2)
+    different_type = relay.Let(v2, relay.const(2), v2)
     assert not alpha_equal(let_with_type, different_type)
 
 
@@ -422,17 +421,17 @@ def test_if_alpha_equal():
     v1 = relay.Var("v1")
     v2 = relay.Var("v2")
 
-    if_sample = relay.If(v1, convert(1), relay.Tuple([convert(2), convert(3)]))
-    same = relay.If(v1, convert(1), relay.Tuple([convert(2), convert(3)]))
+    if_sample = relay.If(v1, relay.const(1), relay.Tuple([relay.const(2), relay.const(3)]))
+    same = relay.If(v1, relay.const(1), relay.Tuple([relay.const(2), relay.const(3)]))
     assert alpha_equal(if_sample, same)
 
-    different_cond = relay.If(v2, convert(1), relay.Tuple([convert(2), convert(3)]))
+    different_cond = relay.If(v2, relay.const(1), relay.Tuple([relay.const(2), relay.const(3)]))
     assert not alpha_equal(if_sample, different_cond)
 
-    different_true = relay.If(v1, convert(2), relay.Tuple([convert(2), convert(3)]))
+    different_true = relay.If(v1, relay.const(2), relay.Tuple([relay.const(2), relay.const(3)]))
     assert not alpha_equal(if_sample, different_true)
 
-    different_false = relay.If(v1, convert(1), relay.Tuple([]))
+    different_false = relay.If(v1, relay.const(1), relay.Tuple([]))
     assert not alpha_equal(if_sample, different_false)
 
 
diff --git a/tests/python/relay/test_pass_check_kind.py b/tests/python/relay/test_pass_check_kind.py
index 314c8c8b7992..5ead501157c5 100644
--- a/tests/python/relay/test_pass_check_kind.py
+++ b/tests/python/relay/test_pass_check_kind.py
@@ -4,7 +4,7 @@
 
 def test_tuple_kind():
     # only contain type kinds
-    tp = relay.TypeParam('tp', relay.Kind.Type)
+    tp = relay.TypeVar('tp', relay.Kind.Type)
     tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
     tf = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([]))
     fields = tvm.convert([tp, tf, tt])
@@ -15,8 +15,8 @@ def test_tuple_kind():
 
 def test_func_kind():
     # only contain type kinds
-    tp1 = relay.TypeParam('tp1', relay.Kind.Type)
-    tp2 = relay.TypeParam('tp2', relay.Kind.Type)
+    tp1 = relay.TypeVar('tp1', relay.Kind.Type)
+    tp2 = relay.TypeVar('tp2', relay.Kind.Type)
 
     shape = tvm.convert([1, 2, 3])
     dtype = 'float32'
@@ -35,7 +35,7 @@ def test_func_kind():
 
 def test_relation_kind():
     # only have type kinds for arguments
-    tp = relay.TypeParam('tp', relay.Kind.Type)
+    tp = relay.TypeVar('tp', relay.Kind.Type)
     tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
     tf = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([]))
     args = tvm.convert([tf, tt, tp])
@@ -45,9 +45,9 @@ def test_relation_kind():
 
 
 def test_invalid_tuple_kind():
-    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
-    tp2 = relay.TypeParam('tp2', relay.Kind.BaseType)
-    tp3 = relay.TypeParam('tp3', relay.Kind.ShapeVar)
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeVar('tp2', relay.Kind.BaseType)
+    tp3 = relay.TypeVar('tp3', relay.Kind.ShapeVar)
     fields = tvm.convert([tp1, tp2, tp3])
 
     tup_ty = relay.TupleType(fields)
@@ -55,9 +55,9 @@ def test_invalid_tuple_kind():
 
 
 def test_invalid_func_kind():
-    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
-    tp2 = relay.TypeParam('tp2', relay.Kind.BaseType)
-    tp3 = relay.TypeParam('tp3', relay.Kind.ShapeVar)
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeVar('tp2', relay.Kind.BaseType)
+    tp3 = relay.TypeVar('tp3', relay.Kind.ShapeVar)
 
     type_params = tvm.convert([tp1, tp2, tp3])
     type_constraints = tvm.convert([])
@@ -69,9 +69,9 @@ def test_invalid_func_kind():
 
 
 def test_invalid_relation_kind():
-    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
-    tp2 = relay.TypeParam('tp2', relay.Kind.BaseType)
-    tp3 = relay.TypeParam('tp3', relay.Kind.ShapeVar)
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeVar('tp2', relay.Kind.BaseType)
+    tp3 = relay.TypeVar('tp3', relay.Kind.ShapeVar)
     args = tvm.convert([tp1, tp2, tp3])
 
     tr = relay.TypeRelation(None, args, 2, None)
@@ -79,19 +79,19 @@ def test_invalid_relation_kind():
 
 
 def test_func_with_invalid_ret_type():
-    tp1 = relay.TypeParam('tp1', relay.Kind.Type)
-    tp2 = relay.TypeParam('tp2', relay.Kind.Shape)
+    tp1 = relay.TypeVar('tp1', relay.Kind.Type)
+    tp2 = relay.TypeVar('tp2', relay.Kind.Shape)
     tf = relay.FuncType(tvm.convert([tp1]), tp2, tvm.convert([tp1, tp2]), tvm.convert([]))
 
 
 def test_func_with_invalid_arg_types():
-    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
-    tp2 = relay.TypeParam('tp2', relay.Kind.Type)
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeVar('tp2', relay.Kind.Type)
     tf = relay.FuncType(tvm.convert([tp1]), tp2, tvm.convert([tp1, tp2]), tvm.convert([]))
 
 
 def test_func_with_invalid_tuple():
-    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
 
     ret_type = relay.TupleType(tvm.convert([tp1, tp1, tp1]))
 
@@ -100,9 +100,9 @@ def test_func_with_invalid_tuple():
 
 
 def test_func_with_invalid_relation():
-    tp1 = relay.TypeParam('tp1', relay.Kind.Type)
-    tp2 = relay.TypeParam('tp2', relay.Kind.Shape)
-    tp3 = relay.TypeParam('tp3', relay.Kind.ShapeVar)
+    tp1 = relay.TypeVar('tp1', relay.Kind.Type)
+    tp2 = relay.TypeVar('tp2', relay.Kind.Shape)
+    tp3 = relay.TypeVar('tp3', relay.Kind.ShapeVar)
 
     tr = relay.TypeRelation(None, tvm.convert([tp2, tp3]), 1, None)
 
@@ -113,7 +113,7 @@ def test_func_with_invalid_relation():
 def test_tuple_with_invalid_func():
     tensor_type = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
 
-    tp1 = relay.TypeParam('tp1', relay.Kind.Shape)
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
     tf = relay.FuncType(tvm.convert([]), tp1, tvm.convert([tp1]), tvm.convert([]))
 
     tup_ty = relay.TupleType(tvm.convert([tensor_type, tf]))
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index c4bacce3ddfc..f74aaf74e474 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -1,7 +1,6 @@
 import tvm
 from tvm import relay
 from tvm.relay.ir_pass import dead_code_elimination, alpha_equal
-from tvm.relay.ir_builder import convert, IRBuilder
 from tvm.relay.op import log, add, equal, subtract
 
 
@@ -19,9 +18,9 @@ def __init__(self):
         self.tt = relay.TensorType(self.shape, "float32")
         self.int32 = relay.TensorType([], "int32")
         self.float32 = relay.TensorType([], "float32")
-        self.one = convert(1.0)
-        self.two = convert(2.0)
-        self.three = convert(3.0)
+        self.one = relay.const(1.0)
+        self.two = relay.const(2.0)
+        self.three = relay.const(3.0)
 
 
 e = env()
@@ -58,9 +57,12 @@ def test_recursion():
     f = relay.Var("f")
     n = relay.Var("n", e.int32)
     data = relay.Var("data", e.float32)
-    funcbody = relay.If(equal(n, convert(0)), data, f(subtract(n, convert(1.0)), log(data)))
+    funcbody = relay.If(equal(n, relay.const(0)),
+                        data,
+                        relay.Call(f, [subtract(n, relay.const(1.0)),
+                                       log(data)]))
     value = relay.Function([n, data], funcbody, e.float32, [])
-    orig = relay.Let(f, funcbody, f(convert(2.0), convert(10000.0)))
+    orig = relay.Let(f, funcbody, relay.Call(f, [relay.const(2.0), relay.const(10000.0)]))
     assert alpha_equal(dead_code_elimination(orig), orig)
     assert alpha_equal(dead_code_elimination(relay.Let(f, funcbody, e.three)), e.three)
 
@@ -70,8 +72,10 @@ def test_op_let():
 
 
 def test_if():
-    orig = relay.If(convert(True), e.a, e.b)
-    assert alpha_equal(dead_code_elimination(orig), e.a)
+    cond = relay.const(True)
+    orig = relay.If(cond, e.a, e.b)
+    y = dead_code_elimination(orig)
+    assert alpha_equal(y, e.a)
 
 
 def test_tuple_get_item():
@@ -82,10 +86,10 @@ def test_tuple_get_item():
 
 
 if __name__ == "__main__":
+    test_if()
     test_let()
     test_used_let()
     test_chain_unused_let()
     test_recursion()
     test_op_let()
-    test_if()
     test_tuple_get_item()
diff --git a/tests/python/relay/test_pass_free_vars.py b/tests/python/relay/test_pass_free_vars.py
index 524196661753..151dbe1412bc 100644
--- a/tests/python/relay/test_pass_free_vars.py
+++ b/tests/python/relay/test_pass_free_vars.py
@@ -28,7 +28,7 @@ def test_tuple():
 
 
 def test_free_type_vars():
-    tp = relay.TypeParam("")
+    tp = relay.TypeVar("")
     ty = relay.TupleType([tp, relay.TensorType([], "int32")])
     x = relay.Var("x", ty)
     y = relay.Var("y")
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 77b04590df59..2d8f98974639 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -4,34 +4,17 @@
 import tvm
 import numpy as np
 from tvm.relay.ir_pass import infer_type
-from tvm.relay.ir_builder import IRBuilder, func_type
-from tvm.relay.ir_builder import scalar_type, convert, tensor_type
-from tvm.relay.env import Environment
-from tvm.relay.op import log, add, equal, subtract, concatenate
-from tvm.relay.expr import Function
 from tvm import relay
 
-def assert_has_type(expr, typ, env=Environment({})):
-    checked_expr = infer_type(env, expr)
-    checked_type = checked_expr.checked_type
-    if checked_type != typ:
-        raise RuntimeError("Type mismatch %s vs %s" % (
-            checked_type, typ))
-
-
-def assert_decl_has_type(env, name, typ):
-    func = env[name]
-    assert func.checked_type == typ
-
 
 def test_monomorphic_let():
     "Program: let x = 1; return x"
-    b = IRBuilder()
-    x = b.let('x', 1.0, value_type=scalar_type('float64'))
-    b.ret(x)
+    sb = relay.ScopeBuilder()
+    x = sb.let('x', relay.const(1.0, "float64"))
+    sb.ret(x)
+    xchecked = relay.ir_pass.infer_type(sb.get())
+    assert xchecked.checked_type == relay.scalar_type("float64")
 
-    prog, env = b.get()
-    assert_has_type(prog, scalar_type('float64'))
 
 def test_dual_op():
     """Program:
@@ -41,31 +24,29 @@ def test_dual_op():
          return t1;
        }
     """
-    b = IRBuilder()
-    with b.function(('x', tensor_type(10, 10))) as func:
-        x, = func.param_ids()
-        t1 = b.let('t1', log(x))
-        t2 = b.let('t2', add(t1, x))
-        b.ret(t2)
-
-    assert_has_type(func.to_func(),
-                    func_type([tensor_type(10, 10)], tensor_type(10, 10)))
+    tp = relay.TensorType((10, 10), "float32")
+    x = relay.var("x", tp)
+    sb = relay.ScopeBuilder()
+    t1 = sb.let("t1", relay.log(x))
+    t2 = sb.let("t2", relay.add(t1, x))
+    sb.ret(t2)
+    f = relay.Function([x], sb.get())
+    fchecked = relay.ir_pass.infer_type(f)
+    assert fchecked.checked_type == relay.FuncType([tp], tp)
 
 
 def test_decl():
     """Program:
-       def f(x : Tensor[f32, (10, 10)]) {
-           let lx = log(x);
-           return lx;
+       def f(x : Tensor[(10, 10), f32]) {
+           return log(x);
        }
     """
-    b = IRBuilder()
-    x = b.param('x')
-    with b.decl('f', x):
-        lx = b.let('lx', log(x))
-        b.ret(lx)
-    _, env = b.get()
-    assert_decl_has_type(env, 'f', func_type(['float32'], 'float32'))
+    sb = relay.ScopeBuilder()
+    tp = relay.TensorType((10, 10))
+    x = relay.var("x", tp)
+    f = relay.Function([x], relay.log(x))
+    fchecked = relay.ir_pass.infer_type(f)
+    assert fchecked.checked_type == relay.FuncType([tp], tp)
 
 
 def test_recursion():
@@ -78,54 +59,44 @@ def f(n: i32, data: f32) -> f32 {
               return f(n - 1, log(data));
           }
        }
-       f(2, 10000);
     """
-    b = IRBuilder()
-    f = b.global_var('f')
-    n = b.param('n', ty='int32')
-    data = b.param('data', ty='float32')
-    with b.decl(f, n, data):
-        with b.if_scope(equal(n, convert(0))):
-            b.ret(data)
-        with b.else_scope():
-            b.ret(f(subtract(n, convert(1)), log(data)))
-    b.ret(f(convert(2.0), convert(10000.0)))
-    assert_decl_has_type(b.env, 'f', func_type(
-        ['int32', 'float32'], 'float32'))
-    # TODO(@jroesch): need evaluator or new runtime
-    # to execute this.
+    sb = relay.ScopeBuilder()
+    f = relay.GlobalVar("f")
+    ti32 = relay.scalar_type("int32")
+    tf32 = relay.scalar_type("float32")
+    n = relay.var("n", ti32)
+    data = relay.var("data", tf32)
+
+    with sb.if_scope(relay.equal(n, relay.const(0, ti32))):
+        sb.ret(data)
+    with sb.else_scope():
+        sb.ret(f(relay.subtract(n, relay.const(1, ti32)), relay.log(data)))
+    env = relay.Environment()
+    env[f] = relay.Function([n, data], sb.get())
+    assert "%3 = @f(%1, %2)" in env.astext()
+    assert env[f].checked_type == relay.FuncType([ti32, tf32], tf32)
 
-def test_concat():
-    """
-    Program:
-        def try_concat2(x: Float(3, 2), y: Float(2, 2)) -> Float(5, 2) {
-            return concatenate((x, y), axis=0);
-        }
-    """
-    ib = IRBuilder()
-    try_concat2 = ib.global_var('try_concat2')
-    x = ib.param('x', ty=tensor_type(3, 2))
-    y = ib.param('y', ty=tensor_type(2, 2))
-    with ib.decl(try_concat2, x, y):
-        ib.ret(concatenate((x, y), axis=0))
-    fn_ty = func_type([tensor_type(3, 2), tensor_type(2, 2)], tensor_type(5, 2))
-    assert_decl_has_type(ib.env, try_concat2, fn_ty)
 
 def test_tuple():
-    ib = IRBuilder()
-    dup = ib.global_var('dup')
-    x = ib.param('x')
-    with ib.decl(dup, x):
-        ib.ret(relay.Tuple([x, x]))
-    # todo: why is this not generalized?
-    fn_ty = func_type([tensor_type()], relay.TupleType([tensor_type(), tensor_type()]))
-    assert_decl_has_type(ib.env, dup, fn_ty)
+    tp = relay.TensorType((10,))
+    x = relay.var("x", tp)
+    res = relay.Tuple([x, x])
+    assert (relay.ir_pass.infer_type(res).checked_type ==
+            relay.TupleType([tp, tp]))
+
+
+def test_free_expr():
+    x = relay.var("x", "float32")
+    y = relay.add(x, x)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.scalar_type("float32")
+
 
 if __name__ == "__main__":
+    test_free_expr()
     test_dual_op()
     test_recursion()
     test_monomorphic_let()
     test_decl()
     test_recursion()
-    test_concat()
     test_tuple()
diff --git a/tests/python/relay/test_type_solver.py b/tests/python/relay/test_type_solver.py
index c96ca59d2c8d..e8ff67756931 100644
--- a/tests/python/relay/test_type_solver.py
+++ b/tests/python/relay/test_type_solver.py
@@ -1,7 +1,5 @@
 import tvm
-
 from tvm import relay
-from tvm.relay.ir_builder import scalar_type, convert, tensor_type
 
 
 def make_rel(name, args, num_inputs=None, attrs=None):

From c4a1e9c070f290f64a1c9d1fe3e70ac9d695b0d9 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 20 Oct 2018 07:45:42 -0700
Subject: [PATCH 251/529] Fix master CI due to stale push (#1943)

---
 tests/python/relay/test_pass_alpha_equal.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index 7b8a0bbdfa49..7b27cb7ee2d4 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -194,9 +194,9 @@ def test_var_alpha_equal():
     v4 = relay.Var("v4", tt2)
     v5 = relay.Var("v5", tt3)
 
-    l4 = relay.Let(v3, convert(1), v3)
-    l5 = relay.Let(v4, convert(1), v4)
-    l6 = relay.Let(v5, convert(1), v5)
+    l4 = relay.Let(v3, relay.const(1), v3)
+    l5 = relay.Let(v4, relay.const(1), v4)
+    l6 = relay.Let(v5, relay.const(1), v5)
 
     # same annotations
     assert alpha_equal(l4, l5)

From d1cc91af055b7aba2e30ec57d2b95a27e38ebd9f Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Sat, 20 Oct 2018 21:53:41 -0700
Subject: [PATCH 252/529] Fix typo in module.py line 90 (#1947)

---
 python/tvm/module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/module.py b/python/tvm/module.py
index 6cca6fb0f722..1ca09740aff4 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -87,7 +87,7 @@ def export_library(self,
             If fcompile has attribute object_format, will compile host library
             to that format. Otherwise, will use default format "o".
 
-        kwargs : dict, optiona;
+        kwargs : dict, optional
             Additional arguments passed to fcompile
         """
         if self.type_key == "stackvm":

From 40f3e295fdd4a3201bb4bda63951eee635431b04 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@cs.washington.edu>
Date: Sat, 20 Oct 2018 21:55:12 -0700
Subject: [PATCH 253/529] [VTA] pynq v2.1 -> v2.3 (#1945)

---
 apps/pynq_rpc/start_rpc_server.sh |  3 ++-
 cmake/modules/VTA.cmake           |  9 +++------
 docs/vta/install.md               |  2 +-
 vta/include/vta/driver.h          |  6 ------
 vta/python/vta/exec/rpc_server.py |  5 +++--
 vta/src/pynq/pynq_driver.cc       | 31 -------------------------------
 vta/src/pynq/pynq_driver.h        |  5 -----
 7 files changed, 9 insertions(+), 52 deletions(-)

diff --git a/apps/pynq_rpc/start_rpc_server.sh b/apps/pynq_rpc/start_rpc_server.sh
index 30b3c9a90d6b..2dce74472414 100755
--- a/apps/pynq_rpc/start_rpc_server.sh
+++ b/apps/pynq_rpc/start_rpc_server.sh
@@ -2,4 +2,5 @@
 PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
 
 export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
-python -m vta.exec.rpc_server
+export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq
+python3 -m vta.exec.rpc_server
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 43fb700203c7..ea5bb5ae916a 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -38,13 +38,10 @@ elseif(PYTHON)
     set_target_properties(vta PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
   endif(APPLE)
 
-  # PYNQ rules
+  # PYNQ rules for Pynq v2.3
   if(${VTA_TARGET} STREQUAL "pynq")
-    find_library(__sds_lib NAMES sds_lib PATHS /usr/lib)
-    find_library(__dma_lib NAMES dma PATHS
-      "/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/"
-      "/opt/python3.6/lib/python3.6/site-packages/pynq/lib/")
-    target_link_libraries(vta ${__sds_lib} ${__dma_lib})
+    find_library(__cma_lib NAMES cma PATH /usr/lib)
+    target_link_libraries(vta ${__cma_lib})
   endif()
 else()
   message(STATUS "Cannot found python in env, VTA build is skipped..")
diff --git a/docs/vta/install.md b/docs/vta/install.md
index ca5969386e80..4a05f9fd8318 100644
--- a/docs/vta/install.md
+++ b/docs/vta/install.md
@@ -67,7 +67,7 @@ This guide covers the following themes:
 
 Setup your Pynq board based on the [Pynq board getting started tutorial](http://pynq.readthedocs.io/en/latest/getting_started.html).
 You should follow the instructions up to and including the *Turning On the PYNQ-Z1* step (no need to pursue the tutorial beyond this point).
-* Make sure that you've downloaded the latest Pynq image, [PYNQ-Z1 v2.1](http://pynq-testing.readthedocs.io/en/image_v2.2/getting_started/pynq_image.html) (released 21 Feb 2018), and have imaged your SD card with it (we recommend the free [Etcher](https://etcher.io/) program).
+* Make sure that you've downloaded the latest Pynq image, [PYNQ-Z1 v2.3](http://www.pynq.io/board.html) (released October 3rd 2018), and have imaged your SD card with it (we recommend the free [Etcher](https://etcher.io/) program).
 * For this test setup, follow the ["Connect to a Computer"](http://pynq.readthedocs.io/en/latest/getting_started.html#connect-to-a-computer) Ethernet setup instructions. To be able to talk to the board, make sure to [assign your computer a static IP address](http://pynq.readthedocs.io/en/latest/appendix.html#assign-your-computer-a-static-ip)
 
 Once the board is powered on and connected to your development machine, try connecting to it to make sure you've properly set up your Pynq board:
diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h
index 269728c51cda..588819a5fe6b 100644
--- a/vta/include/vta/driver.h
+++ b/vta/include/vta/driver.h
@@ -95,12 +95,6 @@ void VTAFlushCache(vta_phy_addr_t buf, int size);
  */
 void VTAInvalidateCache(vta_phy_addr_t buf, int size);
 
-/*!
- * \brief Programming the bit stream on the FPGA.
- * \param bitstream The path to the bit stream file.
- */
-void VTAProgram(const char* bitstream);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py
index 233d37ccad7c..768f6a00d451 100644
--- a/vta/python/vta/exec/rpc_server.py
+++ b/vta/python/vta/exec/rpc_server.py
@@ -10,9 +10,9 @@
 import ctypes
 import json
 import tvm
-from tvm._ffi.base import c_str
 from tvm import rpc
 from tvm.contrib import cc
+from pynq import Bitstream
 
 from ..environment import get_env
 from ..pkg_config import PkgConfig
@@ -51,7 +51,8 @@ def ext_dev_callback():
     @tvm.register_func("tvm.contrib.vta.init", override=True)
     def program_fpga(file_name):
         path = tvm.get_global_func("tvm.rpc.server.workpath")(file_name)
-        load_vta_dll().VTAProgram(c_str(path))
+        bitstream = Bitstream(path)
+        bitstream.download()
         logging.info("Program FPGA with %s", file_name)
 
     @tvm.register_func("tvm.rpc.server.shutdown", override=True)
diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc
index 1909ed35c562..5c597d918b5f 100644
--- a/vta/src/pynq/pynq_driver.cc
+++ b/vta/src/pynq/pynq_driver.cc
@@ -136,34 +136,3 @@ int VTADeviceRun(VTADeviceHandle handle,
   return static_cast<VTADevice*>(handle)->Run(
       insn_phy_addr, insn_count, wait_cycles);
 }
-
-void VTAProgram(const char* bitstream) {
-  int elem;
-  FILE *src, *dst, *partial;
-  partial = fopen(VTA_PYNQ_BS_IS_PARTIAL, "w");
-  if (partial == NULL) {
-    printf("Cannot open partial config file %s\n", VTA_PYNQ_BS_IS_PARTIAL);
-        fclose(partial);
-        exit(1);
-  }
-  fputc('0', partial);
-  fclose(partial);
-  src = fopen(bitstream, "rb");
-  if (src == NULL) {
-    printf("Cannot open bitstream %s\n", bitstream);
-    exit(1);
-  }
-  dst = fopen(VTA_PYNQ_BS_XDEVCFG, "wb");
-  if (dst == NULL) {
-    printf("Cannot open device file %s\n", VTA_PYNQ_BS_XDEVCFG);
-    fclose(dst);
-    exit(1);
-  }
-  elem = fgetc(src);
-  while (elem != EOF) {
-    fputc(elem, dst);
-    elem = fgetc(src);
-  }
-  fclose(src);
-  fclose(dst);
-}
diff --git a/vta/src/pynq/pynq_driver.h b/vta/src/pynq/pynq_driver.h
index 7aba00441abd..4a0b03ac25bd 100644
--- a/vta/src/pynq/pynq_driver.h
+++ b/vta/src/pynq/pynq_driver.h
@@ -37,11 +37,6 @@ void VTAUnmapRegister(void *vta, size_t length);
 void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
 uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
 
-/*! \brief (Pynq only) Partial bitstream status file path */
-#define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
-/*! \brief (Pynq only) Bitstream destination file path */
-#define VTA_PYNQ_BS_XDEVCFG "/dev/xdevcfg"
-
 /*! \brief (Pynq only) Path to /dev/mem */
 #define VTA_PYNQ_DEV_MEM_PATH "/dev/mem"
 /*! \brief (Pynq only) MMIO driver constant */

From 7e67e976ca056c8fc5af31a375b8522c3faa172c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Sat, 20 Oct 2018 21:58:37 -0700
Subject: [PATCH 254/529] [Relay] format text_printer.cc (#1946)

---
 src/relay/ir/text_printer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index 5bbcb0608e6f..66ef86641fae 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -121,7 +121,7 @@ class TextMetaDataContext {
 };
 
 class TextPrinter :
-    public ExprFunctor<TextValue(const Expr&)> ,
+    public ExprFunctor<TextValue(const Expr&)>,
     public TypeFunctor<void (const Type&, std::ostream& os)>,  // NOLINT(*)
     public AttrFunctor<void (const NodeRef&, std::ostream& os)> { // NOLINT(*)
  public:

From e312964857625ebb15eb60e1a8038ee234f5660e Mon Sep 17 00:00:00 2001
From: Sergey Mironov <grrwlf@gmail.com>
Date: Sun, 21 Oct 2018 08:06:45 +0300
Subject: [PATCH 255/529] [TOPI] Specify non-zero absolute tolerance in tests
 (#1925)

---
 apps/extension/tests/test_ext.py              |  2 +-
 docs/deploy/aocl_fpga.md                      |  2 +-
 docs/deploy/aws_fpga.md                       |  2 +-
 nnvm/python/nnvm/testing/check_computation.py | 10 ++---
 nnvm/tests/python/compiler/test_build.py      | 18 ++++----
 .../python/compiler/test_compiler_cache.py    |  2 +-
 nnvm/tests/python/compiler/test_fold_axis.py  |  3 +-
 .../tests/python/compiler/test_nhwc_layout.py |  2 +-
 nnvm/tests/python/compiler/test_op_fusion.py  | 14 +++----
 nnvm/tests/python/compiler/test_optimizer.py  |  2 +-
 nnvm/tests/python/compiler/test_param_dict.py |  2 +-
 nnvm/tests/python/compiler/test_rpc_exec.py   |  2 +-
 nnvm/tests/python/compiler/test_top_assign.py |  4 +-
 nnvm/tests/python/compiler/test_top_level2.py | 28 ++++++-------
 nnvm/tests/python/compiler/test_top_level4.py | 32 +++++++-------
 .../python/frontend/coreml/test_forward.py    | 18 ++++----
 .../python/frontend/darknet/test_forward.py   |  4 +-
 .../python/frontend/keras/test_forward.py     |  2 +-
 .../python/frontend/mxnet/test_forward.py     |  4 +-
 .../python/frontend/onnx/test_forward.py      | 42 +++++++++----------
 .../frontend/tensorflow/test_forward.py       | 14 +++----
 python/tvm/__init__.py                        |  1 +
 python/tvm/testing.py                         | 12 ++++++
 tests/python/contrib/test_cblas.py            |  2 +-
 tests/python/contrib/test_cublas.py           |  2 +-
 tests/python/contrib/test_dlpack.py           |  2 +-
 tests/python/contrib/test_miopen.py           |  2 +-
 tests/python/contrib/test_mps.py              |  2 +-
 tests/python/contrib/test_mxnet_bridge.py     |  2 +-
 tests/python/contrib/test_nnpack.py           |  8 ++--
 tests/python/contrib/test_rocblas.py          |  2 +-
 tests/python/contrib/test_sort.py             |  4 +-
 tests/python/contrib/test_sparse.py           |  6 +--
 tests/python/integration/test_dot.py          |  2 +-
 tests/python/integration/test_ewise.py        | 12 +++---
 tests/python/integration/test_ewise_fpga.py   |  4 +-
 tests/python/integration/test_gemm.py         |  2 +-
 tests/python/integration/test_reduce.py       | 14 +++----
 tests/python/integration/test_scan.py         |  2 +-
 .../unittest/test_codegen_cross_llvm.py       |  2 +-
 tests/python/unittest/test_codegen_cuda.py    |  6 +--
 tests/python/unittest/test_codegen_device.py  |  4 +-
 tests/python/unittest/test_codegen_extern.py  |  8 ++--
 tests/python/unittest/test_codegen_llvm.py    | 24 +++++------
 tests/python/unittest/test_hybrid_script.py   |  4 +-
 tests/python/unittest/test_ir_builder.py      |  4 +-
 .../unittest/test_lang_tensor_overload_op.py  |  8 ++--
 tests/python/unittest/test_runtime_ndarray.py |  2 +-
 tests/python/unittest/test_runtime_rpc.py     |  2 +-
 .../integration/test_codegen_verilog.py       |  2 +-
 tests/webgl/test_local_gemm.py                |  2 +-
 tests/webgl/test_local_multi_stage.py         |  2 +-
 tests/webgl/test_local_save_load.py           |  2 +-
 tests/webgl/test_local_topi_conv2d_nchw.py    |  4 +-
 tests/webgl/test_local_topi_dense.py          |  2 +-
 tests/webgl/test_local_topi_pooling.py        |  4 +-
 tests/webgl/test_local_topi_softmax.py        |  4 +-
 tests/webgl/test_remote_save_load.py          |  2 +-
 topi/recipe/broadcast/test_broadcast_map.py   |  4 +-
 topi/recipe/conv/depthwise_conv2d_test.py     | 12 +++---
 topi/recipe/conv/test_conv2d_hwcn_map.py      |  4 +-
 topi/recipe/gemm/cuda_gemm_square.py          |  2 +-
 topi/recipe/gemm/gemm_int8.py                 |  2 +-
 topi/recipe/reduce/test_reduce_map.py         |  2 +-
 topi/recipe/rnn/matexp.py                     |  2 +-
 .../python/test_topi_bitserial_conv2d.py      |  4 +-
 topi/tests/python/test_topi_bnn.py            |  2 +-
 topi/tests/python/test_topi_broadcast.py      |  4 +-
 topi/tests/python/test_topi_clip.py           |  2 +-
 topi/tests/python/test_topi_conv2d_hwcn.py    |  4 +-
 topi/tests/python/test_topi_conv2d_int8.py    |  2 +-
 topi/tests/python/test_topi_conv2d_nchw.py    |  2 +-
 topi/tests/python/test_topi_conv2d_nhwc.py    |  2 +-
 .../python/test_topi_conv2d_transpose_nchw.py |  4 +-
 .../tests/python/test_topi_conv2d_winograd.py |  2 +-
 topi/tests/python/test_topi_dense.py          |  2 +-
 .../python/test_topi_depthwise_conv2d.py      | 12 +++---
 .../test_topi_depthwise_conv2d_back_input.py  |  2 +-
 .../test_topi_depthwise_conv2d_back_weight.py |  2 +-
 topi/tests/python/test_topi_dilate.py         |  2 +-
 topi/tests/python/test_topi_l2norm.py         |  2 +-
 topi/tests/python/test_topi_lrn.py            |  2 +-
 topi/tests/python/test_topi_math.py           |  2 +-
 topi/tests/python/test_topi_matmul.py         |  2 +-
 topi/tests/python/test_topi_pooling.py        |  4 +-
 topi/tests/python/test_topi_reduce.py         |  6 +--
 topi/tests/python/test_topi_region.py         |  2 +-
 topi/tests/python/test_topi_relu.py           |  6 +--
 topi/tests/python/test_topi_reorg.py          |  2 +-
 topi/tests/python/test_topi_resize.py         |  2 +-
 topi/tests/python/test_topi_shortcut.py       |  2 +-
 topi/tests/python/test_topi_softmax.py        |  4 +-
 topi/tests/python/test_topi_sparse.py         |  8 ++--
 topi/tests/python/test_topi_tensor.py         |  6 +--
 topi/tests/python/test_topi_transform.py      | 20 ++++-----
 topi/tests/python/test_topi_upsampling.py     |  2 +-
 topi/tests/python/test_topi_vision.py         |  6 +--
 topi/tests/python_cpp/test_topi_bnn.py        |  2 +-
 topi/tests/python_cpp/test_topi_clip.py       |  2 +-
 topi/tests/python_cpp/test_topi_dense.py      |  2 +-
 topi/tests/python_cpp/test_topi_dilate.py     |  2 +-
 topi/tests/python_cpp/test_topi_l2norm.py     |  2 +-
 topi/tests/python_cpp/test_topi_lrn.py        |  2 +-
 topi/tests/python_cpp/test_topi_pooling.py    |  4 +-
 topi/tests/python_cpp/test_topi_reduce.py     |  6 +--
 topi/tests/python_cpp/test_topi_region.py     |  2 +-
 topi/tests/python_cpp/test_topi_relu.py       |  6 +--
 topi/tests/python_cpp/test_topi_reorg.py      |  2 +-
 topi/tests/python_cpp/test_topi_softmax.py    |  4 +-
 topi/tests/python_cpp/test_topi_tensor.py     |  6 +--
 topi/tests/python_cpp/test_topi_transform.py  | 20 ++++-----
 topi/tests/python_cpp/test_topi_yolo.py       |  2 +-
 tutorials/autotvm/tune_conv2d_cuda.py         |  2 +-
 tutorials/autotvm/tune_simple_template.py     |  2 +-
 tutorials/get_started.py                      |  8 ++--
 tutorials/language/extern_op.py               |  4 +-
 tutorials/language/reduction.py               |  2 +-
 tutorials/language/scan.py                    |  2 +-
 tutorials/language/tensorize.py               |  4 +-
 tutorials/nnvm/using_external_lib.py          |  2 +-
 tutorials/optimize/opt_gemm.py                | 14 +++----
 tutorials/topi/intro_topi.py                  |  2 +-
 .../python/integration/test_benchmark_gemm.py |  2 +-
 .../integration/test_benchmark_topi_conv2d.py |  4 +-
 vta/tutorials/convolution_opt.py              |  2 +-
 125 files changed, 330 insertions(+), 316 deletions(-)
 create mode 100644 python/tvm/testing.py

diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index 628602f0baea..b7b97897a0fa 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -22,7 +22,7 @@ def check_llvm():
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1)
+        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1)
     check_llvm()
 
 
diff --git a/docs/deploy/aocl_fpga.md b/docs/deploy/aocl_fpga.md
index f29fc9ef1ace..c9c50dc56be6 100644
--- a/docs/deploy/aocl_fpga.md
+++ b/docs/deploy/aocl_fpga.md
@@ -52,7 +52,7 @@ b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
 c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx)
 
 fadd(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 ```
 
 Setup
diff --git a/docs/deploy/aws_fpga.md b/docs/deploy/aws_fpga.md
index 86918ca90a80..9d8af7d97a94 100644
--- a/docs/deploy/aws_fpga.md
+++ b/docs/deploy/aws_fpga.md
@@ -55,7 +55,7 @@ b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
 c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx)
 
 fadd(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 ```
 
 Setup
diff --git a/nnvm/python/nnvm/testing/check_computation.py b/nnvm/python/nnvm/testing/check_computation.py
index 76d7b66b140f..aab3f916e19f 100644
--- a/nnvm/python/nnvm/testing/check_computation.py
+++ b/nnvm/python/nnvm/testing/check_computation.py
@@ -281,10 +281,10 @@ def check_function(symbol, forward=None, backward=None, grad_input_vars=None,
         Additional parameters for `check_numerical_grads`.
 
     atol : float, optional
-        Absolute tolerance for `np.testing.assert_allclose`. NOT used for numerical gradients.
+        Absolute tolerance for `tvm.testing.assert_allclose`. NOT used for numerical gradients.
 
     rtol : float, optional
-        Relative tolerance for `np.testing.assert_allclose`. NOT used for numerical gradients.
+        Relative tolerance for `tvm.testing.assert_allclose`. NOT used for numerical gradients.
 
     quiet : bool, optional
         Don't dump additional information to stdout on failure.
@@ -466,7 +466,7 @@ def check_function(symbol, forward=None, backward=None, grad_input_vars=None,
                                      .format(len(numpy_res), out_len))
 
                 for i in range(out_len):
-                    np.testing.assert_allclose(nnvm_res[i], numpy_res[i], atol=atol, rtol=rtol)
+                    tvm.testing.assert_allclose(nnvm_res[i], numpy_res[i], atol=atol, rtol=rtol)
 
             if backward is not None:
                 nothing_was_done = False
@@ -495,8 +495,8 @@ def check_function(symbol, forward=None, backward=None, grad_input_vars=None,
                                          .format(set(grad_var_names) - set(numpy_grads)))
 
                 for x_name in numpy_grads:
-                    np.testing.assert_allclose(nnvm_grads[x_name], numpy_grads[x_name],
-                                               atol=atol, rtol=rtol)
+                    tvm.testing.assert_allclose(nnvm_grads[x_name], numpy_grads[x_name],
+                                                atol=atol, rtol=rtol)
 
             if numerical_grads:
                 nothing_was_done = False
diff --git a/nnvm/tests/python/compiler/test_build.py b/nnvm/tests/python/compiler/test_build.py
index 7697497d3dbc..387225f550ab 100644
--- a/nnvm/tests/python/compiler/test_build.py
+++ b/nnvm/tests/python/compiler/test_build.py
@@ -27,7 +27,7 @@ def verify(graph, lib):
         # get outputs
         out = tvm.nd.empty(shape, dtype)
         get_output(0, out)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
 
     graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
@@ -49,7 +49,7 @@ def test_run():
     nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
     ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
     res = _run_graph(z, {"x": nx, "y": ny})
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         res[0].asnumpy(), np.exp(nx.asnumpy() + ny.asnumpy()))
 
 
@@ -73,7 +73,7 @@ def test_precompute_prune():
     m["load_params"](nnvm.compiler.save_param_dict(params))
     m.run()
     out = m.get_output(0, out=res)
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         res.asnumpy(), nx.asnumpy() + 1 + ny.asnumpy() + na.asnumpy())
 
 
@@ -92,7 +92,7 @@ def test_dtypes():
         m.run(x=data)
         data = (data > 0) * data
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), data, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), data, atol=1e-5, rtol=1e-5)
 
 def test_ndarray_output():
     x = sym.Variable("x")
@@ -110,7 +110,7 @@ def test_ndarray_output():
     m.set_input("y", ny)
     m.run()
     out = m.get_output(0)
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         out.asnumpy(), nx.asnumpy() + ny.asnumpy())
 
 def test_ndarray_input():
@@ -131,12 +131,12 @@ def test_ndarray_input():
     in_y = tvm.nd.empty(shape, dtype)
     m.get_input("x", in_x)
     m.get_input("y", in_y)
-    np.testing.assert_allclose(nx.asnumpy(), in_x.asnumpy())
-    np.testing.assert_allclose(ny.asnumpy(), in_y.asnumpy())
+    tvm.testing.assert_allclose(nx.asnumpy(), in_x.asnumpy())
+    tvm.testing.assert_allclose(ny.asnumpy(), in_y.asnumpy())
     in_nx = m.get_input("x")
     in_ny = m.get_input("y")
-    np.testing.assert_allclose(nx.asnumpy(), in_nx.asnumpy())
-    np.testing.assert_allclose(ny.asnumpy(), in_ny.asnumpy())
+    tvm.testing.assert_allclose(nx.asnumpy(), in_nx.asnumpy())
+    tvm.testing.assert_allclose(ny.asnumpy(), in_ny.asnumpy())
 
 def test_num_outputs():
     x = sym.Variable('x')
diff --git a/nnvm/tests/python/compiler/test_compiler_cache.py b/nnvm/tests/python/compiler/test_compiler_cache.py
index 970b193a6875..623f05048348 100644
--- a/nnvm/tests/python/compiler/test_compiler_cache.py
+++ b/nnvm/tests/python/compiler/test_compiler_cache.py
@@ -19,7 +19,7 @@ def verify(graph, lib):
         m.run(x=na, y=nb)
         # get outputs
         out = m.get_output(0, tvm.nd.empty(shape, dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
 
     engine = nnvm.compiler.engine
diff --git a/nnvm/tests/python/compiler/test_fold_axis.py b/nnvm/tests/python/compiler/test_fold_axis.py
index ab90cd723989..a7611fbde797 100644
--- a/nnvm/tests/python/compiler/test_fold_axis.py
+++ b/nnvm/tests/python/compiler/test_fold_axis.py
@@ -1,4 +1,5 @@
 """Unittest cases for fold_axis"""
+import tvm
 import nnvm
 import nnvm.testing.resnet
 import numpy as np
@@ -147,7 +148,7 @@ def run_prune(graph, params, opt_level):
 
     x = run_prune(graph, params, 0)
     y = run_prune(graph, params, 3)
-    np.testing.assert_allclose(y[0].asnumpy(), x[0].asnumpy())
+    tvm.testing.assert_allclose(y[0].asnumpy(), x[0].asnumpy())
 
 
 if __name__ == "__main__":
diff --git a/nnvm/tests/python/compiler/test_nhwc_layout.py b/nnvm/tests/python/compiler/test_nhwc_layout.py
index 96a8135435c3..f1aced94a0b3 100644
--- a/nnvm/tests/python/compiler/test_nhwc_layout.py
+++ b/nnvm/tests/python/compiler/test_nhwc_layout.py
@@ -50,7 +50,7 @@ def test_nhwc():
     oshape_nhwc = (1, 224, 224, out_channel)
     nchw_output = build_and_run(nchw_sym, nchw_params, data, oshape)
     nhwc_output = build_and_run(nhwc_sym, nhwc_params, data.transpose(0, 2, 3, 1), oshape_nhwc)
-    np.testing.assert_allclose(nchw_output, nhwc_output.transpose(0, 3, 1, 2), rtol=1e-5, atol=1e-5)
+    tvm.testing.assert_allclose(nchw_output, nhwc_output.transpose(0, 3, 1, 2), rtol=1e-5, atol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py
index 288f112f1063..4c4773773d47 100644
--- a/nnvm/tests/python/compiler/test_op_fusion.py
+++ b/nnvm/tests/python/compiler/test_op_fusion.py
@@ -22,7 +22,7 @@ def test_ewise_injective():
         x_np = np.random.uniform(size=dshape).astype(dtype)
         m.run(x=x_np)
         out = m.get_output(0, tvm.nd.empty((10, 6)))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),  x_np.reshape(out.shape) * 2 + 1,
             atol=1e-5, rtol=1e-5)
 
@@ -54,7 +54,7 @@ def test_conv_ewise_injective():
             data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
         c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1) + 1
         c_np = c_np.reshape(c_np.shape[0], np.prod(c_np.shape[1:])) + 1
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
 def test_injective_reduce_injective():
@@ -74,7 +74,7 @@ def test_injective_reduce_injective():
         c_np = np.sum(data.reshape(32, 18 * 18) + 1, axis=1)
         # get output
         out = m.get_output(0, tvm.nd.empty(c_np.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
 def test_injective_conv2d():
@@ -107,7 +107,7 @@ def test_injective_conv2d():
             data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
         weight = np.mean(data.asnumpy(), axis=(2, 3))
         c_np = weight[:, :, np.newaxis, np.newaxis] * data.asnumpy() + residual
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
 def test_concatenate_conv2d():
@@ -140,7 +140,7 @@ def test_concatenate_conv2d():
         conv = topi.testing.conv2d_nchw_python(
             concat, kernel.asnumpy(), (1,1), 'SAME')
         ref = concat + conv
-        np.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
 
 
 def test_residual_block_layout_transform():
@@ -178,7 +178,7 @@ def test_residual_block_layout_transform():
     conv2 = topi.testing.conv2d_nchw_python(
         conv1, kernel2.asnumpy(), (1,1), 'SAME')
     ref = np.maximum(conv1 + conv2, 0)
-    np.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
+    tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
 
 
 def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2):
@@ -218,7 +218,7 @@ def get_sym(out_channel):
         _, params2 = utils.create_workload(sym2, 1, dshape[1:], seed=0)
         output1, g1 = build_and_run(sym1, params1, data, oshape, target, ctx, opt_level=2)
         output2, g2 = build_and_run(sym2, params2, data, oshape, target, ctx, opt_level=0)
-        np.testing.assert_allclose(output1, output2, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(output1, output2, rtol=1e-5, atol=1e-5)
         # data, conv weight, bias, batch norm gamma, batch norm beta, conv op
         assert g1.index.num_nodes == 6
 
diff --git a/nnvm/tests/python/compiler/test_optimizer.py b/nnvm/tests/python/compiler/test_optimizer.py
index fd620271d861..413227d88091 100644
--- a/nnvm/tests/python/compiler/test_optimizer.py
+++ b/nnvm/tests/python/compiler/test_optimizer.py
@@ -27,7 +27,7 @@ def helper(symbol, inputs, params, update_func, run_times, target, ctx, dtype="f
         m.run()
     y_np = update_func(**np_inputs)
     out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
-    np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
+    tvm.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
 
 
 def test_sgd():
diff --git a/nnvm/tests/python/compiler/test_param_dict.py b/nnvm/tests/python/compiler/test_param_dict.py
index a6605123fa0d..447db305d98c 100644
--- a/nnvm/tests/python/compiler/test_param_dict.py
+++ b/nnvm/tests/python/compiler/test_param_dict.py
@@ -68,7 +68,7 @@ def verify_nnvm(remote, target, shape, dtype):
         m.load_params(nnvm.compiler.save_param_dict(params))
         m.run()
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx))
-        np.testing.assert_allclose(a + 1, out.asnumpy())
+        tvm.testing.assert_allclose(a + 1, out.asnumpy())
 
     print("Test RPC connection to PowerPC...")
     remote = rpc.connect(host, port)
diff --git a/nnvm/tests/python/compiler/test_rpc_exec.py b/nnvm/tests/python/compiler/test_rpc_exec.py
index 111ba724e196..8177f1b153ab 100644
--- a/nnvm/tests/python/compiler/test_rpc_exec.py
+++ b/nnvm/tests/python/compiler/test_rpc_exec.py
@@ -43,7 +43,7 @@ def test_rpc_executor():
     # get outputs
     out = tvm.nd.empty(shape, dtype, ctx)
     get_output(0, out)
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
     server.terminate()
 
diff --git a/nnvm/tests/python/compiler/test_top_assign.py b/nnvm/tests/python/compiler/test_top_assign.py
index e411385712f5..95c16c96c443 100644
--- a/nnvm/tests/python/compiler/test_top_assign.py
+++ b/nnvm/tests/python/compiler/test_top_assign.py
@@ -27,11 +27,11 @@ def check(target, ctx):
         m.set_input("w", data)
         m.run()
         out = m.get_input("w2", tvm.nd.empty(dshape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 2, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 2, rtol=1e-5)
 
         m.run()
         out = m.get_input("w2", tvm.nd.empty(dshape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 3, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 3, rtol=1e-5)
 
     for target, ctx in ctx_list():
         check(target, ctx)
diff --git a/nnvm/tests/python/compiler/test_top_level2.py b/nnvm/tests/python/compiler/test_top_level2.py
index c26f5356557f..0585f3c974b7 100644
--- a/nnvm/tests/python/compiler/test_top_level2.py
+++ b/nnvm/tests/python/compiler/test_top_level2.py
@@ -22,7 +22,7 @@ def run_test_conv2d(sym, dtype, dshape, kshape, oshape, shape_dict, padding):
             c_np = topi.testing.conv2d_nchw_python(
                 data.asnumpy(), kernel.asnumpy(), 1, padding)
             c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
-            np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+            tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
     x = sym.Variable("x")
     y = sym.conv2d(x, channels=10, kernel_size=(3,3),
@@ -71,7 +71,7 @@ def test_mixed_precision():
         c_np = topi.testing.conv2d_nchw_python(
             data.asnumpy().astype(out_dtype),
             kernel.asnumpy().astype(out_dtype), 1, 1)
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
 def test_dilated_conv2d():
@@ -97,7 +97,7 @@ def test_dilated_conv2d():
         c_np = topi.testing.conv2d_nchw_python(
             data.asnumpy(), dkernel_np, 1, 1)
         c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
 def test_grouped_conv2d_nchw():
@@ -120,7 +120,7 @@ def test_grouped_conv2d_nchw():
         c_np = topi.testing.depthwise_conv2d_python_nchw(
             data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
         c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 def test_grouped_conv2d_nhwc():
     x = sym.Variable("x")
@@ -142,7 +142,7 @@ def test_grouped_conv2d_nhwc():
         c_np = topi.testing.depthwise_conv2d_python_nhwc(
             data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
         c_np = c_np + bias.asnumpy().reshape(1, 1, kshape[2])
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
 def test_conv2d_transpose():
@@ -167,7 +167,7 @@ def test_conv2d_transpose():
         c_np = c_np + bias.asnumpy().reshape(kshape[1], 1, 1)
         d_np = np.zeros(shape=oshape)
         d_np[:,:,0:c_np.shape[2],0:c_np.shape[3]] = c_np
-        np.testing.assert_allclose(out.asnumpy(), d_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), d_np, rtol=1e-5)
 
 
 def test_max_pool2d():
@@ -185,7 +185,7 @@ def test_max_pool2d():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = np.max(data.asnumpy().reshape(1,3,14,2,14,2), axis=(3,5))
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 
 def test_avg_pool2d():
@@ -202,7 +202,7 @@ def test_avg_pool2d():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = np.mean(data.asnumpy().reshape(1,3,14,2,14,2), axis=(3,5))
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 
 def test_avg_pool2d_no_count_pad():
@@ -237,7 +237,7 @@ def test_avg_pool2d_no_count_pad():
         data = tvm.nd.array(a_np)
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty((n, oc, oh, ow), dtype))
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 
 def test_global_max_pool2d():
@@ -254,7 +254,7 @@ def test_global_max_pool2d():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = np.max(data.asnumpy(), axis=(2,3), keepdims=True)
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 
 def test_global_avg_pool2d():
@@ -271,7 +271,7 @@ def test_global_avg_pool2d():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = np.mean(data.asnumpy(), axis=(2,3), keepdims=True)
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 
 def test_upsampling_nearest_neighbor():
@@ -290,7 +290,7 @@ def test_upsampling_nearest_neighbor():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = topi.testing.upsampling_python(a_np, scale, "NCHW")
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 def test_upsampling_bilinear():
     x = sym.Variable("x")
@@ -309,7 +309,7 @@ def test_upsampling_bilinear():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = topi.testing.bilinear_resize_python(a_np, (32*scale, 32*scale), "NCHW")
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
 def test_resize_bilinear():
     x = sym.Variable("x")
@@ -327,7 +327,7 @@ def test_resize_bilinear():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = topi.testing.bilinear_resize_python(a_np, (60, 60), "NHWC")
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
 if __name__ == "__main__":
     test_mixed_precision()
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 16b02f956ccc..46383e73657e 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -24,7 +24,7 @@ def verify_transpose(dshape, axes):
         m.run(x=data)
         out_np = np.transpose(data.asnumpy(), axes=axes) + 1
         out = m.get_output(0, tvm.nd.empty(out_np.shape))
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
 
 def verify_reduce_explicit(dshape, data, result, fsym, oshape=None, otype='float32', **kwargs):
     """ Verify reduce operations by comparign its result with `result` """
@@ -43,7 +43,7 @@ def verify_reduce_explicit(dshape, data, result, fsym, oshape=None, otype='float
         out = m.get_output(0, tvm.nd.empty(oshape, dtype=otype))
         if isinstance(result, np.ndarray):
             np.testing.assert_equal(out.asnumpy().shape, result.shape)
-            np.testing.assert_allclose(out.asnumpy(), result, atol=1e-5, rtol=1e-5)
+            tvm.testing.assert_allclose(out.asnumpy(), result, atol=1e-5, rtol=1e-5)
         else:
             tvm_out = out.asnumpy()
             assert abs(result - tvm_out) <= (1e-5 + 1e-5 * abs(tvm_out))
@@ -68,7 +68,7 @@ def verify_collapse(dshape, target_shape, fnp):
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(target_shape))
         out_np = fnp(data)
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
 
 
 def test_transpose():
@@ -149,7 +149,7 @@ def verify_flip(ishape, axis):
         m = graph_runtime.create(graph, lib, ctx)
         m.run(x=x_np)
         out = m.get_output(0, tvm.nd.empty(res.shape))
-        np.testing.assert_allclose(out.asnumpy(), res, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), res, atol=1e-5, rtol=1e-5)
 
 
 def test_flip():
@@ -174,7 +174,7 @@ def verify_reshape(dshape, oshape):
         m.run(x=data)
         out_np = data.asnumpy().reshape(oshape) + 1
         out = m.get_output(0, tvm.nd.empty(out_np.shape))
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
 
 
 def test_reshape():
@@ -435,7 +435,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run(data=np.random.uniform(size=shape).astype(dtype))
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=value, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -445,7 +445,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run(data=np.random.uniform(size=shape).astype(dtype))
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=1, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -455,7 +455,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run(data=np.random.uniform(size=shape).astype(dtype))
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=0, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -465,7 +465,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run()
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=value, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -475,7 +475,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run()
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=1, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -485,7 +485,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run()
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=0, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -534,7 +534,7 @@ def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1),
     m.set_input("data", np.random.uniform(size=dshape).astype(dtype))
     m.run()
     out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
-    np.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+    tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
 
 def test_multibox_prior():
     verify_multibox_prior((1, 3, 50, 50))
@@ -571,7 +571,7 @@ def test_multibox_transform_loc():
     m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
     m.run()
     out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
-    np.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
+    tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
 
 def test_nms():
     dshape = (1, 5, 6)
@@ -599,7 +599,7 @@ def test_nms():
     m.set_input(**{"data": np_data, "valid_count": np_valid_count})
     m.run()
     out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
-    np.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+    tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
 
 def np_slice_like(np_data, np_shape_like, axis=[]):
     begin_idx = [0 for _ in np_data.shape]
@@ -634,7 +634,7 @@ def verify_slice_like(np_data, np_shape_like, axis=[]):
         m.set_input(**{"data1": np_data, "data2": np_shape_like})
         m.run()
         out = m.get_output(0, tvm.nd.empty(np_result.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
 
 def test_slice_like():
     np_data = np.random.uniform(size=(3, 4, 5))
@@ -673,7 +673,7 @@ def verify_where(condition, x, y):
         m.set_input(**{"condition": condition, "x": x, "y": y})
         m.run()
         out = m.get_output(0, tvm.nd.empty(x.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
 
 def test_where():
     shape = (13, 8, 224, 224, 6)
diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py
index a33a7c5a5ed7..214c917cb96d 100644
--- a/nnvm/tests/python/frontend/coreml/test_forward.py
+++ b/nnvm/tests/python/frontend/coreml/test_forward.py
@@ -109,7 +109,7 @@ def verify_AddLayerParams(input_dim, alpha=2):
                            ['input1', 'input2'],
                            b_np.shape,
                            dtype)
-        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 def test_forward_AddLayerParams():
     verify_AddLayerParams((1, 2, 2), 0)
@@ -139,7 +139,7 @@ def verify_MultiplyLayerParams(input_dim, alpha):
                            ['input1', 'input2'],
                            b_np.shape,
                            dtype)
-        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 def test_forward_MultiplyLayerParams():
     verify_MultiplyLayerParams((1, 2, 2), 0)
@@ -168,7 +168,7 @@ def verify_ConcatLayerParams(input1_dim, input2_dim):
                            ['input1', 'input2'],
                            b_np.shape,
                            dtype)
-        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 def test_forward_ConcatLayerParams():
     verify_ConcatLayerParams((1, 1, 2, 2), (1, 2, 2, 2))
@@ -198,7 +198,7 @@ def verify_UpsampleLayerParams(input_dim, scale, mode):
     model = cm.models.MLModel(builder.spec)
     for target, ctx in ctx_list():
         out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
-        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 def test_forward_UpsampleLayerParams():
     verify_UpsampleLayerParams((1, 16, 32, 32), 2, 'NN')
@@ -218,7 +218,7 @@ def verify_l2_normalize(input_dim, eps):
     model = cm.models.MLModel(builder.spec)
     for target, ctx in ctx_list():
         out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
-        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 def test_forward_l2_normalize():
     verify_l2_normalize((1, 3, 20, 20), 0.001)
@@ -243,7 +243,7 @@ def verify_lrn(input_dim, size, bias, alpha, beta):
     model = cm.models.MLModel(builder.spec)
     for target, ctx in ctx_list():
         out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
-        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 def test_forward_lrn():
     verify_lrn((1, 3, 10, 20), 3, 1.0, 1.0, 0.5)
@@ -271,7 +271,7 @@ def verify_average(input_dim1, input_dim2, axis=0):
                            ['input1', 'input2'],
                            b_np.shape,
                            dtype)
-        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 def test_forward_average():
     verify_average((1, 3, 20, 20), (1, 3, 20, 20))
@@ -303,7 +303,7 @@ def verify_max(input_dim):
                            ['input1', 'input2', 'input3'],
                            b_np.shape,
                            dtype)
-        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 def test_forward_max():
     verify_max((1, 3, 20, 20))
@@ -334,7 +334,7 @@ def verify_min(input_dim):
                            ['input1', 'input2', 'input3'],
                            b_np.shape,
                            dtype)
-        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 def test_forward_min():
     verify_min((1, 3, 20, 20))
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
index b1d5e735611a..f836ca477dda 100644
--- a/nnvm/tests/python/frontend/darknet/test_forward.py
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -139,7 +139,7 @@ def get_darknet_output(net, img):
 
     tvm_out = _get_tvm_output(net, data, build_dtype)
     for tvm_outs, darknet_out in zip(tvm_out, darknet_output):
-        np.testing.assert_allclose(darknet_out, tvm_outs, rtol=1e-3, atol=1e-3)
+        tvm.testing.assert_allclose(darknet_out, tvm_outs, rtol=1e-3, atol=1e-3)
 
 def test_rnn_forward(net):
     '''Test network with given input data on both darknet and tvm'''
@@ -158,7 +158,7 @@ def get_darknet_network_predict(net, data):
     last_layer = net.layers[net.n-1]
     darknet_outshape = (last_layer.batch, last_layer.outputs)
     darknet_out = darknet_out.reshape(darknet_outshape)
-    np.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-4, atol=1e-4)
+    tvm.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-4, atol=1e-4)
 
 def test_forward_extraction():
     '''test extraction model'''
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 2e1c378d27cd..96c51a94ff69 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -52,7 +52,7 @@ def to_channels_last(arr):
         for kout, tout in zip(keras_out, tvm_out):
             if need_transpose:
                 tout = to_channels_last(tout)
-            np.testing.assert_allclose(kout, tout, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(kout, tout, rtol=1e-5, atol=1e-5)
 
 def test_forward_elemwise_add():
     r = []
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index dcab7d8fcde7..653af1a63154 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -62,13 +62,13 @@ def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
         gluon_out, gluon_sym = get_gluon_output(name, x)
         for target, ctx in ctx_list():
             tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype)
-            np.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
     else:
         mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype)
         assert "data" not in args
         for target, ctx in ctx_list():
             tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype)
-            np.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_forward_mlp():
     mlp = model_zoo.mx_mlp
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 7ca520a88b12..e0d77277f98b 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -70,7 +70,7 @@ def verify_onnx_forward_impl(graph_file, data_shape, out_shape):
     c2_out = get_caffe2_output(model, x, dtype)
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, dtype)
-        np.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 def verify_super_resolution_example():
     verify_onnx_forward_impl(super_resolution, (1, 1, 224, 224), (1, 1, 672, 672))
@@ -112,7 +112,7 @@ def test_reshape():
         x = np.random.uniform(size=in_shape).astype('int32')
         tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
 
-    np.testing.assert_allclose(ref_shape, tvm_out.shape)
+    tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
 def test_reshape_like():
     in_shape = (4, 3, 3, 4)
@@ -142,7 +142,7 @@ def test_reshape_like():
         x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
 
-    np.testing.assert_allclose(ref_shape, tvm_out.shape)
+    tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
 def _test_power_iteration(x_shape, y_shape):
     if isinstance(y_shape, int):
@@ -168,7 +168,7 @@ def _test_power_iteration(x_shape, y_shape):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [x, y], target, ctx, np_res.shape)
-        np.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_power():
     _test_power_iteration((1, 3), (1))
@@ -193,7 +193,7 @@ def test_squeeze():
         x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
 
-    np.testing.assert_allclose(out_shape, tvm_out.shape)
+    tvm.testing.assert_allclose(out_shape, tvm_out.shape)
 
 def test_unsqueeze():
     in_shape = (3, 3)
@@ -214,7 +214,7 @@ def test_unsqueeze():
         x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
 
-    np.testing.assert_allclose(out_shape, tvm_out.shape)
+    tvm.testing.assert_allclose(out_shape, tvm_out.shape)
 
 def verify_gather(in_shape, indices, axis, dtype):
     x = np.random.uniform(size=in_shape).astype(dtype)
@@ -235,7 +235,7 @@ def verify_gather(in_shape, indices, axis, dtype):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [x, indices], target, ctx, out_np.shape)
-        np.testing.assert_allclose(out_np, tvm_out)
+        tvm.testing.assert_allclose(out_np, tvm_out)
 
 def test_gather():
     verify_gather((4,), [1], 0, 'int32')
@@ -263,7 +263,7 @@ def _test_slice_iteration(indata, outdata, starts, ends, axes=None):
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
 
-    np.testing.assert_allclose(outdata, tvm_out)
+    tvm.testing.assert_allclose(outdata, tvm_out)
 
 def test_slice():
     x = np.random.randn(20, 10, 5).astype(np.float32)
@@ -290,7 +290,7 @@ def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype)
 
-    np.testing.assert_allclose(outdata, tvm_out)
+    tvm.testing.assert_allclose(outdata, tvm_out)
 
 def test_floor():
     _test_onnx_op_elementwise((2, 4, 5, 6), np.floor, {}, 'float32', 'Floor', {})
@@ -329,7 +329,7 @@ def test_matmul():
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [a_array, b_array], target, ctx, out_np.shape)
-        np.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None):
     in_array = np.random.uniform(size=shape).astype(dtype)
@@ -376,7 +376,7 @@ def _get_python_lrn():
         # get outputs
         tvm_out = m.get_output(0, tvm.nd.empty(shape, dtype))
         py_out = _get_python_lrn()
-        np.testing.assert_allclose(py_out, tvm_out.asnumpy(), rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(py_out, tvm_out.asnumpy(), rtol=1e-5, atol=1e-5)
 
 def test_lrn():
     verify_lrn((5, 5, 5, 5), 3, 'float32')
@@ -400,7 +400,7 @@ def _test_upsample_nearest():
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
-        np.testing.assert_allclose(out_array, tvm_out)
+        tvm.testing.assert_allclose(out_array, tvm_out)
 
 def _test_upsample_bilinear():
     scale = 2
@@ -420,7 +420,7 @@ def _test_upsample_bilinear():
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
-        np.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_upsample():
     _test_upsample_nearest()
@@ -447,7 +447,7 @@ def _test_softmax(inshape, axis):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, indata, target, ctx, outshape, 'float32')
-        np.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_softmax():
     _test_softmax((1, 10), None)
@@ -479,7 +479,7 @@ def verify_min(input_dim):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
-        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_forward_min():
     verify_min((1, 3, 20, 20))
@@ -511,7 +511,7 @@ def verify_max(input_dim):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
-        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_forward_max():
     verify_max((1, 3, 20, 20))
@@ -543,7 +543,7 @@ def verify_mean(input_dim):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
-        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_forward_mean():
     verify_mean((1, 3, 20, 20))
@@ -569,7 +569,7 @@ def verify_hardsigmoid(input_dim, alpha, beta):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape)
-        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_forward_hardsigmoid():
     verify_hardsigmoid((1, 3, 20, 20), 0.5, 0.6)
@@ -618,7 +618,7 @@ def _argmin_numpy(data, axis=0, keepdims=True):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
-        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 def verify_argmax(input_dim, axis=None, keepdims=None):
     def _argmax_numpy(data, axis=0, keepdims=True):
@@ -665,7 +665,7 @@ def _argmax_numpy(data, axis=0, keepdims=True):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
-        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_forward_arg_min_max():
     '''Verify argmin and argmax'''
@@ -705,7 +705,7 @@ def verify_constantfill(is_shape, input_dim, out_dim, value, dtype, **kwargs):
         else:
             tvm_out = get_tvm_output(model, [input_a], target, ctx, out.shape)
 
-        np.testing.assert_allclose(out, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(out, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_constantfill():
     verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index d73080d1cb00..2ebc7b671ba5 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -120,7 +120,7 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False,
                 continue
 
             tvm_output = run_tvm_graph(final_graph_def, in_data, in_node, target=device)
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+            tvm.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
 
         sess.close()
 
@@ -580,7 +580,7 @@ def _get_tensorflow_output():
     out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
     out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
     tvm_out = [out, out_state_c, out_state_h]
-    np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3)
+    tvm.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3)
 
 def test_forward_lstm():
     '''test LSTM block cell'''
@@ -653,7 +653,7 @@ def test_forward_inception_v3():
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'input:0', 'InceptionV3/Predictions/Reshape_1:0')
             tvm_output = run_tvm_graph(graph_def, data, 'input')
-            np.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # Inception V1
@@ -689,7 +689,7 @@ def test_forward_inception_v1():
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'softmax:0')
             tvm_output = run_tvm_graph(graph_def, tvm_data, 'DecodeJpeg/contents')
-            np.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # Mobilenet
@@ -707,7 +707,7 @@ def test_forward_mobilenet():
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'input:0', out_node + ':0')
             tvm_output = run_tvm_graph(graph_def, data, 'input')
-            np.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # ResnetV2
@@ -726,7 +726,7 @@ def test_forward_resnetv2():
             with tf.Session() as sess:
                 tf_output = run_tf_graph(sess, data, 'input_tensor:0', out_node + ':0')
                 tvm_output = run_tvm_graph(graph_def, data, 'input_tensor', tf_output.shape, 'float32')
-                np.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
+                tvm.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # PTB
@@ -834,7 +834,7 @@ def _get_sample(data, state):
                                 in_state, cnt_sample)
         tf_sample_str = _pretty_print(tf_samples, False, id_to_word)
         inpt = tvm_sample_str
-        np.testing.assert_allclose(tf_samples, tvm_samples, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(tf_samples, tvm_samples, rtol=1e-5, atol=1e-5)
         assert(tvm_sample_str == tf_sample_str)
 
 #######################################################################
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index a028dfeddf36..e202c5adb967 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -17,6 +17,7 @@
 from . import target
 from . import generic
 from . import hybrid
+from . import testing
 
 from . import ndarray as nd
 from .ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl
diff --git a/python/tvm/testing.py b/python/tvm/testing.py
new file mode 100644
index 000000000000..5c0b9b9da4ae
--- /dev/null
+++ b/python/tvm/testing.py
@@ -0,0 +1,12 @@
+""" TVM testing utilities """
+import numpy as np
+
+def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7):
+    """ Version of np.testing.assert_allclose with `atol` and `rtol` fields set
+    in reasonable defaults.
+
+    Arguments `actual` and `desired` are not interchangable, since the function
+    compares the `abs(actual-desired)` with `atol+rtol*abs(desired)`.  Since we
+    often allow `desired` to be close to zero, we generally want non-zero `atol`.
+    """
+    np.testing.assert_allclose(actual, desired, rtol=rtol, atol=atol, verbose=True)
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index 6c9f24711896..890820ba4519 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -27,7 +27,7 @@ def verify(target="llvm"):
         d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
         bb = 10.0
         f(a, b, d, bb)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5)
     verify()
 
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index 40fc20aec049..07c7e9224fcb 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -24,7 +24,7 @@ def verify(target="cuda"):
         b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
         f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
     verify()
 
diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
index 9a8ea34e69d5..f97b002368ab 100644
--- a/tests/python/contrib/test_dlpack.py
+++ b/tests/python/contrib/test_dlpack.py
@@ -34,7 +34,7 @@ def test():
         f_pytorch = to_pytorch_func(f)
         zz2 = torch.empty(137,137)
         f_pytorch(xx, yy, zz2)
-        np.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-6)
+        tvm.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-6)
 
     except ImportError:
         pass
diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py
index 4e13b052e616..0d9e6dda2d7a 100644
--- a/tests/python/contrib/test_miopen.py
+++ b/tests/python/contrib/test_miopen.py
@@ -56,7 +56,7 @@ def verify():
         y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx)
         f_ref(x, w, y_ref)
         print("Max abs diff:", np.max(np.abs(y.asnumpy() - y_ref.asnumpy())))
-        np.testing.assert_allclose(y.asnumpy(), y_ref.asnumpy(), atol=1e-3)
+        tvm.testing.assert_allclose(y.asnumpy(), y_ref.asnumpy(), atol=1e-3)
 
     verify()
 
diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py
index 6187d01b2ab8..635724921708 100644
--- a/tests/python/contrib/test_mps.py
+++ b/tests/python/contrib/test_mps.py
@@ -41,7 +41,7 @@ def verify(A, B, D, s, target="metal"):
         b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
         f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 1, rtol=1e-5)
     verify(A, B, D, s)
 
diff --git a/tests/python/contrib/test_mxnet_bridge.py b/tests/python/contrib/test_mxnet_bridge.py
index 2228f7305c6b..d511ec61d6a3 100644
--- a/tests/python/contrib/test_mxnet_bridge.py
+++ b/tests/python/contrib/test_mxnet_bridge.py
@@ -40,7 +40,7 @@ def mxnet_check():
     mxf(xx, yy, zz, 10.0)
 
 
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         zz.asnumpy(), (xx.asnumpy() + yy.asnumpy()) * 10)
 
 
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index af5c8e5dfa5c..a6c6b8158ff3 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -28,7 +28,7 @@ def verify(target="llvm"):
         d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
         bb = 10.0
         f(a, b, d, bb)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy().T) + bb, rtol=1e-5)
     verify()
 
@@ -58,7 +58,7 @@ def verify(target="llvm"):
         d = tvm.nd.array(np.zeros((m, ), dtype=D.dtype), ctx)
         bb = 10.0
         f(a, b, d, bb)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy().T) + bb, rtol=1e-5)
     verify()
 
@@ -142,7 +142,7 @@ def verify(target="llvm"):
         td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
         f(ta, tb, tc, td)
         nd = np_conv(np.reshape(na, (1, IC, IH, IW)), nb, PAD, STRIDE)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             td.asnumpy(), nd.reshape(IC, IH, IW), rtol=1e-5)
     verify()
 
@@ -187,7 +187,7 @@ def verify(target="llvm"):
         td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
         f(ta, tb, tc, td)
         nd = np_conv(na, nb, PAD)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             td.asnumpy(), nd, rtol=1e-5)
     verify()
 
diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py
index 38b911f106c5..5f076a3e8963 100644
--- a/tests/python/contrib/test_rocblas.py
+++ b/tests/python/contrib/test_rocblas.py
@@ -24,7 +24,7 @@ def verify(target="rocm"):
         b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
         f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
     verify()
 
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index 3a99779e58f0..f34dad9e41fb 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -28,7 +28,7 @@ def test_sort():
     b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
     c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
     f(a, b, c)
-    np.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
+    tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
 
 def test_sort_np():
     dshape = (1, 2, 3, 4, 5, 6)
@@ -55,7 +55,7 @@ def test_sort_np():
     b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
     c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
     f(a, b, c)
-    np.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
+    tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
 
 if __name__ == "__main__":
     test_sort()
diff --git a/tests/python/contrib/test_sparse.py b/tests/python/contrib/test_sparse.py
index f7a0d1d137a5..ed46ba2ea74a 100644
--- a/tests/python/contrib/test_sparse.py
+++ b/tests/python/contrib/test_sparse.py
@@ -27,7 +27,7 @@ def test_static_tensor():
     c.indices = a.indices
     c.indptr = a.indptr
     f(a.data, c.data)
-    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
 
 def test_dynamic_tensor():
     dtype = 'float32'
@@ -53,7 +53,7 @@ def test_dynamic_tensor():
     c.indices = a.indices
     c.indptr = a.indptr
     f(a.data.shape[0], a.data, c.data)
-    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
 
 def test_sparse_array_tuple():
     dtype, itype = 'float32', 'int32'
@@ -91,7 +91,7 @@ def test_sparse_array_tuple():
     c.indices = a.indices
     c.indptr = a.indptr
     f(a.data.shape[0], a.data, c.data)
-    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
 
 if __name__ == "__main__":
     test_static_tensor()
diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py
index 1073d43bceaa..15243c8b1235 100644
--- a/tests/python/integration/test_dot.py
+++ b/tests/python/integration/test_dot.py
@@ -46,7 +46,7 @@ def verify(target):
         b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx)
         c  = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx)
         f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4)
 
     verify("llvm")
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index f16f15325735..0f58c2367576 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -31,7 +31,7 @@ def check_device(device, host="stackvm"):
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
         fexp(a, b)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
 
     check_device("opencl -device=intel_graphics")
@@ -75,7 +75,7 @@ def check_device(device, host="stackvm"):
         a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         func(a0, a1, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a0.asnumpy() + a1.asnumpy() + (a0.asnumpy() * a1.asnumpy()),
             rtol=1e-5)
 
@@ -106,7 +106,7 @@ def test_log_pow_llvm():
     ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat)
     res = ftimer(a, b)
     assert(len(res.results) == repeat)
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5)
 
 
@@ -136,7 +136,7 @@ def check_device(device):
             a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), ctx)
             b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), ctx)
             func(a, b)
-            np.testing.assert_allclose(
+            tvm.testing.assert_allclose(
                 b.asnumpy(), list(map(lambda x: bin(x).count('1'), a.asnumpy())), rtol=1e-5)
 
         check_device("llvm")
@@ -186,7 +186,7 @@ def check_device(device):
             c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
             ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=1)
             tcost = ftimer(a, b, c).mean
-            np.testing.assert_allclose(
+            tvm.testing.assert_allclose(
                 c.asnumpy(), a.asnumpy() + b.asnumpy(), rtol=1e-6)
 
         check_device("opencl")
@@ -233,7 +233,7 @@ def check_device(device):
         a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), ctx)
         b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
         f(a, b)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), a.asnumpy() + 3, rtol=1e-6)
     check_device("cuda")
 
diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py
index 2524e2d230b1..493cb5016cfe 100644
--- a/tests/python/integration/test_ewise_fpga.py
+++ b/tests/python/integration/test_ewise_fpga.py
@@ -37,7 +37,7 @@ def check_device(device, host="llvm"):
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
         fexp(a, b)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
 
     check_device("sdaccel")
@@ -78,7 +78,7 @@ def check_device(device, host="llvm"):
         c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), ctx)
         d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), ctx)
         fadd(a, b, c, d)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), a.asnumpy() * 2 + b.asnumpy(), rtol=1e-5)
 
     check_device("sdaccel")
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
index 6e74052d8283..928ba187a4d3 100644
--- a/tests/python/integration/test_gemm.py
+++ b/tests/python/integration/test_gemm.py
@@ -85,7 +85,7 @@ def check_device(device):
         ftimer = f.time_evaluator(f.entry_name, ctx, number=1)
         tcost = ftimer(a, b, c).mean
         print("%s: exec=%g sec/op" % (ctx, tcost))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5)
 
     check_device("vulkan")
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index c8fb98746bf6..e2285808ccd3 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -42,7 +42,7 @@ def check_device(device, host="stackvm"):
             npy[:2] = 0
             res = np_reducer(x.asnumpy(), axis=1)
             res[:2] = 0
-            np.testing.assert_allclose(npy, res, rtol=1e-4)
+            tvm.testing.assert_allclose(npy, res, rtol=1e-4)
 
         check_device("metal")
         check_device("vulkan")
@@ -78,7 +78,7 @@ def check_target(target="llvm"):
         b  = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=0)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), res, rtol=1e-4)
 
     check_target()
@@ -108,7 +108,7 @@ def check_target(target="llvm"):
         b  = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=0)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), res, rtol=1e-4)
 
     check_target()
@@ -155,7 +155,7 @@ def check_target(device, host="stackvm"):
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=1)
         res[:2] = 0
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), res, rtol=1e-4)
 
     check_target("vulkan")
@@ -206,7 +206,7 @@ def check_target(device, host="stackvm"):
         b  = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=1) + 2
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), res, rtol=1e-4)
 
     check_target("vulkan")
@@ -256,7 +256,7 @@ def check_target():
         nd_res0 = tvm.nd.array(np.zeros(mm, dtype='int32'), ctx)
         nd_res1 = tvm.nd.array(np.zeros(mm, dtype='float32'), ctx)
         fargmax(nd_idx, nd_val, nd_res0, nd_res1)
-        np.testing.assert_allclose(np_res, nd_res0.asnumpy())
+        tvm.testing.assert_allclose(np_res, nd_res0.asnumpy())
 
     check_target()
 
@@ -316,7 +316,7 @@ def check_target(device):
         nd_res0 = tvm.nd.array(np.zeros(mm, dtype='int32'), ctx)
         nd_res1 = tvm.nd.array(np.zeros(mm, dtype='float32'), ctx)
         fargmax(nd_idx, nd_val, nd_res0, nd_res1)
-        np.testing.assert_allclose(np_res, nd_res0.asnumpy())
+        tvm.testing.assert_allclose(np_res, nd_res0.asnumpy())
 
     check_target("cuda")
     check_target("vulkan")
diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py
index 855f3e072133..49d1cf3b75ce 100644
--- a/tests/python/integration/test_scan.py
+++ b/tests/python/integration/test_scan.py
@@ -38,7 +38,7 @@ def check_device(device):
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros((m, n), dtype=res.dtype), ctx)
         fscan(a, b)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), np.cumsum(a_np, axis=0))
 
     check_device("vulkan")
diff --git a/tests/python/unittest/test_codegen_cross_llvm.py b/tests/python/unittest/test_codegen_cross_llvm.py
index aa6f9d708a41..5b9c509aedf2 100644
--- a/tests/python/unittest/test_codegen_cross_llvm.py
+++ b/tests/python/unittest/test_codegen_cross_llvm.py
@@ -67,7 +67,7 @@ def build_arm():
             b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
             c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
             farm(a, b, c)
-            np.testing.assert_allclose(
+            tvm.testing.assert_allclose(
                 c.asnumpy(), a.asnumpy() + b.asnumpy())
             print("Verification finish on remote..")
 
diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
index 0b54863d1aab..a0b1cf445ba6 100644
--- a/tests/python/unittest/test_codegen_cuda.py
+++ b/tests/python/unittest/test_codegen_cuda.py
@@ -27,7 +27,7 @@ def check_cuda(dtype, n, lanes):
             np.random.uniform(size=(n, lanes)))
         c = tvm.nd.empty((n,), B.dtype, ctx)
         fun(a, c)
-        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
         
     check_cuda("float32", 64, 2)
     check_cuda("float16", 64, 2)
@@ -62,7 +62,7 @@ def check_cuda(dtype, n, lanes):
         c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
         d = tvm.nd.empty((n,), D.dtype, ctx)
         fun(a, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), np_d)
+        tvm.testing.assert_allclose(d.asnumpy(), np_d)
     check_cuda("int8", 64, 4)
 
 def test_cuda_vectorize_load():
@@ -83,7 +83,7 @@ def check_cuda(dtype, n, lanes):
         a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
         b = tvm.nd.empty((n,), B.dtype, ctx)
         fun(a,b)
-        np.testing.assert_allclose(a.asnumpy(), b.asnumpy())
+        tvm.testing.assert_allclose(a.asnumpy(), b.asnumpy())
     check_cuda("int8", 64, 8)
     check_cuda("int8", 64, 16)
 
diff --git a/tests/python/unittest/test_codegen_device.py b/tests/python/unittest/test_codegen_device.py
index 0bb072ebf0bd..ee7644cea677 100644
--- a/tests/python/unittest/test_codegen_device.py
+++ b/tests/python/unittest/test_codegen_device.py
@@ -51,7 +51,7 @@ def check_target(device, host="stackvm"):
         b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx)
         d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx)
         f(a, b, d)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), a.asnumpy() + b.asnumpy() + 1)
 
     def check_module_save(device, host="stackvm"):
@@ -75,7 +75,7 @@ def check_module_save(device, host="stackvm"):
         b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx)
         d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx)
         f(a, b, d)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), a.asnumpy() + b.asnumpy() + 1)
 
     check_target("cuda", host="stackvm")
diff --git a/tests/python/unittest/test_codegen_extern.py b/tests/python/unittest/test_codegen_extern.py
index dfbf1820c21d..7512f0d23634 100644
--- a/tests/python/unittest/test_codegen_extern.py
+++ b/tests/python/unittest/test_codegen_extern.py
@@ -46,7 +46,7 @@ def check_target(target):
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
         
     check_target("llvm")
     check_target("opencl")
@@ -80,7 +80,7 @@ def check_target(target):
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
 
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy())
     check_target("stackvm")
     check_target("llvm")
@@ -112,12 +112,12 @@ def check_target(target):
         @tvm.register_func
         def my_extern_array_func2(aa, bb):
             assert aa.shape == a.shape
-            np.testing.assert_allclose(
+            tvm.testing.assert_allclose(
                 aa.asnumpy(), a.asnumpy() + 1)
             aa.copyto(bb)
 
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + 1)
 
     check_target("llvm")
diff --git a/tests/python/unittest/test_codegen_llvm.py b/tests/python/unittest/test_codegen_llvm.py
index 9f282050df3e..66a7fc48c287 100644
--- a/tests/python/unittest/test_codegen_llvm.py
+++ b/tests/python/unittest/test_codegen_llvm.py
@@ -52,7 +52,7 @@ def check_llvm(use_file):
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
         f(a, b)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), a.asnumpy() + 1.0)
     check_llvm(use_file=True)
     check_llvm(use_file=False)
@@ -106,7 +106,7 @@ def check_llvm():
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
 
     with tvm.build_config(offset_factor=4):
@@ -138,7 +138,7 @@ def check_llvm():
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(c.asnumpy(),
+        tvm.testing.assert_allclose(c.asnumpy(),
                                    np.sqrt(a.asnumpy() + 1) * 2 + 2,
                                    rtol=1e-5)
 
@@ -164,7 +164,7 @@ def check_llvm(nn, base):
         a = tvm.nd.array(np.random.uniform(size=(n + base)).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy()[::-1][:n])
     check_llvm(4, 0)
     check_llvm(128, 8)
@@ -195,7 +195,7 @@ def check_llvm(n, lanes):
             np.random.uniform(size=(n, lanes)))
         c = tvm.nd.empty((n,), C.dtype, ctx)
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + 1)
     check_llvm(64, 2)
     check_llvm(512, 2)
@@ -220,7 +220,7 @@ def check_llvm(nn, base, stride):
         a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy()[base:] + 1)
     check_llvm(64, 0, 2)
     check_llvm(4, 0, 1)
@@ -247,7 +247,7 @@ def check_llvm():
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + 1 + 1)
     check_llvm()
 
@@ -277,10 +277,10 @@ def check_llvm():
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         fadd1(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
         fadd2(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
     check_llvm()
 
@@ -302,7 +302,7 @@ def check_llvm(n, offset):
         f(a, c)
         c_np = a.asnumpy()
         c_np[:offset] = 0
-        np.testing.assert_allclose(c.asnumpy(), c_np)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np)
     check_llvm(64, 8)
 
 
@@ -321,7 +321,7 @@ def check_llvm(n):
         c = tvm.nd.empty((n,), C.dtype, ctx)
         f(a, c)
         c_np = a.asnumpy() == 1
-        np.testing.assert_allclose(c.asnumpy(), c_np)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np)
     check_llvm(64)
 
 
@@ -345,7 +345,7 @@ def check_llvm(n):
         d = tvm.nd.empty((), D.dtype, ctx)
         f(a, sc, d)
         d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
-        np.testing.assert_allclose(d.asnumpy(), d_np)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np)
     check_llvm(64)
 
 
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index ef0bcf8f72e5..3124586ca343 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -38,7 +38,7 @@ def tvm_val_2_py_val(val):
     module(*nd_args)
 
     for nd, np in to_check:
-        numpy.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)
 
 
 @script
@@ -257,7 +257,7 @@ def intrin_real(a):
     tvm_a = tvm.ndarray.array(a)
     func(tvm_a)
     intrin_real(a)
-    numpy.testing.assert_allclose(a, tvm_a.asnumpy(), rtol=1e-5)
+    tvm.testing.assert_allclose(a, tvm_a.asnumpy(), rtol=1e-5)
 
     @script
     def intrin_int(a):
diff --git a/tests/python/unittest/test_ir_builder.py b/tests/python/unittest/test_ir_builder.py
index 864455d01ad9..a257571752e4 100644
--- a/tests/python/unittest/test_ir_builder.py
+++ b/tests/python/unittest/test_ir_builder.py
@@ -84,7 +84,7 @@ def check_target(target):
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         fadd(a, b, c)
-        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
     check_target("llvm")
 
 def test_gpu():
@@ -125,7 +125,7 @@ def check_target(target):
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         fadd(a, b, c)
-        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
     check_target("opencl")
     check_target("cuda")
 
diff --git a/tests/python/unittest/test_lang_tensor_overload_op.py b/tests/python/unittest/test_lang_tensor_overload_op.py
index 14853e89188a..95cceaac338e 100644
--- a/tests/python/unittest/test_lang_tensor_overload_op.py
+++ b/tests/python/unittest/test_lang_tensor_overload_op.py
@@ -66,7 +66,7 @@ def test_combination():
     c = tvm.nd.array(np.random.uniform(size=(n, m)).astype(C.dtype), ctx)
     d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
     foo(x, a, b, c, d)
-    np.testing.assert_allclose(d.asnumpy(), k + a.asnumpy() - b.asnumpy() * c.asnumpy() / x)
+    tvm.testing.assert_allclose(d.asnumpy(), k + a.asnumpy() - b.asnumpy() * c.asnumpy() / x)
 
 
 def verify_tensor_scalar_bop(shape, typ="add"):
@@ -111,7 +111,7 @@ def check_device(device):
         a_nd = tvm.nd.array(a_npy, ctx)
         b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx)
         foo(a_nd, b_nd, k_, *shape)
-        np.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5)
+        tvm.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5)
 
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
         check_device(device)
@@ -160,7 +160,7 @@ def check_device(device):
         out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
         for _ in range(1):
             foo(lhs_nd, rhs_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
 
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
         check_device(device)
@@ -213,7 +213,7 @@ def check_device(device):
         b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx)
         c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), ctx)
         foo(a_nd, w_nd, b_nd, c_nd)
-        np.testing.assert_allclose(c_nd.asnumpy(), c_npy, rtol=1E-4, atol=1E-4)
+        tvm.testing.assert_allclose(c_nd.asnumpy(), c_npy, rtol=1E-4, atol=1E-4)
 
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
         check_device(device)
diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/unittest/test_runtime_ndarray.py
index 7be538199a58..935f8f38a804 100644
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -53,7 +53,7 @@ def test_fp16_conversion():
         expected = x_tvm.asnumpy().astype(dst)
         real = y_tvm.asnumpy()
 
-        np.testing.assert_allclose(expected, real)
+        tvm.testing.assert_allclose(expected, real)
 
 if __name__ == "__main__":
     test_nd_create()
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index 756b2867184d..eb7458555979 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -31,7 +31,7 @@ def verify_rpc(remote, target, shape, dtype):
         remote.upload(path_dso)
         f = remote.load_module("dev_lib.o")
         f(a, b)
-        np.testing.assert_allclose(a.asnumpy() + 1, b.asnumpy())
+        tvm.testing.assert_allclose(a.asnumpy() + 1, b.asnumpy())
 
     print("Test RPC connection to PowerPC...")
     remote = rpc.connect(host, port)
diff --git a/tests/verilog/integration/test_codegen_verilog.py b/tests/verilog/integration/test_codegen_verilog.py
index 26c0a9e36c9d..7ce264797012 100644
--- a/tests/verilog/integration/test_codegen_verilog.py
+++ b/tests/verilog/integration/test_codegen_verilog.py
@@ -60,7 +60,7 @@ def check_target(device, host="stackvm"):
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, b, c)
         print("Check correctness...")
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
     check_target("verilog")
 
diff --git a/tests/webgl/test_local_gemm.py b/tests/webgl/test_local_gemm.py
index 0dd1c0fc7376..e3b9c862a5f9 100644
--- a/tests/webgl/test_local_gemm.py
+++ b/tests/webgl/test_local_gemm.py
@@ -35,7 +35,7 @@ def test_local_gemm():
     c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
     f(a, b, c)
 
-    np.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
+    tvm.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
 
 if __name__ == "__main__":
     test_local_gemm()
diff --git a/tests/webgl/test_local_multi_stage.py b/tests/webgl/test_local_multi_stage.py
index 47fa5c76c7aa..1791241d68ee 100644
--- a/tests/webgl/test_local_multi_stage.py
+++ b/tests/webgl/test_local_multi_stage.py
@@ -24,7 +24,7 @@ def test_local_multi_stage():
     c = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
     f(a, c)
 
-    np.testing.assert_allclose(c.asnumpy(), (a.asnumpy() + 1) * 2)
+    tvm.testing.assert_allclose(c.asnumpy(), (a.asnumpy() + 1) * 2)
 
 if __name__ == "__main__":
     test_local_multi_stage()
diff --git a/tests/webgl/test_local_save_load.py b/tests/webgl/test_local_save_load.py
index 5ed058a7461c..bcf9f0a8d5bf 100644
--- a/tests/webgl/test_local_save_load.py
+++ b/tests/webgl/test_local_save_load.py
@@ -30,7 +30,7 @@ def test_local_save_load():
     f.export_library(path_so)
     f1 = tvm.module.load(path_so)
     f1(a, b, c)
-    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 if __name__ == "__main__":
     test_local_save_load()
diff --git a/tests/webgl/test_local_topi_conv2d_nchw.py b/tests/webgl/test_local_topi_conv2d_nchw.py
index 106534505694..598446456b4e 100644
--- a/tests/webgl/test_local_topi_conv2d_nchw.py
+++ b/tests/webgl/test_local_topi_conv2d_nchw.py
@@ -49,8 +49,8 @@ def check_device(device):
             func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
             func1(a, w, b)
             func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+            tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+            tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in ['opengl']:
         check_device(device)
diff --git a/tests/webgl/test_local_topi_dense.py b/tests/webgl/test_local_topi_dense.py
index f2e7dfc1331c..75f6dac5d1f8 100644
--- a/tests/webgl/test_local_topi_dense.py
+++ b/tests/webgl/test_local_topi_dense.py
@@ -45,7 +45,7 @@ def check_device(device):
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B, C, D], device, name="dense")
         f(a, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
     for device in ['opengl']:
         check_device(device)
diff --git a/tests/webgl/test_local_topi_pooling.py b/tests/webgl/test_local_topi_pooling.py
index 813fcd227e2f..35e893b94e6e 100644
--- a/tests/webgl/test_local_topi_pooling.py
+++ b/tests/webgl/test_local_topi_pooling.py
@@ -60,7 +60,7 @@ def check_device(device):
 
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['opengl']:
         check_device(device)
@@ -98,7 +98,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['opengl']:
         check_device(device)
diff --git a/tests/webgl/test_local_topi_softmax.py b/tests/webgl/test_local_topi_softmax.py
index 34f8bfb8d8f5..45c0c18098ed 100644
--- a/tests/webgl/test_local_topi_softmax.py
+++ b/tests/webgl/test_local_topi_softmax.py
@@ -32,7 +32,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ["opengl"]:
         check_device(device)
@@ -63,7 +63,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="log_softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ["opengl"]:
         check_device(device)
diff --git a/tests/webgl/test_remote_save_load.py b/tests/webgl/test_remote_save_load.py
index b1db6ce741c5..f14b2f2a2eae 100644
--- a/tests/webgl/test_remote_save_load.py
+++ b/tests/webgl/test_remote_save_load.py
@@ -73,7 +73,7 @@ def try_remote_save_load():
     b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx)
     c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx)
     fhost(a, b, c)
-    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 if __name__ == "__main__":
     try_remote_save_load()
diff --git a/topi/recipe/broadcast/test_broadcast_map.py b/topi/recipe/broadcast/test_broadcast_map.py
index 9c4e521ddd0d..11a4a34647db 100644
--- a/topi/recipe/broadcast/test_broadcast_map.py
+++ b/topi/recipe/broadcast/test_broadcast_map.py
@@ -48,7 +48,7 @@ def test_broadcast_to(in_shape, out_shape):
     out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), tvm.gpu())
     for _ in range(2):
         fcuda(data_nd, out_nd)
-    np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+    tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
 
 def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"):
@@ -95,7 +95,7 @@ def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"):
     out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), tvm.gpu())
     for _ in range(2):
         fcuda(lhs_nd, rhs_nd, out_nd)
-    np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+    tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
 
 if __name__ == "__main__":
diff --git a/topi/recipe/conv/depthwise_conv2d_test.py b/topi/recipe/conv/depthwise_conv2d_test.py
index d02f088e989a..cce36517a5ea 100644
--- a/topi/recipe/conv/depthwise_conv2d_test.py
+++ b/topi/recipe/conv/depthwise_conv2d_test.py
@@ -106,9 +106,9 @@ def check_device(device):
         for c in range(in_channel * channel_multiplier):
             scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]
         relu_scipy = np.maximum(scale_shift_scipy, 0)
-        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
-        np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
-        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
         print("success")
 
     for device in ['cuda', 'opencl', 'rocm']:
@@ -195,9 +195,9 @@ def check_device(device):
         for c in range(in_channel * channel_multiplier):
             scale_shift_scipy[:,:,:,c] = depthwise_conv2d_scipy[:,:,:,c] * scale_np[c] + shift_np[c]
         relu_scipy = np.maximum(scale_shift_scipy, 0)
-        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
-        np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
-        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
         print("success")
 
     for device in ['cuda', 'opencl', 'rocm']:
diff --git a/topi/recipe/conv/test_conv2d_hwcn_map.py b/topi/recipe/conv/test_conv2d_hwcn_map.py
index 8c8471d7af9c..c92dcc5d8fe7 100644
--- a/topi/recipe/conv/test_conv2d_hwcn_map.py
+++ b/topi/recipe/conv/test_conv2d_hwcn_map.py
@@ -64,10 +64,10 @@ def check_device(device):
                               unroll_explicit=device == 'rocm'):
             func1 = tvm.build(s1, [A, W, B], device)
             func1(a, w, b)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+            tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
             func2 = tvm.build(s2, [A, W, C], device)
             func2(a, w, c)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+            tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'rocm']:
         check_device(device)
diff --git a/topi/recipe/gemm/cuda_gemm_square.py b/topi/recipe/gemm/cuda_gemm_square.py
index f2cabb26bb66..2a47e22e0b59 100644
--- a/topi/recipe/gemm/cuda_gemm_square.py
+++ b/topi/recipe/gemm/cuda_gemm_square.py
@@ -118,7 +118,7 @@ def check_device(device):
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
         for i in range(2):
             f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5)
 
         num_flops = 2 * nn * nn * nn
diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py
index ed735dad9cd9..43029094a25c 100644
--- a/topi/recipe/gemm/gemm_int8.py
+++ b/topi/recipe/gemm/gemm_int8.py
@@ -140,7 +140,7 @@ def block_size_filter(entity):
     c = tvm.nd.array(np.zeros((n, m), dtype='int32'), ctx)
     f(a, b, c)
 
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         c.asnumpy(),
         np.dot(
             a_np.astype('int32'),
diff --git a/topi/recipe/reduce/test_reduce_map.py b/topi/recipe/reduce/test_reduce_map.py
index 6e9befaff2ec..5fadf10d94f9 100644
--- a/topi/recipe/reduce/test_reduce_map.py
+++ b/topi/recipe/reduce/test_reduce_map.py
@@ -67,7 +67,7 @@ def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
 
     for _ in range(2):
         fcuda(data_tvm, out_tvm)
-    np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 4E-4, 4E-4)
+    tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
 
 if __name__ == "__main__":
     test_reduce_map(in_shape=(128, 24, 128, 24),
diff --git a/topi/recipe/rnn/matexp.py b/topi/recipe/rnn/matexp.py
index 13f6357209c6..dddadb8ba5f3 100644
--- a/topi/recipe/rnn/matexp.py
+++ b/topi/recipe/rnn/matexp.py
@@ -145,7 +145,7 @@ def check_device(target):
                 for j in range(n_num_hidden):
                     if abs(res_cmp[i,0,j] - res_gpu[i,0,j]) > 1e-5:
                         print("%d, %d: %g vs %g" % (i,j, res_cmp[i,0,j], res_gpu[i,0,j]))
-            np.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
+            tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
     check_device("cuda")
 
 if __name__ == "__main__":
diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py
index 82af0006c2ef..6979cf1ce437 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d.py
@@ -46,7 +46,7 @@ def get_ref_data():
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
     func = tvm.build(s, [A, W, B], "llvm")
     func(a, w, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
                         activation_bits, weight_bits, dorefa):
@@ -85,7 +85,7 @@ def get_ref_data():
     func = tvm.build(s, [A, W, B], 'llvm')
 
     func(a, w, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 def test_bitserial_conv2d():
     in_size = 56
diff --git a/topi/tests/python/test_topi_bnn.py b/topi/tests/python/test_topi_bnn.py
index cf9f377e9e1c..7d132bfff0fe 100644
--- a/topi/tests/python/test_topi_bnn.py
+++ b/topi/tests/python/test_topi_bnn.py
@@ -44,7 +44,7 @@ def get_ref_data():
     f1(a, bnn_a)
     f2(b, bnn_b)
     f3(bnn_a, bnn_b, bnn_c)
-    np.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
+    tvm.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
 
 def test_binary_dense():
     verify_binary_dense(1, 4096, 1024)
diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py
index 4ed5b31708e4..3be938852fdf 100644
--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -23,7 +23,7 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for target in get_all_backend():
         check_device(target)
@@ -77,7 +77,7 @@ def check_device(device):
         out_npy = fnumpy(lhs_npy, rhs_npy)
         out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx)
         foo(lhs_nd, rhs_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
 
     for target in get_all_backend():
         check_device(target)
diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py
index f1367463eb4f..128a45c46f60 100644
--- a/topi/tests/python/test_topi_clip.py
+++ b/topi/tests/python/test_topi_clip.py
@@ -33,7 +33,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device, name="clip")
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in get_all_backend():
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py
index af1afcb9ea9a..bbd8dc3a6db9 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -47,8 +47,8 @@ def check_device(device):
         func2 = tvm.build(s2, [A, W, C], device)
         func1(a, w, b)
         func2(a, w, c)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
index 2b85b2b97cb1..af2d9e2046c4 100644
--- a/topi/tests/python/test_topi_conv2d_int8.py
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -83,7 +83,7 @@ def check_device(device):
         else:
             func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
             func(a, w, c)
-        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in ["cuda"]:
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index 14aa0b742a8a..45dded7953d4 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -66,7 +66,7 @@ def check_device(device):
         else:
             func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
             func(a, w, c)
-        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in get_all_backend():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
diff --git a/topi/tests/python/test_topi_conv2d_nhwc.py b/topi/tests/python/test_topi_conv2d_nhwc.py
index 7e41517c5d61..ba52251c4f5b 100644
--- a/topi/tests/python/test_topi_conv2d_nhwc.py
+++ b/topi/tests/python/test_topi_conv2d_nhwc.py
@@ -42,7 +42,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, W, B], device)
         func(a, w, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
index 5f65c038be60..296772f4e9f5 100644
--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -48,8 +48,8 @@ def check_device(device):
         func2 = tvm.build(s2, [A, W, C], device)
         func1(a, w, b)
         func2(a, w, c)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in get_all_backend():
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_winograd.py b/topi/tests/python/test_topi_conv2d_winograd.py
index 575e75ce2272..1666bc24991c 100644
--- a/topi/tests/python/test_topi_conv2d_winograd.py
+++ b/topi/tests/python/test_topi_conv2d_winograd.py
@@ -65,7 +65,7 @@ def check_device(device):
         else:
             func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
             func(a, w, c)
-        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
 
     for device in ['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']:
diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py
index 92f95f3e0497..60ef4be4c8e0 100644
--- a/topi/tests/python/test_topi_dense.py
+++ b/topi/tests/python/test_topi_dense.py
@@ -44,7 +44,7 @@ def check_device(device):
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B, C, D], device, name="dense")
         f(a, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
     for device in get_all_backend():
         check_device(device)
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index b03916b9ba09..51f2c418c121 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -97,9 +97,9 @@ def get_ref_data():
         # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
         timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
         tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
-        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
-        np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
-        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
     for device in get_all_backend():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
@@ -197,9 +197,9 @@ def get_ref_data():
         timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
         tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
         relu_scipy = np.maximum(scale_shift_scipy, 0)
-        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
-        np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
-        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
     for device in get_all_backend():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
diff --git a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
index f7c027344840..78b01ef42167 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
@@ -80,7 +80,7 @@ def get_ref_data():
         # launch the kernel
         timer = f.time_evaluator(f.entry_name, ctx, number=1)
         tcost = timer(filter_tvm, out_grad_tvm, in_grad_tvm).mean
-        np.testing.assert_allclose(in_grad_np, in_grad_tvm.asnumpy(), rtol=1e-5)
+        tvm.testing.assert_allclose(in_grad_np, in_grad_tvm.asnumpy(), rtol=1e-5)
 
     check_device("opencl")
     check_device("cuda")
diff --git a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
index da5b0351ae3c..50838a7c863f 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
@@ -73,7 +73,7 @@ def get_ref_data():
         # launch the kernel
         timer = f.time_evaluator(f.entry_name, ctx, number=1)
         tcost = timer(input_tvm, out_grad_tvm, weight_grad_tvm).mean
-        np.testing.assert_allclose(weight_grad_np, weight_grad_tvm.asnumpy(), rtol=1e-4)
+        tvm.testing.assert_allclose(weight_grad_np, weight_grad_tvm.asnumpy(), rtol=1e-4)
 
     check_device("opencl")
     check_device("cuda")
diff --git a/topi/tests/python/test_topi_dilate.py b/topi/tests/python/test_topi_dilate.py
index 9cc44719745a..d1e157f5e52f 100644
--- a/topi/tests/python/test_topi_dilate.py
+++ b/topi/tests/python/test_topi_dilate.py
@@ -19,7 +19,7 @@ def _test_dilate(input_size, strides):
         output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx)
         f = tvm.build(schedule, [Input, Output], target)
         f(input_tvm, output_tvm)
-        np.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
+        tvm.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
 
     _test_dilate((32,), (2,))
     _test_dilate((32,32), (2,2))
diff --git a/topi/tests/python/test_topi_l2norm.py b/topi/tests/python/test_topi_l2norm.py
index 75dc57057893..2bf799407398 100644
--- a/topi/tests/python/test_topi_l2norm.py
+++ b/topi/tests/python/test_topi_l2norm.py
@@ -29,7 +29,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_lrn.py b/topi/tests/python/test_topi_lrn.py
index 478054ddb134..2f96a86f164e 100644
--- a/topi/tests/python/test_topi_lrn.py
+++ b/topi/tests/python/test_topi_lrn.py
@@ -28,7 +28,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index 5d606d507387..22713aa6cfdd 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -37,7 +37,7 @@ def check_device(device):
             a = tvm.nd.array(a_np, ctx)
             b = tvm.nd.array(np.zeros_like(b_np), ctx)
             foo(a, b)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
         for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel',
                        'aocl_sw_emu']:
diff --git a/topi/tests/python/test_topi_matmul.py b/topi/tests/python/test_topi_matmul.py
index 407a2859b467..bd79bc4cba41 100644
--- a/topi/tests/python/test_topi_matmul.py
+++ b/topi/tests/python/test_topi_matmul.py
@@ -27,7 +27,7 @@ def verify_matmul(sa, sb, transp_a, transp_b):
     c1 = np.matmul(np.transpose(a) if transp_a else a,
                    np.transpose(b) if transp_b else b)
     c2 = with_tvm(lambda A,B: topi.matmul(A,B,transp_a,transp_b), a,b)
-    np.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
+    tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
 
 def test_matmul():
     verify_matmul((1,1),(1,1),False,False)
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index 578adf60094a..273320fce727 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -64,7 +64,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in get_all_backend():
         check_device(device)
@@ -109,7 +109,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in get_all_backend():
         check_device(device)
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 0be652948060..3b3472f538b7 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -87,11 +87,11 @@ def check_device(device):
                 sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
                 out_tvm_val = in_npy_map[sel_indices]
             if type == "argmax":
-                np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
+                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
             elif type == "argmin":
-                np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
+                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
         else:
-            np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
+            tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
     for device in get_all_backend():
         check_device(device)
 
diff --git a/topi/tests/python/test_topi_region.py b/topi/tests/python/test_topi_region.py
index a2835339e8eb..3357382b232e 100644
--- a/topi/tests/python/test_topi_region.py
+++ b/topi/tests/python/test_topi_region.py
@@ -37,7 +37,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device)
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm', 'cuda']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py
index 3e38e707a6da..a7ff64f0f759 100644
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -27,7 +27,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="relu")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in get_all_backend():
         check_device(device)
@@ -45,7 +45,7 @@ def verify_leaky_relu(m, alpha):
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
     foo = tvm.build(s, [A, B], "llvm", name="leaky_relu")
     foo(a, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 
 def verify_prelu(x, w, axis, weight_reshape):
@@ -68,7 +68,7 @@ def _prelu_numpy(x, W):
     foo = tvm.build(s, [X, W, B], "llvm", name="prelu")
     foo(x_tvm, w_tvm, b)
     out_np = _prelu_numpy(x_np, w_np)
-    np.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
 
 def test_relu():
     verify_relu(10, 128)
diff --git a/topi/tests/python/test_topi_reorg.py b/topi/tests/python/test_topi_reorg.py
index 5b15b9f6c5aa..339cafe3ba41 100644
--- a/topi/tests/python/test_topi_reorg.py
+++ b/topi/tests/python/test_topi_reorg.py
@@ -38,7 +38,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device)
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm', 'cuda']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_resize.py b/topi/tests/python/test_topi_resize.py
index cb2a69caf22b..6926a3a2a73c 100644
--- a/topi/tests/python/test_topi_resize.py
+++ b/topi/tests/python/test_topi_resize.py
@@ -38,7 +38,7 @@ def check_device(device):
         f = tvm.build(s, [A, B], device)
         f(a, b)
 
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
     for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_shortcut.py b/topi/tests/python/test_topi_shortcut.py
index b5840fe8e7b2..f89aa46a1e66 100644
--- a/topi/tests/python/test_topi_shortcut.py
+++ b/topi/tests/python/test_topi_shortcut.py
@@ -36,7 +36,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A1, A2, B], device)
         func(a1, a2, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm', 'cuda']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py
index cad30fa00e5b..1990a9e99d65 100644
--- a/topi/tests/python/test_topi_softmax.py
+++ b/topi/tests/python/test_topi_softmax.py
@@ -32,7 +32,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
@@ -63,7 +63,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="log_softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in get_all_backend():
         check_device(device)
diff --git a/topi/tests/python/test_topi_sparse.py b/topi/tests/python/test_topi_sparse.py
index 51c1bf3227d4..16a5ad33f201 100644
--- a/topi/tests/python/test_topi_sparse.py
+++ b/topi/tests/python/test_topi_sparse.py
@@ -47,7 +47,7 @@ def check_device(device):
         assert a.indptr.dtype == A.indptr.dtype
         f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmv")
         f(_nr, a.data, a.indices, a.indptr, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
 
     for device in ["llvm"]:
         check_device(device)
@@ -89,7 +89,7 @@ def check_device(device):
         f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmm")
 
         f(_nr, a.data, a.indices, a.indptr, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-2, atol=1e-2)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-2, atol=1e-2)
 
     for device in ["llvm"]:
         check_device(device)
@@ -127,7 +127,7 @@ def check_device(device):
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A.data, A.indices, A.indptr, B, C, D], device, name="dense")
         f(a.data, a.indices, a.indptr, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
 
     check_device('llvm')
 
@@ -164,7 +164,7 @@ def check_device(device):
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B.data, B.indices, B.indptr, C, D], device, name="dense")
         f(a, b.data, b.indices, b.indptr, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
 
     check_device('llvm')
 
diff --git a/topi/tests/python/test_topi_tensor.py b/topi/tests/python/test_topi_tensor.py
index 3d563c21b5c4..f54472716521 100644
--- a/topi/tests/python/test_topi_tensor.py
+++ b/topi/tests/python/test_topi_tensor.py
@@ -32,7 +32,7 @@ def check_device(device):
         tvm_nd = [tvm.nd.array(nd, ctx) for nd in np_nd] + [out]
         f(*tvm_nd)
         np_out = np.sum(np.array(np_nd), axis=0)
-        np.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
 
     for device in ["llvm"]:
         check_device(device)
@@ -59,11 +59,11 @@ def check_device(device):
         out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
         f = tvm.build(s1, [A, B], device, name="full_like")
         f(tvm.nd.array(np.zeros(shape, dtype), ctx), out)
-        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
 
         f = tvm.build(s2, [C], device, name="full")
         f(out)
-        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
 
     for device in ["llvm"]:
         check_device(device)
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 8da7f0828c2f..5c810f85e4c6 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -22,7 +22,7 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in get_all_backend():
         check_device(device)
@@ -45,7 +45,7 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in get_all_backend():
         check_device(device)
@@ -68,7 +68,7 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in get_all_backend():
         check_device(device)
@@ -96,7 +96,7 @@ def check_device(device):
             out_nd_shape = out_npy.shape
         out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in get_all_backend():
         check_device(device)
@@ -121,7 +121,7 @@ def check_device(device):
         data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype)
         foo(*(data_nds + [out_nd]))
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in get_all_backend():
         check_device(device)
@@ -146,7 +146,7 @@ def check_device(device):
         out_nds = [tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys]
         foo(*([data_nd] + out_nds))
         for out_nd, out_npy in zip(out_nds, out_npys):
-            np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+            tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in get_all_backend():
         check_device(device)
@@ -181,7 +181,7 @@ def check_device(device):
         tvm_shape_like = tvm.nd.array(np.zeros(out_shape).astype(B.dtype), ctx)
         out = tvm.nd.array(np.zeros(out_shape).astype(A.dtype), ctx)
         f(tvm_input, tvm_shape_like, out)
-        np.testing.assert_allclose(out.asnumpy(), input)
+        tvm.testing.assert_allclose(out.asnumpy(), input)
 
     for device in ["llvm"]:
         check_device(device)
@@ -204,7 +204,7 @@ def check_device(device):
         data_nd = tvm.nd.array(x_np, ctx)
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "cuda", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
@@ -243,7 +243,7 @@ def check_device(device):
         indices_nd = tvm.nd.array(indices_src, ctx)
         out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
         foo(data_nd, indices_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npys)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
     for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
@@ -270,7 +270,7 @@ def check_device(device):
         data_nd = tvm.nd.array(x_np, ctx)
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py
index ec657d490fb6..c10ce6e61b5a 100644
--- a/topi/tests/python/test_topi_upsampling.py
+++ b/topi/tests/python/test_topi_upsampling.py
@@ -43,7 +43,7 @@ def check_device(device):
         f = tvm.build(s, [A, B], device)
         f(a, b)
 
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
     for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 959b10f82ca5..547d7bdcfbf6 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -41,7 +41,7 @@ def check_device(device):
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
         f = tvm.build(s, [data, valid_count, out], device)
         f(tvm_data, tvm_valid_count, tvm_out)
-        np.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
     for device in ['llvm', 'opencl']:
         check_device(device)
@@ -100,7 +100,7 @@ def check_device(device):
         tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), ctx)
         f = tvm.build(s, [data, out], device)
         f(tvm_input_data, tvm_out)
-        np.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3)
 
     for device in ['llvm', 'opencl']:
         check_device(device)
@@ -148,7 +148,7 @@ def check_device(device):
         tvm_out = tvm.nd.array(np.zeros((batch_size, num_anchors, 6)).astype(out.dtype), ctx)
         f = tvm.build(s, [cls_prob, loc_preds, anchors, out], device)
         f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out)
-        np.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, rtol=1e-4)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, rtol=1e-4)
 
     for device in ['llvm', 'opencl']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_bnn.py b/topi/tests/python_cpp/test_topi_bnn.py
index 3fa5cfc4a0a7..83d880311eff 100644
--- a/topi/tests/python_cpp/test_topi_bnn.py
+++ b/topi/tests/python_cpp/test_topi_bnn.py
@@ -44,7 +44,7 @@ def get_ref_data():
     f1(a, bnn_a)
     f2(b, bnn_b)
     f3(bnn_a, bnn_b, bnn_c)
-    np.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
+    tvm.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
 
 def test_binary_dense():
     verify_binary_dense(1, 4096, 1024)
diff --git a/topi/tests/python_cpp/test_topi_clip.py b/topi/tests/python_cpp/test_topi_clip.py
index fe00408642f5..d1aca4cb904c 100644
--- a/topi/tests/python_cpp/test_topi_clip.py
+++ b/topi/tests/python_cpp/test_topi_clip.py
@@ -29,7 +29,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device, name="clip")
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_dense.py b/topi/tests/python_cpp/test_topi_dense.py
index f2369af4319a..636257de7919 100644
--- a/topi/tests/python_cpp/test_topi_dense.py
+++ b/topi/tests/python_cpp/test_topi_dense.py
@@ -47,7 +47,7 @@ def check_device(device):
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B, C, D], device, name="dense")
         f(a, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_dilate.py b/topi/tests/python_cpp/test_topi_dilate.py
index f1924239cc77..1f7f1d8bceeb 100644
--- a/topi/tests/python_cpp/test_topi_dilate.py
+++ b/topi/tests/python_cpp/test_topi_dilate.py
@@ -19,7 +19,7 @@ def _test_dilate(input_size, strides):
         output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx)
         f = tvm.build(schedule, [Input, Output], target)
         f(input_tvm, output_tvm)
-        np.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
+        tvm.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
 
     _test_dilate((32,), (2,))
     _test_dilate((32,32), (2,2))
diff --git a/topi/tests/python_cpp/test_topi_l2norm.py b/topi/tests/python_cpp/test_topi_l2norm.py
index 08799f76c5c3..fef2710b8d79 100644
--- a/topi/tests/python_cpp/test_topi_l2norm.py
+++ b/topi/tests/python_cpp/test_topi_l2norm.py
@@ -30,7 +30,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device, name="l2_normalize")
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_lrn.py b/topi/tests/python_cpp/test_topi_lrn.py
index d685643a9406..14a0eaa27781 100644
--- a/topi/tests/python_cpp/test_topi_lrn.py
+++ b/topi/tests/python_cpp/test_topi_lrn.py
@@ -29,7 +29,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-1)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-1)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_pooling.py b/topi/tests/python_cpp/test_topi_pooling.py
index 42232c8e4848..9997fb6738c2 100644
--- a/topi/tests/python_cpp/test_topi_pooling.py
+++ b/topi/tests/python_cpp/test_topi_pooling.py
@@ -67,7 +67,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm']:
         check_device(device)
@@ -115,7 +115,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_reduce.py b/topi/tests/python_cpp/test_topi_reduce.py
index b17176938d82..dbfa3683fa66 100644
--- a/topi/tests/python_cpp/test_topi_reduce.py
+++ b/topi/tests/python_cpp/test_topi_reduce.py
@@ -92,11 +92,11 @@ def check_device(device):
                 sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
                 out_tvm_val = in_npy_map[sel_indices]
             if type == "argmax":
-                np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
+                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
             elif type == "argmin":
-                np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
+                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
         else:
-            np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
+            tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
     for device in ["cuda", "opencl", "metal", "llvm", "rocm"]:
         check_device(device)
 
diff --git a/topi/tests/python_cpp/test_topi_region.py b/topi/tests/python_cpp/test_topi_region.py
index a37cf6610a0f..28e984b70244 100644
--- a/topi/tests/python_cpp/test_topi_region.py
+++ b/topi/tests/python_cpp/test_topi_region.py
@@ -39,7 +39,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device, name="region")
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_relu.py b/topi/tests/python_cpp/test_topi_relu.py
index 6677c1bf5551..3b1b00ec8f67 100644
--- a/topi/tests/python_cpp/test_topi_relu.py
+++ b/topi/tests/python_cpp/test_topi_relu.py
@@ -28,7 +28,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="relu")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm']:
         check_device(device)
@@ -48,7 +48,7 @@ def verify_leaky_relu(m, alpha):
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
     foo = tvm.build(s, [A, B], device, name="leaky_relu")
     foo(a, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 def verify_prelu(x, w, axis, weight_reshape):
     X = tvm.placeholder((x), name='X')
@@ -71,7 +71,7 @@ def _prelu_numpy(x, W):
     b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), ctx)
     foo = tvm.build(s, [X, W, B], "llvm", name="prelu")
     foo(x_tvm, w_tvm, b)
-    np.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
 
 def test_relu():
     for dtype in ['float32', 'float64', 'int32', 'int16', 'int8', 'int64']:
diff --git a/topi/tests/python_cpp/test_topi_reorg.py b/topi/tests/python_cpp/test_topi_reorg.py
index e5b8aa7f8b31..f7767967c699 100644
--- a/topi/tests/python_cpp/test_topi_reorg.py
+++ b/topi/tests/python_cpp/test_topi_reorg.py
@@ -39,7 +39,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device, name="reorg")
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_softmax.py b/topi/tests/python_cpp/test_topi_softmax.py
index 4d4ac387bccf..09f838ef57ec 100644
--- a/topi/tests/python_cpp/test_topi_softmax.py
+++ b/topi/tests/python_cpp/test_topi_softmax.py
@@ -32,7 +32,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm']:
         check_device(device)
@@ -66,7 +66,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="log_softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ["cuda", "opencl", "metal", "rocm"]:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_tensor.py b/topi/tests/python_cpp/test_topi_tensor.py
index 1a0a7c92db7e..762ee045e38a 100644
--- a/topi/tests/python_cpp/test_topi_tensor.py
+++ b/topi/tests/python_cpp/test_topi_tensor.py
@@ -30,7 +30,7 @@ def check_device(device):
         tvm_nd = [tvm.nd.array(nd, ctx) for nd in np_nd] + [out]
         f(*tvm_nd)
         np_out = np.sum(np.array(np_nd), axis=0)
-        np.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
 
     for device in ["llvm"]:
         check_device(device)
@@ -56,11 +56,11 @@ def check_device(device):
         out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
         f = tvm.build(s1, [A, B], device, name="full_like")
         f(tvm.nd.array(np.zeros(shape, dtype), ctx), out)
-        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
 
         f = tvm.build(s2, [C], device, name="full")
         f(out)
-        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
 
     for device in ["llvm"]:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py
index 3f7bdbfdd499..492f1d94c341 100644
--- a/topi/tests/python_cpp/test_topi_transform.py
+++ b/topi/tests/python_cpp/test_topi_transform.py
@@ -23,7 +23,7 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -50,7 +50,7 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -76,7 +76,7 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -106,7 +106,7 @@ def check_device(device):
             out_nd_shape = out_npy.shape
         out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -133,7 +133,7 @@ def check_device(device):
         data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype)
         foo(*(data_nds + [out_nd]))
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -162,7 +162,7 @@ def check_device(device):
         out_nds = [tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys]
         foo(*([data_nd] + out_nds))
         for out_nd, out_npy in zip(out_nds, out_npys):
-            np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+            tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -201,7 +201,7 @@ def check_device(device):
         indices_nd = tvm.nd.array(indices_src, ctx)
         out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
         foo(data_nd, indices_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npys)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
     for device in ["llvm", "opencl"]:
         check_device(device)
@@ -230,7 +230,7 @@ def check_device(device):
         tvm_out = tvm.nd.empty(x.shape, ctx=ctx, dtype=dtype)
         foo(tvm.nd.array(condition, ctx), tvm.nd.array(x, ctx),
             tvm.nd.array(y, ctx), tvm_out)
-        np.testing.assert_allclose(tvm_out.asnumpy(), np_out)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -261,7 +261,7 @@ def check_device(device):
         out_nds = [tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys_split]
         foo(*(data_nds + out_nds))
         for out_nd, out_npy in zip(out_nds, out_npys_split):
-            np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+            tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -295,7 +295,7 @@ def check_device(device):
         out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
         for _ in range(1):
             foo(*(data_nds + [rhs_nd] + [out_nd]))
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
 
     for device in ["llvm", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_yolo.py b/topi/tests/python_cpp/test_topi_yolo.py
index ed234b7bd134..293de4fca087 100644
--- a/topi/tests/python_cpp/test_topi_yolo.py
+++ b/topi/tests/python_cpp/test_topi_yolo.py
@@ -36,7 +36,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device, name="yolo")
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
         check_device(device)
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index f2ee98a61f66..a09c7d51869e 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -211,7 +211,7 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
 c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
 func(a_tvm, w_tvm, c_tvm)
 
-np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
+tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
 
 # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
 # and the overhead of kernel launch. You can also use nvprof to validate the result.
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 6673c0db9466..15b883dcbd73 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -305,4 +305,4 @@ def matmul(N, L, M, dtype):
 c_tvm = tvm.nd.empty(c_np.shape)
 func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)
 
-np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
+tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
diff --git a/tutorials/get_started.py b/tutorials/get_started.py
index de94827ab1e9..022d087d4d9b 100644
--- a/tutorials/get_started.py
+++ b/tutorials/get_started.py
@@ -138,7 +138,7 @@
 b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
 c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
 fadd(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 ######################################################################
 # Inspect the Generated Code
@@ -217,7 +217,7 @@
     fadd1_dev = tvm.module.load(temp.relpath("myadd.ptx"))
     fadd1.import_module(fadd1_dev)
 fadd1(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 ######################################################################
 # Pack Everything into One Library
@@ -231,7 +231,7 @@
 fadd.export_library(temp.relpath("myadd_pack.so"))
 fadd2 = tvm.module.load(temp.relpath("myadd_pack.so"))
 fadd2(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 ######################################################################
 # .. note:: Runtime API and Thread-Safety
@@ -264,7 +264,7 @@
     b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
     c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
     fadd_cl(a, b, c)
-    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 ######################################################################
 # Summary
diff --git a/tutorials/language/extern_op.py b/tutorials/language/extern_op.py
index 298a2cc7dd8b..59efe5000f03 100644
--- a/tutorials/language/extern_op.py
+++ b/tutorials/language/extern_op.py
@@ -59,7 +59,7 @@
 d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
 bb = 10.0
 f(a, b, d, bb)
-np.testing.assert_allclose(
+tvm.testing.assert_allclose(
     d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)
 
 ######################################################################
@@ -98,7 +98,7 @@ def my_tvm_addone(x, y):
 a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
 b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
 f(a, b)
-np.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5)
+tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5)
 
 ######################################################################
 # Summary
diff --git a/tutorials/language/reduction.py b/tutorials/language/reduction.py
index 531283e15213..8be614b2f6ea 100644
--- a/tutorials/language/reduction.py
+++ b/tutorials/language/reduction.py
@@ -123,7 +123,7 @@
 a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx)
 b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx)
 fcuda(a, b)
-np.testing.assert_allclose(
+tvm.testing.assert_allclose(
     b.asnumpy(),  np.sum(a.asnumpy(), axis=1), rtol=1e-4)
 
 ######################################################################
diff --git a/tutorials/language/scan.py b/tutorials/language/scan.py
index 6cdb0a0ff38e..8b8f848ffa13 100644
--- a/tutorials/language/scan.py
+++ b/tutorials/language/scan.py
@@ -72,7 +72,7 @@
 a = tvm.nd.array(a_np, ctx)
 b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), ctx)
 fscan(a, b)
-np.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))
+tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))
 
 ######################################################################
 # Multi-Stage Scan Cell
diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py
index 675306de064e..762068457e4b 100644
--- a/tutorials/language/tensorize.py
+++ b/tutorials/language/tensorize.py
@@ -163,7 +163,7 @@ def gemv_impl():
 b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
 c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
 func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
-np.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
+tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
 
 ######################################################################
 # We compare the tensorize version with that :code:`numpy.dot` produces,
@@ -270,7 +270,7 @@ def _reduce_update():
 b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
 c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
 func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
-np.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
+tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
 
 ######################################################################
 # Summary
diff --git a/tutorials/nnvm/using_external_lib.py b/tutorials/nnvm/using_external_lib.py
index fd00768b93be..272dcbb2b808 100644
--- a/tutorials/nnvm/using_external_lib.py
+++ b/tutorials/nnvm/using_external_lib.py
@@ -195,7 +195,7 @@
 # -----------------
 # We can check that the results of two runs match.
 
-np.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)
+tvm.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)
 
 #####################################################################
 # Conclusion
diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 6a0a25228910..ecb8707d399b 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -93,7 +93,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
 print('Baseline: %f' % evaluator(a, b, c).mean)
@@ -128,7 +128,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 # By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops,
 # we can see big speedup compared with the baseline.
@@ -164,7 +164,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
 print('Opt2: %f' % evaluator(a, b, c).mean)
@@ -197,7 +197,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
 print('Opt3: %f' % evaluator(a, b, c).mean)
@@ -252,7 +252,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
 print('Opt4: %f' % evaluator(a, b, c).mean)
@@ -298,7 +298,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
 print('Opt5: %f' % evaluator(a, b, c).mean)
@@ -341,7 +341,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=50)
 opt6_time = evaluator(a, b, c).mean
diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py
index 339a688dc1ed..c8ecbf848792 100644
--- a/tutorials/topi/intro_topi.py
+++ b/tutorials/topi/intro_topi.py
@@ -89,7 +89,7 @@
 b_nd = tvm.nd.array(b_np, ctx)
 g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)
 func(a_nd, b_nd, g_nd)
-np.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)
+tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)
 
 ######################################################################
 # TOPI also provides common neural nets operations such as _softmax_ with optimized schedule
diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py
index 7201038b7be0..da867c9b8270 100644
--- a/vta/tests/python/integration/test_benchmark_gemm.py
+++ b/vta/tests/python/integration/test_benchmark_gemm.py
@@ -94,7 +94,7 @@ def verify(s, check_correctness=True):
                                                    env.BATCH,
                                                    env.BLOCK_OUT)
             if check_correctness:
-                np.testing.assert_allclose(res_unpack, res_ref)
+                tvm.testing.assert_allclose(res_unpack, res_ref)
             return cost
 
         def run_schedule(load_inp,
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 0661d292f4e5..4bc0a8844a4b 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -87,7 +87,7 @@ def verify(s, check_correctness):
                 padding = wl.hpad
                 res_ref = res_ref >> 8
                 res_ref = np.clip(res_ref, 0, 127).astype("int8")
-                np.testing.assert_allclose(res_unpack, res_ref)
+                tvm.testing.assert_allclose(res_unpack, res_ref)
             return cost
 
         def conv_normal(print_ir):
@@ -219,7 +219,7 @@ def verify(s, check_correctness):
                 res_ref = res_ref >> 8
                 res_ref += bias_orig.reshape(wl.out_filter, 1, 1)
                 res_ref = np.clip(res_ref, 0, 127).astype("int8")
-                np.testing.assert_allclose(res_unpack, res_ref)
+                tvm.testing.assert_allclose(res_unpack, res_ref)
             return cost
 
         def conv_normal(print_ir):
diff --git a/vta/tutorials/convolution_opt.py b/vta/tutorials/convolution_opt.py
index 8e4b77d8b491..f4d3997b0146 100644
--- a/vta/tutorials/convolution_opt.py
+++ b/vta/tutorials/convolution_opt.py
@@ -413,7 +413,7 @@
                            env.BLOCK_OUT,
                            fout_height,
                            fout_width)).transpose((0, 2, 4, 5, 1, 3))
-np.testing.assert_allclose(res_ref, res_nd.asnumpy())
+tvm.testing.assert_allclose(res_ref, res_nd.asnumpy())
 print("Successful 2D convolution test!")
 
 ######################################################################

From 66e4af5e8038cb4a7620ce24d0cedb6db14e8034 Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Sun, 21 Oct 2018 14:08:11 +0900
Subject: [PATCH 256/529] [Frontend][MXNet] ones zeros ones_like zeros_like ops
 support (#1814)

---
 nnvm/python/nnvm/frontend/mxnet.py            | 19 +++++++++++---
 .../python/frontend/mxnet/test_forward.py     | 26 +++++++++++++++++++
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index f0217fc1ec85..87b169a1cfbc 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -273,6 +273,14 @@ def _lrn(inputs, attrs):
     new_attrs['size'] = _required_attr(attrs, 'nsize')
     return _get_nnvm_op(op_name)(*inputs, **new_attrs)
 
+def _ones(_, attrs):
+    op_name = "ones"
+    return _get_nnvm_op(op_name)(**attrs)
+
+def _zeros(_, attrs):
+    op_name = "zeros"
+    return _get_nnvm_op(op_name)(**attrs)
+
 _identity_list = ['__add_scalar__', '__add_symbol__', '__div_scalar__',
                   '__div_symbol__', '__mul_scalar__', '__mul_symbol__',
                   '__pow_scalar__', '__rdiv_scalar__', '__rpow_scalar__',
@@ -281,8 +289,8 @@ def _lrn(inputs, attrs):
                   'broadcast_sub', 'broadcast_to', 'cast', 'elemwise_add',
                   'elemwise_div', 'elemwise_mul', 'elemwise_sub', 'exp',
                   'flatten', 'log', 'log_softmax', 'max', 'min', 'negative',
-                  'relu', 'sigmoid', 'slice_like', 'softmax', 'sum', 'tanh',
-                  'transpose']
+                  'ones_like', 'relu', 'sigmoid', 'slice_like', 'softmax',
+                  'sum', 'tanh', 'transpose', 'zeros_like']
 
 _convert_map = {
     '_copy'         : _rename('copy'),
@@ -294,6 +302,8 @@ def _lrn(inputs, attrs):
     '_rminus_scalar': _rename('__rsub_scalar__'),
     '_contrib_MultiBoxPrior' : _rename('multibox_prior'),
     '_contrib_MultiBoxDetection' : _contrib_multibox_detection,
+    '_ones' : _ones,
+    '_zeros' : _zeros,
     'Activation'    : _activations,
     'BatchNorm'     : _batch_norm,
     'BatchNorm_v1'  : _batch_norm,
@@ -397,13 +407,14 @@ def _from_mxnet_impl(symbol, graph):
     if node:
         return node[output_index]
     attr = symbol.list_attr()
-    # op_name = symbol.attr('op_name')
+    op_name = symbol.attr('op_name')
     childs = symbol.get_children()
     if childs is not None:
-        op_name = symbol.attr('op_name')
         childs = [_from_mxnet_impl(childs[i], graph) for i in range(len(childs.list_outputs()))]
         childs = [x for y in childs for x in _as_list(y)]  # expand group symbol
         node = _convert_symbol(op_name, childs, attr)
+    elif op_name != 'null':
+        node = _convert_symbol(op_name, [], attr)   # no input symbol
     else:
         op_name = json.loads(symbol.tojson())['nodes'][0]['op']
         node = _sym.Variable(name=name, **attr)
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 653af1a63154..dbd93e710491 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -153,6 +153,28 @@ def test_forward_lrn():
     mx_sym = mx.sym.LRN(data, alpha=2, beta=2, knorm=1, nsize=5)
     verify_mxnet_frontend_impl(mx_sym, (1, 10, 24, 24), (1, 10, 24, 24))
 
+def test_forward_ones():
+    data = mx.sym.var('data')
+    ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, ones)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+    
+def test_forward_zeros():
+    data = mx.sym.var('data')
+    zeros = mx.sym.zeros(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, zeros)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_ones_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.ones_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_zeros_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.zeros_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+    
 if __name__ == '__main__':
     test_forward_mlp()
     test_forward_vgg()
@@ -168,3 +190,7 @@ def test_forward_lrn():
     test_forward_expand_dims()
     test_forward_pooling()
     test_forward_lrn()
+    test_forward_ones()
+    test_forward_zeros()
+    test_forward_ones_like()
+    test_forward_zeros_like()

From 6bc4f87fbe75cf82c8c097006b7f028769c6428e Mon Sep 17 00:00:00 2001
From: Gaoxiong <xiong.gao@huawei.com>
Date: Sun, 21 Oct 2018 13:43:36 +0800
Subject: [PATCH 257/529] Fix non-zero extent of access_ptr out of range
 (#1937) (#1939)

---
 src/lang/buffer.cc                        |  4 ++--
 tests/python/unittest/test_lang_buffer.py | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc
index 183a52f785bd..524cad2eeac6 100644
--- a/src/lang/buffer.cc
+++ b/src/lang/buffer.cc
@@ -357,9 +357,9 @@ Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes, Expr
   } else if (self->strides.size() == self->shape.size()) {
     int highest_dim = 0;
     extent = arith::ComputeExpr<ir::Mul>(
-        self->strides[highest_dim], self->shape[highest_dim]);
+        self->strides[highest_dim], self->shape[highest_dim]) - offset;
   } else {
-    extent = arith::ComputeReduce<ir::Mul>(self->shape, Expr());
+    extent = arith::ComputeReduce<ir::Mul>(self->shape, Expr()) - offset;
   }
   Expr elem_offset = self->elem_offset + offset;
   if (content_lanes > 1) {
diff --git a/tests/python/unittest/test_lang_buffer.py b/tests/python/unittest/test_lang_buffer.py
index 51f1e3abb7e9..85c9fbeee53e 100644
--- a/tests/python/unittest/test_lang_buffer.py
+++ b/tests/python/unittest/test_lang_buffer.py
@@ -41,6 +41,18 @@ def test_buffer_access_ptr_offset():
     assert tvm.ir_pass.Equal(offset, tvm.call_extern('int32', "test_call", 200 + v))
     assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
 
+def test_buffer_access_ptr_extent():
+    m = tvm.var('m')
+    n = tvm.var('n')
+    Ab = tvm.decl_buffer((m, n), tvm.float32)
+    aptr = Ab.access_ptr("rw")
+    assert tvm.ir_pass.Equal(aptr.args[3], m * n)
+    aptr = Ab.access_ptr("rw", offset=100)
+    assert tvm.ir_pass.Equal(aptr.args[3], m * n - 100)
+    Ab = tvm.decl_buffer((m, n), tvm.float32, strides=[n + 1 , 1])
+    aptr = Ab.access_ptr("rw", offset=100)
+    assert tvm.ir_pass.Equal(aptr.args[3], Ab.strides[0] * m - 100)
+
 def test_buffer_vload():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -84,5 +96,6 @@ def assert_simplified_equal(index_simplified, index_direct):
     test_buffer()
     test_buffer_access_ptr()
     test_buffer_access_ptr_offset()
+    test_buffer_access_ptr_extent()
     test_buffer_vload()
     test_buffer_index_merge_mult_mod()

From 125f4551a903be5728743912fe66d265e33110b5 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Sun, 21 Oct 2018 22:16:32 +0530
Subject: [PATCH 258/529] [RELAY]Reduce ops sum/max/min/mean/prod (#1927)

---
 docs/langref/relay_op.rst            |  10 ++
 python/tvm/relay/op/reduce.py        | 152 ++++++++++++++++++++++++++-
 src/relay/op/tensor/reduce.cc        | 116 +++++++++++++++++++-
 tests/python/relay/test_op_level4.py |  59 +++++++----
 4 files changed, 308 insertions(+), 29 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 56558272f2a3..a36f8e6c71cf 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -108,6 +108,11 @@ This level enables additional math and transform operators.
    tvm.relay.where
    tvm.relay.argmax
    tvm.relay.argmin
+   tvm.relay.sum
+   tvm.relay.max
+   tvm.relay.min
+   tvm.relay.mean
+   tvm.relay.prod
 
 
 **Level 5: Vision/Image Operators**
@@ -187,6 +192,11 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.where
 .. autofunction:: tvm.relay.argmax
 .. autofunction:: tvm.relay.argmin
+.. autofunction:: tvm.relay.sum
+.. autofunction:: tvm.relay.max
+.. autofunction:: tvm.relay.min
+.. autofunction:: tvm.relay.mean
+.. autofunction:: tvm.relay.prod
 
 
 Level 5 Definitions
diff --git a/python/tvm/relay/op/reduce.py b/python/tvm/relay/op/reduce.py
index a2a4519512ea..73c5f270e8bf 100644
--- a/python/tvm/relay/op/reduce.py
+++ b/python/tvm/relay/op/reduce.py
@@ -30,7 +30,6 @@ def argmax(data, axis=None, keepdims=False, exclude=False):
     result : relay.Expr
         The computed result.
     """
-
     return _make.argmax(data, axis, keepdims, exclude)
 
 def argmin(data, axis=None, keepdims=False, exclude=False):
@@ -60,5 +59,154 @@ def argmin(data, axis=None, keepdims=False, exclude=False):
     result : relay.Expr
         The computed result.
     """
-
     return _make.argmin(data, axis, keepdims, exclude)
+
+
+def sum(data, axis=None, keepdims=False, exclude=False):
+    """Computes the sum of array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.sum(data, axis, keepdims, exclude)
+
+
+def max(data, axis=None, keepdims=False, exclude=False):
+    """ Computes the max of array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.max(data, axis, keepdims, exclude)
+
+
+def min(data, axis=None, keepdims=False, exclude=False):
+    """Computes the min of array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.min(data, axis, keepdims, exclude)
+
+
+def mean(data, axis=None, keepdims=False, exclude=False):
+    """Computes the mean of array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.mean(data, axis, keepdims, exclude)
+
+
+def prod(data, axis=None, keepdims=False, exclude=False):
+    """Computes the products of array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.prod(data, axis, keepdims, exclude)
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index 017ef1e5dfec..0a955fad631b 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -7,6 +7,7 @@
 #include <tvm/relay/op.h>
 #include <numeric>
 #include <limits>
+#include "../op_common.h"
 #include "../type_relations.h"
 
 namespace tvm {
@@ -19,7 +20,7 @@ struct ReduceAttrs : public tvm::AttrsNode<ReduceAttrs> {
   bool exclude;
 
   TVM_DECLARE_ATTRS(ReduceAttrs, "relay.attrs.ReduceAttrs") {
-    TVM_ATTR_FIELD(axis).set_default(Array<IndexExpr>({}))
+    TVM_ATTR_FIELD(axis).set_default(NullValue<Array<IndexExpr>>())
         .describe(R"code(The axis or axes along which to perform the reduction.
 
       The default, `axis=()`, will compute over all elements into a
@@ -158,10 +159,7 @@ bool ArgReduceRel(const Array<Type>& types,
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   CHECK(static_cast<int>(data->shape.size()) != 0);
-  std::vector<IndexExpr> in_shape;
-  for (auto i : data->shape) {
-    in_shape.push_back(i);
-  }
+  std::vector<IndexExpr>&& in_shape = AsVector(data->shape);
 
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
   CHECK(param != nullptr);
@@ -172,6 +170,31 @@ bool ArgReduceRel(const Array<Type>& types,
   return true;
 }
 
+/*!
+* \brief ReduceRel Output type and shape relation evaluation function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return false if This relation cannot be resolved. true if this relation has been resolved.
+*/
+bool ReduceRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  CHECK(static_cast<int>(data->shape.size()) != 0);
+  std::vector<IndexExpr>&& in_shape = AsVector(data->shape);
+
+  const ReduceAttrs* param = attrs.as<ReduceAttrs>();
+  CHECK(param != nullptr);
+
+  // assign output type and shape
+  auto oshape = ReduceShapeImpl(in_shape, param, reporter);
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
 
 #define RELAY_REGISTER_REDUCE_OP(OpName)                           \
   TVM_REGISTER_API("relay.op._make." OpName)                       \
@@ -213,5 +236,88 @@ values over a given axis.
 .set_support_level(4)
 .add_type_rel("ArgReduce", ArgReduceRel);
 
+
+RELAY_REGISTER_REDUCE_OP("sum")
+.describe(R"code(Computes the sum of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  sum(data, axis=1)
+  [[  4.   8.]
+   [ 10.   9.]
+   [ 21.   6.]]
+
+  sum(data, axis=[1,2])
+  [ 12.  19.  27.]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel);
+
+
+RELAY_REGISTER_REDUCE_OP("max")
+.describe(R"code(Computes the max of array elements over given axes.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel);
+
+
+RELAY_REGISTER_REDUCE_OP("min")
+.describe(R"code(Computes the min of array elements over given axes.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel);
+
+
+RELAY_REGISTER_REDUCE_OP("mean")
+.describe(R"code(Computes the mean of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  mean(data)
+  [3.22]
+
+  mean(data, axis=[1,2])
+  [ 2.  3.16666667  4.5]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel);
+
+
+RELAY_REGISTER_REDUCE_OP("prod")
+.describe(R"code(Computes the products of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  mean(data, axis=1)
+  [35562240]
+
+  mean(data, axis=[1,2])
+  [ 36  480  2058]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index c2b685affab4..2dc643cfd7e4 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -46,27 +46,6 @@ def test_binary_int_broadcast():
         assert zz.checked_type == relay.TensorType((5, 10, 4), "int32")
 
 
-def test_arg_reduce():
-    for op in [relay.argmax, relay.argmin]:
-        n, c , h, w = 10, 20, 3, 4
-        x = relay.var("x", relay.ty.TensorType((n, c , h, w), "float32"))
-        z = relay.argmax(x, axis=(1,))
-        "axis="  in z.astext()
-        zz = relay.ir_pass.infer_type(z)
-        assert zz.checked_type == relay.ty.TensorType((n, h, w), "int32")
-        n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-        x = relay.var("x", relay.ty.TensorType((n, c , h, w), "float32"))
-        z = relay.argmax(x, axis=(2,), keepdims=True)
-        zz = relay.ir_pass.infer_type(z)
-        assert zz.checked_type == relay.ty.TensorType((n, c , 1, w), "int32")
-
-        n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-        x = relay.var("x", relay.ty.TensorType((n, c , h, w), "float32"))
-        z = relay.argmax(x, axis=(2,), keepdims=True, exclude=True)
-        zz = relay.ir_pass.infer_type(z)
-        assert zz.checked_type == relay.ty.TensorType((1, 1 , h, 1), "int32")
-
-
 def test_where():
     cond = relay.var("cond", relay.TensorType((3, 4), "float32"))
     x = relay.var("x", relay.TensorType((3, 4), "float32"))
@@ -76,9 +55,45 @@ def test_where():
     assert zz.checked_type == relay.TensorType((3, 4), "float32")
 
 
+def verify_reduce(test_func, data, axis, keepdims, exclude, output):
+    x = relay.var("x", relay.TensorType(data, "float32"))
+    z = test_func(x, axis, keepdims, exclude)
+    zz = relay.ir_pass.infer_type(z)
+    if axis:
+        assert "axis=" in z.astext()
+    if keepdims:
+        assert "keepdims=" in z.astext()
+    if exclude:
+        assert "exclude=" in z.astext()
+    out_type = "int32" if test_func in [relay.argmin, relay.argmax] else "float32"
+    assert zz.checked_type == relay.ty.TensorType(output, out_type)
+
+def test_reduce_functions():
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    for func in [relay.sum,
+                 relay.max,
+                 relay.min,
+                 relay.mean,
+                 relay.prod,
+                 relay.argmin,
+                 relay.argmax]:
+        verify_reduce(func, (d1, d2, d3, d4), (2,), True, False, (d1, d2, 1, d4))
+        verify_reduce(func, (d1, d2, d3), (1,), True, False, (d1, 1, d3))
+        verify_reduce(func, (d1, d2, d3), None, True, False, (1, 1, 1))
+        verify_reduce(func, (d1, d2, d3), (0, 1), True, False, (1, 1, d3))
+        verify_reduce(func, (2, 3, 4), (1,), True, False, (2, 1, 4))
+        verify_reduce(func, (2, 3, 4), (0, 1, 2), False, False, ())
+        verify_reduce(func, (4, 4, 3), None, True, False, (1, 1, 1))
+        verify_reduce(func, (4, 4, 3), None, False, True, ())
+        verify_reduce(func, (4, 4, 3), (0, 2), False, False, (4,))
+        verify_reduce(func, (128, 24, 128), (0, 1), False, False, (128,))
+        verify_reduce(func, (128, 24, 128), (0, 2), False, False, (24,))
+        verify_reduce(func, (128, 24, 128), (0, 1), True, False, (1, 1, 128))
+        verify_reduce(func, (128, 24, 128), (0, 2), True, False, (1, 24, 1))
+
 if __name__ == "__main__":
     test_binary_op()
     test_cmp_type()
     test_binary_int_broadcast()
     test_where()
-    test_arg_reduce()
+    test_reduce_functions()

From f3a11d380534eee5d1c265d7b1bbcd2bc716daaf Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Sun, 21 Oct 2018 09:47:36 -0700
Subject: [PATCH 259/529] up (#1940)

---
 src/pass/inject_virtual_thread.cc | 3 ++-
 src/pass/vectorize_loop.cc        | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc
index 833513756053..f1aed09d47da 100644
--- a/src/pass/inject_virtual_thread.cc
+++ b/src/pass/inject_virtual_thread.cc
@@ -430,7 +430,8 @@ class VTInjector : public IRMutator {
     } else {
       // insert a for loop
       Var idx(var_->name_hint + ".s", var_->type);
-      stmt = Substitute(stmt, {{var_, idx}});
+      Map<Var, Expr> values{{var_, idx}};
+      stmt = Substitute(stmt, values);
       return For::make(idx, make_zero(idx.type()),
                        make_const(idx.type(), num_threads_),
                        ForType::Serial, DeviceAPI::None, stmt);
diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc
index fe2f819809fd..19874a803657 100644
--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
@@ -355,7 +355,8 @@ class Vectorizer : public IRMutator {
   // scalarize the statment
   Stmt Scalarize(Stmt stmt) {
     Var idx(var_->name_hint + ".s", var_->type);
-    stmt = Substitute(stmt, {{var_, idx}});
+    Map<Var, Expr> values{{var_, idx}};
+    stmt = Substitute(stmt, values);
     return For::make(idx, 0, var_lanes_, ForType::Serial, DeviceAPI::None, stmt);
   }
 

From 7a641cf8c8ac02f1a5247728272d78ccca43e8cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Sun, 21 Oct 2018 10:30:04 -0700
Subject: [PATCH 260/529] [Relay] fix format in ty.py (#1948)

---
 python/tvm/relay/ty.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index 824b0f20e281..0835ca3c903b 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -163,17 +163,17 @@ class TypeRelation(TypeConstraint):
 
     Parameters
     ----------
-    func : EnvFunc
+    func: EnvFunc
         User defined relation function.
 
-    args : list of types
+    args: list of types
         List of types to the func.
 
     num_inputs: int
         Number of input arguments in args,
         this act as a hint for type inference.
 
-    attrs : Attrs
+    attrs: Attrs
         The attribute attached to the relation information
     """
     def __init__(self, func, args, num_inputs, attrs):
@@ -188,7 +188,7 @@ def scalar_type(dtype):
 
     Parameters
     ----------
-    dtype : str
+    dtype: str
         The content data type.
 
     Returns

From cc597ce9442c8ed7a829b97d5f5d0188eb379eeb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Sun, 21 Oct 2018 17:33:29 -0700
Subject: [PATCH 261/529] [Relay] fix doc in ty.py (#1949)

---
 python/tvm/relay/ty.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index 0835ca3c903b..088f076abb75 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -102,7 +102,7 @@ def __init__(self, fields):
 
         Parameters
         ----------
-        fields: list of tvm.Type
+        fields: List[tvm.relay.Type]
 
         Returns
         -------

From dc1ed30a14a203a4ff26e95ab49d1361be45a037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Mon, 22 Oct 2018 09:31:59 -0700
Subject: [PATCH 262/529] [Relay][Op]BroadcastToLike CollapseSumLike (#1886)

---
 docs/langref/relay_op.rst                   | 18 ++++++
 python/tvm/relay/op/transform.py            | 38 +++++++++++++
 src/relay/op/tensor/transform.cc            | 61 +++++++++++++++++++++
 tests/python/relay/test_op_level10.py       | 23 ++++++++
 tests/python/relay/test_pass_alpha_equal.py |  1 +
 5 files changed, 141 insertions(+)
 create mode 100644 tests/python/relay/test_op_level10.py

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index a36f8e6c71cf..6eba6b25d9fd 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -123,6 +123,17 @@ This level enables additional math and transform operators.
    tvm.relay.image.resize
 
 
+**Level 10: Temporary Operators**
+
+This level support backpropagation of broadcast operators. It is temporary.
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.broadcast_to_like
+   tvm.relay.collapse_sum_like
+
+
 Level 1 Definitions
 -------------------
 .. autofunction:: tvm.relay.log
@@ -199,6 +210,13 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.prod
 
 
+
 Level 5 Definitions
 -------------------
 .. autofunction:: tvm.relay.image.resize
+
+
+Level 10 Definitions
+--------------------
+.. autofunction:: tvm.relay.broadcast_to_like
+.. autofunction:: tvm.relay.collapse_sum_like
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index c2036f509133..84e2398f0a9e 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -242,3 +242,41 @@ def where(condition, x, y):
     Note that the shape of condition, x, and y needs to be the same.
     """
     return _make.where(condition, x, y)
+
+
+def broadcast_to_like(data, broadcast_type):
+    """Return an scalar value array with the same shape and type as the input array.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input tensor.
+
+    broadcast_type : relay.Expr
+        Provide the type to broadcast to.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.broadcast_to_like(data, broadcast_type)
+
+
+def collapse_sum_like(data, collapse_type):
+    """Return an scalar value array with the same shape and type as the input array.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input tensor.
+
+    collapse_type : relay.Expr
+        Provide the type to collapse to.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.collapse_sum_like(data, collapse_type)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 61ee2778d0a2..e3c8bcef217e 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -718,5 +718,66 @@ RELAY_REGISTER_OP("squeeze")
 .set_support_level(3)
 .add_type_rel("Squeeze", SqueezeRel);
 
+// Have no idea how to assert the constraint.
+// CollapseSumLike: <A, B> -> B where BroadCast(A, B) = A
+bool CollapseSumLikeRel(const Array<Type>& types,
+                        int num_inputs,
+                        const Attrs& attrs,
+                        const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  reporter->Assign(types[2], types[1]);
+  return true;
+}
+
+Expr MakeCollapseSumLike(Expr data,
+                         Expr collapse_type) {
+  static const Op& op = Op::Get("collapse_sum_like");
+  return CallNode::make(op, {data, collapse_type}, Attrs(), {});
+}
+
+TVM_REGISTER_API("relay.op._make.collapse_sum_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeCollapseSumLike, args, rv);
+  });
+
+RELAY_REGISTER_OP("collapse_sum_like")
+.describe(R"code(Collapse the first input to match the shape of the second input.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("collapse_type", "Tensor", "Provide the type to collapse to.")
+.set_support_level(10)
+.add_type_rel("CollapseSumLike", CollapseSumLikeRel);
+
+// BroadCastToLike: <A, B> -> B where BroadCast(A, B) = B
+bool BroadCastToLikeRel(const Array<Type>& types,
+                        int num_inputs,
+                        const Attrs& attrs,
+                        const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  reporter->Assign(types[2], types[1]);
+  return true;
+}
+
+Expr MakeBroadCastToLike(Expr data,
+                         Expr broadcast_type) {
+  static const Op& op = Op::Get("broadcast_to_like");
+  return CallNode::make(op, {data, broadcast_type}, Attrs(), {});
+}
+
+TVM_REGISTER_API("relay.op._make.broadcast_to_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeBroadCastToLike, args, rv);
+  });
+
+RELAY_REGISTER_OP("broadcast_to_like")
+.describe(R"code(Broadcast the first input to match the shape of the second input.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("broadcast_type", "Tensor", "Provide the type to broadcast to.")
+.set_support_level(10)
+.add_type_rel("BroadCastToLike", BroadCastToLikeRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
new file mode 100644
index 000000000000..9486d029876d
--- /dev/null
+++ b/tests/python/relay/test_op_level10.py
@@ -0,0 +1,23 @@
+""" Support level10 operator test cases.
+"""
+import tvm
+from tvm import relay
+
+def test_collapse_sum_like():
+    x = relay.Var("x", relay.ty.TensorType((3, 4, 5, 6), "int8"))
+    y = relay.Var("y", relay.ty.TensorType((4, 1, 6), "int8"))
+    z = relay.collapse_sum_like(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType((4, 1, 6), "int8")
+
+
+def test_broadcast_to_like():
+    x = relay.Var("x", relay.ty.TensorType((3, 4, 5, 6), "int8"))
+    y = relay.Var("y", relay.ty.TensorType((4, 1, 6), "int8"))
+    z = relay.broadcast_to_like(y, x)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType((3, 4, 5, 6), "int8")
+
+if __name__ == "__main__":
+    test_collapse_sum_like()
+    test_broadcast_to_like()
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index 7b27cb7ee2d4..de4df7c84b9f 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -461,3 +461,4 @@ def test_op_alpha_equal():
     test_let_alpha_equal()
     test_if_alpha_equal()
     test_op_alpha_equal()
+    test_var_alpha_equal()

From 13f8d2f2a6b91564936edff494eb9946e88f2985 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Mon, 22 Oct 2018 21:55:16 -0700
Subject: [PATCH 263/529] Add tophub for x86 (#1955)

---
 python/tvm/autotvm/tophub.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 41e2b4c4683c..7798d5522036 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -21,6 +21,7 @@
 # the version of each package
 PACKAGE_VERSION = {
     'arm_cpu': "v0.03",
+    'llvm':    "v0.01",
 
     'cuda':    "v0.03",
     'rocm':    "v0.01",

From 14f917dafc185490f7905632f21c09e72594e67a Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Mon, 22 Oct 2018 21:55:31 -0700
Subject: [PATCH 264/529] [AUTOTVM] Fix measurement for CPU (#1956)

---
 python/tvm/autotvm/measure/measure_methods.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index ec3eb7e611e0..975faf71b5a0 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -460,7 +460,11 @@ def run_through_rpc(measure_input, build_result,
         if ref_input:
             args = [nd.array(x, ctx=ctx) for x in ref_input]
         else:
+            # create empty arrays on the remote device and copy them once.
+            # This can avoid some memory issues that make the measurment results unreliable.
             args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
+            args = [nd.array(x, ctx=ctx) for x in args]
+            ctx.sync()
 
         costs = time_f(*args).results
         if len(costs) > 2:  # remove largest and smallest value to reduce variance

From 1e5afe87ba5efd93124eb1e0456e68198900da49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Mon, 22 Oct 2018 22:33:37 -0700
Subject: [PATCH 265/529] [Relay] Fix format (#1957)

* save

* fix format
---
 src/relay/ir/base.cc          |  6 +++---
 src/relay/ir/environment.cc   |  4 ++--
 src/relay/ir/expr.cc          | 34 +++++++++++++++++-----------------
 src/relay/ir/type.cc          | 28 ++++++++++++++--------------
 src/relay/pass/alpha_eq.cc    | 11 ++++++++---
 src/relay/pass/kind_check.cc  |  6 +++---
 src/relay/pass/type_infer.cc  |  2 +-
 src/relay/pass/util.cc        | 14 +++++++-------
 src/relay/pass/well_formed.cc | 12 ++++++------
 9 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
index 4e71444bf1ae..a68910e56b71 100644
--- a/src/relay/ir/base.cc
+++ b/src/relay/ir/base.cc
@@ -33,7 +33,7 @@ SourceName SourceName::Get(const std::string& name) {
 }
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<SourceNameNode>([](const SourceNameNode *node, tvm::IRPrinter *p) {
+.set_dispatch<SourceNameNode>([](const SourceNameNode* node, tvm::IRPrinter* p) {
     p->stream << "SourceName(" << node->name << ", " << node << ")";
   });
 
@@ -54,12 +54,12 @@ Span SpanNode::make(SourceName source, int lineno, int col_offset) {
 TVM_REGISTER_NODE_TYPE(SpanNode);
 
 TVM_REGISTER_API("relay._make.Span")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = SpanNode::make(args[0], args[1], args[2]);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<SpanNode>([](const SpanNode *node, tvm::IRPrinter *p) {
+.set_dispatch<SpanNode>([](const SpanNode* node, tvm::IRPrinter* p) {
     p->stream << "SpanNode(" << node->source << ", " << node->lineno << ", "
               << node->col_offset << ")";
   });
diff --git a/src/relay/ir/environment.cc b/src/relay/ir/environment.cc
index 6dfaa0b24a53..dddad82c8afc 100644
--- a/src/relay/ir/environment.cc
+++ b/src/relay/ir/environment.cc
@@ -73,12 +73,12 @@ Function EnvironmentNode::Lookup(const GlobalVar& var) {
   return (*it).second;
 }
 
-Function EnvironmentNode::Lookup(const std::string &name) {
+Function EnvironmentNode::Lookup(const std::string& name) {
   GlobalVar id = this->GetGlobalVar(name);
   return this->Lookup(id);
 }
 
-void EnvironmentNode::Update(const Environment &env) {
+void EnvironmentNode::Update(const Environment& env) {
   for (auto pair : env->functions) {
     this->Update(pair.first, pair.second);
   }
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 2d373b769559..c75c414c8ce9 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -20,12 +20,12 @@ Constant ConstantNode::make(runtime::NDArray data) {
 TVM_REGISTER_NODE_TYPE(ConstantNode);
 
 TVM_REGISTER_API("relay._make.Constant")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = ConstantNode::make(args[0]);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<ConstantNode>([](const ConstantNode *node, tvm::IRPrinter *p) {
+.set_dispatch<ConstantNode>([](const ConstantNode* node, tvm::IRPrinter* p) {
     p->stream << "Constant(TODO)";
   });
 
@@ -49,12 +49,12 @@ Tuple TupleNode::make(tvm::Array<relay::Expr> fields) {
 TVM_REGISTER_NODE_TYPE(TupleNode);
 
 TVM_REGISTER_API("relay._make.Tuple")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = TupleNode::make(args[0]);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<TupleNode>([](const TupleNode *node, tvm::IRPrinter *p) {
+.set_dispatch<TupleNode>([](const TupleNode* node, tvm::IRPrinter* p) {
     p->stream << "Tuple(" << node->fields << ")";
   });
 
@@ -68,12 +68,12 @@ Var VarNode::make(std::string name_hint, Type type_annotation) {
 TVM_REGISTER_NODE_TYPE(VarNode);
 
 TVM_REGISTER_API("relay._make.Var")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = VarNode::make(args[0], args[1]);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<VarNode>([](const VarNode *node, tvm::IRPrinter *p) {
+.set_dispatch<VarNode>([](const VarNode* node, tvm::IRPrinter* p) {
     p->stream << "Var(" << node->name_hint;
     if (node->type_annotation.defined()) {
       p->stream << ", ty=";
@@ -91,12 +91,12 @@ GlobalVar GlobalVarNode::make(std::string name_hint) {
 TVM_REGISTER_NODE_TYPE(GlobalVarNode);
 
 TVM_REGISTER_API("relay._make.GlobalVar")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = GlobalVarNode::make(args[0]);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<GlobalVarNode>([](const GlobalVarNode *node, tvm::IRPrinter *p) {
+.set_dispatch<GlobalVarNode>([](const GlobalVarNode* node, tvm::IRPrinter* p) {
     p->stream << "GlobalVar(" << node->name_hint << ")";
   });
 
@@ -124,13 +124,13 @@ FuncType FunctionNode::func_type_annotation() const {
 TVM_REGISTER_NODE_TYPE(FunctionNode);
 
 TVM_REGISTER_API("relay._make.Function")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
   *ret = FunctionNode::make(args[0], args[1], args[2], args[3]);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<FunctionNode>([](const FunctionNode *node,
-                                   tvm::IRPrinter *p) {
+.set_dispatch<FunctionNode>([](const FunctionNode* node,
+                                   tvm::IRPrinter* p) {
       p->stream << "FunctionNode(" << node->params << ", " << node->ret_type
                 << ", " << node->body << ", " << node->type_params << ")";
 });
@@ -148,12 +148,12 @@ Call CallNode::make(Expr op, Array<Expr> args, Attrs attrs,
 TVM_REGISTER_NODE_TYPE(CallNode);
 
 TVM_REGISTER_API("relay._make.Call")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
   *ret = CallNode::make(args[0], args[1], args[2], args[3]);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<CallNode>([](const CallNode *node, tvm::IRPrinter *p) {
+.set_dispatch<CallNode>([](const CallNode* node, tvm::IRPrinter* p) {
   p->stream << "CallNode(" << node->op << ", " << node->args << ", "
     << node->attrs << ", " << node->type_args << ")";
 });
@@ -169,12 +169,12 @@ Let LetNode::make(Var var, Expr value, Expr body) {
 TVM_REGISTER_NODE_TYPE(LetNode);
 
 TVM_REGISTER_API("relay._make.Let")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = LetNode::make(args[0], args[1], args[2]);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<LetNode>([](const LetNode *node, tvm::IRPrinter *p) {
+.set_dispatch<LetNode>([](const LetNode* node, tvm::IRPrinter* p) {
   p->stream << "LetNode(" << node->var << ", " << node->value
             << ", " << node->body << ")";
 });
@@ -189,12 +189,12 @@ If IfNode::make(Expr cond, Expr true_branch, Expr false_branch) {
 
 TVM_REGISTER_NODE_TYPE(IfNode);
 
-TVM_REGISTER_API("relay._make.If").set_body([](TVMArgs args, TVMRetValue *ret) {
+TVM_REGISTER_API("relay._make.If").set_body([](TVMArgs args, TVMRetValue* ret) {
   *ret = IfNode::make(args[0], args[1], args[2]);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<IfNode>([](const IfNode *node, tvm::IRPrinter *p) {
+.set_dispatch<IfNode>([](const IfNode* node, tvm::IRPrinter* p) {
   p->stream << "IfNode(" << node->cond << ", " << node->true_branch
             << ", " << node->false_branch << ")";
 });
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
index 39347adced92..d6fc2e85b2d8 100644
--- a/src/relay/ir/type.cc
+++ b/src/relay/ir/type.cc
@@ -25,14 +25,14 @@ TensorType TensorTypeNode::Scalar(DataType dtype) {
 TVM_REGISTER_NODE_TYPE(TensorTypeNode);
 
 TVM_REGISTER_API("relay._make.TensorType")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
   Array<IndexExpr> shape = args[0];
   *ret = TensorTypeNode::make(shape, args[1]);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<TensorTypeNode>([](const TensorTypeNode *node,
-                                 tvm::IRPrinter *p) {
+.set_dispatch<TensorTypeNode>([](const TensorTypeNode* node,
+                                 tvm::IRPrinter* p) {
   p->stream << "TensorType(" << node->shape << ", " << node->dtype << ")";
 });
 
@@ -46,15 +46,15 @@ TypeVar TypeVarNode::make(std::string name, TypeVarNode::Kind kind) {
 TVM_REGISTER_NODE_TYPE(TypeVarNode);
 
 TVM_REGISTER_API("relay._make.TypeVar")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
   int kind = args[1];
   *ret =
     TypeVarNode::make(args[0], static_cast<TypeVarNode::Kind>(kind));
     });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<TypeVarNode>([](const TypeVarNode *node,
-                                    tvm::IRPrinter *p) {
+.set_dispatch<TypeVarNode>([](const TypeVarNode* node,
+                                    tvm::IRPrinter* p) {
   p->stream << "TypeVarNode(" << node->var->name_hint << ", "
     << node->kind << ")";
 });
@@ -95,13 +95,13 @@ FuncType FuncTypeNode::make(tvm::Array<Type> arg_types,
 TVM_REGISTER_NODE_TYPE(FuncTypeNode);
 
 TVM_REGISTER_API("relay._make.FuncType")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
   *ret = FuncTypeNode::make(args[0], args[1], args[2], args[3]);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<FuncTypeNode>([](const FuncTypeNode *node,
-                                   tvm::IRPrinter *p) {
+.set_dispatch<FuncTypeNode>([](const FuncTypeNode* node,
+                                   tvm::IRPrinter* p) {
   p->stream << "FuncTypeNode(" << node->type_params << ", "
             << node->arg_types << ", " << node->ret_type << ", "
             << node->type_constraints << ")";
@@ -122,12 +122,12 @@ TypeRelation TypeRelationNode::make(TypeRelationFn func,
 TVM_REGISTER_NODE_TYPE(TypeRelationNode);
 
 TVM_REGISTER_API("relay._make.TypeRelation")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = TypeRelationNode::make(args[0], args[1], args[2], args[3]);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<TypeRelationNode>([](const TypeRelationNode *node, tvm::IRPrinter *p) {
+.set_dispatch<TypeRelationNode>([](const TypeRelationNode* node, tvm::IRPrinter* p) {
     p->stream << "TypeRelationNode("
               << node->func->name
               << ", " << node->args << ")";
@@ -142,13 +142,13 @@ TupleType TupleTypeNode::make(Array<Type> fields) {
 TVM_REGISTER_NODE_TYPE(TupleTypeNode);
 
 TVM_REGISTER_API("relay._make.TupleType")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+.set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = TupleTypeNode::make(args[0]);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<TupleTypeNode>([](const TupleTypeNode *node,
-                                    tvm::IRPrinter *p) {
+.set_dispatch<TupleTypeNode>([](const TupleTypeNode* node,
+                                tvm::IRPrinter* p) {
   p->stream << "TupleTypeNode(" << node->fields << ")";
 });
 
diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
index 56aeefda78f1..41ec3f1e090b 100644
--- a/src/relay/pass/alpha_eq.cc
+++ b/src/relay/pass/alpha_eq.cc
@@ -193,11 +193,13 @@ struct TypeAlphaEq : TypeVisitor<const Type&> {
 };
 
 bool AlphaEqual(const Type& t1, const Type& t2) {
-  if (t1.defined() != t2.defined())
+  if (t1.defined() != t2.defined()) {
     return false;
+  }
 
-  if (!t1.defined())
+  if (!t1.defined()) {
     return true;
+  }
 
   TypeAlphaEq aeq;
   aeq.VisitType(t1, t2);
@@ -273,7 +275,10 @@ struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
       for (size_t i = 0; i < func1->params.size(); ++i) {
         MergeVarDecl(func1->params[i], func2->params[i]);
       }
-      if (!equal) return;
+
+      if (!equal) {
+        return;
+      }
 
       for (size_t i = 0U; i < func1->type_params.size(); i++) {
         equal = equal && AlphaEqual(func1->type_params[i], func2->type_params[i]);
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
index 3f4d81b7e24f..8fd77a71ec4b 100644
--- a/src/relay/pass/kind_check.cc
+++ b/src/relay/pass/kind_check.cc
@@ -29,11 +29,11 @@ struct KindChecker : TypeVisitor<> {
 
   // checks if t is an incomplete node of kind k or a type param of kind k
   bool MatchKind(const Type& t, Kind k) {
-    if (const IncompleteTypeNode *tv = t.as<IncompleteTypeNode>()) {
+    if (const IncompleteTypeNode* tv = t.as<IncompleteTypeNode>()) {
       return tv->kind == k;
     }
 
-    if (const TypeVarNode *tp = t.as<TypeVarNode>()) {
+    if (const TypeVarNode* tp = t.as<TypeVarNode>()) {
       return tp->kind == k;
     }
 
@@ -93,7 +93,7 @@ struct KindChecker : TypeVisitor<> {
     }
   }
 
-  bool Check(const Type &t) {
+  bool Check(const Type& t) {
     this->VisitType(t);
     return valid;
   }
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 3e233274af2e..0cbce833aed9 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -379,7 +379,7 @@ class TypeInferencer::Resolver : public ExprMutator {
     return new_e;
   }
 
-  Type VisitType(const Type &t) final {
+  Type VisitType(const Type& t) final {
     return solver_->Resolve(t);
   }
 
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index 8ebac921203f..ff4bb55b7b3c 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -14,10 +14,10 @@ namespace relay {
 
 class FreeVar;
 class FreeTypeVar : private TypeVisitor<> {
-  std::unordered_set<TypeVar, NodeHash, NodeEqual> * free_vars;
-  std::unordered_set<TypeVar, NodeHash, NodeEqual> * bound_vars;
-  FreeTypeVar(std::unordered_set<TypeVar, NodeHash, NodeEqual> * free_vars,
-              std::unordered_set<TypeVar, NodeHash, NodeEqual> * bound_vars) :
+  std::unordered_set<TypeVar, NodeHash, NodeEqual>* free_vars;
+  std::unordered_set<TypeVar, NodeHash, NodeEqual>* bound_vars;
+  FreeTypeVar(std::unordered_set<TypeVar, NodeHash, NodeEqual>* free_vars,
+              std::unordered_set<TypeVar, NodeHash, NodeEqual>* bound_vars) :
     free_vars(free_vars), bound_vars(bound_vars) { }
 
   void VisitType_(const TypeVarNode* tp) final {
@@ -45,7 +45,7 @@ class FreeTypeVar : private TypeVisitor<> {
 };
 
 class FreeVar : public ExprVisitor {
-  void VisitExpr_(const VarNode *v) final {
+  void VisitExpr_(const VarNode* v) final {
     auto var = GetRef<Var>(v);
     if (bound_vars.count(var) == 0) {
       free_vars.insert(var);
@@ -55,7 +55,7 @@ class FreeVar : public ExprVisitor {
     }
   }
 
-  void VisitExpr_(const FunctionNode *f) final {
+  void VisitExpr_(const FunctionNode* f) final {
     for (const auto& tp : f->type_params) {
       bound_types.insert(tp);
     }
@@ -66,7 +66,7 @@ class FreeVar : public ExprVisitor {
     VisitType(f->ret_type);
   }
 
-  void VisitExpr_(const LetNode *l) final {
+  void VisitExpr_(const LetNode* l) final {
     bound_vars.insert(l->var);
     VisitExpr(l->value);
     VisitExpr(l->body);
diff --git a/src/relay/pass/well_formed.cc b/src/relay/pass/well_formed.cc
index e008a72e5d90..a37969f9e317 100644
--- a/src/relay/pass/well_formed.cc
+++ b/src/relay/pass/well_formed.cc
@@ -18,14 +18,14 @@ class WellFormedChecker : private ExprVisitor {
 
   std::unordered_set<Var, NodeHash, NodeEqual> s;
 
-  void Check(const Var & v) {
+  void Check(const Var& v) {
     if (s.count(v) != 0) {
       well_formed = false;
     }
     s.insert(v);
   }
 
-  void VisitExpr_(const LetNode * l) final {
+  void VisitExpr_(const LetNode* l) final {
     // we do letrec only for FunctionNode,
     // but shadowing let in let binding is likely programming error, and we should forbidden it.
     Check(l->var);
@@ -33,21 +33,21 @@ class WellFormedChecker : private ExprVisitor {
     CheckWellFormed(l->body);
   }
 
-  void VisitExpr_(const FunctionNode * f) final {
-    for (const Var & param : f->params) {
+  void VisitExpr_(const FunctionNode* f) final {
+    for (const Var& param : f->params) {
       Check(param);
     }
     CheckWellFormed(f->body);
   }
 
  public:
-  bool CheckWellFormed(const Expr & e) {
+  bool CheckWellFormed(const Expr& e) {
     this->VisitExpr(e);
     return well_formed;
   }
 };
 
-bool WellFormed(const Expr & e) {
+bool WellFormed(const Expr& e) {
   return WellFormedChecker().CheckWellFormed(e);
 }
 

From 6142d4a86bc5308910b0fe8cb6c75cad2a577b99 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 22 Oct 2018 23:08:32 -0700
Subject: [PATCH 266/529] Add link to the reviewers

---
 .github/PULL_REQUEST_TEMPLATE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 313b776b0824..849e4606834e 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1 +1 @@
-Thanks for contributing to TVM!   Please refer to guideline https://docs.tvm.ai/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from others in the community.
+Thanks for contributing to TVM!   Please refer to guideline https://docs.tvm.ai/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md#reviewers).

From 75ca3d56cbe18607a9777381e0b2785c14889e2f Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 23 Oct 2018 10:54:20 -0700
Subject: [PATCH 267/529] [RELAY] Refactor AlphaEqual to support deep
 comparison of Attrs. (#1958)

---
 include/tvm/attrs.h                           | 197 +++++----
 src/api/api_pass.cc                           |   8 +-
 src/lang/attr_functor.h                       | 124 +++++-
 src/lang/attrs.cc                             | 316 ++++++++-----
 src/relay/ir/alpha_equal.cc                   | 384 ++++++++++++++++
 src/relay/ir/text_printer.cc                  |   7 +-
 .../type_visitor.h => ir/type_functor.h}      |  94 +++-
 src/relay/pass/alpha_eq.cc                    | 418 ------------------
 src/relay/pass/kind_check.cc                  |  16 +-
 src/relay/pass/type_functor.h                 |  94 ----
 src/relay/pass/type_subst.cc                  |   2 +-
 src/relay/pass/util.cc                        |   2 +-
 tests/python/relay/test_pass_alpha_equal.py   |  43 +-
 .../unittest/test_pass_attrs_hash_equal.py    |   6 +
 14 files changed, 980 insertions(+), 731 deletions(-)
 create mode 100644 src/relay/ir/alpha_equal.cc
 rename src/relay/{pass/type_visitor.h => ir/type_functor.h} (52%)
 delete mode 100644 src/relay/pass/alpha_eq.cc
 delete mode 100644 src/relay/pass/type_functor.h

diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 3b7beaa37838..33d84cecec6a 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -108,6 +108,90 @@ class AttrFieldInfoNode : public Node {
 /*! \brief AttrFieldInfo */
 TVM_DEFINE_NODE_REF(AttrFieldInfo, AttrFieldInfoNode);
 
+class AttrsHashHandler;
+class AttrsEqualHandler;
+/*!
+ * \brief Content-aware Equality comparator for attrs.
+ *
+ * This comparator will recursively deep compare the following Attributes.
+ *
+ * - IntImm, UIntImm, FloatImm, StringImm
+ * - Any subclass of BaseAttrsNode
+ * - Array of Attributes.
+ * - Map from string to Attributes.
+ */
+class AttrsEqual {
+ public:
+  bool operator()(const double& lhs, const double& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const int64_t& lhs, const int64_t& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const uint64_t& lhs, const uint64_t& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const int& lhs, const int& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const bool& lhs, const bool& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const std::string& lhs, const std::string& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const Type& lhs, const Type& rhs) const {
+    return lhs == rhs;
+  }
+  // node comparator
+  TVM_DLL bool operator()(const NodeRef& lhs, const NodeRef& rhs) const;
+
+ protected:
+  friend class AttrsEqualHandler;
+  /*! \brief internal handle. */
+  AttrsEqualHandler* handler_{nullptr};
+};
+
+/*!
+ * \brief Content-aware hash function.
+ *
+ * This hash functor will recursively hash the content of the Attributes.
+ * It is guaranteed that if AttrsEqual(a, b) == true, then AttrsHash(a) == AttrsHash(b);
+ */
+class AttrsHash {
+ public:
+  size_t operator()(const double& value) const {
+    return std::hash<double>()(value);
+  }
+  size_t operator()(const int64_t& value) const {
+    return std::hash<int64_t>()(value);
+  }
+  size_t operator()(const uint64_t& value) const {
+    return std::hash<uint64_t>()(value);
+  }
+  size_t operator()(const int& value) const {
+    return std::hash<int>()(value);
+  }
+  size_t operator()(const bool& value) const {
+    return std::hash<bool>()(value);
+  }
+  size_t operator()(const std::string& value) const {
+    return std::hash<std::string>()(value);
+  }
+  size_t operator()(const Type& value) const {
+    return std::hash<int>()(
+        static_cast<int>(value.code()) |
+        (static_cast<int>(value.bits()) << 8) |
+        (static_cast<int>(value.lanes()) << 16));
+  }
+  TVM_DLL size_t operator()(const NodeRef& value) const;
+
+ private:
+  friend class AttrsHashHandler;
+  /*! \brief internal handle. */
+  AttrsHashHandler* handler_{nullptr};
+};
+
 /*!
  * \brief Base class of all attribute class
  * \note Do not subclass AttrBaseNode directly,
@@ -153,14 +237,17 @@ class BaseAttrsNode : public Node {
   /*!
    * \brief Whether this attribute's content equals to another node.
    * \param other The pointer to another node.
+   * \param equal The equal comparator
    * \return The comparison result.
    */
-  TVM_DLL virtual bool ContentEqual(const Node* other) const = 0;
+  TVM_DLL virtual bool ContentEqual(
+      const Node* other, AttrsEqual equal) const = 0;
   /*!
    * \brief Content aware hash.
+   * \param hasher The hasher to run the hash.
    * \return the hash result.
    */
-  TVM_DLL virtual size_t ContentHash() const = 0;
+  TVM_DLL virtual size_t ContentHash(AttrsHash hasher) const = 0;
 
   static constexpr const char* _type_key = "Attrs";
   TVM_DECLARE_BASE_NODE_INFO(BaseAttrsNode, Node);
@@ -209,92 +296,13 @@ class DictAttrsNode : public BaseAttrsNode {
   void VisitNonDefaultAttrs(AttrVisitor* v) final;
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final;
   Array<AttrFieldInfo> ListFieldInfo() const final;
-  bool ContentEqual(const Node* other) const final;
-  size_t ContentHash() const final;
+  bool ContentEqual(const Node* other, AttrsEqual equal) const final;
+  size_t ContentHash(AttrsHash hasher) const final;
   // type info
   static constexpr const char* _type_key = "DictAttrs";
   TVM_DECLARE_NODE_TYPE_INFO(DictAttrsNode, BaseAttrsNode);
 };
 
-/*!
- * \brief Content-aware Equality comparator for attrs.
- *
- * This comparator will recursively deep compare the following Attributes.
- *
- * - IntImm, UIntImm, FloatImm, StringImm
- * - Any subclass of BaseAttrsNode
- * - Array of Attributes.
- * - Map from string to Attributes.
- */
-class AttrsEqual {
- public:
-  bool operator()(const double& lhs, const double& rhs) const {
-    return lhs == rhs;
-  }
-  bool operator()(const int64_t& lhs, const int64_t& rhs) const {
-    return lhs == rhs;
-  }
-  bool operator()(const uint64_t& lhs, const uint64_t& rhs) const {
-    return lhs == rhs;
-  }
-  bool operator()(const int& lhs, const int& rhs) const {
-    return lhs == rhs;
-  }
-  bool operator()(const bool& lhs, const bool& rhs) const {
-    return lhs == rhs;
-  }
-  bool operator()(const std::string& lhs, const std::string& rhs) const {
-    return lhs == rhs;
-  }
-  bool operator()(const Type& lhs, const Type& rhs) const {
-    return lhs == rhs;
-  }
-  bool operator()(const NodeRef& lhs, const NodeRef& rhs) const {
-    return AttrsEqual::Equal(lhs, rhs);
-  }
-
-  // comparator of NodeRef types.
-  static TVM_DLL bool Equal(const NodeRef& lhs, const NodeRef& rhs);
-};
-
-/*!
- * \brief Content-aware hash function.
- *
- * This hash functor will recursively hash the content of the Attributes.
- * It is guaranteed that if AttrsEqual(a, b) == true, then AttrsHash(a) == AttrsHash(b);
- */
-class AttrsHash {
- public:
-  size_t operator()(const double& value) const {
-    return std::hash<double>()(value);
-  }
-  size_t operator()(const int64_t& value) const {
-    return std::hash<int64_t>()(value);
-  }
-  size_t operator()(const uint64_t& value) const {
-    return std::hash<uint64_t>()(value);
-  }
-  size_t operator()(const int& value) const {
-    return std::hash<int>()(value);
-  }
-  size_t operator()(const bool& value) const {
-    return std::hash<bool>()(value);
-  }
-  size_t operator()(const std::string& value) const {
-    return std::hash<std::string>()(value);
-  }
-  size_t operator()(const Type& value) const {
-    return std::hash<int>()(
-        static_cast<int>(value.code()) |
-        (static_cast<int>(value.bits()) << 8) |
-        (static_cast<int>(value.lanes()) << 16));
-  }
-  size_t operator()(const NodeRef& value) const {
-    return AttrsHash::Hash(value);
-  }
-  // hash function of the attribute and attribute fields.
-  static TVM_DLL size_t Hash(const NodeRef& lhs);
-};
 
 // Namespace containing detail implementations
 namespace detail {
@@ -342,8 +350,8 @@ class AttrsEqualVisitor {
  public:
   bool result_{true};
   // constructor
-  AttrsEqualVisitor(const Node* lhs, const Node* rhs)
-      : lhs_(lhs), rhs_(rhs) {
+  AttrsEqualVisitor(const Node* lhs, const Node* rhs, const AttrsEqual& equal)
+      : lhs_(lhs), rhs_(rhs), equal_(equal) {
   }
   template<typename T>
   AttrNopEntry operator()(const char* key, T* lhs_value) {
@@ -353,7 +361,7 @@ class AttrsEqualVisitor {
             reinterpret_cast<const char*>(rhs_) +
             (reinterpret_cast<const char*>(lhs_value) -
              reinterpret_cast<const char*>(lhs_)));
-    if (!AttrsEqual()(*lhs_value, *rhs_value)) {
+    if (!equal_(*lhs_value, *rhs_value)) {
       result_ = false;
     }
     return AttrNopEntry();
@@ -362,17 +370,24 @@ class AttrsEqualVisitor {
  private:
   const Node* lhs_;
   const Node* rhs_;
+  const AttrsEqual& equal_;
 };
 
 class AttrsHashVisitor {
  public:
+  explicit AttrsHashVisitor(const AttrsHash& hasher)
+      : hasher_(hasher) {}
+
   size_t result_{0};
 
   template<typename T>
   AttrNopEntry operator()(const char* key, T* value) {
-    result_ = dmlc::HashCombine(result_, AttrsHash()(*value));
+    result_ = dmlc::HashCombine(result_, hasher_(*value));
     return AttrNopEntry();
   }
+
+ private:
+  const AttrsHash& hasher_;
 };
 
 // helper entry that does initialization, set default.
@@ -793,18 +808,18 @@ class AttrsNode : public BaseAttrsNode {
     return visitor.fields_;
   }
 
-  bool ContentEqual(const Node* other) const final {
+  bool ContentEqual(const Node* other, AttrsEqual equal) const final {
     DerivedType* pself = self();
     if (pself == other) return true;
     if (other == nullptr) return false;
     if (pself->type_index() != other->type_index()) return false;
-    detail::AttrsEqualVisitor visitor(pself, other);
+    detail::AttrsEqualVisitor visitor(pself, other, equal);
     self()->__VisitAttrs__(visitor);
     return visitor.result_;
   }
 
-  size_t ContentHash() const final {
-    detail::AttrsHashVisitor visitor;
+  size_t ContentHash(AttrsHash hasher) const final {
+    detail::AttrsHashVisitor visitor(hasher);
     visitor.result_ = std::hash<std::string>()(this->type_key());
     self()->__VisitAttrs__(visitor);
     return visitor.result_;
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index 66e4529acaf1..1e571ca0dc41 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -68,10 +68,14 @@ TVM_REGISTER_API("ir_pass.Equal")
 
 
 TVM_REGISTER_API("ir_pass.AttrsEqual")
-.set_body_typed<bool(const NodeRef&, const NodeRef&)>(AttrsEqual::Equal);
+.set_body_typed<bool(const NodeRef&, const NodeRef&)>([](const NodeRef& lhs, const NodeRef& rhs) {
+    return AttrsEqual()(lhs, rhs);
+  });
 
 TVM_REGISTER_API("ir_pass.AttrsHash")
-.set_body_typed<int64_t(const NodeRef&)>(AttrsHash::Hash);
+.set_body_typed<int64_t(const NodeRef&)>([](const NodeRef &node) {
+    return AttrsHash()(node);
+  });
 
 
 TVM_REGISTER_API("ir_pass.ExprUseVar")
diff --git a/src/lang/attr_functor.h b/src/lang/attr_functor.h
index 8aa39a774315..ef1d061015c3 100644
--- a/src/lang/attr_functor.h
+++ b/src/lang/attr_functor.h
@@ -52,13 +52,33 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
       return VisitAttrDefault_(n.get(), std::forward<Args>(args)...);
     }
   }
+  virtual R VisitAttrDefault_(const Node* node, Args... args) = 0;
   virtual R VisitAttr_(const ArrayNode* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const StrMapNode* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::IntImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::UIntImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::FloatImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::StringImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
-  virtual R VisitAttrDefault_(const Node* node, Args... args) = 0;
+  // deep comparison of symbolic integer expressions.
+  virtual R VisitAttr_(const Variable* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Add* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Sub* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Mul* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Mod* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Min* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Max* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::GE* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::GT* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::LT* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::LE* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::EQ* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::NE* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::And* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Or* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Not* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Cast* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Call* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Select* op, Args... args) ATTR_FUNCTOR_DEFAULT;
 
  private:
   // initialize the vtable.
@@ -72,9 +92,111 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
     ATTR_FUNCTOR_DISPATCH(UIntImm);
     ATTR_FUNCTOR_DISPATCH(FloatImm);
     ATTR_FUNCTOR_DISPATCH(StringImm);
+    ATTR_FUNCTOR_DISPATCH(Variable);
+    ATTR_FUNCTOR_DISPATCH(Add);
+    ATTR_FUNCTOR_DISPATCH(Sub);
+    ATTR_FUNCTOR_DISPATCH(Mul);
+    ATTR_FUNCTOR_DISPATCH(Min);
+    ATTR_FUNCTOR_DISPATCH(Max);
+    ATTR_FUNCTOR_DISPATCH(GE);
+    ATTR_FUNCTOR_DISPATCH(GT);
+    ATTR_FUNCTOR_DISPATCH(LE);
+    ATTR_FUNCTOR_DISPATCH(LT);
+    ATTR_FUNCTOR_DISPATCH(EQ);
+    ATTR_FUNCTOR_DISPATCH(NE);
+    ATTR_FUNCTOR_DISPATCH(And);
+    ATTR_FUNCTOR_DISPATCH(Or);
+    ATTR_FUNCTOR_DISPATCH(Not);
+    ATTR_FUNCTOR_DISPATCH(Cast);
+    ATTR_FUNCTOR_DISPATCH(Call);
+    ATTR_FUNCTOR_DISPATCH(Select);
     return vtable;
   }
 };
 
+class AttrsEqualHandler :
+      protected AttrFunctor<bool(const NodeRef&, const NodeRef&)> {
+ public:
+  /*!
+   * \brief Check if lhs equals rhs
+   * \param lhs The left operand.
+   * \param rhs The right operand.
+   */
+  bool Equal(const NodeRef& lhs, const NodeRef& rhs);
+
+ protected:
+  bool VisitAttrDefault_(const Node* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ArrayNode* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const StrMapNode* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::IntImm* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::UIntImm* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::FloatImm* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::StringImm* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Add* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Sub* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Mul* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Mod* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Min* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Max* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::GE* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::GT* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::LT* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::LE* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::EQ* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::NE* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::And* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Or* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Not* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Cast* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Call* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Select* lhs, const NodeRef& other) final;
+};
+
+class AttrsHashHandler :
+      protected AttrFunctor<size_t(const NodeRef&)> {
+ public:
+  /*!
+   * \brief Get hash value of node
+   * \param node The node to be hashed.
+   */
+  size_t Hash(const NodeRef& node) {
+    return this->VisitAttr(node);
+  }
+
+ protected:
+  size_t VisitAttrDefault_(const Node* lhs) final;
+  size_t VisitAttr_(const ir::IntImm* lhs) final;
+  size_t VisitAttr_(const ir::UIntImm* lhs) final;
+  size_t VisitAttr_(const ir::FloatImm* lhs) final;
+  size_t VisitAttr_(const ir::StringImm* lhs) final;
+  size_t VisitAttr_(const ArrayNode* lhs) final;
+  size_t VisitAttr_(const StrMapNode* lhs) final;
+  size_t VisitAttr_(const ir::Add* op) final;
+  size_t VisitAttr_(const ir::Sub* op) final;
+  size_t VisitAttr_(const ir::Mul* op) final;
+  size_t VisitAttr_(const ir::Mod* op) final;
+  size_t VisitAttr_(const ir::Min* op) final;
+  size_t VisitAttr_(const ir::Max* op) final;
+  size_t VisitAttr_(const ir::GE* op) final;
+  size_t VisitAttr_(const ir::GT* op) final;
+  size_t VisitAttr_(const ir::LE* op) final;
+  size_t VisitAttr_(const ir::LT* op) final;
+  size_t VisitAttr_(const ir::EQ* op) final;
+  size_t VisitAttr_(const ir::NE* op) final;
+  size_t VisitAttr_(const ir::And* op) final;
+  size_t VisitAttr_(const ir::Or* op) final;
+  size_t VisitAttr_(const ir::Not* op) final;
+  size_t VisitAttr_(const ir::Cast* op) final;
+  size_t VisitAttr_(const ir::Call* op) final;
+  size_t VisitAttr_(const ir::Select* op) final;
+  /*!
+   * \brief alias of dmlc::HashCombine
+   * \param lhs The first hash value.
+   * \param rhs The second hash value.
+   */
+  static size_t Combine(size_t lhs, size_t rhs) {
+    return dmlc::HashCombine(lhs, rhs);
+  }
+};
 }  // namespace tvm
 #endif  // TVM_LANG_ATTR_FUNCTOR_H_
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index e467018add11..9aa067c09679 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -51,156 +51,272 @@ TVM_REGISTER_NODE_TYPE(AttrFieldInfoNode);
 
 
 using namespace ir;
+// Equal handler.
+bool AttrsEqualHandler::Equal(const NodeRef& lhs, const NodeRef& rhs) {
+  if (lhs.same_as(rhs)) return true;
+  if (!lhs.defined() || !rhs.defined()) return false;
+  return this->VisitAttr(lhs, rhs);
+}
 
-class AttrsEqualChecker :
-      public AttrFunctor<bool(const NodeRef&, const NodeRef&)> {
- public:
-  bool Check(const NodeRef& lhs, const NodeRef& rhs) {
-    if (!equal_) return false;
-    if (lhs.same_as(rhs)) return true;
-    if (!lhs.defined() || !rhs.defined()) return false;
-    if (!this->VisitAttr(lhs, rhs)) {
-      equal_ = false;
-    }
-    return equal_;
+bool AttrsEqualHandler::VisitAttrDefault_(const Node* lhs, const NodeRef& other) {
+  if (lhs->derived_from<BaseAttrsNode>()) {
+    AttrsEqual equal;
+    equal.handler_ = this;
+    return static_cast<const BaseAttrsNode*>(lhs)->ContentEqual(
+        other.get(), equal);
   }
+  return lhs == other.get();
+}
 
-  bool VisitAttrDefault_(const Node* lhs, const NodeRef& other) final {
-    if (lhs->derived_from<BaseAttrsNode>()) {
-      return static_cast<const BaseAttrsNode*>(lhs)->ContentEqual(other.get());
-    }
-    return lhs == other.get();
+bool AttrsEqualHandler::VisitAttr_(const IntImm* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<IntImm>()) {
+    return lhs->value == rhs->value;
   }
+  return false;
+}
 
-  bool VisitAttr_(const IntImm* lhs, const NodeRef& other) final {
-    if (const auto* rhs = other.as<IntImm>()) {
-      return lhs->value == rhs->value;
-    }
-    return false;
+bool AttrsEqualHandler::VisitAttr_(const UIntImm* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<UIntImm>()) {
+    return lhs->value == rhs->value;
   }
+  return false;
+}
 
-  bool VisitAttr_(const UIntImm* lhs, const NodeRef& other) final {
-    if (const auto* rhs = other.as<UIntImm>()) {
-      return lhs->value == rhs->value;
-    }
-    return false;
+bool AttrsEqualHandler::VisitAttr_(const FloatImm* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<FloatImm>()) {
+    return lhs->value == rhs->value;
   }
+  return false;
+}
 
-  bool VisitAttr_(const FloatImm* lhs, const NodeRef& other) final {
-    if (const auto* rhs = other.as<FloatImm>()) {
-      return lhs->value == rhs->value;
-    }
-    return false;
+bool AttrsEqualHandler::VisitAttr_(const StringImm* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<StringImm>()) {
+    return lhs->value == rhs->value;
   }
+  return false;
+}
 
-  bool VisitAttr_(const StringImm* lhs, const NodeRef& other) final {
-    if (const auto* rhs = other.as<StringImm>()) {
-      return lhs->value == rhs->value;
+bool AttrsEqualHandler::VisitAttr_(const ArrayNode* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<ArrayNode>()) {
+    if (rhs->data.size() != lhs->data.size()) return false;
+    for (size_t  i = 0; i < lhs->data.size(); ++i) {
+      if (!Equal(NodeRef(lhs->data[i]), NodeRef(rhs->data[i]))) return false;
     }
-    return false;
   }
+  return true;
+}
 
-  bool VisitAttr_(const ArrayNode* lhs, const NodeRef& other) final {
-    if (const auto* rhs = other.as<ArrayNode>()) {
-      if (rhs->data.size() != lhs->data.size()) return false;
-      for (size_t  i = 0; i < lhs->data.size(); ++i) {
-        if (!Check(NodeRef(lhs->data[i]), NodeRef(rhs->data[i]))) return false;
-      }
+bool AttrsEqualHandler::VisitAttr_(const StrMapNode* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<StrMapNode>()) {
+    if (rhs->data.size() != lhs->data.size()) return false;
+    for (const auto& kv : lhs->data) {
+      auto it = rhs->data.find(kv.first);
+      if (it == rhs->data.end()) return false;
+      if (!Equal(NodeRef(kv.second), NodeRef(it->second))) return false;
     }
-    return true;
   }
+  return true;
+}
 
-  bool VisitAttr_(const StrMapNode* lhs, const NodeRef& other) final {
-    if (const auto* rhs = other.as<StrMapNode>()) {
-      if (rhs->data.size() != lhs->data.size()) return false;
-      for (const auto& kv : lhs->data) {
-        auto it = rhs->data.find(kv.first);
-        if (it == rhs->data.end()) return false;
-        if (!Check(NodeRef(kv.second), NodeRef(it->second))) return false;
-      }
-    }
-    return true;
+#define TVM_DEFINE_ATTRS_BINOP_EQUAL(NodeName)                          \
+  bool AttrsEqualHandler::VisitAttr_(const NodeName* lhs, const NodeRef& other) { \
+    if (const auto* rhs = other.as<NodeName>()) {                       \
+      if (!Equal(lhs->a, rhs->a)) return false;                         \
+      if (!Equal(lhs->b, rhs->b)) return false;                         \
+      return true;                                                      \
+    } else {                                                            \
+      return false;                                                     \
+    }                                                                   \
+  }                                                                     \
+
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Add);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Sub);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Mul);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Mod);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Max);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Min);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(GE);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(GT);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(LE);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(LT);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(EQ);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(NE);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(And);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Or);
+
+bool AttrsEqualHandler::VisitAttr_(const Not* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<Not>()) {
+    return Equal(lhs->a, rhs->a);
+  } else {
+    return false;
   }
+}
 
- private:
-  bool equal_{true};
-};
-
-class AttrContentHasher :
-      public AttrFunctor<void(const NodeRef&)> {
- public:
-  size_t result_{0};
-
-  void VisitAttrDefault_(const Node* value) final {
-    if (value->derived_from<BaseAttrsNode>()) {
-      Update(static_cast<const BaseAttrsNode*>(value)->ContentHash());
-    } else {
-      Update(NodeHash()(GetRef<NodeRef>(value)));
-    }
+bool AttrsEqualHandler::VisitAttr_(const Cast* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<Cast>()) {
+    if (lhs->type != rhs->type) return false;
+    return Equal(lhs->value, rhs->value);
+  } else {
+    return false;
   }
+}
 
-  void VisitAttr_(const IntImm* op) final {
-    Update(std::hash<int64_t>()(op->value));
+bool AttrsEqualHandler::VisitAttr_(const Call* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<Call>()) {
+    return
+        lhs->name == rhs->name &&
+        lhs->type == rhs->type &&
+        lhs->call_type == rhs->call_type &&
+        Equal(lhs->args, rhs->args);
+  } else {
+    return false;
   }
+}
 
-  void VisitAttr_(const UIntImm* op) final {
-    Update(std::hash<uint64_t>()(op->value));
+bool AttrsEqualHandler::VisitAttr_(const Select* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<Select>()) {
+    return
+        Equal(lhs->condition, rhs->condition) &&
+        Equal(lhs->true_value, rhs->true_value) &&
+        Equal(lhs->false_value, rhs->false_value);
+  } else {
+    return false;
   }
+}
 
-  void VisitAttr_(const FloatImm* op) final {
-    Update(std::hash<double>()(op->value));
+// Hash Handler.
+size_t AttrsHashHandler::VisitAttrDefault_(const Node* value) {
+  if (value->derived_from<BaseAttrsNode>()) {
+    AttrsHash hasher;
+    hasher.handler_ = this;
+    return static_cast<const BaseAttrsNode*>(value)->ContentHash(hasher);
+  } else {
+    return NodeHash()(GetRef<NodeRef>(value));
   }
+}
 
-  void VisitAttr_(const StringImm* op) final {
-    Update(std::hash<std::string>()(op->value));
-  }
+size_t AttrsHashHandler::VisitAttr_(const IntImm* op) {
+  return std::hash<int64_t>()(op->value);
+}
 
-  void VisitAttr_(const ArrayNode* op) final {
-    Update(op->data.size());
-    for (size_t  i = 0; i < op->data.size(); ++i) {
-      this->VisitAttr(NodeRef(op->data[i]));
-    }
+size_t AttrsHashHandler::VisitAttr_(const UIntImm* op) {
+  return std::hash<uint64_t>()(op->value);
+}
+
+size_t AttrsHashHandler::VisitAttr_(const FloatImm* op) {
+  return std::hash<double>()(op->value);
+}
+
+size_t AttrsHashHandler::VisitAttr_(const StringImm* op) {
+  return std::hash<std::string>()(op->value);
+}
+
+size_t AttrsHashHandler::VisitAttr_(const ArrayNode* op) {
+  size_t result = op->data.size();
+  for (size_t  i = 0; i < op->data.size(); ++i) {
+    result = Combine(result, this->Hash(NodeRef(op->data[i])));
   }
+  return result;
+}
 
-  void VisitAttr_(const StrMapNode* lhs) final {
+size_t AttrsHashHandler::VisitAttr_(const StrMapNode* lhs) {
     using Entry = std::pair<std::string, NodePtr<Node> >;
     std::vector<Entry> data(lhs->data.begin(), lhs->data.end());
     std::sort(data.begin(), data.end(), [](const Entry& a, const Entry& b) {
         return a.first < b.first;
       });
+    size_t result = 0;
     for (const Entry& kv : data) {
-      Update(std::hash<std::string>()(kv.first));
-      this->VisitAttr(NodeRef(kv.second));
+      result = Combine(result, std::hash<std::string>()(kv.first));
+      result = Combine(result, this->Hash(NodeRef(kv.second)));
     }
-  }
+    return result;
+}
 
-  void Update(size_t value) {
-    result_ = dmlc::HashCombine(result_, value);
-  }
-};
 
-bool AttrsEqual::Equal(const NodeRef& lhs, const NodeRef& rhs) {
+#define TVM_DEFINE_ATTRS_BINOP_HASH(NodeName)                           \
+  size_t AttrsHashHandler::VisitAttr_(const NodeName* op) {             \
+    static size_t key = std::hash<std::string>()(NodeName::_type_key);  \
+    return Combine(key, Combine(Hash(op->a), Hash(op->b)));             \
+  }                                                                     \
+
+TVM_DEFINE_ATTRS_BINOP_HASH(Add);
+TVM_DEFINE_ATTRS_BINOP_HASH(Sub);
+TVM_DEFINE_ATTRS_BINOP_HASH(Mul);
+TVM_DEFINE_ATTRS_BINOP_HASH(Mod);
+TVM_DEFINE_ATTRS_BINOP_HASH(Max);
+TVM_DEFINE_ATTRS_BINOP_HASH(Min);
+TVM_DEFINE_ATTRS_BINOP_HASH(GE);
+TVM_DEFINE_ATTRS_BINOP_HASH(GT);
+TVM_DEFINE_ATTRS_BINOP_HASH(LE);
+TVM_DEFINE_ATTRS_BINOP_HASH(LT);
+TVM_DEFINE_ATTRS_BINOP_HASH(EQ);
+TVM_DEFINE_ATTRS_BINOP_HASH(NE);
+TVM_DEFINE_ATTRS_BINOP_HASH(And);
+TVM_DEFINE_ATTRS_BINOP_HASH(Or);
+
+size_t AttrsHashHandler::VisitAttr_(const Not* op) {
+  static size_t key = std::hash<std::string>()(Not::_type_key);
+  return Combine(key, Hash(op->a));
+}
+
+size_t AttrsHashHandler::VisitAttr_(const Cast* op) {
+  static size_t key = std::hash<std::string>()(Cast::_type_key);
+  AttrsHash hasher;
+  size_t res = key;
+  res = Combine(res, hasher(op->type));
+  res = Combine(res, Hash(op->value));
+  return res;
+}
+
+size_t AttrsHashHandler::VisitAttr_(const Call* op) {
+  static size_t key = std::hash<std::string>()(Call::_type_key);
+  AttrsHash hasher;
+  size_t res = key;
+  res = Combine(res, hasher(op->name));
+  res = Combine(res, hasher(op->type));
+  res = Combine(res, Hash(op->args));
+  return res;
+}
+
+size_t AttrsHashHandler::VisitAttr_(const Select* op) {
+  static size_t key = std::hash<std::string>()(Select::_type_key);
+  size_t res = key;
+  res = Combine(res, Hash(op->condition));
+  res = Combine(res, Hash(op->true_value));
+  res = Combine(res, Hash(op->false_value));
+  return res;
+}
+
+
+// Default case
+bool AttrsEqual::operator()(const NodeRef& lhs, const NodeRef& rhs) const {
   if (lhs.same_as(rhs)) return true;
-  AttrsEqualChecker checker;
-  return checker.Check(lhs, rhs);
+  if (handler_ == nullptr) {
+    return AttrsEqualHandler().Equal(lhs, rhs);
+  } else {
+    return handler_->Equal(lhs, rhs);
+  }
 }
 
-size_t AttrsHash::Hash(const NodeRef& node) {
+size_t AttrsHash::operator()(const NodeRef& node) const {
   if (!node.defined()) return 0;
-  AttrContentHasher hasher;
-  hasher.VisitAttr(node);
-  return hasher.result_;
+  if (handler_ == nullptr) {
+    return AttrsHashHandler().Hash(node);
+  } else {
+    return handler_->Hash(node);
+  }
 }
 
-size_t DictAttrsNode::ContentHash() const {
-  return AttrsHash()(this->dict);
+size_t DictAttrsNode::ContentHash(AttrsHash hasher) const {
+  return hasher(this->dict);
 }
 
-bool DictAttrsNode::ContentEqual(const Node* other) const {
+bool DictAttrsNode::ContentEqual(const Node* other, AttrsEqual equal) const {
   if (this == other) return true;
   if (other == nullptr) return false;
   if (this->type_index() != other->type_index()) return false;
-  return AttrsEqual()(this->dict, static_cast<const DictAttrsNode*>(other)->dict);
+  return equal(this->dict, static_cast<const DictAttrsNode*>(other)->dict);
 }
 
 }  // namespace tvm
diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
new file mode 100644
index 000000000000..f227970fc09b
--- /dev/null
+++ b/src/relay/ir/alpha_equal.cc
@@ -0,0 +1,384 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/ir/alpha_equal.cc
+ * \brief Alpha equality check by deep comparing two nodes.
+ */
+#include <tvm/ir_pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/relay/pass.h>
+#include "type_functor.h"
+#include "../../lang/attr_functor.h"
+
+namespace tvm {
+namespace relay {
+
+// Alpha equal handler for relay.
+class AlphaEqualHandler:
+      public AttrsEqualHandler,
+      public TypeFunctor<bool(const Type&, const Type&)>,
+      public ExprFunctor<bool(const Expr&, const Expr&)> {
+ public:
+  explicit AlphaEqualHandler(bool map_free_var)
+      : map_free_var_(map_free_var) {}
+
+  /*!
+   * Check equality of two nodes.
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return the compare result.
+   */
+  bool Equal(const NodeRef& lhs, const NodeRef& rhs) {
+    if (lhs.same_as(rhs)) return true;
+    if (!lhs.defined() || !rhs.defined()) return false;
+    if (lhs->derived_from<TypeNode>()) {
+      if (!rhs->derived_from<TypeNode>()) return false;
+      return TypeEqual(Downcast<Type>(lhs), Downcast<Type>(rhs));
+    }
+    if (lhs->derived_from<ExprNode>()) {
+      if (!rhs->derived_from<ExprNode>()) return false;
+      return ExprEqual(Downcast<Expr>(lhs), Downcast<Expr>(rhs));
+    }
+    return AttrEqual(lhs, rhs);
+  }
+
+  /*!
+   * Check equality of two attributes.
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return the compare result.
+   */
+  bool AttrEqual(const NodeRef& lhs, const NodeRef& rhs) {
+    return AttrsEqualHandler::Equal(lhs, rhs);
+  }
+  /*!
+   * Check equality of two types.
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return the compare result.
+   */
+  bool TypeEqual(const Type& lhs, const Type& rhs) {
+    if (lhs.same_as(rhs)) return true;
+    if (!lhs.defined() || !rhs.defined()) return false;
+    return this->VisitType(lhs, rhs);
+  }
+  /*!
+   * Check equality of two expressions.
+   *
+   * \note We run graph structural equality checking when comparing two Exprs.
+   *   This means that AlphaEqualHandler can only be used once for each pair.
+   *   The equality checker checks data-flow equvalence of the Expr DAG.
+   *   This function also runs faster as it memomizes equal_map.
+   *
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return the compare result.
+   */
+  bool ExprEqual(const Expr& lhs, const Expr& rhs) {
+    if (lhs.same_as(rhs)) return true;
+    if (!lhs.defined() || !rhs.defined()) return false;
+    auto it = equal_map_.find(lhs);
+    if (it != equal_map_.end()) {
+      return it->second.same_as(rhs);
+    }
+    if (this->VisitExpr(lhs, rhs)) {
+      equal_map_[lhs] = rhs;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ protected:
+  /*!
+   * \brief Check if data type equals each other.
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return the compare result.
+   */
+  bool DataTypeEqual(const DataType& lhs, const DataType& rhs) {
+    return lhs == rhs;
+  }
+  /*!
+   * \brief Check Equality of leaf node of the graph.
+   *  if map_free_var_ is set to true, try to map via equal node.
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return the compare result.
+   */
+  bool LeafNodeEqual(const NodeRef& lhs, const NodeRef& rhs) {
+    if (lhs.same_as(rhs)) return true;
+    auto it = equal_map_.find(lhs);
+    if (it != equal_map_.end()) {
+      return it->second.same_as(rhs);
+    } else {
+      if (map_free_var_) {
+        if (lhs->type_index() != rhs->type_index()) return false;
+        equal_map_[lhs] = rhs;
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+  using AttrsEqualHandler::VisitAttr_;
+  bool VisitAttr_(const Variable* lhs, const NodeRef& other) final {
+    return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
+  }
+
+  // Type equality
+  bool VisitType_(const TensorTypeNode* lhs, const Type& other) final {
+    if (const TensorTypeNode* rhs = other.as<TensorTypeNode>()) {
+      return (lhs->dtype == rhs->dtype &&
+              AttrEqual(lhs->shape, rhs->shape));
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitType_(const IncompleteTypeNode* lhs, const Type& other) final {
+    return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
+  }
+
+  bool VisitType_(const TypeVarNode* lhs, const Type& other) final {
+    if (const TypeVarNode* rhs = other.as<TypeVarNode>()) {
+      if (lhs->kind != rhs->kind) return false;
+      return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitType_(const FuncTypeNode* lhs, const Type& other) final {
+    if (const FuncTypeNode* rhs = other.as<FuncTypeNode>()) {
+      if (lhs->arg_types.size() != rhs->arg_types.size()) return false;
+      if (lhs->type_params.size() != rhs->type_params.size()) return false;
+      if (lhs->type_constraints.size() != rhs->type_constraints.size()) return false;
+      for (size_t i = 0; i < lhs->type_params.size(); ++i) {
+        if (lhs->type_params[i]->kind != rhs->type_params[i]->kind) {
+          return false;
+        }
+        equal_map_[lhs->type_params[i]] = rhs->type_params[i];
+        // set up type parameter equal
+        if (lhs->type_params[i]->kind == TypeVarNode::Kind::kShapeVar) {
+          // map variable
+          equal_map_[lhs->type_params[i]->var] = rhs->type_params[i]->var;
+        }
+      }
+      for (size_t i = 0; i < lhs->arg_types.size(); i++) {
+        if (!TypeEqual(lhs->arg_types[i], rhs->arg_types[i])) return false;
+      }
+      if (!TypeEqual(lhs->ret_type, rhs->ret_type)) return false;
+      for (size_t i = 0; i < lhs->type_constraints.size(); i++) {
+        if (!TypeEqual(lhs->type_constraints[i],
+                       rhs->type_constraints[i])) {
+          return false;
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitType_(const TypeRelationNode* lhs, const Type& other) final {
+    if (const TypeRelationNode* rhs = other.as<TypeRelationNode>()) {
+      if (!lhs->func.same_as(rhs->func)) return false;
+      if (lhs->num_inputs != rhs->num_inputs) return false;
+      if (!this->AttrEqual(lhs->attrs, rhs->attrs)) return false;
+      if (lhs->args.size() != rhs->args.size()) return false;
+      for (size_t i = 0; i < lhs->args.size(); ++i) {
+        if (!TypeEqual(lhs->args[i], rhs->args[i])) return false;
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitType_(const TupleTypeNode* lhs, const Type& other) final {
+    if (const TupleTypeNode* rhs = other.as<TupleTypeNode>()) {
+      if (lhs->fields.size() != rhs->fields.size()) return false;
+      for (size_t i = 0; i < lhs->fields.size(); ++i) {
+        if (!TypeEqual(lhs->fields[i], rhs->fields[i])) return false;
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+  // Expr equal checking.
+  bool NDArrayEqual(const runtime::NDArray& lhs,
+                    const runtime::NDArray& rhs) {
+    if (lhs.defined() != rhs.defined()) {
+      return false;
+    } else if (lhs.same_as(rhs)) {
+      return true;
+    } else {
+      auto ldt = lhs->dtype;
+      auto rdt = rhs->dtype;
+      CHECK_EQ(lhs->ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+      CHECK_EQ(rhs->ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+      if (ldt.code == rdt.code && ldt.lanes == rdt.lanes && ldt.bits == rdt.bits) {
+        size_t data_size = runtime::GetDataSize(*lhs.operator->());
+        return std::memcmp(lhs->data, rhs->data, data_size) == 0;
+      } else {
+        return false;
+      }
+    }
+  }
+  // merge declaration of two variables together.
+  bool MergeVarDecl(const Var& lhs, const Var& rhs) {
+    if (lhs.same_as(rhs)) return true;
+    if (!lhs.defined() || !rhs.defined()) return false;
+    if (!TypeEqual(lhs->type_annotation,
+                   rhs->type_annotation)) return false;
+    CHECK(!equal_map_.count(lhs))
+        << "Duplicated declaration of variable " <<  lhs;
+    equal_map_[lhs] = rhs;
+    return true;
+  }
+
+  bool VisitExpr_(const VarNode* lhs, const Expr& other) final {
+    if (const VarNode* rhs = other.as<VarNode>()) {
+      if (lhs->name_hint != rhs->name_hint) return false;
+      if (!TypeEqual(lhs->type_annotation, rhs->type_annotation)) return false;
+      return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const GlobalVarNode* lhs, const Expr& other) final {
+    if (const GlobalVarNode* rhs = other.as<GlobalVarNode>()) {
+      // use name equality for global var for now.
+      if (lhs->name_hint != rhs->name_hint) return false;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const TupleNode* lhs, const Expr& other) final {
+    if (const TupleNode* rhs = other.as<TupleNode>()) {
+      if (lhs->fields.size() != rhs->fields.size()) return false;
+      for (size_t i = 0; i < lhs->fields.size(); ++i) {
+        if (!ExprEqual(lhs->fields[i], rhs->fields[i])) return false;
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const FunctionNode* lhs, const Expr& other) final {
+    if (const FunctionNode* rhs = other.as<FunctionNode>()) {
+      if (lhs->params.size() != rhs->params.size()) return false;
+      if (lhs->type_params.size() != rhs->type_params.size()) return false;
+      // map type parameter to be the same
+      for (size_t i = 0; i < lhs->type_params.size(); ++i) {
+        if (lhs->type_params[i]->kind != rhs->type_params[i]->kind) return false;
+        equal_map_[lhs->type_params[i]] = rhs->type_params[i];
+      }
+      // check parameter type annotations
+      for (size_t i = 0; i < lhs->params.size(); ++i) {
+        if (!MergeVarDecl(lhs->params[i], rhs->params[i])) return false;
+      }
+      // check return types.
+      if (!TypeEqual(lhs->ret_type, rhs->ret_type)) return false;
+      return ExprEqual(lhs->body, rhs->body);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const CallNode* lhs, const Expr& other) final {
+    if (const CallNode* rhs = other.as<CallNode>()) {
+      if (!ExprEqual(lhs->op, rhs->op)) return false;
+      if (lhs->args.size() != rhs->args.size()) return false;
+      if (lhs->type_args.size() != rhs->type_args.size()) return false;
+
+      for (size_t i = 0; i < lhs->args.size(); ++i) {
+        if (!ExprEqual(lhs->args[i], rhs->args[i])) return false;
+      }
+      for (size_t i = 0; i < lhs->type_args.size(); ++i) {
+        if (!TypeEqual(lhs->type_args[i], rhs->type_args[i])) return false;
+      }
+      return AttrEqual(lhs->attrs, rhs->attrs);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const LetNode* lhs, const Expr& other) final {
+    if (const LetNode* rhs = other.as<LetNode>()) {
+      if (!ExprEqual(lhs->value, rhs->value)) return false;
+      if (!MergeVarDecl(lhs->var, rhs->var)) return false;
+      return ExprEqual(lhs->body, rhs->body);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const IfNode* lhs, const Expr& other) final {
+    if (const IfNode* rhs = other.as<IfNode>()) {
+      return ExprEqual(lhs->cond, rhs->cond) &&
+          ExprEqual(lhs->true_branch, rhs->true_branch) &&
+          ExprEqual(lhs->false_branch, rhs->false_branch);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const OpNode* op, const Expr& other) final {
+    return op == other.get();
+  }
+
+  bool VisitExpr_(const ConstantNode* lhs, const Expr& other) final {
+    if (const ConstantNode* rhs = other.as<ConstantNode>()) {
+      return NDArrayEqual(lhs->data, rhs->data);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const TupleGetItemNode* lhs, const Expr& other) final {
+    if (const TupleGetItemNode* rhs = other.as<TupleGetItemNode>()) {
+      return ExprEqual(lhs->tuple, rhs->tuple) && lhs->index == rhs->index;
+    } else {
+      return false;
+    }
+  }
+
+ private:
+  // whether to map open terms.
+  bool map_free_var_{false};
+  // renaming of NodeRef to indicate two nodes equals to each other
+  std::unordered_map<NodeRef, NodeRef, NodeHash, NodeEqual> equal_map_;
+};
+
+bool AlphaEqual(const Type& lhs, const Type& rhs) {
+  return AlphaEqualHandler(false).TypeEqual(lhs, rhs);
+}
+
+bool AlphaEqual(const Expr& lhs, const Expr& rhs) {
+  return AlphaEqualHandler(false).ExprEqual(lhs, rhs);
+}
+
+// TODO(@jroesch): move to correct namespace?
+TVM_REGISTER_API("relay._make._alpha_equal")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = AlphaEqualHandler(false).Equal(args[0], args[1]);
+  });
+
+TVM_REGISTER_API("relay._make._type_alpha_equal")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = AlphaEqualHandler(false).TypeEqual(args[0], args[1]);
+  });
+
+TVM_REGISTER_API("relay._make._graph_equal")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = AlphaEqualHandler(true).Equal(args[0], args[1]);
+  });
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index 66ef86641fae..0ebe111ab6b2 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -6,7 +6,7 @@
 #include <tvm/relay/environment.h>
 #include <tvm/relay/expr_functor.h>
 #include <sstream>
-#include "../pass/type_functor.h"
+#include "type_functor.h"
 #include "../../lang/attr_functor.h"
 
 namespace tvm {
@@ -245,6 +245,9 @@ class TextPrinter :
         stream_ << ", ";
       }
     }
+    if (fields.size() == 1) {
+      stream_ << ',';
+    }
     stream_ << ')';
     this->PrintEndInst("\n");
     return id;
@@ -648,7 +651,7 @@ class TextPrinter :
       name = "%" + name;
     }
     TextValue val(GetUniqueName(name));
-    CHECK(!memo_.count(var));
+    CHECK(!memo_.count(var)) << "Duplicated variable " << var;
     memo_[var] = val;
     return val;
   }
diff --git a/src/relay/pass/type_visitor.h b/src/relay/ir/type_functor.h
similarity index 52%
rename from src/relay/pass/type_visitor.h
rename to src/relay/ir/type_functor.h
index c1b2c3e1a3ad..03bb4db1f59e 100644
--- a/src/relay/pass/type_visitor.h
+++ b/src/relay/ir/type_functor.h
@@ -1,18 +1,97 @@
 /*!
  *  Copyright (c) 2018 by Contributors
- * \file type_visitor.h
- * \brief A wrapper around TypeFunctor for common use cases.
+ * \file type_functor.h
+ * \brief A way to defined arbitrary function signature with dispatch on types.
  */
-#ifndef TVM_RELAY_PASS_TYPE_VISITOR_H_
-#define TVM_RELAY_PASS_TYPE_VISITOR_H_
+#ifndef TVM_RELAY_IR_TYPE_FUNCTOR_H_
+#define TVM_RELAY_IR_TYPE_FUNCTOR_H_
 
+#include <tvm/node/ir_functor.h>
+#include <tvm/relay/expr.h>
+#include <string>
 #include <vector>
-#include "./type_functor.h"
 
 namespace tvm {
 namespace relay {
 
-/*! \brief A type visitor for vistiors which make use of internal
+template <typename FType>
+class TypeFunctor;
+
+// functions to be overriden.
+#define TYPE_FUNCTOR_DEFAULT \
+  { return VisitTypeDefault_(op, std::forward<Args>(args)...); }
+
+
+#define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                                   \
+  vtable.template set_dispatch<OP>(                                       \
+      [](const NodeRef& n, TSelf* self, Args... args) {                   \
+        return self->VisitType_(static_cast<const OP*>(n.node_.get()),    \
+                                std::forward<Args>(args)...);             \
+      });
+
+template <typename R, typename... Args>
+class TypeFunctor<R(const Type& n, Args...)> {
+ private:
+  using TSelf = TypeFunctor<R(const Type& n, Args...)>;
+  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+
+ public:
+  /*! \brief the result type of this functor */
+  using result_type = R;
+  /*! \brief virtual destructor */
+  virtual ~TypeFunctor() {}
+  /*!
+   * \brief Same as call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  R operator()(const Type& n, Args... args) {
+    return VisitType(n, std::forward<Args>(args)...);
+  }
+  /*!
+   * \brief The functor call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  virtual R VisitType(const Type& n, Args... args) {
+    static FType vtable = InitVTable();
+    return vtable(n, this, std::forward<Args>(args)...);
+  }
+  // Functions that can be overriden by subclass
+  virtual R VisitType_(const TensorTypeNode* op,
+                       Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TypeVarNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TypeConstraintNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const FuncTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TypeRelationNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TupleTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const IncompleteTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+
+  virtual R VisitTypeDefault_(const Node* op, Args...) {
+    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    throw;  // unreachable, written to stop compiler warning
+  }
+
+ private:
+  // initialize the vtable.
+  static FType InitVTable() {
+    FType vtable;
+    // Set dispatch
+    RELAY_TYPE_FUNCTOR_DISPATCH(TensorTypeNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TypeVarNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TypeConstraintNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(FuncTypeNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TypeRelationNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TupleTypeNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(IncompleteTypeNode);
+    return vtable;
+  }
+};
+
+/*!
+ * \brief A type visitor for vistiors which make use of internal
  * mutable state.
  *
  * We recursively visit each type contained inside the visitor.
@@ -118,7 +197,6 @@ struct TypeMutator : TypeFunctor<Type(const Type& n)> {
     return GetRef<Type>(op);
   }
 };
-
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_PASS_TYPE_VISITOR_H_
+#endif  // TVM_RELAY_IR_TYPE_FUNCTOR_H_
diff --git a/src/relay/pass/alpha_eq.cc b/src/relay/pass/alpha_eq.cc
deleted file mode 100644
index 41ec3f1e090b..000000000000
--- a/src/relay/pass/alpha_eq.cc
+++ /dev/null
@@ -1,418 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file src/tvm/relay/pass/alpha_eq.cc
- * \brief Check that two type are syntactically equal up to alpha equivalence.
- */
-#include <tvm/ir_pass.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/runtime/ndarray.h>
-#include "./type_visitor.h"
-#include "tvm/relay/pass.h"
-
-namespace tvm {
-namespace relay {
-
-using namespace tvm::runtime;
-
-bool SameNDArray(const NDArray& lhs, const NDArray& rhs) {
-  if (lhs.defined() != rhs.defined()) {
-    return false;
-  } else if (lhs.same_as(rhs)) {
-    return true;
-  } else {
-    auto ldt = lhs->dtype;
-    auto rdt = rhs->dtype;
-    CHECK_EQ(lhs->ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    CHECK_EQ(rhs->ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    if (ldt.code == rdt.code && ldt.lanes == rdt.lanes && ldt.bits == rdt.bits) {
-      size_t s = GetDataSize(*lhs.operator->());
-      return memcmp(lhs->data, rhs->data, s) == 0;
-    } else {
-      return false;
-    }
-  }
-}
-
-struct TypeAlphaEq : TypeVisitor<const Type&> {
-  tvm::Map<TypeVar, TypeVar> eq_map;
-  bool equal;
-
-  TypeAlphaEq() : eq_map(), equal(true) {}
-
-  void DataTypeEqual(const DataType& dt1, const DataType& dt2) {
-    if (dt1 != dt2) {
-      equal = false;
-    }
-  }
-
-  void ShapeEqual(const Array<IndexExpr>& s1, const Array<IndexExpr>& s2) {
-    if (s1.size() != s2.size()) {
-      equal = false;
-      return;
-    }
-    for (size_t i = 0; i < s1.size(); ++i) {
-      if (!tvm::ir::Equal(s1[i], s2[i])) {
-        equal = false;
-        return;
-      }
-    }
-  }
-
-  void VisitType_(const TensorTypeNode* tt1, const Type& t2) final {
-    if (const TensorTypeNode* tt2 = t2.as<TensorTypeNode>()) {
-      DataTypeEqual(tt1->dtype, tt2->dtype);
-      ShapeEqual(tt1->shape, tt2->shape);
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitType_(const IncompleteTypeNode* bt1, const Type& t2) final {
-    if (const IncompleteTypeNode* bt2 = t2.as<IncompleteTypeNode>()) {
-      equal = equal && bt1 == bt2;
-      return;
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitType_(const TypeVarNode* ti1, const Type& t2) final {
-    if (const TypeVarNode* ti2 = t2.as<TypeVarNode>()) {
-      auto tid1 = GetRef<TypeVar>(ti1);
-      auto tid2 = GetRef<TypeVar>(ti2);
-
-      // We handle open terms with this rule assuming variables are identical.
-      //
-      // Not sure if we should do this.
-      if (tid1 == tid2) {
-        return;
-      }
-
-      // Check that they are same kind
-      if (tid1->kind != tid2->kind) {
-        equal = false;
-        return;
-      }
-
-      // Next we see if there is mapping for local1 into the rhs term.
-      // If there is we check to see if those are equal.
-      if (eq_map.find(tid1) != eq_map.end()) {
-        equal = equal && eq_map[tid1] == tid2;
-      } else {
-        equal = false;
-      }
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitType_(const FuncTypeNode* op, const Type& t2) final {
-    if (const FuncTypeNode* ta2 = t2.as<FuncTypeNode>()) {
-      if (op->arg_types.size() != ta2->arg_types.size()
-          || op->type_params.size() != ta2->type_params.size()
-          || op->type_constraints.size() != ta2->type_constraints.size()) {
-        equal = false;
-        return;
-      }
-
-      // must visit params first so they are appropriate entered
-      // into equality map
-      for (size_t i = 0; i < op->type_params.size(); i++) {
-        eq_map.Set(op->type_params[i], ta2->type_params[i]);
-        this->VisitType(op->type_params[i], ta2->type_params[i]);
-        if (!equal) {
-          return;
-        }
-      }
-
-      for (size_t i = 0; i < op->arg_types.size(); i++) {
-        this->VisitType(op->arg_types[i], ta2->arg_types[i]);
-        if (!equal) {
-          return;
-        }
-      }
-
-      this->VisitType(op->ret_type, ta2->ret_type);
-      if (!equal) {
-        return;
-      }
-
-      for (size_t i = 0; i < op->type_constraints.size(); i++) {
-        this->VisitType(op->type_constraints[i], ta2->type_constraints[i]);
-        if (!equal) {
-          return;
-        }
-      }
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitType_(const TypeRelationNode* tr1, const Type& t2) final {
-    if (const TypeRelationNode* tr2 = t2.as<TypeRelationNode>()) {
-      if (tr1->func != tr2->func
-          || tr1->num_inputs != tr2->num_inputs
-          || tr1->attrs != tr2->attrs) {
-        equal = false;
-        return;
-      }
-
-      if (tr1->args.size() != tr2->args.size()) {
-        equal = false;
-        return;
-      }
-
-      for (size_t i = 0; i < tr1->args.size(); i++) {
-        this->VisitType(tr1->args[i], tr2->args[i]);
-        if (!equal) {
-          return;
-        }
-      }
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitType_(const TupleTypeNode* op, const Type& t2) final {
-    if (const TupleTypeNode* pt = t2.as<TupleTypeNode>()) {
-      if (op->fields.size() != pt->fields.size()) {
-        equal = false;
-        return;
-      }
-
-      for (size_t i = 0U; i < op->fields.size(); i++) {
-        if (!equal) {
-          return;
-        }
-        this->VisitType(op->fields[i], pt->fields[i]);
-      }
-    } else {
-      equal = false;
-    }
-  }
-};
-
-bool AlphaEqual(const Type& t1, const Type& t2) {
-  if (t1.defined() != t2.defined()) {
-    return false;
-  }
-
-  if (!t1.defined()) {
-    return true;
-  }
-
-  TypeAlphaEq aeq;
-  aeq.VisitType(t1, t2);
-  return aeq.equal;
-}
-
-struct AlphaEq : ExprFunctor<void(const Expr&, const Expr&)> {
- public:
-  tvm::Map<Var, Var> eq_map;
-
-  bool equal;
-  AlphaEq() : eq_map(), equal(true) {}
-
-  void VisitExpr_(const VarNode* e1, const Expr& e2) final {
-    if (const VarNode* id2 = e2.as<VarNode>()) {
-      auto local1 = GetRef<Var>(e1);
-      auto local2 = GetRef<Var>(id2);
-      // We handle open terms with this rule assuming variables are identical.
-      if (local1 == local2) {
-        equal = true;
-        return;
-      }
-
-      // Next we see if there is mapping for local1 into the rhs term.
-      // If there is we check to see if those are equal.
-      if (eq_map.find(local1) != eq_map.end()) {
-        equal = equal && eq_map[local1] == local2;
-      } else {
-        equal = false;
-      }
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitExpr_(const GlobalVarNode* g1, const Expr& e2) final {
-    if (const GlobalVarNode* g2 = e2.as<GlobalVarNode>()) {
-      equal = equal && g1 == g2;
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitExpr_(const TupleNode* pl1, const Expr& e2) final {
-    Tuple prod1 = GetRef<Tuple>(pl1);
-    if (const TupleNode* pl2 = e2.as<TupleNode>()) {
-      Tuple prod2 = GetRef<Tuple>(pl2);
-      if (prod1->fields.size() != prod2->fields.size()) {
-        equal = false;
-        return;
-      }
-
-      for (size_t i = 0U; i < prod1->fields.size(); i++) {
-        this->VisitExpr(prod1->fields[i], prod2->fields[i]);
-      }
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitExpr_(const FunctionNode* func1, const Expr& e2) final {
-    if (const FunctionNode* func2 = e2.as<FunctionNode>()) {
-      if (func1->params.size() != func2->params.size()) {
-        equal = false;
-        return;
-      }
-
-      if (func1->type_params.size() != func2->type_params.size()) {
-        equal = false;
-        return;
-      }
-
-      for (size_t i = 0; i < func1->params.size(); ++i) {
-        MergeVarDecl(func1->params[i], func2->params[i]);
-      }
-
-      if (!equal) {
-        return;
-      }
-
-      for (size_t i = 0U; i < func1->type_params.size(); i++) {
-        equal = equal && AlphaEqual(func1->type_params[i], func2->type_params[i]);
-        if (!equal) {
-          return;
-        }
-      }
-
-      equal = equal && AlphaEqual(func1->ret_type, func2->ret_type);
-      if (!equal) {
-        return;
-      }
-
-      this->VisitExpr(func1->body, func2->body);
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitExpr_(const CallNode* op, const Expr& e2) final {
-    if (const CallNode* call = e2.as<CallNode>()) {
-      this->VisitExpr(op->op, call->op);
-
-      if (op->args.size() != call->args.size()) {
-        equal = false;
-        return;
-      }
-
-      if (op->type_args.size() != call->type_args.size()) {
-        equal = false;
-        return;
-      }
-
-      // checking attrs by pointer equality for now
-      equal = equal && (op->attrs == call->attrs);
-      if (!equal) {
-        return;
-      }
-
-      for (size_t i = 0U; i < op->args.size(); i++) {
-        this->VisitExpr(op->args[i], call->args[i]);
-      }
-
-      for (size_t i = 0U; i < op->type_args.size(); i++) {
-        equal = equal && AlphaEqual(op->type_args[i], call->type_args[i]);
-        if (!equal) {
-          return;
-        }
-      }
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitExpr_(const LetNode* op, const Expr& e2) final {
-    if (const LetNode* let = e2.as<LetNode>()) {
-      MergeVarDecl(op->var, let->var);
-      this->VisitExpr(op->value, let->value);
-      this->VisitExpr(op->body, let->body);
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitExpr_(const IfNode* op, const Expr& e2) final {
-    if (const IfNode* i = e2.as<IfNode>()) {
-      VisitExpr(op->cond, i->cond);
-      VisitExpr(op->true_branch, i->true_branch);
-      VisitExpr(op->false_branch, i->false_branch);
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitExpr_(const OpNode* op, const Expr& e2) final {
-    if (const OpNode* o = e2.as<OpNode>()) {
-      equal = equal && op->name == o->name;
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitExpr_(const ConstantNode* op, const Expr& e2) final {
-    if (const ConstantNode* c = e2.as<ConstantNode>()) {
-      if (AlphaEqual(op->tensor_type(), c->tensor_type())) {
-        equal = equal && SameNDArray(op->data, c->data);
-      } else {
-        equal = false;
-      }
-    } else {
-      equal = false;
-    }
-  }
-
-  void VisitExpr_(const TupleGetItemNode* op, const Expr& e2) final {
-    if (const TupleGetItemNode* proj = e2.as<TupleGetItemNode>()) {
-      this->VisitExpr(op->tuple, proj->tuple);
-      equal = equal && (op->index == proj->index);
-    } else {
-      equal = false;
-    }
-  }
-
- private:
-  void MergeVarDecl(const Var& var1, const Var& var2) {
-    equal = equal && AlphaEqual(var1->type_annotation, var2->type_annotation);
-    if (!equal) {
-      return;
-    }
-
-    eq_map.Set(var1, var2);
-  }
-};
-
-bool AlphaEqual(const Expr& e1, const Expr& e2) {
-  AlphaEq eq;
-  eq.VisitExpr(e1, e2);
-  return eq.equal;
-}
-
-// TODO(@jroesch): move to correct namespace?
-TVM_REGISTER_API("relay._make._alpha_equal")
-    .set_body([](TVMArgs args, TVMRetValue* ret) {
-      Expr e1 = args[0];
-      Expr e2 = args[1];
-      *ret = AlphaEqual(e1, e2);
-    });
-
-TVM_REGISTER_API("relay._make._type_alpha_equal")
-    .set_body([](TVMArgs args, TVMRetValue* ret) {
-      Type t1 = args[0];
-      Type t2 = args[1];
-      *ret = AlphaEqual(t1, t2);
-    });
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
index 8fd77a71ec4b..c3d16c2976bf 100644
--- a/src/relay/pass/kind_check.cc
+++ b/src/relay/pass/kind_check.cc
@@ -14,7 +14,7 @@
  * contains a data type such as `int`, `float`, `uint`.
  */
 #include <tvm/relay/pass.h>
-#include "./type_visitor.h"
+#include "../ir/type_functor.h"
 
 namespace tvm {
 namespace relay {
@@ -105,13 +105,13 @@ bool KindCheck(const Type& t, const Environment& env) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.check_kind")
-    .set_body([](TVMArgs args, TVMRetValue* ret) {
-      if (args.size() == 1) {
-        *ret = KindCheck(args[0], EnvironmentNode::make({}));
-      } else {
-        *ret = KindCheck(args[0], args[1]);
-      }
-    });
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    if (args.size() == 1) {
+      *ret = KindCheck(args[0], EnvironmentNode::make({}));
+    } else {
+      *ret = KindCheck(args[0], args[1]);
+    }
+  });
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/type_functor.h b/src/relay/pass/type_functor.h
deleted file mode 100644
index b8eaa85a73d2..000000000000
--- a/src/relay/pass/type_functor.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file type_functor.h
- * \brief A way to defined arbitrary function signature with dispatch on types.
- */
-#ifndef TVM_RELAY_PASS_TYPE_FUNCTOR_H_
-#define TVM_RELAY_PASS_TYPE_FUNCTOR_H_
-
-#include <tvm/node/ir_functor.h>
-#include <tvm/relay/expr.h>
-#include <string>
-
-namespace tvm {
-namespace relay {
-
-template <typename FType>
-class TypeFunctor;
-
-// functions to be overriden.
-#define TYPE_FUNCTOR_DEFAULT \
-  { return VisitTypeDefault_(op, std::forward<Args>(args)...); }
-
-
-#define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                                   \
-  vtable.template set_dispatch<OP>(                                       \
-      [](const NodeRef& n, TSelf* self, Args... args) {                   \
-        return self->VisitType_(static_cast<const OP*>(n.node_.get()),    \
-                                std::forward<Args>(args)...);             \
-      });
-
-template <typename R, typename... Args>
-class TypeFunctor<R(const Type& n, Args...)> {
- private:
-  using TSelf = TypeFunctor<R(const Type& n, Args...)>;
-  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
-
- public:
-  /*! \brief the result type of this functor */
-  using result_type = R;
-  /*! \brief virtual destructor */
-  virtual ~TypeFunctor() {}
-  /*!
-   * \brief Same as call.
-   * \param n The expression node.
-   * \param args Additional arguments.
-   * \return The result of the call
-   */
-  R operator()(const Type& n, Args... args) {
-    return VisitType(n, std::forward<Args>(args)...);
-  }
-  /*!
-   * \brief The functor call.
-   * \param n The expression node.
-   * \param args Additional arguments.
-   * \return The result of the call
-   */
-  virtual R VisitType(const Type& n, Args... args) {
-    static FType vtable = InitVTable();
-    return vtable(n, this, std::forward<Args>(args)...);
-  }
-  // Functions that can be overriden by subclass
-  virtual R VisitType_(const TensorTypeNode* op,
-                       Args... args) TYPE_FUNCTOR_DEFAULT;
-  virtual R VisitType_(const TypeVarNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
-  virtual R VisitType_(const TypeConstraintNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
-  virtual R VisitType_(const FuncTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
-  virtual R VisitType_(const TypeRelationNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
-  virtual R VisitType_(const TupleTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
-  virtual R VisitType_(const IncompleteTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
-
-  virtual R VisitTypeDefault_(const Node* op, Args...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
-    throw;  // unreachable, written to stop compiler warning
-  }
-
- private:
-  // initialize the vtable.
-  static FType InitVTable() {
-    FType vtable;
-    // Set dispatch
-    RELAY_TYPE_FUNCTOR_DISPATCH(TensorTypeNode);
-    RELAY_TYPE_FUNCTOR_DISPATCH(TypeVarNode);
-    RELAY_TYPE_FUNCTOR_DISPATCH(TypeConstraintNode);
-    RELAY_TYPE_FUNCTOR_DISPATCH(FuncTypeNode);
-    RELAY_TYPE_FUNCTOR_DISPATCH(TypeRelationNode);
-    RELAY_TYPE_FUNCTOR_DISPATCH(TupleTypeNode);
-    RELAY_TYPE_FUNCTOR_DISPATCH(IncompleteTypeNode);
-    return vtable;
-  }
-};
-
-}  // namespace relay
-}  // namespace tvm
-#endif  // TVM_RELAY_PASS_TYPE_FUNCTOR_H_
diff --git a/src/relay/pass/type_subst.cc b/src/relay/pass/type_subst.cc
index bffd779d1af2..76507058f059 100644
--- a/src/relay/pass/type_subst.cc
+++ b/src/relay/pass/type_subst.cc
@@ -4,7 +4,7 @@
  * \brief Function for substituting a concrete type in place of a type ID
  */
 #include "./type_subst.h"
-#include "./type_visitor.h"
+#include "../ir/type_functor.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index ff4bb55b7b3c..d69f1bce70d4 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -7,7 +7,7 @@
  */
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
-#include "./type_visitor.h"
+#include "../ir/type_functor.h"
 
 namespace tvm {
 namespace relay {
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index de4df7c84b9f..d16c2df53435 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -139,7 +139,8 @@ def test_type_relation_alpha_equal():
 
     # attrs are also compared only by pointer equality
     attr1 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
-    attr2 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+    attr1_same = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+    attr2 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4,4))
 
     tr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1)
     same = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1)
@@ -147,6 +148,7 @@ def test_type_relation_alpha_equal():
     diff_order = relay.TypeRelation(broadcast, tvm.convert([t2, t1]), 1, attr1)
     diff_args = relay.TypeRelation(broadcast, tvm.convert([t2, t3]), 1, attr1)
     diff_attr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr2)
+    same_attr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1_same)
 
     bigger = relay.TypeRelation(identity, tvm.convert([t1, t3, t2]), 2, attr1)
     diff_num_inputs = relay.TypeRelation(identity, tvm.convert([t1, t3, t2]), 1, attr2)
@@ -157,6 +159,7 @@ def test_type_relation_alpha_equal():
     assert tr != diff_order
     assert tr != diff_args
     assert tr != diff_attr
+    assert tr == same_attr
     assert tr != bigger
 
     assert bigger != diff_num_inputs
@@ -216,22 +219,26 @@ def test_global_var_alpha_equal():
 
 
 def test_tuple_alpha_equal():
+    v0 = relay.Var("v0")
     v1 = relay.Var("v1")
     v2 = relay.Var("v2")
 
     # unit value is a valid tuple
     assert alpha_equal(relay.Tuple([]), relay.Tuple([]))
 
-    tup = relay.Tuple([v1, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])])
-    same = relay.Tuple([v1, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])])
+    tup = relay.Tuple([v0, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])])
+    same = relay.Tuple([v0, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])])
 
     assert alpha_equal(tup, same)
 
     # use the eq_map
+
+
     let_tup = relay.Let(v1, tup, v1)
-    let_mapped = relay.Let(v2, relay.Tuple([v2, relay.const(2), relay.const(3),
+    let_mapped = relay.Let(v2, relay.Tuple([v0, relay.const(2), relay.const(3),
                                             relay.Tuple([relay.const(4)])]),
                            v2)
+
     assert alpha_equal(let_tup, let_mapped)
 
     more_fields = relay.Tuple([v1, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)]), v2])
@@ -340,7 +347,8 @@ def test_call_alpha_equal():
 
     # attrs are compared only by pointer equality
     attr1 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
-    attr2 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+    attr1_same = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+    attr2 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4,4))
 
     tt1 = relay.TensorType((1, 2, 3), "float32")
     tt2 = relay.TensorType((), "int8")
@@ -375,6 +383,9 @@ def test_call_alpha_equal():
     different_attrs = relay.Call(v1, basic_args, attr2, [tt1])
     assert not alpha_equal(call, different_attrs)
 
+    same_attrs = relay.Call(v1, basic_args, attr1_same, [tt1])
+    assert alpha_equal(call, same_attrs)
+
     no_type_args = relay.Call(v1, basic_args, attr1)
     assert not alpha_equal(call, no_type_args)
 
@@ -445,6 +456,27 @@ def test_op_alpha_equal():
     assert not alpha_equal(op1, op3)
 
 
+def test_graph_equal():
+    x = relay.var("x")
+
+    y0 = relay.add(x, x)
+    z0 = relay.add(y0, y0)
+
+    y1 = relay.add(x, x)
+    z1 = relay.add(y1, y1)
+
+    z3 = relay.add(relay.add(x, x), relay.add(x, x))
+
+    assert alpha_equal(z0, z1)
+
+    # z3's dataflow format is different from z0
+    # z0 is computed from a common y0 node
+    # Relay view them as different programs
+    # Check the difference in the text format.
+    assert not alpha_equal(z0, z3)
+
+
+
 if __name__ == "__main__":
     test_tensor_type_alpha_equal()
     test_incomplete_type_alpha_equal()
@@ -462,3 +494,4 @@ def test_op_alpha_equal():
     test_if_alpha_equal()
     test_op_alpha_equal()
     test_var_alpha_equal()
+    test_graph_equal()
diff --git a/tests/python/unittest/test_pass_attrs_hash_equal.py b/tests/python/unittest/test_pass_attrs_hash_equal.py
index 23f0e6374064..2d6987aeb183 100644
--- a/tests/python/unittest/test_pass_attrs_hash_equal.py
+++ b/tests/python/unittest/test_pass_attrs_hash_equal.py
@@ -17,6 +17,12 @@ def test_attrs_equal():
     assert tvm.ir_pass.AttrsEqual({"x": [x, x]}, {"x": [y, x]})
     assert not tvm.ir_pass.AttrsEqual({"x": [x, 1]}, {"x": [y, 2]})
 
+    n = tvm.var("n")
+    assert tvm.ir_pass.AttrsEqual({"x": n+1}, {"x": n+1})
+
+
+
+
 
 def test_attrs_hash():
     fhash = tvm.ir_pass.AttrsHash

From 7292a1836597333a944b1560a8b3c95037a3f928 Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Tue, 23 Oct 2018 10:55:24 -0700
Subject: [PATCH 268/529] [Relay] Parser CI dependencies and build rules
 (#1965)

---
 docker/Dockerfile.ci_gpu                        | 3 +++
 docker/install/ubuntu_install_antlr.sh          | 5 +++++
 docker/install/ubuntu_install_python_package.sh | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 docker/install/ubuntu_install_antlr.sh

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index bca16b59366b..7b97a54185f4 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -69,6 +69,9 @@ RUN bash /install/ubuntu_install_tensorflow.sh
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
 
+COPY install/ubuntu_install_antlr.sh /install/ubuntu_install_antlr.sh
+RUN bash /install/ubuntu_install_antlr.sh
+
 # Environment variables
 ENV PATH=/usr/local/nvidia/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}
diff --git a/docker/install/ubuntu_install_antlr.sh b/docker/install/ubuntu_install_antlr.sh
new file mode 100644
index 000000000000..f1066c4220d4
--- /dev/null
+++ b/docker/install/ubuntu_install_antlr.sh
@@ -0,0 +1,5 @@
+cd /usr/local/lib
+wget https://www.antlr.org/download/antlr-4.7.1-complete.jar
+cd -
+
+alias antlr4='java -jar /usr/local/lib/antlr-4.7.1-complete.jar'
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 6724116cb720..505a25d28e3d 100644
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -1,3 +1,3 @@
 # install libraries for python package on ubuntu
-pip2 install nose pylint numpy nose-timer cython decorator scipy tornado
+pip2 install nose pylint numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime
 pip3 install nose pylint numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime

From 846ca8a542c8f4a3f49be142bba16d3d546dc0ab Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Tue, 23 Oct 2018 18:38:41 -0700
Subject: [PATCH 269/529] [Relay] Serialization round-trip tests (#1968)

---
 python/tvm/relay/ir_pass.py         | 22 +++++++++++++++
 src/relay/ir/alpha_equal.cc         |  2 +-
 tests/python/relay/test_ir_nodes.py | 44 +++++++++++++++++++++++++----
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 549203d12c9f..22ee918039b5 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -141,3 +141,25 @@ def alpha_equal(lhs, rhs):
       True iff lhs is alpha equal to rhs.
     """
     return bool(_make._alpha_equal(lhs, rhs))
+
+
+def graph_equal(lhs, rhs):
+    """Compare two Relay expr for data-flow equivalence.
+    The difference between this and alpha-equality is that
+    variables are not expected to match between lhs and rhs;
+    they are treated as sources and are mapped between each other.
+
+    Parameters
+    ----------
+    lhs: tvm.relay.Expr
+      One of the input Expression.
+
+    rhs: tvm.relay.Expr
+      One of the input Expression.
+
+    Returns
+    -------
+    result: bool
+      True iff lhs is data-flow equivalent to rhs.
+    """
+    return bool(_make._graph_equal(lhs, rhs))
diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index f227970fc09b..7aab9bb3223b 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -183,7 +183,7 @@ class AlphaEqualHandler:
 
   bool VisitType_(const TypeRelationNode* lhs, const Type& other) final {
     if (const TypeRelationNode* rhs = other.as<TypeRelationNode>()) {
-      if (!lhs->func.same_as(rhs->func)) return false;
+      if (lhs->func->name != rhs->func->name) return false;
       if (lhs->num_inputs != rhs->num_inputs) return false;
       if (!this->AttrEqual(lhs->attrs, rhs->attrs)) return false;
       if (lhs->args.size() != rhs->args.size()) return false;
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index fc9f30c9a61d..2159dd02de95 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -2,6 +2,14 @@
 import tvm
 from tvm import relay
 from tvm.expr import *
+from tvm.relay.ir_pass import graph_equal
+
+
+def check_json_roundtrip(node):
+    json_str = tvm.save_json(node)
+    back = tvm.load_json(json_str)
+    assert graph_equal(back, node)
+
 
 def test_bad_constructor():
     try:
@@ -21,6 +29,13 @@ def test_span():
     assert isinstance(span, relay.base.Span)
     str(span)
 
+    # span is not a node so we can't use graph_equal
+    # to test the round trip
+    back = tvm.load_json(tvm.save_json(span))
+    assert back.source == span.source
+    assert back.lineno == span.lineno
+    assert back.col_offset == span.col_offset
+
 # Types
 
 def test_tensor_type():
@@ -31,6 +46,7 @@ def test_tensor_type():
     assert tt.shape == shape
     assert tt.span == None
     str(tt)
+    check_json_roundtrip(tt)
 
 
 def test_type_param():
@@ -38,21 +54,23 @@ def test_type_param():
     assert tp.kind == relay.Kind.Type
     # assert tp.span  # TODO allow us to set span
     str(tp)
+    check_json_roundtrip(tp)
 
 
 def test_func_type():
     type_params = tvm.convert([])
     type_constraints = tvm.convert([])  # TODO: fill me in
     arg_types = tvm.convert([])
-    ret_type = None
+    ret_type = relay.TensorType((1, 2, 3), 'float32')
     tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
     assert tf.type_params == type_params
     assert tf.type_constraints == type_constraints
     assert tf.arg_types == arg_types
     assert tf.ret_type == ret_type
     assert tf.span == None
-    # TODO make sure we can set
+    # TODO make sure we can set span
     str(tf)
+    check_json_roundtrip(tf)
 
 
 def test_tuple_type():
@@ -63,13 +81,15 @@ def test_tuple_type():
 
     tup_ty = relay.TupleType(fields)
     assert tup_ty.fields == fields
+    str(tup_ty)
+    check_json_roundtrip(tup_ty)
 
 
 def test_type_relation():
     tp = relay.TypeVar('tp', relay.Kind.Type)
     tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([]))
     tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
-    args = tvm.convert([tf, tt, tp])
+    args = tvm.convert([tp, tf, tt])
 
     num_inputs = 2
     func = tvm.get_env_func("tvm.relay.type_relation.Broadcast")
@@ -78,6 +98,8 @@ def test_type_relation():
     tr = relay.TypeRelation(func, args, num_inputs, attrs)
     assert tr.args == args
     assert tr.num_inputs == num_inputs
+    str(tr)
+    check_json_roundtrip(tr)
 
 
 def test_constant():
@@ -86,6 +108,7 @@ def test_constant():
     assert const.data == arr
     assert const.span == None
     str(const)
+    check_json_roundtrip(const)
 
 
 def test_tuple():
@@ -94,6 +117,7 @@ def test_tuple():
     assert tup.fields == fields
     assert tup.span == None
     str(tup)
+    check_json_roundtrip(tup)
 
 
 def test_local_var():
@@ -103,6 +127,7 @@ def test_local_var():
     assert lv.type_annotation is None
     # assert lv.span == None todo(@jroesch): what do we do about spans
     str(lv)
+    check_json_roundtrip(lv)
 
     t1 = relay.ty.TensorType((), "float")
     lv = relay.Var(name_hint, t1)
@@ -116,20 +141,22 @@ def test_global_var():
     gv.name_hint == name_hint
     # assert lv.span == None todo(@jroesch): what do we do about spans
     str(gv)
+    check_json_roundtrip(gv)
 
 
 def test_function():
     param_names = ['a', 'b', 'c', 'd']
     params = tvm.convert([relay.Var(n) for n in param_names])
-    ret_type = None
-    body = None
+    ret_type = relay.TupleType(tvm.convert([]))
+    body = relay.Tuple(tvm.convert([]))
     type_params = tvm.convert([])
-    fn = relay.Function(params, ret_type, body, type_params)
+    fn = relay.Function(params, body, ret_type, type_params)
     assert fn.params == params
     assert fn.body == body
     assert fn.type_params == type_params
     assert fn.span == None
     str(fn)
+    check_json_roundtrip(fn)
 
 
 def test_call():
@@ -141,6 +168,7 @@ def test_call():
     assert call.args == args
     assert call.span == None
     str(call)
+    check_json_roundtrip(call)
 
 
 def test_let():
@@ -156,6 +184,7 @@ def test_let():
     assert let.body == lv
     assert let.span == None
     str(let)
+    check_json_roundtrip(let)
 
 
 def test_if():
@@ -168,6 +197,7 @@ def test_if():
     assert ife.false_branch == right
     assert ife.span == None
     str(ife)
+    check_json_roundtrip(ife)
 
 
 def test_tuple_get_item():
@@ -176,6 +206,8 @@ def test_tuple_get_item():
     assert get.tuple_value == tup
     assert get.index == 1
     str(get)
+    check_json_roundtrip(get)
+
 
 if __name__ == "__main__":
     test_bad_constructor()

From 23ed7f72f7ae72761903e3f4a376f3a958fa8d55 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Tue, 23 Oct 2018 19:00:32 -0700
Subject: [PATCH 270/529] [RELAY][TypeSystem] Add support for populating type
 args (#1962)

---
 include/tvm/relay/op.h                |  30 ++++++++
 src/relay/ir/text_printer.cc          |  21 +++++-
 src/relay/pass/type_infer.cc          | 104 +++++++++++++++++++-------
 tests/python/relay/test_type_infer.py |  17 +++++
 4 files changed, 142 insertions(+), 30 deletions(-)

diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index fe6d957e79ed..9f28fbebccfc 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -485,6 +485,36 @@ inline ValueType OpMap<ValueType>::get(const Op& op,
   return map_.get<ValueType>(op, def_value);
 }
 
+/*!
+ * \brief Check that an expression is a "primtive operator".
+ *
+ * Will return true if the expression is an operator which
+ * matches the form of primtive operators registered directly
+ * by the Relay codebase.
+ *
+ * That is the arguments are all type variables, and there is a single
+ * type relation applied to the input and output types.
+ */
+inline bool IsPrimitiveOp(const Expr& expr) {
+  const auto* op = expr.as<OpNode>();
+
+  if (!op) {
+    return false;
+  }
+
+  const auto& fn_ty = op->op_type;
+  if (fn_ty->type_constraints.size() != 1) return false;
+
+  const TypeRelationNode* rel = fn_ty->type_constraints[0].as<TypeRelationNode>();
+  if (rel == nullptr) return false;
+  // validate if the type parameter matches up
+  for (size_t i = 0; i < fn_ty->type_params.size(); ++i) {
+    if (!fn_ty->type_params[i].same_as(rel->args[i])) return false;
+  }
+
+  return true;
+}
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_OP_H_
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index 0ebe111ab6b2..3cbe1e00b9ca 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -278,10 +278,7 @@ class TextPrinter :
   }
 
   TextValue VisitExpr_(const CallNode* op) final {
-    // TODO(tqchen, M.K.): support generic call
     // possibly through meta-data
-    CHECK_EQ(op->type_args.size(), 0U)
-        << "generic call not yet supported";
     TextValue call_op = GetValue(op->op);
     std::vector<TextValue> args;
     for (Expr arg : op->args) {
@@ -289,7 +286,23 @@ class TextPrinter :
     }
     TextValue id = this->AllocTempVar();
     this->PrintIndent();
-    stream_ << id << " = " << call_op << "(";
+
+    stream_ << id << " = " << call_op;
+
+    auto type_args = op->type_args;
+
+    if (!IsPrimitiveOp(op->op) && type_args.size() > 0U) {
+      stream_ << "<";
+      for (size_t i = 0; i < op->type_args.size(); ++i) {
+        this->PrintType(type_args[i], stream_);
+        if (i + 1 != type_args.size()) {
+          stream_ << ", ";
+        }
+      }
+      stream_ << ">";
+    }
+
+    stream_ << "(";
     for (size_t i = 0; i < args.size(); ++i) {
       stream_ << args[i];
       if (i + 1 != args.size()) {
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 0cbce833aed9..87fdb1c0ffba 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -61,6 +61,17 @@ TVM_REGISTER_API("tvm.relay.type_relation.TupleGetItem")
 .set_body_typed<bool(const Array<Type>&, int, const Attrs&, const TypeReporter&)>(
     TupleGetItemRel);
 
+struct ResolvedTypeInfo {
+  explicit ResolvedTypeInfo(Type checked_type, Array<Type> type_args)
+      : checked_type(checked_type), type_args(type_args) {}
+  ResolvedTypeInfo() {}
+
+  Type checked_type;
+  // Only allocated when the expression is a call.
+
+  Array<Type> type_args = Array<Type>(NodePtr<Node>(nullptr));
+};
+
 //
 // The inference algorithm can roughly be devided into three stages:
 // - Populate the constraints by visiting the expression (TypeInferencer.GetType)
@@ -87,7 +98,8 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
   Environment env_;
   // map from expression to checked type
   // type inferencer will populate it up
-  std::unordered_map<Expr, Type, NodeHash, NodeEqual> type_map_;
+  std::unordered_map<Expr, ResolvedTypeInfo, NodeHash, NodeEqual> type_map_;
+
   // The solver used by the inferencer.
   TypeSolver solver_;
   // relation function
@@ -111,11 +123,12 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
   // will call visit to deduce it if it is not in the type_map_
   Type GetType(const Expr &expr) {
     auto it = type_map_.find(expr);
-    if (it != type_map_.end()) {
-      return it->second;
+    if (it != type_map_.end() && it->second.checked_type.defined()) {
+      return it->second.checked_type;
     }
     Type ret = this->VisitExpr(expr);
-    type_map_[expr] = ret;
+    ResolvedTypeInfo& rti = type_map_[expr];
+    rti.checked_type = ret;
     return ret;
   }
 
@@ -176,7 +189,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
     }
     CHECK(!type_map_.count(op->var));
     // NOTE: no scoping is necessary because var are unique in program
-    type_map_[op->var] = vtype;
+    type_map_[op->var].checked_type = vtype;
     return GetType(op->body);
   }
 
@@ -224,6 +237,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
       subst_map.Set(ty_param, fresh);
       ty_args->push_back(fresh);
     }
+
     Type ret_type = fn_ty->ret_type;
 
     // If the function type is incomplete, place a new IncompleteType
@@ -234,6 +248,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
     if (!ret_type.defined()) {
       ret_type = IncompleteTypeNode::make(TypeVarNode::Kind::kType);
     }
+
     Type inst_ty = FuncTypeNode::make(fn_ty->arg_types,
                                       ret_type, {},
                                       fn_ty->type_constraints);
@@ -241,49 +256,74 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
     return Downcast<FuncType>(inst_ty);
   }
 
+  void AddTypeArgs(const Expr& expr, Array<Type> type_args) {
+    auto type_info = type_map_.find(expr);
+    if (type_info == type_map_.end()) {
+      type_map_.insert({expr, ResolvedTypeInfo(Type(), type_args)});
+    } else {
+      CHECK(!type_info->second.type_args.defined());
+      type_info->second.type_args = type_args;
+    }
+  }
+
   // Handle general call node.
-  Type GeneralCall(const CallNode* op, Array<Type> arg_types) {
-    Type ftype = GetType(op->op);
+  Type GeneralCall(const CallNode* call, Array<Type> arg_types) {
+    Type ftype = GetType(call->op);
     auto* fn_ty_node = ftype.as<FuncTypeNode>();
+
     CHECK(fn_ty_node != nullptr)
         << "only expressions with function types can be called, at "
-        << op->span;
+        << call->span;
 
     Array<Type> type_args;
     FuncType fn_ty = Instantiate(fn_ty_node, &type_args);
+
+    AddTypeArgs(GetRef<Call>(call), type_args);
+
     size_t type_arity = fn_ty->arg_types.size();
     size_t number_of_args = arg_types.size();
 
     if (type_arity != number_of_args) {
       if (type_arity < number_of_args) {
-        LOG(FATAL) << "the function is provided too many arguments " << op->span;
+        LOG(FATAL) << "the function is provided too many arguments " << call->span;
       } else {
-        LOG(FATAL) << "the function is provided too few arguments" << op->span;
+        LOG(FATAL) << "the function is provided too few arguments" << call->span;
       }
     }
+
     for (size_t i = 0; i < fn_ty->arg_types.size(); i++) {
-      this->Unify(fn_ty->arg_types[i], arg_types[i], op->args[i]->span);
+      this->Unify(fn_ty->arg_types[i], arg_types[i], call->args[i]->span);
     }
 
     for (auto cs : fn_ty->type_constraints) {
-      solver_.AddConstraint(cs);
+      if (auto tr = cs.as<TypeRelationNode>()) {
+        solver_.AddConstraint(
+          TypeRelationNode::make(tr->func, tr->args, tr->num_inputs, call->attrs));
+      } else {
+        solver_.AddConstraint(cs);
+      }
     }
+
     return fn_ty->ret_type;
   }
 
-  Type VisitExpr_(const CallNode* op) final {
-    // Fast path: well-formed primitive op
+  Type VisitExpr_(const CallNode* call) final {
     Array<Type> arg_types;
-    for (Expr arg : op->args) {
+    for (Expr arg : call->args) {
       arg_types.push_back(GetType(arg));
     }
-    if (const OpNode* opnode = op->op.as<OpNode>()) {
+
+    if (const OpNode* opnode = call->op.as<OpNode>()) {
       Type rtype = PrimitiveCall(opnode->op_type.as<FuncTypeNode>(),
                                  arg_types,
-                                 op->attrs);
-      if (rtype.defined()) return rtype;
+                                 call->attrs);
+      if (rtype.defined()) {
+        AddTypeArgs(GetRef<Call>(call), arg_types);
+        return rtype;
+      }
     }
-    return GeneralCall(op, arg_types);
+
+    return GeneralCall(call, arg_types);
   }
 
   Type VisitExpr_(const FunctionNode* f) final {
@@ -312,7 +352,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
 
 class TypeInferencer::Resolver : public ExprMutator {
  public:
-  Resolver(const std::unordered_map<Expr, Type, NodeHash, NodeEqual>& tmap,
+  Resolver(const std::unordered_map<Expr, ResolvedTypeInfo, NodeHash, NodeEqual>& tmap,
            TypeSolver* solver)
       : tmap_(tmap), solver_(solver) {
   }
@@ -362,7 +402,7 @@ class TypeInferencer::Resolver : public ExprMutator {
   Expr AttachCheckedType(const T* op) {
     auto it = tmap_.find(GetRef<Expr>(op));
     CHECK(it != tmap_.end());
-    Type checked_type = solver_->Resolve(it->second);
+    Type checked_type = solver_->Resolve(it->second.checked_type);
     CHECK(checked_type.as<IncompleteTypeNode>() == nullptr)
         << "Cannot resolve type of " << GetRef<Expr>(op)
         << " at " << op->span;
@@ -376,25 +416,37 @@ class TypeInferencer::Resolver : public ExprMutator {
       }
       new_e->checked_type_ = checked_type;
     }
+
+    if (it->second.type_args.defined()) {
+      Call call = Downcast<Call>(new_e);
+      const CallNode* const_call_ref = call.operator->();
+      CallNode* call_ref = const_cast<CallNode*>(const_call_ref);
+      call_ref->type_args = it->second.type_args;
+
+      for (size_t i = 0; i < call->type_args.size(); i++) {
+        call_ref->type_args.Set(i, solver_->Resolve(call->type_args[i]));
+      }
+    }
+
     return new_e;
   }
 
-  Type VisitType(const Type& t) final {
+  Type VisitType(const Type &t) final {
     return solver_->Resolve(t);
   }
 
  private:
-  const std::unordered_map<Expr, Type, NodeHash, NodeEqual>& tmap_;
+  const std::unordered_map<Expr, ResolvedTypeInfo, NodeHash, NodeEqual>& tmap_;
   TypeSolver* solver_;
 };
 
 
 Expr TypeInferencer::Infer(Expr expr) {
-  // step 0: populate the constraints
+  // Step 0: Populate the constraints.
   GetType(expr);
-  // step 1: solve the constraints
+  // Step 1: Solve the constraints.
   solver_.Solve();
-  // step 2: attach resolved types to checked_type field
+  // Step 2: Attach resolved types to checked_type field.
   return Resolver(type_map_, &solver_).VisitExpr(expr);
 }
 
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 2d8f98974639..e1d749e75863 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -91,6 +91,21 @@ def test_free_expr():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.scalar_type("float32")
 
+def test_type_args():
+    x = relay.var("x", shape=(10, 10))
+    y = relay.var("y", shape=(1, 10))
+    z = relay.add(x, y)
+    ty_z = relay.ir_pass.infer_type(z)
+    ty_args = ty_z.type_args
+    assert len(ty_args) == 2
+    assert ty_args[0].dtype == "float32"
+    assert ty_args[1].dtype == "float32"
+    sh1 = ty_args[0].shape
+    sh2 = ty_args[1].shape
+    assert sh1[0].value == 10
+    assert sh1[1].value == 10
+    assert sh2[0].value == 1
+    assert sh2[1].value == 10
 
 if __name__ == "__main__":
     test_free_expr()
@@ -100,3 +115,5 @@ def test_free_expr():
     test_decl()
     test_recursion()
     test_tuple()
+    test_free_expr()
+    test_type_args()

From f4dfcab2247688f1a6db28761ea0c2528da4d277 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 25 Oct 2018 00:37:13 +0800
Subject: [PATCH 271/529] Fix int8x4 broadcast value codegen in cuda (#1959)

---
 src/codegen/codegen_cuda.cc                | 10 ++++++++++
 tests/python/unittest/test_codegen_cuda.py | 23 ++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 2ed8d8e3ff78..0ab56a116eab 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -273,6 +273,16 @@ void CodeGenCUDA::VisitExpr_(const Ramp* op, std::ostream& os) {
 }
 
 void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
+  if (op->type.is_int() && op->type.bits() == 8 && op->lanes == 4) {
+    // make_int8x4
+    const int64_t *p = as_const_int(op->value);
+    CHECK(p);
+    int64_t v = *p & 0xFF;
+    v = (v << 24) | (v << 16) | (v << 8) | v;
+    os << "(int)" << v;
+    return;
+  }
+
   std::string v = PrintExpr(op->value);
   os << "make_";
   PrintType(op->type, os);
diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
index a0b1cf445ba6..d3b770790bdb 100644
--- a/tests/python/unittest/test_codegen_cuda.py
+++ b/tests/python/unittest/test_codegen_cuda.py
@@ -87,7 +87,30 @@ def check_cuda(dtype, n, lanes):
     check_cuda("int8", 64, 8)
     check_cuda("int8", 64, 16)
 
+def test_cuda_make_int8x4():
+    def check_cuda(n, value):
+        if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+        lanes = 4
+        dtype = 'int8'
+        ctx = tvm.gpu(0)
+        A = tvm.compute((n, lanes), lambda i,j: tvm.const(value, dtype=dtype))
+        s = tvm.create_schedule(A.op)
+        y, x = s[A].op.axis
+        s[A].vectorize(x)
+        s[A].bind(y, tvm.thread_axis("blockIdx.x"))
+        fun = tvm.build(s, [A], "cuda", name="make_int8x4")
+        np_a = np.full((n, lanes), value, dtype=dtype)
+        a = tvm.nd.empty(np_a.shape, dtype, ctx)
+        fun(a)
+        np.testing.assert_equal(a.asnumpy(), np_a)
+    check_cuda(64, 0xAB)
+    check_cuda(64, 0)
+    check_cuda(64, -3)
+
 if __name__ == "__main__":
     test_cuda_vectorize_add()
     test_cuda_multiply_add()
     test_cuda_vectorize_load()
+    test_cuda_make_int8x4()

From 2f0a8a0da46ecbbce1deefb4df2fc173bd8889f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Wed, 24 Oct 2018 11:44:18 -0700
Subject: [PATCH 272/529] fix pydoc format (#1975)

---
 python/tvm/relay/ty.py | 54 ++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index 088f076abb75..0f7e0e82ad4d 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -31,11 +31,17 @@ class TensorType(Type):
 
     Parameters
     ----------
-    shape: List[tvm.Expr]
+    shape : List[tvm.Expr]
         The shape of the Tensor
 
-    dtype: str, optional
+    dtype : Optional[str]
         The content data type.
+        Default to "float32".
+
+    Returns
+    -------
+    tensor_type : tvm.relay.TensorType
+        The tensor type.
     """
     def __init__(self, shape, dtype="float32"):
         self.__init_handle_by_constructor__(
@@ -57,10 +63,10 @@ class Kind(IntEnum):
 
 @register_relay_node
 class TypeVar(Type):
-    """A type parameter used for generic types in Relay,
+    """A type variable used for generic types in Relay,
     see tvm/relay/type.h for more details.
 
-    A type parameter represents a type placeholder which will
+    A type variable represents a type placeholder which will
     be filled in later on. This allows the user to write
     functions which are generic over types.
     """
@@ -70,16 +76,17 @@ def __init__(self, var, kind=Kind.Type):
 
         Parameters
         ----------
-        var: tvm.expr.Var
+        var : tvm.expr.Var
             The tvm.Var which backs the type parameter.
 
-        kind: Kind, optional
+        kind : Optional[Kind]
             The kind of the type parameter.
+            Default to Kind.Type.
 
         Returns
         -------
-        type_param: TypeVar
-            The type parameter.
+        type_var : tvm.relay.TypeVar
+            The type variable.
         """
         self.__init_handle_by_constructor__(_make.TypeVar, var, kind)
 
@@ -102,11 +109,13 @@ def __init__(self, fields):
 
         Parameters
         ----------
-        fields: List[tvm.relay.Type]
+        fields : List[tvm.relay.Type]
+            The fields in the tuple
 
         Returns
         -------
-        tuple_type: the tuple type
+        tuple_type : tvm.relay.TupleType
+            the tuple type
         """
         self.__init_handle_by_constructor__(_make.TupleType, fields)
 
@@ -125,16 +134,16 @@ class FuncType(Type):
 
     Parameters
     ----------
-    arg_types: List[tvm.relay.Type]
+    arg_types : List[tvm.relay.Type]
         The argument types
 
-    ret_type: tvm.relay.Type
+    ret_type : tvm.relay.Type
         The return type.
 
-    type_params: List[tvm.relay.TypeVar]
+    type_params : Optional[List[tvm.relay.TypeVar]]
         The type parameters
 
-    type_constraints: List[tvm.relay.TypeConstraint]
+    type_constraints : Optional[List[tvm.relay.TypeConstraint]]
         The type constraints.
     """
     def __init__(self,
@@ -163,18 +172,23 @@ class TypeRelation(TypeConstraint):
 
     Parameters
     ----------
-    func: EnvFunc
+    func : EnvFunc
         User defined relation function.
 
-    args: list of types
+    args : [tvm.relay.Type]
         List of types to the func.
 
-    num_inputs: int
+    num_inputs : int
         Number of input arguments in args,
         this act as a hint for type inference.
 
-    attrs: Attrs
+    attrs : Attrs
         The attribute attached to the relation information
+
+    Returns
+    -------
+    type_relation : tvm.relay.TypeRelation
+        The type relation.
     """
     def __init__(self, func, args, num_inputs, attrs):
         self.__init_handle_by_constructor__(_make.TypeRelation,
@@ -188,12 +202,12 @@ def scalar_type(dtype):
 
     Parameters
     ----------
-    dtype: str
+    dtype : str
         The content data type.
 
     Returns
     -------
-    s_type: tvm.relay.TensorType
+    s_type : tvm.relay.TensorType
         The result type.
     """
     return TensorType((), dtype)

From 5fa98134a88661616ef66c0bccc20789e3c28275 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 24 Oct 2018 13:51:12 -0700
Subject: [PATCH 273/529] [RELAY] BiasAdd, MLP, Resnet testing (#1969)

* [RELAY] BiasAdd, MLP, Resnet testing

* fix review comments
---
 docs/langref/relay_op.rst                  |  13 +
 include/tvm/relay/attrs/nn.h               |  17 ++
 include/tvm/relay/attrs/transform.h        |  10 +
 include/tvm/relay/expr_functor.h           |  30 ++-
 include/tvm/relay/pass.h                   |  29 +--
 python/tvm/expr.py                         |   3 +
 python/tvm/relay/expr.py                   |  32 ++-
 python/tvm/relay/ir_pass.py                |  40 +--
 python/tvm/relay/op/nn/nn.py               | 161 +++++++-----
 python/tvm/relay/testing/__init__.py       |   5 +
 python/tvm/relay/testing/init.py           | 149 +++++++++++
 python/tvm/relay/testing/layers.py         | 114 +++++++++
 python/tvm/relay/testing/mlp.py            |  93 +++++++
 python/tvm/relay/testing/resnet.py         | 276 +++++++++++++++++++++
 python/tvm/relay/ty.py                     |  15 ++
 src/relay/ir/expr_functor.cc               |   9 +-
 src/relay/ir/text_printer.cc               |  17 +-
 src/relay/ir/type_functor.h                |  29 +--
 src/relay/op/nn/nn.cc                      | 112 ++++++---
 src/relay/op/tensor/transform.cc           |  58 ++++-
 src/relay/pass/kind_check.cc               |   2 +-
 src/relay/pass/type_infer.cc               |   1 -
 src/relay/pass/util.cc                     | 133 +++++-----
 src/relay/pass/well_formed.cc              |   1 -
 tests/python/relay/test_ir_text_printer.py |  11 +
 tests/python/relay/test_ir_well_formed.py  |   9 +-
 tests/python/relay/test_op_level1.py       |  18 +-
 tests/python/relay/test_op_level3.py       |  10 +
 28 files changed, 1160 insertions(+), 237 deletions(-)
 create mode 100644 python/tvm/relay/testing/__init__.py
 create mode 100644 python/tvm/relay/testing/init.py
 create mode 100644 python/tvm/relay/testing/layers.py
 create mode 100644 python/tvm/relay/testing/mlp.py
 create mode 100644 python/tvm/relay/testing/resnet.py

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 6eba6b25d9fd..42883f5f77da 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -40,6 +40,8 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.nn.relu
    tvm.relay.nn.dropout
    tvm.relay.nn.batch_norm
+   tvm.relay.nn.bias_add
+
 
 
 **Level 2: Convolutions**
@@ -85,8 +87,13 @@ This level enables additional math and transform operators.
    tvm.relay.abs
    tvm.relay.negative
    tvm.relay.take
+   tvm.relay.zeros
+   tvm.relay.zeros_like
+   tvm.relay.ones
+   tvm.relay.ones_like
    tvm.relay.full
    tvm.relay.full_like
+   tvm.relay.cast
 
 
 **Level 4: Broadcast and Reductions**
@@ -151,6 +158,9 @@ Level 1 Definitions
 .. autofunction:: tvm.relay.nn.softmax
 .. autofunction:: tvm.relay.nn.log_softmax
 .. autofunction:: tvm.relay.nn.relu
+.. autofunction:: tvm.relay.nn.dropout
+.. autofunction:: tvm.relay.nn.batch_norm
+.. autofunction:: tvm.relay.nn.bias_add
 
 
 Level 2 Definitions
@@ -185,6 +195,9 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.zeros_like
 .. autofunction:: tvm.relay.ones
 .. autofunction:: tvm.relay.ones_like
+.. autofunction:: tvm.relay.full
+.. autofunction:: tvm.relay.full_like
+.. autofunction:: tvm.relay.cast
 
 
 Level 4 Definitions
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 6b522ef3bfd0..eb044ccb29fd 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -12,6 +12,23 @@
 namespace tvm {
 namespace relay {
 
+/*!
+ * \brief Add a 1D Tensor to an axis of a data.
+ *
+ * \note bias_add is a special add operator that is in nn
+ *   and enables automatic derivation of bias's shape.
+ *   You can directly use add for more generalized case.
+ */
+struct BiasAddAttrs : public tvm::AttrsNode<BiasAddAttrs> {
+  int axis;
+
+  TVM_DECLARE_ATTRS(BiasAddAttrs, "relay.attrs.BiasAddAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .describe("The axis to add the bias")
+        .set_default(1);
+  }
+};
+
 /*! \brief Attributes used in convolution operators */
 struct Conv2DAttrs : public tvm::AttrsNode<Conv2DAttrs> {
   Array<IndexExpr> strides;
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 8e2b741091b3..1941e045ed8d 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -12,6 +12,16 @@
 namespace tvm {
 namespace relay {
 
+/*! \brief data type cast */
+struct CastAttrs : public tvm::AttrsNode<CastAttrs> {
+  DataType dtype;
+
+  TVM_DECLARE_ATTRS(CastAttrs, "relay.attrs.CastAttrs") {
+    TVM_ATTR_FIELD(dtype)
+        .describe("Target data type");
+  }
+};  // struct CastAttrs.
+
 /*! \brief Attributes used in expand_dims operators */
 struct ExpandDimsAttrs : public tvm::AttrsNode<ExpandDimsAttrs> {
   int axis;
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index c10933590f99..c0256cf3a1c3 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -112,15 +112,17 @@ class ExprFunctor<R(const Expr& n, Args...)> {
   }
 };
 
-/*! \brief A simple visitor wrapper around ExprFunctor.
+/*!
+ * \brief A simple visitor wrapper around ExprFunctor.
+ *  Recursively visit the content.
  *
- * Exposes two visitors with default traversal strategies, one
- * which doesn't compute a result but can mutate internal state,
- * and another which functionally builds a new Expr.
+ * ExprVisitor treats Expr as dataflow graph,
+ * and only visit each Expr node once.
  */
-
-class ExprVisitor : public ::tvm::relay::ExprFunctor<void(const Expr& n)> {
+class ExprVisitor
+    : public ::tvm::relay::ExprFunctor<void(const Expr& n)> {
  public:
+  void VisitExpr(const Expr& expr) override;
   void VisitExpr_(const VarNode* op) override;
   void VisitExpr_(const GlobalVarNode* op) override;
   void VisitExpr_(const ConstantNode* op) override;
@@ -132,13 +134,19 @@ class ExprVisitor : public ::tvm::relay::ExprFunctor<void(const Expr& n)> {
   void VisitExpr_(const OpNode* op) override;
   void VisitExpr_(const TupleGetItemNode* op) override;
   virtual void VisitType(const Type& t);
+
+ private:
+  // internal visited flag.
+  std::unordered_set<const Node*> visited_;
 };
 
-/*! \brief A wrapper around ExprFunctor which functionally updates the AST.
-*
-* ExprMutator uses memoization and self return in order to amortize
-* the cost of using functional updates.
-*/
+/*!
+ * \brief A wrapper around ExprFunctor which functionally updates the AST.
+ *
+ * ExprMutator treats Expr as dataflow graph, and only Mutate each Expr once.
+ * The mutated results are memoized in a map and reused so that
+ * local transformation on the dataflow preserves the graph structure.
+ */
 class ExprMutator
     : public ::tvm::relay::ExprFunctor<Expr(const Expr&)> {
  public:
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 9a3b75364167..1b3462659e18 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -102,35 +102,26 @@ bool AlphaEqual(const Type& t1, const Type& t2);
  */
 bool WellFormed(const Expr& e);
 
-/*! \brief Get free variables from expression e.
+/*! \brief Get free Vars from expr in PostDFS order.
  *
- * Free variables are variables that are not bound by a let or a function parameter in the context.
+ * Free variables are variables that are not bound by a
+ * let or a function parameter in the context.
  *
- * \param e the expression.
+ * \param expr the expression.
  *
- * \return the set of free variable.
+ * \return List of free vars, in the PostDFS order visited by expr.
  */
-tvm::Array<Var> FreeVariables(const Expr& e);
+tvm::Array<Var> FreeVars(const Expr& expr);
 
-/*! \brief Get free type parameters from expression e.
+/*! \brief Get free TypeVars from expression expr.
  *
  * Free type parameters are type parameters that are not bound by a function type in the context.
  *
- * \param e the expression.
+ * \param expr the expression.
  *
- * \return the set of free type variables.
+ * \return List of free vars, in the PostDFS order visited by expr.
  */
-tvm::Array<TypeVar> FreeTypeVariables(const Expr& e);
-
-/*! \brief Get free type parameters from type t.
- *
- * Free type parameters are type parameters that are not bound by a function type in the context.
- *
- * \param t the type.
- *
- * \return the set of free type variables.
- */
-tvm::Array<TypeVar> FreeTypeVariables(const Type& t);
+tvm::Array<TypeVar> FreeTypeVars(const Expr& expr);
 
 /*! \brief Remove expressions which does not effect the program result.
  *
diff --git a/python/tvm/expr.py b/python/tvm/expr.py
index 00a523416c85..bdb253d21582 100644
--- a/python/tvm/expr.py
+++ b/python/tvm/expr.py
@@ -299,6 +299,9 @@ def __init__(self, dtype, value):
         self.__init_handle_by_constructor__(
             _make.IntImm, dtype, value)
 
+    def __int__(self):
+        return self.value
+
 
 @register_node
 class UIntImm(ConstExpr):
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 36116d07d601..655379066c74 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -6,7 +6,7 @@
 from .base import RelayNode, register_relay_node
 from . import _make
 from . import ty as _ty
-from .._ffi import base as _base, node as _node
+from .._ffi import base as _base
 from .. import nd as _nd
 from .. import convert
 
@@ -28,6 +28,25 @@ def checked_type(self):
                              " the checked_type for this node")
         return ret
 
+    def astype(self, dtype):
+        """Cast the content type of the current data to dtype.
+
+        Parameters
+        ----------
+        dtype : str
+            The target data type.
+
+        Note
+        ----
+        This function only works for TensorType Exprs.
+
+        Returns
+        -------
+        result : tvm.relay.Expr
+            The result expression.
+        """
+        return _make.dtype_cast(self, dtype)
+
 
 @register_relay_node
 class Constant(Expr):
@@ -62,6 +81,9 @@ def __getitem__(self, index):
     def __len__(self):
         return len(self.fields)
 
+    def astype(self, _):
+        raise TypeError("astype cannot be used on tuple")
+
 
 @register_relay_node
 class Var(Expr):
@@ -238,7 +260,7 @@ def __init__(self, tuple_value, index):
             _make.TupleGetItem, tuple_value, index)
 
 
-class TupleWrapper(_node.NodeGeneric):
+class TupleWrapper(object):
     """TupleWrapper.
 
     This class is a Python wrapper for a Relay tuple of known size.
@@ -257,10 +279,9 @@ def __init__(self, tuple_value, size):
         self.tuple_value = tuple_value
         self.size = size
 
-    def asnode(self):
+    def astuple(self):
         """Returns the underlying Relay tuple if this wrapper is passed
         as an argument to an FFI function."""
-
         return self.tuple_value
 
     def __getitem__(self, index):
@@ -275,6 +296,9 @@ def __repr__(self):
         return ("TupleWrapper(" + self.tuple_value.__repr__() +
                 ", " + self.size + ")")
 
+    def astype(self, _):
+        raise TypeError("astype cannot be used on tuple")
+
 
 def var(name_hint,
         type_annotation=None,
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 22ee918039b5..c6d5aa7515bc 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -15,16 +15,16 @@ def infer_type(expr, env=None):
     Parameters
     ----------
     expr: tvm.relay.Expr
-      The input expression.
+        The input expression.
 
     env: Optional[tvm.relay.Environment]
-      The global environment.
+        The global environment.
 
 
     Returns
     -------
     checked_expr : tvm.relay.Expr
-      The checked expression.
+        The checked expression.
     """
     return _ir_pass.infer_type(expr, env)
 
@@ -35,12 +35,12 @@ def well_formed(expr):
     Parameters
     ----------
     expr: tvm.relay.Expr
-      The input expression
+        The input expression
 
     Returns
     -------
     well_form : bool
-      whether the input expression is well formed
+        Whether the input expression is well formed
     """
     return _ir_pass.well_formed(expr)
 
@@ -52,15 +52,15 @@ def check_kind(t, env=None):
     Parameters
     ----------
     t: tvm.relay.Type
-      The type to check
+        The type to check
 
     env: tvm.relay.Environment, optional
-      The global environment
+        The global environment
 
     Returns
     -------
     well_kinded : bool
-      whether the input type is well kinded.
+        whether the input type is well kinded.
 
     Examples
     --------
@@ -75,20 +75,26 @@ def check_kind(t, env=None):
         return _ir_pass.check_kind(t)
 
 
-def free_vars(e):
-    """Get free variables from expression e.
+def free_vars(expr):
+    """Get free Vars from expression expr in Post DFS order.
 
     Parameters
     ----------
-    e: tvm.relay.Expr
-      The input expression
+    expr: tvm.relay.Expr
+        The input expression
 
     Returns
     -------
     free : List[tvm.relay.Var]
-        The list of free variables
+        The list of free variables in post DFS order.
+
+    Note
+    ----
+    The fact that Vars are post-DFS ordred are useful in
+    neural networks: usually this means weights of previous
+    are ordered first.
     """
-    return _ir_pass.free_vars(e)
+    return _ir_pass.free_vars(expr)
 
 
 def free_type_vars(expr):
@@ -130,15 +136,15 @@ def alpha_equal(lhs, rhs):
     Parameters
     ----------
     lhs: tvm.relay.Expr
-      One of the input Expression.
+        One of the input Expression.
 
     rhs: tvm.relay.Expr
-      One of the input Expression.
+        One of the input Expression.
 
     Returns
     -------
     result: bool
-      True iff lhs is alpha equal to rhs.
+        True iff lhs is alpha equal to rhs.
     """
     return bool(_make._alpha_equal(lhs, rhs))
 
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 8a5357e4a2df..d0ccfcb44899 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -43,10 +43,10 @@ def conv2d(data,
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
-    weight : relay.Expr
+    weight : tvm.relay.Expr
         The weight expressions.
 
     strides : tuple of int, optional
@@ -81,7 +81,7 @@ def conv2d(data,
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.conv2d(data, weight, strides, padding, dilation,
@@ -105,10 +105,10 @@ def conv2d_transpose(data,
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
-    weight : relay.Expr
+    weight : tvm.relay.Expr
         The weight expressions.
 
     strides : Tuple[int], optional
@@ -137,7 +137,7 @@ def conv2d_transpose(data,
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.conv2d_transpose(data, weight, strides, padding, dilation,
@@ -155,7 +155,7 @@ def softmax(data, axis=1):
 
     Parameters
     ----------
-    data: relay.Expr
+    data: tvm.relay.Expr
         The input data to the operator.
 
     axis: int, optional
@@ -163,7 +163,7 @@ def softmax(data, axis=1):
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.softmax(data, axis)
@@ -181,7 +181,7 @@ def log_softmax(data, axis):
 
     Parameters
     ----------
-    data: relay.Expr
+    data: tvm.relay.Expr
         The input data to the operator.
 
     axis: int
@@ -189,7 +189,7 @@ def log_softmax(data, axis):
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.log_softmax(data, axis)
@@ -224,7 +224,7 @@ def max_pool2d(data,
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
     strides : tuple of int, optional
@@ -241,7 +241,7 @@ def max_pool2d(data,
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.max_pool2d(data, pool_size, strides, padding,
@@ -278,7 +278,7 @@ def avg_pool2d(data,
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
     strides : tuple of int, optional
@@ -298,7 +298,7 @@ def avg_pool2d(data,
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.avg_pool2d(data, pool_size, strides, padding,
@@ -325,7 +325,7 @@ def global_max_pool2d(data,
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
     layout : str, optional
@@ -333,7 +333,7 @@ def global_max_pool2d(data,
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.global_max_pool2d(data, layout)
@@ -359,7 +359,7 @@ def global_avg_pool2d(data,
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
     layout : str, optional
@@ -367,7 +367,7 @@ def global_avg_pool2d(data,
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.global_avg_pool2d(data, layout)
@@ -389,10 +389,10 @@ def upsampling(data,
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
-    scale : relay.Expr
+    scale : tvm.relay.Expr
         The scale factor for upsampling.
 
     layout : str, optional
@@ -403,11 +403,12 @@ def upsampling(data,
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.upsampling(data, scale, layout, method)
 
+
 def batch_flatten(data):
     """BatchFlatten.
 
@@ -420,17 +421,43 @@ def batch_flatten(data):
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
     Returns
     -------
-    result: relay.Expr
+    result : tvm.relay.Expr
         The Flattened result.
     """
     return _make.batch_flatten(data)
 
 
+def bias_add(data, bias, axis=1):
+    """add_bias operator.
+
+    Add 1D bias to the axis of data.
+    This function is a special case of add which allows
+    inference of shape of the bias from data.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    bias : tvm.relay.Expr
+        The bias to be added.
+
+    axis : int, optional
+        The axis to add the bias.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The final result.
+    """
+    return _make.bias_add(data, bias, axis)
+
+
 def dense(data, weight, units=None):
     """Dense operator.
     Applies a linear transformation
@@ -441,10 +468,10 @@ def dense(data, weight, units=None):
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
-    weight : relay.Expr
+    weight : tvm.relay.Expr
         The weight expressions.
 
     units : int, optional
@@ -452,7 +479,7 @@ def dense(data, weight, units=None):
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.dense(data, weight, units)
@@ -466,12 +493,12 @@ def relu(data):
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.relu(data)
@@ -487,7 +514,7 @@ def leaky_relu(data, alpha):
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
     alpha : float
@@ -495,7 +522,7 @@ def leaky_relu(data, alpha):
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.leaky_relu(data, alpha)
@@ -511,7 +538,7 @@ def pad(data,
 
     Parameters
     ----------
-    data: relay.Expr
+    data: tvm.relay.Expr
         The input data to the operator
     pad_width: tuple of <tuple of <int>>, required
         Number of values padded to the edges of each axis, in the format
@@ -521,7 +548,7 @@ def pad(data,
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.pad(data, pad_width, pad_value)
@@ -540,7 +567,7 @@ def lrn(data, size=5, axis=1, bias=2, alpha=.00001, beta=0.75):
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
     size : int, optional
@@ -560,7 +587,7 @@ def lrn(data, size=5, axis=1, bias=2, alpha=.00001, beta=0.75):
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.lrn(data, size, axis, alpha, beta, bias)
@@ -574,7 +601,7 @@ def l2_normalize(data, eps, axis=None):
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
     eps : float
@@ -585,11 +612,12 @@ def l2_normalize(data, eps, axis=None):
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The computed result.
     """
     return _make.l2_normalize(data, eps, axis)
 
+
 def dropout(data, rate=0.5):
     """Applies the dropout operation to the input array.
 
@@ -599,7 +627,7 @@ def dropout(data, rate=0.5):
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
     rate : float, optional (default=0.5)
@@ -607,17 +635,22 @@ def dropout(data, rate=0.5):
 
     Returns
     -------
-    result : relay.Tuple([relay.Expr, relay.Expr])
-        The first member of the tuple is the result of dropping elements from ``data``
-        and rescaling. The second member is a "mask" tensor, which is of the same
-        shape and data type as ``data`` and, for each element in ``data``, is 1.0
-        if the element was not dropped and 0.0 if it was.
+    result : tvm.relay.Expr
+        The result of dropout
     """
     result = _make.dropout(data, rate)
-    return TupleWrapper(result, 2)
-
-def batch_norm(data, gamma, beta, moving_mean, moving_var,
-               axis=1, epsilon=1e-5, center=True, scale=True):
+    return TupleWrapper(result, 2)[0]
+
+
+def batch_norm(data,
+               gamma,
+               beta,
+               moving_mean,
+               moving_var,
+               axis=1,
+               epsilon=1e-5,
+               center=True,
+               scale=True):
     r"""
     Batch normalization layer (Ioffe and Szegedy, 2014).
     Normalizes the input at each batch, i.e. applies a transformation
@@ -658,34 +691,50 @@ def batch_norm(data, gamma, beta, moving_mean, moving_var,
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         Input to which batch_norm will be applied.
-    gamma : relay.Expr
+
+    gamma : tvm.relay.Expr
         The gamma scale factor.
-    beta : relay.Expr
+
+    beta : tvm.relay.Expr
         The beta offset factor.
-    moving_mean : relay.Expr
+
+    moving_mean : tvm.relay.Expr
         Running mean of input,
-    moving_var : relay.Expr
+
+    moving_var : tvm.relay.Expr
         Running variance of input.
+
     axis : int, optional, default=1
         Specify along which shape axis the channel is specified.
+
     epsilon : double, optional, default=1e-5
         Small float added to variance to avoid diving by zero.
+
     center : boolean, optional, default=True
         If True, add offset of beta to normalized tensor, If False,
         beta is ignored.
+
     scale : boolean, optional, default=True
         If true, multiply by gamma. If False, gamma is not used.
         When the next layer is piecewise linear (also e.g. nn.relu),
-        this can be disabled since the scalingwill be done by the next layer.
+        this can be disabled since the scaling will be done by the next layer.
 
     Returns
     -------
-    result : relay.Tuple([relay.Expr, relay.Expr, relay.Expr])
-        Tuple of normed data (same shape as input), new running mean (k-length vector),
+    result : relay.Tuple([tvm.relay.Expr, tvm.relay.Expr, tvm.relay.Expr])
+        Tuple of normed data (same shape as input),
+        new running mean (k-length vector),
         and new running variance (k-length vector)
     """
-    result = _make.batch_norm(data, gamma, beta, moving_mean, moving_var,
-                              axis, epsilon, center, scale)
+    result = _make.batch_norm(data,
+                              gamma,
+                              beta,
+                              moving_mean,
+                              moving_var,
+                              axis,
+                              epsilon,
+                              center,
+                              scale)
     return TupleWrapper(result, 3)
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
new file mode 100644
index 000000000000..9bfda3c7abc7
--- /dev/null
+++ b/python/tvm/relay/testing/__init__.py
@@ -0,0 +1,5 @@
+"""Utilities for testing and benchmarks"""
+from __future__ import absolute_import as _abs
+
+from . import mlp
+from . import resnet
diff --git a/python/tvm/relay/testing/init.py b/python/tvm/relay/testing/init.py
new file mode 100644
index 000000000000..fdbde9d289d6
--- /dev/null
+++ b/python/tvm/relay/testing/init.py
@@ -0,0 +1,149 @@
+"""Initializer of parameters."""
+import tvm
+from tvm import relay
+import numpy as np
+
+class Initializer(object):
+    """The base class of an initializer."""
+    def __init__(self, **kwargs):
+        self._kwargs = kwargs
+
+    def __call__(self, desc, arr):
+        """Initialize an array
+
+        Parameters
+        ----------
+        desc : str
+            Initialization pattern descriptor.
+
+        arr : NDArray
+            The array to be initialized.
+        """
+        if desc.endswith('weight'):
+            self._init_weight(desc, arr)
+        elif desc.endswith('bias'):
+            self._init_bias(desc, arr)
+        elif desc.endswith('gamma'):
+            self._init_gamma(desc, arr)
+        elif desc.endswith('beta'):
+            self._init_beta(desc, arr)
+        elif desc.endswith('mean'):
+            self._init_mean(desc, arr)
+        elif desc.endswith('var'):
+            self._init_var(desc, arr)
+        else:
+            self._init_default(desc, arr)
+
+    def _init_bias(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_gamma(self, _, arr):
+        arr[:] = 1.0
+
+    def _init_beta(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_mean(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_var(self, _, arr):
+        arr[:] = 1.0
+
+    def _init_weight(self, name, arr):
+        """Abstract method to Initialize weight."""
+        raise NotImplementedError("Must override it")
+
+    def _init_default(self, name, _):
+        raise ValueError(
+            'Unknown initialization pattern for %s. ' \
+            'Default initialization is now limited to '\
+            '"weight", "bias", "gamma" (1.0), and "beta" (0.0).' \
+            'Please use mx.sym.Variable(init=mx.init.*) to set initialization pattern' % name)
+
+
+class Xavier(Initializer):
+    """ "Xavier" initialization for weights
+
+    Parameters
+    ----------
+    rnd_type: str, optional
+        Random generator type, can be ``'gaussian'`` or ``'uniform'``.
+
+    factor_type: str, optional
+        Can be ``'avg'``, ``'in'``, or ``'out'``.
+
+    magnitude: float, optional
+        Scale of random number.
+    """
+    def __init__(self, rnd_type="uniform", factor_type="avg", magnitude=3):
+        super(Xavier, self).__init__(rnd_type=rnd_type,
+                                     factor_type=factor_type,
+                                     magnitude=magnitude)
+        self.rnd_type = rnd_type
+        self.factor_type = factor_type
+        self.magnitude = float(magnitude)
+
+    def _init_weight(self, name, arr):
+        shape = arr.shape
+        hw_scale = 1.
+        if len(shape) < 2:
+            raise ValueError('Xavier initializer cannot be applied to vector {0}. It requires at'
+                             ' least 2D.'.format(name))
+        if len(shape) > 2:
+            hw_scale = np.prod(shape[2:])
+        fan_in, fan_out = shape[1] * hw_scale, shape[0] * hw_scale
+        factor = 1.
+        if self.factor_type == "avg":
+            factor = (fan_in + fan_out) / 2.0
+        elif self.factor_type == "in":
+            factor = fan_in
+        elif self.factor_type == "out":
+            factor = fan_out
+        else:
+            raise ValueError("Incorrect factor type")
+        # Hack for mobilenet, because there is less connectivity
+        if "depthwise" in name:
+            factor = 3 * 3
+        scale = np.sqrt(self.magnitude / factor)
+        if self.rnd_type == "uniform":
+            arr[:] = np.random.uniform(-scale, scale, size=arr.shape)
+        else:
+            raise ValueError("Unknown random type")
+
+
+def create_workload(net, initializer=None, seed=0):
+    """Helper function to create benchmark image classification workload.
+
+    Parameters
+    ----------
+    net : tvm.relay.Function
+        The selected function of the network.
+
+    initializer : Initializer
+        The initializer used
+
+    seed : int
+        The seed used in initialization.
+
+    Returns
+    -------
+    net : tvm.relay.Function
+        The updated dataflow
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = relay.ir_pass.infer_type(net)
+    shape_dict = {
+        v.name_hint : v.checked_type for v in net.params}
+    net.astext()
+    np.random.seed(seed)
+    initializer = initializer if initializer else Xavier()
+    params = {}
+    for k, v in shape_dict.items():
+        if k == "data":
+            continue
+        init_value = np.zeros(v.concrete_shape).astype(v.dtype)
+        initializer(k, init_value)
+        params[k] = tvm.nd.array(init_value, ctx=tvm.cpu(0))
+    return net, params
diff --git a/python/tvm/relay/testing/layers.py b/python/tvm/relay/testing/layers.py
new file mode 100644
index 000000000000..fc06ca229f77
--- /dev/null
+++ b/python/tvm/relay/testing/layers.py
@@ -0,0 +1,114 @@
+"""Simple Layer DSL wrapper to ease creation of neural nets."""
+from tvm import relay
+
+def batch_norm_infer(data,
+                     gamma=None,
+                     beta=None,
+                     moving_mean=None,
+                     moving_var=None,
+                     **kwargs):
+    """Wrapper of batch_norm.
+
+    This function automatically creates weights and return
+    the first output(normalized result).
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input expression.
+
+    gamma : relay.Expr
+        The gamma scale factor.
+
+    beta : relay.Expr
+        The beta offset factor.
+
+    moving_mean : relay.Expr
+        Running mean of input,
+
+    moving_var : relay.Expr
+        Running variance of input.
+
+    kwargs : dict
+        Additional arguments.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result.
+    """
+    name = kwargs.get("name")
+    kwargs.pop("name")
+    if not gamma:
+        gamma = relay.var(name + "_gamma")
+    if not beta:
+        beta = relay.var(name + "_beta")
+    if not moving_mean:
+        moving_mean = relay.var(name + "_moving_mean")
+    if not moving_var:
+        moving_var = relay.var(name + "_moving_var")
+    return relay.nn.batch_norm(data,
+                               gamma=gamma,
+                               beta=beta,
+                               moving_mean=moving_mean,
+                               moving_var=moving_var,
+                               **kwargs)[0]
+
+
+def conv2d(data, weight=None, **kwargs):
+    """Wrapper of conv2d which automatically creates weights if not given.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input expression.
+
+    weight : relay.Expr
+        The weight to conv2d.
+
+    kwargs : dict
+        Additional arguments.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result.
+    """
+    name = kwargs.get("name")
+    kwargs.pop("name")
+    if not weight:
+        weight = relay.var(name + "_weight")
+    return relay.nn.conv2d(data, weight, **kwargs)
+
+
+def dense_add_bias(data, weight=None, bias=None, **kwargs):
+    """Wrapper of dense which automatically creates weights if not given.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input expression.
+
+    weight : relay.Expr
+        The weight to conv2d.
+
+    bias : relay.Expr
+        The bias.
+
+    kwargs : dict
+        Additional arguments.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result.
+    """
+    name = kwargs.get("name")
+    kwargs.pop("name")
+    if not weight:
+        weight = relay.var(name + "_weight")
+    if not bias:
+        bias = relay.var(name + "_bias")
+    data = relay.nn.dense(data, weight, **kwargs)
+    data = relay.nn.bias_add(data, bias)
+    return data
diff --git a/python/tvm/relay/testing/mlp.py b/python/tvm/relay/testing/mlp.py
new file mode 100644
index 000000000000..67fa0d90c643
--- /dev/null
+++ b/python/tvm/relay/testing/mlp.py
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+a simple multilayer perceptron
+"""
+from tvm import relay
+from .init import create_workload
+
+def get_net(batch_size,
+            num_classes=10,
+            image_shape=(1, 28, 28),
+            dtype="float32"):
+    """Get network a simple multilayer perceptron.
+
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : relay.Function
+        The dataflow.
+    """
+    data_shape = (batch_size,) + image_shape
+    data = relay.var("data",
+                     shape=data_shape,
+                     dtype=dtype)
+    data = relay.nn.batch_flatten(data)
+    fc1 = relay.nn.dense(data, relay.var("fc1_weight"), units=128)
+    fc1 = relay.nn.bias_add(fc1, relay.var("fc2_bias"))
+    act1 = relay.nn.relu(fc1)
+    fc2 = relay.nn.dense(act1, relay.var("fc2_weight"), units=64)
+    fc2 = relay.nn.bias_add(fc2, relay.var("fc2_bias"))
+    act2 = relay.nn.relu(fc2)
+    fc3 = relay.nn.dense(act2, relay.var("fc3_weight"), units=num_classes)
+    fc3 = relay.nn.bias_add(fc3, relay.var("fc3_bias"))
+    mlp = relay.nn.softmax(data=fc3)
+    args = relay.ir_pass.free_vars(mlp)
+    return relay.Function(args, mlp)
+
+
+def get_workload(batch_size,
+                 num_classes=10,
+                 image_shape=(1, 28, 28),
+                 dtype="float32"):
+    """Get benchmark workload for a simple multilayer perceptron.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : relay.Function
+        The dataflow.
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size, num_classes, image_shape, dtype)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/resnet.py b/python/tvm/relay/testing/resnet.py
new file mode 100644
index 000000000000..cb489f6e2471
--- /dev/null
+++ b/python/tvm/relay/testing/resnet.py
@@ -0,0 +1,276 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+"""
+# pylint: disable=unused-argument
+from tvm import relay
+from .init import create_workload
+from . import layers
+
+def residual_unit(data,
+                  num_filter,
+                  stride,
+                  dim_match,
+                  name,
+                  bottle_neck=True):
+    """Return ResNet Unit symbol for building ResNet
+
+    Parameters
+    ----------
+    data : str
+        Input data
+
+    num_filter : int
+        Number of output channels
+
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+
+    stride : tuple
+        Stride used in convolution
+
+    dim_match : bool
+        True means channel number between input and output is the same,
+        otherwise means differ
+
+    name : str
+        Base name of the operators
+    """
+    if bottle_neck:
+        bn1 = layers.batch_norm_infer(data=data,
+                                      epsilon=2e-5,
+                                      name=name + '_bn1')
+        act1 = relay.relu(data=bn1)
+        conv1 = layers.conv2d(
+            data=act1,
+            channels=int(num_filter*0.25),
+            kernel_size=(1, 1),
+            strides=stride,
+            padding=(0, 0),
+            name=name + '_conv1')
+        bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, name=name + '_bn2')
+        act2 = relay.relu(data=bn2)
+        conv2 = layers.conv2d(
+            data=act2, channels=int(num_filter*0.25), kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), name=name + '_conv2')
+        bn3 = layers.batch_norm_infer(data=conv2, epsilon=2e-5, name=name + '_bn3')
+        act3 = relay.relu(data=bn3)
+        conv3 = layers.conv2d(
+            data=act3, channels=num_filter, kernel_size=(1, 1),
+            strides=(1, 1), padding=(0, 0), name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = layers.conv2d(
+                data=act1, channels=num_filter, kernel_size=(1, 1),
+                strides=stride, name=name+'_sc')
+        return relay.add(conv3, shortcut)
+    else:
+        bn1 = layers.batch_norm_infer(data=data, epsilon=2e-5, name=name + '_bn1')
+        act1 = relay.nn.relu(data=bn1)
+        conv1 = layers.conv2d(
+            data=act1, channels=num_filter, kernel_size=(3, 3),
+            strides=stride, padding=(1, 1), name=name + '_conv1')
+        bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, name=name + '_bn2')
+        act2 = relay.nn.relu(data=bn2)
+        conv2 = layers.conv2d(
+            data=act2, channels=num_filter, kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = layers.conv2d(
+                data=act1, channels=num_filter, kernel_size=(1, 1),
+                strides=stride, name=name+'_sc')
+        return relay.add(conv2, shortcut)
+
+
+def resnet(units,
+           num_stages,
+           filter_list,
+           num_classes,
+           data_shape,
+           bottle_neck=True,
+           dtype="float32"):
+    """Return ResNet Program.
+
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+
+    num_stages : int
+        Number of stage
+
+    filter_list : list
+        Channel size of each stage
+
+    num_classes : int
+        Ouput size of symbol
+
+    data_shape : tuple of int.
+        The shape of input data.
+
+    bottle_neck : bool
+        Whether apply bottleneck transformation.
+
+    dtype : str
+        The global data type.
+    """
+    num_unit = len(units)
+    assert num_unit == num_stages
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    data = layers.batch_norm_infer(data=data, epsilon=2e-5, scale=False, name='bn_data')
+    (_, _, height, _) = data_shape
+    if height <= 32:            # such as cifar10
+        body = layers.conv2d(
+            data=data, channels=filter_list[0], kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), name="conv0")
+    else:                       # often expected to be 224 such as imagenet
+        body = layers.conv2d(
+            data=data, channels=filter_list[0], kernel_size=(7, 7),
+            strides=(2, 2), padding=(3, 3), name="conv0")
+        body = layers.batch_norm_infer(data=body, epsilon=2e-5, name='bn0')
+        body = relay.nn.relu(data=body)
+        body = relay.nn.max_pool2d(data=body, pool_size=(3, 3), strides=(2, 2), padding=(1, 1))
+
+    for i in range(num_stages):
+        body = residual_unit(
+            body, filter_list[i+1], (1 if i == 0 else 2, 1 if i == 0 else 2),
+            False, name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck)
+        for j in range(units[i]-1):
+            body = residual_unit(
+                body, filter_list[i+1], (1, 1), True,
+                name='stage%d_unit%d' % (i + 1, j + 2), bottle_neck=bottle_neck)
+    bn1 = layers.batch_norm_infer(data=body, epsilon=2e-5, name='bn1')
+    relu1 = relay.nn.relu(data=bn1)
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = relay.nn.global_avg_pool2d(data=relu1)
+    flat = relay.nn.batch_flatten(data=pool1)
+    fc1 = layers.dense_add_bias(data=flat, units=num_classes, name='fc1')
+    net = relay.nn.softmax(data=fc1)
+    return relay.Function(relay.ir_pass.free_vars(net), net)
+
+
+def get_net(batch_size,
+            num_classes,
+            num_layers=50,
+            image_shape=(3, 224, 224),
+            dtype="float32",
+            **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    (_, height, _) = image_shape
+    data_shape = (batch_size,) + image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}".format(num_layers))
+
+    return resnet(units=units,
+                  num_stages=num_stages,
+                  filter_list=filter_list,
+                  num_classes=num_classes,
+                  data_shape=data_shape,
+                  bottle_neck=bottle_neck,
+                  dtype=dtype)
+
+
+def get_workload(batch_size=1,
+                 num_classes=1000,
+                 num_layers=18,
+                 image_shape=(3, 224, 224),
+                 dtype="float32",
+                 **kwargs):
+    """Get benchmark workload for resnet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    num_layers : int, optional
+        Number of layers
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : relay.Function
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size=batch_size,
+                  num_classes=num_classes,
+                  num_layers=num_layers,
+                  image_shape=image_shape,
+                  dtype=dtype,
+                  **kwargs)
+    return create_workload(net)
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index 0f7e0e82ad4d..7ea63e6200bf 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -47,6 +47,21 @@ def __init__(self, shape, dtype="float32"):
         self.__init_handle_by_constructor__(
             _make.TensorType, shape, dtype)
 
+    @property
+    def concrete_shape(self):
+        """Get shape of the type as concrete tuple of int.
+
+        Returns
+        -------
+        shape : List[int]
+            The concrete shape of the Type.
+
+        Raises
+        ------
+        TypeError : If the shape is symbolic
+        """
+        return tuple(int(x) for x in self.shape)
+
 
 class Kind(IntEnum):
     """The kind of a type parameter, represents a variable shape,
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index a7367c384cb3..557daa98e899 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -159,6 +159,13 @@ Expr ExprMutator::VisitExpr_(const TupleGetItemNode* g) {
 
 Type ExprMutator::VisitType(const Type& t) { return t; }
 
+void ExprVisitor::VisitExpr(const Expr& expr) {
+  if (visited_.count(expr.get())) return;
+  using TParent = ExprFunctor<void(const Expr&)>;
+  TParent::VisitExpr(expr);
+  visited_.insert(expr.get());
+}
+
 void ExprVisitor::ExprVisitor::VisitExpr_(const VarNode* op) {
   if (op->type_annotation.defined()) {
     this->VisitType(op->type_annotation);
@@ -197,8 +204,8 @@ void ExprVisitor::VisitExpr_(const CallNode* op) {
 }
 
 void ExprVisitor::VisitExpr_(const LetNode* op) {
-  this->VisitExpr(op->var);
   this->VisitExpr(op->value);
+  this->VisitExpr(op->var);
   this->VisitExpr(op->body);
 }
 
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index 3cbe1e00b9ca..8056adc9a8b8 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -63,7 +63,7 @@ inline std::ostream& operator<<(std::ostream& os, const TextValue& val) {  // NO
  *
  * \code
  *
- * function(%x: Tensor[(meta.Variable(id=0),), float32]) {
+ * fn (%x: Tensor[(meta.Variable(id=0),), float32]) {
  *   %x
  * }
  * # Meta data section is a json-serialized string
@@ -154,7 +154,7 @@ class TextPrinter :
   }
 
   void PrintFunc(const Function& func) {
-    this->PrintFuncInternal("function", func);
+    this->PrintFuncInternal("fn ", func);
     stream_ << "\n";
   }
 
@@ -343,7 +343,7 @@ class TextPrinter :
     TextValue tuple = GetValue(op->tuple);
     TextValue id = this->AllocTempVar();
     this->PrintIndent();
-    stream_ << id << " = " << tuple << "[" << op->index << "]";
+    stream_ << id << " = " << tuple << "." << op->index << "";
     this->PrintEndInst("\n");
     return id;
   }
@@ -379,6 +379,17 @@ class TextPrinter :
     os << "), " << runtime::TVMType2String(Type2TVMType(node->dtype)) << "]";
   }
 
+  void VisitType_(const TupleTypeNode* node, std::ostream& os) final {  // NOLINT(*)
+    os << "Tuple[";
+    for (size_t i = 0; i < node->fields.size(); ++i) {
+      this->PrintType(node->fields[i], os);
+      if (i + 1 != node->fields.size()) {
+        os << ", ";
+      }
+    }
+    os << "]";
+  }
+
   void VisitTypeDefault_(const Node* node, std::ostream& os) final {  // NOLINT(*)
     // by default always print as meta-data
     os << meta_.GetMetaNode(GetRef<NodeRef>(node));
diff --git a/src/relay/ir/type_functor.h b/src/relay/ir/type_functor.h
index 03bb4db1f59e..f51c8c746eb9 100644
--- a/src/relay/ir/type_functor.h
+++ b/src/relay/ir/type_functor.h
@@ -96,40 +96,41 @@ class TypeFunctor<R(const Type& n, Args...)> {
  *
  * We recursively visit each type contained inside the visitor.
  */
-template <typename... Args>
-struct TypeVisitor : ::tvm::relay::TypeFunctor<void(const Type& n, Args...)> {
-  void VisitType_(const TypeVarNode* op, Args... args) override {}
+class TypeVisitor :
+    public ::tvm::relay::TypeFunctor<void(const Type& n)> {
+ public:
+  void VisitType_(const TypeVarNode* op) override {}
 
-  void VisitType_(const FuncTypeNode* op, Args... args) override {
+  void VisitType_(const FuncTypeNode* op) override {
     for (auto type_param : op->type_params) {
-      this->VisitType(type_param, std::forward<Args>(args)...);
+      this->VisitType(type_param);
     }
 
     for (auto type_cs : op->type_constraints) {
-      this->VisitType(type_cs, std::forward<Args>(args)...);
+      this->VisitType(type_cs);
     }
 
     for (auto arg_type : op->arg_types) {
-      this->VisitType(arg_type, std::forward<Args>(args)...);
+      this->VisitType(arg_type);
     }
-    this->VisitType(op->ret_type, std::forward<Args>(args)...);
+    this->VisitType(op->ret_type);
   }
 
-  void VisitType_(const TensorTypeNode* op, Args... args) override {}
+  void VisitType_(const TensorTypeNode* op) override {}
 
-  void VisitType_(const TupleTypeNode* op, Args... args) override {
+  void VisitType_(const TupleTypeNode* op) override {
     for (const Type& t : op->fields) {
-      this->VisitType(t, std::forward<Args>(args)...);
+      this->VisitType(t);
     }
   }
 
-  void VisitType_(const TypeRelationNode* op, Args... args) override {
+  void VisitType_(const TypeRelationNode* op) override {
     for (const Type& t : op->args) {
-      this->VisitType(t, std::forward<Args>(args)...);
+      this->VisitType(t);
     }
   }
 
-  void VisitType_(const IncompleteTypeNode* op, Args... args) override {}
+  void VisitType_(const IncompleteTypeNode* op) override {}
 };
 
 // A functional visitor for rebuilding an AST in place.
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 8a7cffd2cd27..8459a99cde23 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -15,6 +15,62 @@
 namespace tvm {
 namespace relay {
 
+TVM_REGISTER_NODE_TYPE(BiasAddAttrs);
+
+bool BiasAddRel(const Array<Type>& types,
+                int num_inputs,
+                const Attrs& attrs,
+                const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const BiasAddAttrs* param = attrs.as<BiasAddAttrs>();
+  CHECK(param != nullptr);
+  int axis = param->axis;
+  if (axis < 0) {
+    axis = data->shape.size() + axis;
+  }
+  CHECK_LE(axis, static_cast<int>(data->shape.size()))
+      << "axis " << param->axis << " is out of range";
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(
+      {data->shape[axis]}, data->dtype));
+  reporter->Assign(types[2], types[0]);
+  return true;
+}
+
+
+// Positional relay function to create dense operator used by frontend FFI.
+Expr MakeBiasAdd(Expr data,
+                 Expr bias,
+                 int axis) {
+  auto attrs = make_node<BiasAddAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("nn.bias_add");
+  return CallNode::make(op, {data, bias}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.bias_add")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeBiasAdd, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.bias_add")
+.describe(R"code(Add bias to an axis of the input.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.BiasAddAttrs")
+.set_num_inputs(2)
+.add_argument("data", "nD Tensor", "Input data.")
+.add_argument("bias", "1D Tensor", "Bias.")
+.set_support_level(1)
+.add_type_rel("BiasAdd", BiasAddRel);
+
+
 TVM_REGISTER_NODE_TYPE(DenseAttrs);
 
 
@@ -82,7 +138,7 @@ RELAY_REGISTER_OP("nn.dense")
 .set_num_inputs(2)
 .add_argument("data", "nD Tensor", "Input data.")
 .add_argument("weight", "2D Tensor", "Weight matrix.")
-.set_support_level(2)
+.set_support_level(1)
 .add_type_rel("Dense", DenseRel);
 
 
@@ -235,13 +291,23 @@ Example::
 .set_support_level(2)
 .add_type_rel("BatchFlatten", BatchFlattenRel);
 
-RELAY_REGISTER_UNARY_OP("relay.op.nn._make.", "relu")
+
+// relu
+TVM_REGISTER_API("relay.op.nn._make.relu")
+.set_body_typed<Expr(Expr)>([](Expr data) {
+    static const Op& op = Op::Get("nn.relu");
+    return CallNode::make(op, {data}, Attrs(), {});
+  });
+
+RELAY_REGISTER_OP("nn.relu")
 .describe(R"code(Returns the relu input array, computed element-wise.
 
 .. math::
    max(x, 0)
 
 )code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
@@ -371,24 +437,6 @@ The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input
 // batch_norm
 TVM_REGISTER_NODE_TYPE(BatchNormAttrs);
 
-bool CheckVectorLength(int64_t dim, const DataType& dtype, Type vector, const char* name) {
-  const auto* candidate = vector.as<TensorTypeNode>();
-  CHECK(candidate != nullptr)
-    << name << " should be a vector but is not a tensor type,";
-  CHECK_EQ(dtype, candidate->dtype)
-    << name << " should be of the same data type as the original but it is not.";
-  CHECK_EQ(candidate->shape.size(), 1)
-    << name << " should be a vector but has a shape of "
-    << candidate->shape.size() << " dimensions instead of 1.";
-
-  const int64_t* length = as_const_int(candidate->shape[0]);
-  if (length == nullptr) return false;
-  CHECK(*length == dim)
-    << name << " should be as long as the channel but has length "
-    << *length << " instead of " << dim << ".";
-  return true;
-}
-
 bool BatchNormRel(const Array<Type>& types,
                   int num_inputs,
                   const Attrs& attrs,
@@ -396,33 +444,19 @@ bool BatchNormRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 6);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
-  if (data->shape.size() == 0) return false;
 
   const BatchNormAttrs* param = attrs.as<BatchNormAttrs>();
 
   // axis of -1 means use the last dimension
   CHECK(param->axis >= -1 && param->axis < (int)data->shape.size());
   int axis = (param->axis != -1) ? param->axis : data->shape.size() - 1;
-
-  auto dim = as_const_int(data->shape[axis]);
-  if (dim == nullptr) return false;
+  auto axis_size = data->shape[axis];
 
   // if we are using beta and gamma, they need to be of shape (dim,)
-  if (param->scale && !CheckVectorLength(*dim, data->dtype, types[1], "The gamma scale factor")) {
-    return false;
-  }
-
-  if (param->center && !CheckVectorLength(*dim, data->dtype, types[2], "The beta offset factor")) {
-    return false;
-  }
-
-  // the two running averages must also be vectors of length dim
-  if (!CheckVectorLength(*dim, data->dtype, types[3], "The moving mean")) {
-    return false;
-  }
-  if (!CheckVectorLength(*dim, data->dtype, types[4], "The moving variance")) {
-    return false;
-  }
+  reporter->Assign(types[1], TensorTypeNode::make({axis_size}, data->dtype));
+  reporter->Assign(types[2], TensorTypeNode::make({axis_size}, data->dtype));
+  reporter->Assign(types[3], TensorTypeNode::make({axis_size}, data->dtype));
+  reporter->Assign(types[4], TensorTypeNode::make({axis_size}, data->dtype));
 
   // output is a tuple of the normed data (same shape as input), new running mean,
   // and new running average (the latter two are both vectors of length dim)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index e3c8bcef217e..bab875fd190e 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -13,8 +13,52 @@
 namespace tvm {
 namespace relay {
 
-/* relay.expand_dims */
+// relay.cast
+TVM_REGISTER_NODE_TYPE(CastAttrs);
 
+bool CastRel(const Array<Type>& types,
+             int num_inputs,
+             const Attrs& attrs,
+             const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "cast: expect input type to be TensorType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<CastAttrs>();
+  reporter->Assign(types[1], TensorTypeNode::make(
+      data->shape, param->dtype));
+  return true;
+}
+
+Expr MakeCast(Expr data,
+              DataType dtype) {
+  auto attrs = make_node<CastAttrs>();
+  attrs->dtype = dtype;
+  static const Op& op = Op::Get("cast");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay._make.dtype_cast")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeCast, args, rv);
+});
+
+RELAY_REGISTER_OP("cast")
+.describe(R"code(Cast the data into a new data type.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.CastAttrs")
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Cast", CastRel);
+
+
+// relay.expand_dims
 TVM_REGISTER_NODE_TYPE(ExpandDimsAttrs);
 
 bool ExpandDimsRel(const Array<Type>& types,
@@ -25,6 +69,9 @@ bool ExpandDimsRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "expand_dims: expect input type to be TensorType but get "
+        << types[0];
     return false;
   }
   const auto* param = attrs.as<ExpandDimsAttrs>();
@@ -91,6 +138,9 @@ bool ConcatenateRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* tensor_tuple = types[0].as<TupleTypeNode>();
   if (tensor_tuple == nullptr) {
+    CHECK(types[0].as<TupleTypeNode>())
+        << "cast: expect input type to be TupleType but get "
+        << types[0];
     return false;
   }
   const auto* param = attrs.as<ConcatenateAttrs>();
@@ -161,6 +211,9 @@ bool TransposeRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "transpose: expect input type to be TensorType but get "
+        << types[0];
     return false;
   }
   const auto* param = attrs.as<TransposeAttrs>();
@@ -243,6 +296,9 @@ bool ReshapeRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "reshape: expect input type to be TensorType but get "
+        << types[0];
     return false;
   }
   const auto* param = attrs.as<ReshapeAttrs>();
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
index c3d16c2976bf..81e72c6d7df8 100644
--- a/src/relay/pass/kind_check.cc
+++ b/src/relay/pass/kind_check.cc
@@ -22,7 +22,7 @@ namespace relay {
 using namespace tvm::runtime;
 using Kind = TypeVarNode::Kind;
 
-struct KindChecker : TypeVisitor<> {
+struct KindChecker : TypeVisitor {
   bool valid;
 
   KindChecker() : valid(true) {}
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 87fdb1c0ffba..7c8eeef92c5d 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -471,6 +471,5 @@ TVM_REGISTER_API("relay._ir_pass.infer_type")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = InferType(args[0], args[1]);
   });
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index d69f1bce70d4..c1f00c7b65e0 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -12,107 +12,120 @@
 namespace tvm {
 namespace relay {
 
-class FreeVar;
-class FreeTypeVar : private TypeVisitor<> {
-  std::unordered_set<TypeVar, NodeHash, NodeEqual>* free_vars;
-  std::unordered_set<TypeVar, NodeHash, NodeEqual>* bound_vars;
-  FreeTypeVar(std::unordered_set<TypeVar, NodeHash, NodeEqual>* free_vars,
-              std::unordered_set<TypeVar, NodeHash, NodeEqual>* bound_vars) :
-    free_vars(free_vars), bound_vars(bound_vars) { }
+// FreeTypeVar
+
+class FreeTypeVarTVisitor : public TypeVisitor {
+ public:
+  FreeTypeVarTVisitor(
+      Array<TypeVar>* free_vars,
+      std::unordered_set<TypeVar, NodeHash, NodeEqual>* bound_vars)
+      : free_vars_(free_vars), bound_vars_(bound_vars) { }
 
   void VisitType_(const TypeVarNode* tp) final {
-    auto var = GetRef<TypeVar>(tp);
-    if (bound_vars->count(var) == 0) {
-      free_vars->insert(var);
+    TypeVar var = GetRef<TypeVar>(tp);
+    if (bound_vars_->count(var) == 0) {
+      free_vars_->push_back(var);
     }
   }
 
   void VisitType_(const FuncTypeNode* f) final {
     for (auto type_param : f->type_params) {
-      bound_vars->insert(type_param);
+      bound_vars_->insert(type_param);
     }
+    TypeVisitor::VisitType_(f);
+  }
 
-    for (auto type_cs : f->type_constraints) {
-      this->VisitType(type_cs);
-    }
+ private:
+  Array<TypeVar>* free_vars_;
+  std::unordered_set<TypeVar, NodeHash, NodeEqual>* bound_vars_;
+};
 
-    for (auto arg_type : f->arg_types) {
-      this->VisitType(arg_type);
-    }
-    this->VisitType(f->ret_type);
+class FreeTypeVarEVisitor : private ExprVisitor {
+ public:
+  Array<TypeVar> Find(const Expr& expr) {
+    this->VisitExpr(expr);
+    return free_vars_;
   }
-  friend FreeVar;
-};
 
-class FreeVar : public ExprVisitor {
-  void VisitExpr_(const VarNode* v) final {
-    auto var = GetRef<Var>(v);
-    if (bound_vars.count(var) == 0) {
-      free_vars.insert(var);
-    }
-    if (v->type_annotation.defined()) {
-      VisitType(v->type_annotation);
-    }
+  Array<TypeVar> Find(const Type& type) {
+    this->VisitType(type);
+    return free_vars_;
   }
 
   void VisitExpr_(const FunctionNode* f) final {
     for (const auto& tp : f->type_params) {
-      bound_types.insert(tp);
-    }
-    for (const auto& param : f->params) {
-      bound_vars.insert(param);
+      bound_vars_.insert(tp);
     }
-    VisitExpr(f->body);
-    VisitType(f->ret_type);
+    ExprVisitor::VisitExpr_(f);
   }
 
-  void VisitExpr_(const LetNode* l) final {
-    bound_vars.insert(l->var);
-    VisitExpr(l->value);
-    VisitExpr(l->body);
+  void VisitType(const Type& t) final {
+    FreeTypeVarTVisitor(&free_vars_, &bound_vars_)
+        .VisitType(t);
   }
 
+ private:
+  // The result list
+  Array<TypeVar> free_vars_;
+  std::unordered_set<TypeVar, NodeHash, NodeEqual> bound_vars_;
+};
+
+class FreeVarVisitor : protected ExprVisitor {
  public:
-  std::unordered_set<Var, NodeHash, NodeEqual> free_vars;
-  std::unordered_set<Var, NodeHash, NodeEqual> bound_vars;
-  std::unordered_set<TypeVar, NodeHash, NodeEqual> free_types;
-  std::unordered_set<TypeVar, NodeHash, NodeEqual> bound_types;
+  Array<Var> Find(const Expr& expr) {
+    this->VisitExpr(expr);
+    return free_vars_;
+  }
 
-  void VisitType(const Type& t) final {
-    FreeTypeVar(&free_types, &bound_types)(t);
+  void VisitExpr_(const VarNode* var) final {
+    if (bound_vars_.count(var) == 0) {
+      free_vars_.push_back(GetRef<Var>(var));
+    }
   }
+
+  void VisitExpr_(const FunctionNode* op) final {
+    for (const auto& param : op->params) {
+      bound_vars_.insert(param.operator->());
+    }
+    VisitExpr(op->body);
+  }
+
+  void VisitExpr_(const LetNode* op) final {
+    bound_vars_.insert(op->var.operator->());
+    VisitExpr(op->value);
+    VisitExpr(op->body);
+  }
+
+ private:
+  // The result list
+  Array<Var> free_vars_;
+  std::unordered_set<const VarNode*> bound_vars_;
 };
 
-tvm::Array<Var> FreeVariables(const Expr& e) {
-  FreeVar fv;
-  fv.VisitExpr(e);
-  return tvm::Array<Var>(fv.free_vars.begin(), fv.free_vars.end());
+tvm::Array<TypeVar> FreeTypeVars(const Expr& expr) {
+  return FreeTypeVarEVisitor().Find(expr);
 }
 
-tvm::Array<TypeVar> FreeTypeVariables(const Expr& e) {
-  FreeVar fv;
-  fv.VisitExpr(e);
-  return tvm::Array<TypeVar>(fv.free_types.begin(), fv.free_types.end());
+tvm::Array<TypeVar> FreeTypeVars(const Type& type) {
+  return FreeTypeVarEVisitor().Find(type);
 }
 
-tvm::Array<TypeVar> FreeTypeVariables(const Type& t) {
-  FreeVar fv;
-  fv.VisitType(t);
-  return tvm::Array<TypeVar>(fv.free_types.begin(), fv.free_types.end());
+tvm::Array<Var> FreeVars(const Expr& expr) {
+  return FreeVarVisitor().Find(expr);
 }
 
 TVM_REGISTER_API("relay._ir_pass.free_vars")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = FreeVariables(args[0]);
+    *ret = FreeVars(args[0]);
   });
 
 TVM_REGISTER_API("relay._ir_pass.free_type_vars")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     NodeRef x = args[0];
     if (x.as<TypeNode>()) {
-      *ret = FreeTypeVariables(Downcast<Type>(x));
+      *ret = FreeTypeVars(Downcast<Type>(x));
     } else {
-      *ret = FreeTypeVariables(Downcast<Expr>(x));
+      *ret = FreeTypeVars(Downcast<Expr>(x));
     }
   });
 
diff --git a/src/relay/pass/well_formed.cc b/src/relay/pass/well_formed.cc
index a37969f9e317..d9c6b617ca5f 100644
--- a/src/relay/pass/well_formed.cc
+++ b/src/relay/pass/well_formed.cc
@@ -10,7 +10,6 @@
 namespace tvm {
 namespace relay {
 
-struct NotWellFormed { };
 
 //! brief make sure each Var is bind at most once.
 class WellFormedChecker : private ExprVisitor {
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 29814ecc5eb7..69ba4797a1c7 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -1,7 +1,9 @@
 import tvm
+import tvm.relay.testing
 import numpy as np
 from tvm import relay
 
+
 do_print = [False]
 
 def show(text):
@@ -94,9 +96,18 @@ def test_variable_name():
     v1 = relay.var("1")
     assert "%v1" in v1.astext()
 
+def test_mlp():
+    net, params = tvm.relay.testing.mlp.get_workload(batch_size=1)
+    net.astext()
+
+def test_resnet():
+    net, params = tvm.relay.testing.resnet.get_workload(batch_size=1)
+    net.astext()
 
 if __name__ == "__main__":
     do_print[0] = True
+    test_resnet()
+    test_mlp()
     test_func()
     test_env()
     test_meta_data()
diff --git a/tests/python/relay/test_ir_well_formed.py b/tests/python/relay/test_ir_well_formed.py
index 7ccc96d271ac..725b2fbd3c3d 100644
--- a/tests/python/relay/test_ir_well_formed.py
+++ b/tests/python/relay/test_ir_well_formed.py
@@ -12,10 +12,9 @@ def test_well_formed():
     assert not well_formed(relay.Let(x, v, let))
     f = relay.Function([x], x, ty)
     assert well_formed(f)
-    # this test should pass in case of weak uniqueness (only test for shadowing)
-    # but we want all binder to be distinct from each other.
-    assert not well_formed(relay.Let(relay.Var("y"), f,
-                                     relay.Let(relay.Var("z"), f, v)))
+    assert well_formed(
+        relay.Let(relay.Var("y"), f,
+                  relay.Let(relay.Var("z"), f, v)))
 
 
 def test_tuple():
@@ -25,7 +24,7 @@ def test_tuple():
     let = relay.Let(x, v, x)
     assert well_formed(let)
     assert well_formed(relay.Tuple([v, v]))
-    assert not well_formed(relay.Tuple([let, let]))
+    assert not well_formed(relay.Tuple([let, relay.Let(x, v, x)]))
 
 
 def test_tuple_get_item():
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 5afae6e872d1..fd01dbdde012 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -42,6 +42,15 @@ def check_binary_op(opfunc):
         check_binary_op(opfunc)
 
 
+def test_bias_add():
+    x = relay.var("x", shape=(10, 2, 3, 4))
+    bias = relay.var("bias")
+    z = relay.nn.bias_add(x, bias)
+    zz = relay.ir_pass.infer_type(z)
+    assert "axis=" not in zz.astext()
+    assert zz.args[1].checked_type == relay.TensorType((2,))
+
+
 def test_expand_dims_infer_type():
     n, t, d = tvm.var("n"), tvm.var("t"), 100
     x = relay.var("x", shape=(n, t, d))
@@ -91,7 +100,7 @@ def test_dropout():
     n, t, d = tvm.var("n"), tvm.var("t"), tvm.var("d")
     input_ty = relay.TensorType((n, t, d), "float32")
     x = relay.var("x", input_ty)
-    y, _ = relay.nn.dropout(x, rate=0.75)
+    y = relay.nn.dropout(x, rate=0.75)
     assert "rate=" in y.astext()
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == input_ty
@@ -106,7 +115,7 @@ def test_batch_norm():
     moving_var = relay.var("moving_var", relay.TensorType((2,)))
     y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
                             center=False, scale=False)
-    yy = relay.ir_pass.infer_type(y)
+    yy = relay.ir_pass.infer_type(y.astuple())
     assert "center=" in yy.astext()
     assert yy.checked_type == relay.ty.TupleType(tvm.convert([
         relay.TensorType((3, 2, 1), "float32"),
@@ -121,7 +130,7 @@ def test_batch_norm():
 
     y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
                             axis=0, center=False, scale=False)
-    yy = relay.ir_pass.infer_type(y)
+    yy = relay.ir_pass.infer_type(y.astuple())
     assert yy.checked_type == relay.ty.TupleType(tvm.convert([
         relay.ty.TensorType((3, 2, 1), "float32"),
         relay.ty.TensorType((3,), "float32"),
@@ -136,7 +145,7 @@ def test_batch_norm():
     moving_var = relay.var("moving_var", relay.TensorType((3,)))
     y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
                             axis=-1, center=False, scale=False)
-    yy = relay.ir_pass.infer_type(y)
+    yy = relay.ir_pass.infer_type(y.astuple())
     assert yy.checked_type == relay.ty.TupleType(tvm.convert([
         relay.ty.TensorType((1, 2, 3), "float32"),
         relay.ty.TensorType((3,), "float32"),
@@ -145,6 +154,7 @@ def test_batch_norm():
 
 
 if __name__ == "__main__":
+    test_bias_add()
     test_unary_op()
     test_binary_op()
     test_expand_dims_infer_type()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index d1bff2940457..8ab3c41c079d 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -27,6 +27,14 @@ def test_unary_identity():
         assert yy.checked_type == relay.TensorType((8, 9, 4), "float32")
 
 
+def test_cast():
+    x = relay.var("x", relay.TensorType((8, 9, 4), "float32"))
+    y = x.astype("int32")
+    yy = relay.ir_pass.infer_type(y)
+    assert "dtype=" in yy.astext()
+    assert yy.checked_type == relay.TensorType((8, 9, 4), "int32")
+
+
 def test_clip_type():
     a = relay.var("a", relay.TensorType((10, 4), "float32"))
     y = relay.clip(a, 1., 4.)
@@ -139,7 +147,9 @@ def test_infer_type_leaky_relu():
    yy = relay.ir_pass.infer_type(y)
    assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
+
 if __name__ == "__main__":
+    test_cast()
     test_zeros_ones()
     test_unary_identity()
     test_clip_type()

From b3ede8d753332c876e8c3bfd174bbd15602e0d34 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Wed, 24 Oct 2018 15:07:21 -0700
Subject: [PATCH 274/529] add Xiaoqiang Dan as reviewer (#1976)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 9865a1ade6cf..057e0a18abd5 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -21,6 +21,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
 
 ## Reviewers
+- [Xiaoqiang Dan](https://github.com/xqdan)
 - [Liangfu Chen](https://github.com/liangfu)
 - [Masahiro Masuda](https://github.com/masahi)
 - [Kazutaka Morita](https://github.com/kazum)

From e134b8b3d9939bd05d691a5eb934b2609ab9bf7e Mon Sep 17 00:00:00 2001
From: Pariksheet Pinjari <pariksheet.pinjari@huawei.com>
Date: Thu, 25 Oct 2018 05:28:10 +0530
Subject: [PATCH 275/529] [Frontend][Darknet] L2 normalization support in
 darknet (#1916)

* l2 normalization

* retrigger CI
---
 nnvm/python/nnvm/frontend/darknet.py             | 12 ++++++++++++
 nnvm/python/nnvm/testing/darknet.py              |  1 +
 .../python/frontend/darknet/test_forward.py      | 16 +++++++++++++++-
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index bf3a16cdb23e..4da2e90bca42 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -267,6 +267,13 @@ def _darknet_upsampling(inputs, attrs):
     new_attrs['scale'] = attrs.get('scale', 1)
     return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
+def _darknet_l2normalize(inputs, attrs):
+    """Process the l2 normalization operation."""
+    op_name, new_attrs = 'l2_normalize', {}
+    new_attrs['eps'] = attrs.get('eps', 0)
+    new_attrs['axis'] = attrs.get('axis', 1)
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
 def _darknet_softmax_output(inputs, attrs):
     """Process the softmax operation."""
     temperature = attrs.get('temperature', 1)
@@ -370,6 +377,7 @@ def _darknet_op_not_support(inputs, attrs):
     LAYERTYPE.REGION          : _darknet_region,
     LAYERTYPE.SHORTCUT        : _darknet_shortcut,
     LAYERTYPE.UPSAMPLE        : _darknet_upsampling,
+    LAYERTYPE.L2NORM          : _darknet_l2normalize,
     LAYERTYPE.YOLO            : _darknet_yolo,
     LAYERTYPE.DETECTION       : _darknet_op_not_support,
     LAYERTYPE.CROP            : _darknet_op_not_support,
@@ -630,6 +638,10 @@ def _get_darknet_attrs(self, layer, layer_num):
 
         elif LAYERTYPE.UPSAMPLE == layer.type:
             attr.update({'scale' : layer.stride})
+
+        elif LAYERTYPE.L2NORM == layer.type:
+            pass
+
         else:
             err = "Darknet layer type {} is not supported in nnvm.".format(layer.type)
             raise NotImplementedError(err)
diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py
index d4d33a6949f3..328ad2ae6a10 100644
--- a/nnvm/python/nnvm/testing/darknet.py
+++ b/nnvm/python/nnvm/testing/darknet.py
@@ -512,6 +512,7 @@ class ACTIVATION(object):
 layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
 layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
 layer make_upsample_layer(int batch, int w, int h, int c, int stride);
+layer make_l2norm_layer(int batch, int inputs);
 void free_network(network *net);
 """
                    )
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
index f836ca477dda..1f5e89c6e4d5 100644
--- a/nnvm/tests/python/frontend/darknet/test_forward.py
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -107,7 +107,7 @@ def get_darknet_output(net, img):
                 out.insert(0, attributes)
                 out.insert(0, _read_memory_buffer((layer.total*2, ), layer.biases))
                 out.insert(0, _read_memory_buffer((layer.n, ), layer.mask, dtype='int32'))
-                layer_ou tshape = (layer.batch, layer.out_c,
+                layer_outshape = (layer.batch, layer.out_c,
                                   layer.out_h, layer.out_w)
                 out.insert(0, _read_memory_buffer(layer_outshape, layer.output))
             elif i == net.n-1:
@@ -361,6 +361,19 @@ def test_forward_upsample():
     test_forward(net)
     LIB.free_network(net)
 
+def test_forward_l2normalize():
+    '''test l2 normalization layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_l2norm_layer(1, 224*224*3)
+    layer.c = layer.out_c = 3
+    layer.h = layer.out_h = 224
+    layer.w = layer.out_w = 224
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    test_forward(net)
+    LIB.free_network(net)
+
 def test_forward_elu():
     '''test elu activation layer'''
     net = LIB.make_network(1)
@@ -520,6 +533,7 @@ def test_forward_activation_logistic():
     test_forward_region()
     test_forward_yolo_op()
     test_forward_upsample()
+    test_forward_l2normalize()
     test_forward_elu()
     test_forward_rnn()
     test_forward_crnn()

From a5adb8488632e5fc191f530b9ad20e236391b98a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Thu, 25 Oct 2018 09:28:39 -0700
Subject: [PATCH 276/529] [Relay] visit the span (#1990)

---
 include/tvm/relay/type.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index 2bb9b3070270..5d8eca037013 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -161,6 +161,7 @@ class IncompleteTypeNode : public TypeNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("kind", &kind);
+    v->Visit("span", &span);
   }
 
   TVM_DLL static IncompleteType make(TypeVarNode::Kind kind);
@@ -243,7 +244,10 @@ class TupleTypeNode : public TypeNode {
 
   TupleTypeNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("fields", &fields); }
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("fields", &fields);
+    v->Visit("span", &span);
+  }
 
   TVM_DLL static TupleType make(tvm::Array<Type> fields);
 
@@ -361,6 +365,7 @@ class TypeRelationNode : public TypeConstraintNode {
     v->Visit("args", &args);
     v->Visit("num_inputs", &num_inputs);
     v->Visit("attrs", &attrs);
+    v->Visit("span", &span);
   }
 
   TVM_DLL static TypeRelation make(TypeRelationFn func,

From e74aeb7d8828e8f1bced9dc2cf07301f68dd33dd Mon Sep 17 00:00:00 2001
From: Howave <myhouseng@gmail.com>
Date: Fri, 26 Oct 2018 00:29:00 +0800
Subject: [PATCH 277/529] [DOCS] Fix C++ example:graph_runtime.cc:151: Check
 failed: data->ndim == data_out->ndim (2 vs. 1) (#1987)

---
 docs/deploy/nnvm.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/deploy/nnvm.md b/docs/deploy/nnvm.md
index e4ce14528b53..1e0d17f8b195 100644
--- a/docs/deploy/nnvm.md
+++ b/docs/deploy/nnvm.md
@@ -96,8 +96,8 @@ int main()
     run();
 
     DLTensor* y;
-    int out_ndim = 1;
-    int64_t out_shape[1] = {1000, };
+    int out_ndim = 2;
+    int64_t out_shape[2] = {1, 1000, };
     TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
 
     // get the function from the module(get output data)

From be33a40a1f3ccb79cfb78e1506811e2b32dea5ef Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Fri, 26 Oct 2018 00:29:42 +0800
Subject: [PATCH 278/529] Fix load subgraph from json (#1980)

---
 nnvm/src/pass/saveload_json.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/nnvm/src/pass/saveload_json.cc b/nnvm/src/pass/saveload_json.cc
index f1acb972158d..485b1417a493 100644
--- a/nnvm/src/pass/saveload_json.cc
+++ b/nnvm/src/pass/saveload_json.cc
@@ -215,13 +215,6 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph &jgraph, bool no_parse)
     for (uint32_t nid : n.control_deps) {
       n.node->control_deps.push_back(jgraph.nodes[nid].node);
     }
-    // rebuild attribute parser
-    if (!no_parse && n.node->op() != nullptr && n.node->op()->attr_parser != nullptr) {
-      n.node->op()->attr_parser(&(n.node->attrs));
-    } else if (!no_parse && n.node->is_variable()) {
-      n.node->attrs.parsed =
-        Symbol::CreateVariable(n.node->attrs.name).outputs[0].node->attrs.parsed;
-    }
     for (const JSONGraph &subgraph : n.subgraphs) {
       // The "no_parse" option here, is to be compatible with
       // commit cfd3075e85807dcd8f9534c37e053583dee87524
@@ -230,6 +223,13 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph &jgraph, bool no_parse)
       // incubator-mxnet/src/nnvm/legacy_json_util.cc:UpgradeJSON_Parse
       n.node->attrs.subgraphs.push_back(JSONGraph2Symbol(subgraph, false));
     }
+    // rebuild attribute parser
+    if (!no_parse && n.node->op() != nullptr && n.node->op()->attr_parser != nullptr) {
+      n.node->op()->attr_parser(&(n.node->attrs));
+    } else if (!no_parse && n.node->is_variable()) {
+      n.node->attrs.parsed =
+        Symbol::CreateVariable(n.node->attrs.name).outputs[0].node->attrs.parsed;
+    }
   }
   // consistency check
   for (uint32_t nid : jgraph.arg_nodes) {

From cc7c89c04da6664087f2fa6fd00326e8a7fce0bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Thu, 25 Oct 2018 09:39:49 -0700
Subject: [PATCH 279/529] [Relay] fix small typekey issue (#1992)

It might cause TupleTypeNode to be printed incorrectly.
it doesnt show in http://ci.tvm.ai:8080/blue/organizations/jenkins/tvm/detail/PR-1989/1/pipeline/141, but if you run it on local machine you will see what get compared being NodeBase and TupleType.

Also as a side thought can we write a giant macro that make sure everything get did right (all field get visited, typekey match, declare_node_type_info match, etc?) I can do some macro metaprogramming, so I can take up the work.
---
 include/tvm/relay/type.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index 5d8eca037013..6ff2be5e69d7 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -251,7 +251,7 @@ class TupleTypeNode : public TypeNode {
 
   TVM_DLL static TupleType make(tvm::Array<Type> fields);
 
-  static constexpr const char* _type_key = "relay.TypeTuple";
+  static constexpr const char* _type_key = "relay.TupleType";
   TVM_DECLARE_NODE_TYPE_INFO(TupleTypeNode, TypeNode);
 };
 

From b20a6d43d477996d6ce7d4e198eeae25a1e64898 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Thu, 25 Oct 2018 09:49:42 -0700
Subject: [PATCH 280/529] Cleanup deadcode (#1991)

---
 include/tvm/relay/expr.h | 8 --------
 include/tvm/relay/type.h | 8 --------
 2 files changed, 16 deletions(-)

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 142982d48907..566acf96e2e9 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -388,14 +388,6 @@ class TupleGetItemNode : public ExprNode {
 
 RELAY_DEFINE_NODE_REF(TupleGetItem, TupleGetItemNode, Expr);
 
-/*! \brief Print a debug representation of the expression to the stream.
- *  \param env The environment.
- *  \param e The expression
- *  \param os the stream
- *  \returns A reference to the stream.
- */
-std::ostream& DebugPrint(const Environment& env, const Expr& e, std::ostream& os);
-
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_EXPR_H_
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index 6ff2be5e69d7..a2d15a05d454 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -379,14 +379,6 @@ class TypeRelationNode : public TypeConstraintNode {
 
 RELAY_DEFINE_NODE_REF(TypeRelation, TypeRelationNode, TypeConstraint);
 
-/*! \brief Print a debug representation of the type to the stream.
- *  \param env The environment.
- *  \param t The type
- *  \param os the stream
- *  \returns A reference to the stream.
- */
-std::ostream& DebugPrint(const Environment& env, const Type& t, std::ostream& os);
-
 // The following fields contains advanced typing
 // Only keep the class name and reserved for future usage.
 class GenericTensorType;

From a71b34d6b3f454ae1bfc6605bdd92eb8f4e906dc Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 25 Oct 2018 14:25:13 -0700
Subject: [PATCH 281/529] [ATTR] Introduce Integer container (#1994)

---
 include/tvm/attrs.h                 |  2 +-
 include/tvm/expr.h                  | 46 +++++++++++++++++++++++++++++
 include/tvm/packed_func_ext.h       | 17 +++++++++++
 include/tvm/relay/attrs/nn.h        |  4 +--
 include/tvm/relay/attrs/transform.h |  6 ++--
 include/tvm/runtime/packed_func.h   |  4 +++
 src/relay/op/nn/nn.cc               |  6 ++--
 src/relay/op/tensor/transform.cc    | 23 +++++++--------
 8 files changed, 87 insertions(+), 21 deletions(-)

diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 33d84cecec6a..51d916ca488d 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -35,6 +35,7 @@
 #include <string>
 #include "ir.h"
 #include "base.h"
+#include "expr.h"
 #include "packed_func_ext.h"
 
 namespace tvm {
@@ -73,7 +74,6 @@ inline Type NullValue<Type>() {
   return Type(Type::Handle, 0, 0);
 }
 
-
 /*! \brief Error thrown during attribute checking. */
 struct AttrError : public dmlc::Error {
   /*!
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 7fdca7f6af8e..37b122ae5b03 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -29,6 +29,7 @@ using HalideIR::VarExpr;
 using HalideIR::IR::RangeNode;
 using HalideIR::IR::FunctionRef;
 using HalideIR::IR::FunctionBaseNode;
+using HalideIR::Internal::IntImm;
 using HalideIR::Internal::Stmt;
 using HalideIR::Internal::IRPrinter;
 using HalideIR::Internal::Variable;
@@ -83,6 +84,51 @@ class Var : public HalideIR::VarExpr {
 };
 
 
+/*!
+ * \brief Container of constant ineteger (IntImm).
+ *
+ * This is used to store and automate type check
+ * attributes that must be constant integer.
+ */
+class Integer : public Expr {
+ public:
+  Integer() : Expr() {}
+  /*!
+   * \brief constructor from node.
+   */
+  explicit Integer(NodePtr<Node> node) : Expr(node) {}
+  /*!
+   * \brief Construct integer from int value.
+   */
+  Integer(int value) : Expr(value) {}  // NOLINT(*)
+  /*!
+   * \brief Assign an expression to integer.
+   * \param other another expression.
+   */
+  Integer& operator=(const Integer& other) {
+    node_ = other.node_;
+    return *this;
+  }
+  /*!
+   * \brief Get pointer to the internal value.
+   * \return the content of the integer.
+   */
+  const IntImm* operator->() const {
+    return static_cast<const IntImm*>(node_.get());
+  }
+  /*!
+   * \brief convert to int64_t
+   */
+  operator int64_t() const {
+    CHECK(node_ != nullptr)
+        << " Trying get reference a null Integer";
+    return (*this)->value;
+  }
+  /*! \brief type indicate the container type */
+  using ContainerType = IntImm;
+};
+
+
 /*! \brief container class of iteration variable. */
 class IterVarNode;
 
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 0491f3057815..c5a83608c617 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -10,6 +10,7 @@
 #include <sstream>
 #include <string>
 #include <memory>
+#include <limits>
 #include <type_traits>
 
 #include "base.h"
@@ -126,6 +127,8 @@ inline TNodeRef TVMArgValue::AsNodeRef() const {
 inline TVMArgValue::operator HalideIR::Expr() const {
   if (type_code_ == kNull) return Expr();
   if (type_code_ == kDLInt) {
+    CHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
+    CHECK_GE(value_.v_int64, std::numeric_limits<int>::min());
     return Expr(static_cast<int>(value_.v_int64));
   }
   if (type_code_ == kDLFloat) {
@@ -145,6 +148,20 @@ inline TVMArgValue::operator HalideIR::Expr() const {
   return Expr(sptr);
 }
 
+inline TVMArgValue::operator tvm::Integer() const {
+  if (type_code_ == kNull) return Integer();
+  if (type_code_ == kDLInt) {
+    CHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
+    CHECK_GE(value_.v_int64, std::numeric_limits<int>::min());
+    return Integer(static_cast<int>(value_.v_int64));
+  }
+  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
+  CHECK(NodeTypeChecker<Integer>::Check(sptr.get()))
+      << "Expected type " << NodeTypeName<Expr>()
+      << " but get " << sptr->type_key();
+  return Integer(sptr);
+}
+
 inline NodePtr<Node>& TVMArgValue::node_sptr() {
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
   return *ptr<NodePtr<Node> >();
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index eb044ccb29fd..34bd5eb93312 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -317,7 +317,7 @@ struct BatchNormAttrs : public tvm::AttrsNode<BatchNormAttrs> {
 /*! \brief Attributes for LRN operator */
 struct LRNAttrs : public tvm::AttrsNode<LRNAttrs> {
   IndexExpr size;
-  IndexExpr axis;
+  int axis;
   double bias;
   double alpha;
   double beta;
@@ -340,7 +340,7 @@ struct LRNAttrs : public tvm::AttrsNode<LRNAttrs> {
 /*! \brief Attributes for L2Normalize operator */
 struct L2NormalizeAttrs : public tvm::AttrsNode<L2NormalizeAttrs> {
   double eps;
-  Array<IndexExpr> axis;
+  Array<Integer> axis;
 
   TVM_DECLARE_ATTRS(L2NormalizeAttrs, "relay.attrs.L2NormalizeAttrs") {
     TVM_ATTR_FIELD(eps)
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 1941e045ed8d..b0150c4ac3d9 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -53,7 +53,7 @@ struct ConcatenateAttrs : public tvm::AttrsNode<ConcatenateAttrs> {
 
 /*! \brief Attributes used in transpose operators */
 struct TransposeAttrs : public tvm::AttrsNode<TransposeAttrs> {
-  Array<IndexExpr> axes;
+  Array<Integer> axes;
   TVM_DECLARE_ATTRS(TransposeAttrs, "relay.attrs.TransposeAttrs") {
     TVM_ATTR_FIELD(axes)
         .describe("The target axes order, reverse order if not specified.");
@@ -70,10 +70,10 @@ struct ReshapeAttrs : public tvm::AttrsNode<ReshapeAttrs> {
 };  // struct ReshapeAttrs
 
 struct TakeAttrs : public tvm::AttrsNode<TakeAttrs> {
-  IndexExpr axis;
+  Integer axis;
 
   TVM_DECLARE_ATTRS(TakeAttrs, "relay.attrs.TakeAttrs") {
-    TVM_ATTR_FIELD(axis).set_default(NullValue<IndexExpr>())
+    TVM_ATTR_FIELD(axis).set_default(NullValue<Integer>())
         .describe("The axis over which to select values.");
   }
 };
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index a8fa096e51c4..c306f8d15160 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -32,6 +32,9 @@ struct Expr;
 #endif
 
 namespace tvm {
+// forward declarations
+class Integer;
+
 namespace runtime {
 // forward declarations
 class TVMArgs;
@@ -559,6 +562,7 @@ class TVMArgValue : public TVMPODValue_ {
   inline bool IsNodeType() const;
   inline operator HalideIR::Type() const;
   inline operator HalideIR::Expr() const;
+  inline operator tvm::Integer() const;
   // get internal node ptr, if it is node
   inline NodePtr<Node>& node_sptr();
 };
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 8459a99cde23..d38c5a0ebe0d 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -317,7 +317,7 @@ TVM_REGISTER_NODE_TYPE(LRNAttrs);
 
 Expr MakeLRN(Expr data,
              IndexExpr size,
-             IndexExpr axis,
+             int axis,
              double alpha,
              double beta,
              double bias) {
@@ -337,7 +337,7 @@ TVM_REGISTER_API("relay.op.nn._make.lrn")
   });
 
 RELAY_REGISTER_OP("nn.lrn")
-    .describe(R"code(LRN layer.
+.describe(R"code(LRN layer.
 
 Normalize the input in a local region across or within feature maps.
 Each input value is divided by (1 + (\alpha/n) \sum_i x_i^2)^\beta,
@@ -362,7 +362,7 @@ TVM_REGISTER_NODE_TYPE(L2NormalizeAttrs);
 
 Expr MakeL2Normalize(Expr data,
                      double eps,
-                     Array<IndexExpr> axis) {
+                     Array<Integer> axis) {
   auto attrs = make_node<L2NormalizeAttrs>();
   attrs->eps = eps;
   attrs->axis = std::move(axis);
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index bab875fd190e..29dff1e4ba27 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -218,24 +218,23 @@ bool TransposeRel(const Array<Type>& types,
   }
   const auto* param = attrs.as<TransposeAttrs>();
   const int ndim = data->shape.size();
-  const Array<IndexExpr>& axes = param->axes;
+  const Array<Integer>& axes = param->axes;
   // check dimension match
-  CHECK(axes.empty() || static_cast<int>(axes.size()) == ndim)
+  CHECK(!axes.defined() || static_cast<int>(axes.size()) == ndim)
     << "Dimension mismatch: axes has " << axes.size() << " elements"
     << ", but data.ndim = " << ndim;
   // construct int_axes
   std::vector<int> int_axes;
   int_axes.reserve(ndim);
-  if (axes.empty()) {
+  // used not defined to check if it is None.
+  if (!axes.defined()) {
     for (int i = ndim - 1; i >= 0; --i) {
       int_axes.push_back(i);
     }
   } else {
     std::vector<int> axis_used(ndim, 0);
-    for (const IndexExpr& e : axes) {
-      const int64_t *axis_ptr = as_const_int(e);
-      CHECK(axis_ptr != nullptr);
-      int axis = *axis_ptr;
+    for (const Integer& e : axes) {
+      int64_t axis = e;
       // sanity check for axis and ndim
       CHECK(-ndim <= axis && axis < ndim)
         << "transpose only allows each `axis` in `axes` in range [-data.ndim, data.ndim)"
@@ -245,7 +244,7 @@ bool TransposeRel(const Array<Type>& types,
       // sanity check for duplication
       CHECK(!axis_used[axis]) << "Duplicate axes in transpose: " << axis;
       axis_used[axis] = 1;
-      int_axes.push_back(axis);
+      int_axes.push_back(static_cast<int>(axis));
     }
   }
   std::vector<IndexExpr> oshape;
@@ -258,7 +257,7 @@ bool TransposeRel(const Array<Type>& types,
 }
 
 Expr MakeTranspose(Expr data,
-                   Array<IndexExpr> axes) {
+                   Array<Integer> axes) {
   auto attrs = make_node<TransposeAttrs>();
   attrs->axes = std::move(axes);
   static const Op& op = Op::Get("transpose");
@@ -401,7 +400,7 @@ bool TakeRel(const Array<Type>& types,
   std::vector<IndexExpr> oshape;
   const auto ndim_data = static_cast<int>(data->shape.size());
   const auto ndim_indices = static_cast<int>(indices->shape.size());
-  auto axis = (*as_const_int(param->axis));
+  int axis = static_cast<int>(param->axis->value);
   if (axis < 0) axis += ndim_data;
   CHECK_LE(axis, ndim_data)
     << "axis should be with in data shape"
@@ -424,9 +423,9 @@ bool TakeRel(const Array<Type>& types,
 
 Expr MakeTake(Expr data,
               Expr indices,
-              IndexExpr axis) {
+              Integer axis) {
   auto attrs = make_node<TakeAttrs>();
-  attrs->axis = axis;
+  attrs->axis = std::move(axis);
   static const Op& op = Op::Get("take");
   return CallNode::make(op, {data, indices}, Attrs(attrs), {});
 }

From d74b7bbab5c7e607e1eab1c291664776749cd458 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 25 Oct 2018 14:33:21 -0700
Subject: [PATCH 282/529] [RELAY] Add structural hashing for Relay (#1977)

---
 include/tvm/relay/pass.h                    |  21 ++
 python/tvm/relay/ir_pass.py                 |  28 +-
 src/relay/ir/hash.cc                        | 308 ++++++++++++++++++++
 tests/python/relay/test_pass_alpha_equal.py |   9 +-
 4 files changed, 362 insertions(+), 4 deletions(-)
 create mode 100644 src/relay/ir/hash.cc

diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 1b3462659e18..bf16c7ed8e33 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -136,6 +136,27 @@ tvm::Array<TypeVar> FreeTypeVars(const Expr& expr);
  */
 Expr DeadCodeElimination(const Expr& e);
 
+/*! \brief Hash a Relay type.
+ *
+ * Implements structural hashing of a Relay type.
+ *
+ *  \param type the type to hash.
+ *
+ *  \return the hash value.
+ */
+size_t StructuralHash(const Type& type);
+
+/*! \brief Hash a Relay expression.
+ *
+ * Implements structural hashing of a Relay expression.
+ *
+ * \param expr the expression to hash.
+ *
+ * \return the hash value.
+ */
+size_t StructuralHash(const Expr& expr);
+
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_H_
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index c6d5aa7515bc..f930751c41a7 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -1,4 +1,4 @@
-# pylint: disable=no-else-return,
+# pylint: disable=no-else-return
 # pylint: disable=unidiomatic-typecheck
 """The set of passes for Relay.
 
@@ -7,7 +7,8 @@
 """
 from . import _ir_pass
 from . import _make
-# pylint: disable=invalid-name
+from .expr import Expr
+from .ty import Type
 
 def infer_type(expr, env=None):
     """Infer the type of expr under the context of env.
@@ -148,7 +149,6 @@ def alpha_equal(lhs, rhs):
     """
     return bool(_make._alpha_equal(lhs, rhs))
 
-
 def graph_equal(lhs, rhs):
     """Compare two Relay expr for data-flow equivalence.
     The difference between this and alpha-equality is that
@@ -169,3 +169,25 @@ def graph_equal(lhs, rhs):
       True iff lhs is data-flow equivalent to rhs.
     """
     return bool(_make._graph_equal(lhs, rhs))
+
+def structural_hash(value):
+    """Hash a Relay expression structurally.
+
+    Parameters
+    ----------
+    expr: tvm.relay.Expr or tvm.relay.Type
+      The expression to hash.
+
+    Returns
+    -------
+    result: int
+      The hash value
+    """
+    if isinstance(value, Expr):
+        return int(_ir_pass._expr_hash(value))
+    elif isinstance(value, Type):
+        return int(_ir_pass._type_hash(value))
+    else:
+        msg = ("found value of type {0} expected" +
+               "relay.Expr or relay.Type").format(type(value))
+        raise TypeError(msg)
diff --git a/src/relay/ir/hash.cc b/src/relay/ir/hash.cc
new file mode 100644
index 000000000000..3aa567a4892e
--- /dev/null
+++ b/src/relay/ir/hash.cc
@@ -0,0 +1,308 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/ir/hash.cc
+ * \brief Hash functions for Relay types and expressions.
+ */
+#include <tvm/ir_pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/relay/pass.h>
+#include <tvm/attrs.h>
+#include "type_functor.h"
+#include "../../lang/attr_functor.h"
+
+namespace tvm {
+namespace relay {
+
+// Hash handler for Relay.
+class RelayHashHandler:
+      public AttrsHashHandler,
+      public TypeFunctor<size_t(const Type&)>,
+      public ExprFunctor<size_t(const Expr&)> {
+ public:
+  explicit RelayHashHandler() {}
+
+  /*!
+   * Compute hash of a node.
+   * \param ref The node to hash.
+   * \return the hash value.
+   */
+  size_t Hash(const NodeRef& ref) {
+    if (!ref.defined()) return ref.hash();
+
+    if (ref->derived_from<TypeNode>()) {
+      return TypeHash(Downcast<Type>(ref));
+    }
+    if (ref->derived_from<ExprNode>()) {
+      return ExprHash(Downcast<Expr>(ref));
+    }
+    return AttrHash(ref);
+  }
+
+  /*!
+   * Compute hash of the attributes.
+   * \param ref The attributes.
+   * \return the hash value
+   */
+  size_t AttrHash(const NodeRef& ref) {
+    if (!ref.defined()) { return ref.hash(); }
+    return AttrsHashHandler::Hash(ref);
+  }
+  /*!
+   * Compute hash of a Relay type.
+   * \param ref The type to hash.
+   * \param rhs The right hand operand.
+   * \return the hash value.
+   */
+  size_t TypeHash(const Type& type) {
+    if (!type.defined()) { return type.hash(); }
+    auto found = hash_map_.find(type);
+    if (found != hash_map_.end()) {
+      return found->second;
+    } else {
+      auto hash = this->VisitType(type);
+      hash_map_.insert({type, hash});
+      return hash;
+    }
+  }
+  /*!
+   * Compute the hash of an expression.
+   *
+   * \note We run graph structural equality checking when comparing two Exprs.
+   *   This means that AlphaEqualHandler can only be used once for each pair.
+   *   The equality checker checks data-flow equvalence of the Expr DAG.
+   *   This function also runs faster as it memomizes equal_map.
+   *
+   * \param expr The expression to hash.
+   * \return the hash value.
+   */
+  size_t ExprHash(const Expr& expr) {
+    if (!expr.defined()) return expr.hash();
+    auto found = hash_map_.find(expr);
+    if (found != hash_map_.end()) {
+      return found->second;
+    } else {
+      auto hash = this->VisitExpr(expr);
+      hash_map_.insert({expr, hash});
+      return hash;
+    }
+  }
+
+ protected:
+  /*!
+   * \brief Hash a DataType.
+   * \param dtype The dtype to hash.
+   * \return the hash value.
+   */
+  size_t DataTypeHash(const DataType& dtype) {
+    return ::tvm::AttrsHash()(dtype);
+  }
+
+  using AttrsHashHandler::VisitAttr_;
+  size_t VisitAttr_(const Variable* var) final {
+    auto it = hash_map_.find(GetRef<VarExpr>(var));
+    if (it != hash_map_.end()) {
+      return it->second;
+    }
+
+
+    size_t hash = std::hash<std::string>()(var->_type_key);
+    return Combine(hash, std::hash<std::string>()(var->name_hint));
+  }
+
+  // Type hashing
+  size_t VisitType_(const TensorTypeNode* tensor_type) final {
+    size_t hash = std::hash<std::string>()(tensor_type->_type_key);
+    hash = Combine(hash, DataTypeHash(tensor_type->dtype));
+    hash = Combine(hash, Hash(tensor_type->shape));
+    return hash;
+  }
+
+  size_t VisitType_(const IncompleteTypeNode* incomplete) final {
+    size_t hash = std::hash<std::string>()(incomplete->_type_key);
+    return Combine(hash, std::hash<int>()(incomplete->kind));
+  }
+
+  size_t VisitType_(const TypeVarNode* tyvar) final {
+    /*
+      TypeVar/Var/Variable have two locations where they are hashed:
+
+        The declaration site of a function, let, or function type.
+        The first occurence in the term.
+
+      We will only reach this code if the TypeVar itself is unbound, we assign
+      a free variable index to it, meaning this hashing function implements
+      structural equality for both open (i.e graph equality) and closed terms
+      (i.e alpha_equality).
+    */
+    return BindVar(GetRef<TypeVar>(tyvar));
+  }
+
+  size_t VisitType_(const FuncTypeNode* func_type) final {
+    size_t hash = std::hash<std::string>()(func_type->_type_key);
+
+    for (auto type_param : func_type->type_params) {
+      hash = Combine(hash, BindVar(type_param));
+    }
+
+    for (auto arg : func_type->arg_types) {
+      hash = Combine(hash, TypeHash(arg));
+    }
+
+    hash = Combine(hash, TypeHash(func_type->ret_type));
+    for (auto cs : func_type->type_constraints) {
+      hash = Combine(hash, TypeHash(cs));
+    }
+
+    return hash;
+  }
+
+  size_t VisitType_(const TypeRelationNode* type_rel) final {
+    size_t hash = std::hash<std::string>()(type_rel->_type_key);
+    hash = Combine(hash, std::hash<std::string>()(type_rel->func->name));
+    hash = Combine(hash, AttrHash(type_rel->attrs));
+
+    for (auto arg : type_rel->args) {
+      hash = Combine(hash, TypeHash(arg));
+    }
+
+    return hash;
+  }
+
+  size_t VisitType_(const TupleTypeNode* tuple_type) final {
+    size_t hash = std::hash<std::string>()(tuple_type->_type_key);
+    for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+      hash = Combine(hash, TypeHash(tuple_type->fields[i]));
+    }
+    return hash;
+  }
+
+  // Expr hashing.
+  size_t NDArrayHash(const runtime::NDArray& array) {
+    size_t hash = std::hash<uint8_t>()(array->dtype.code);
+    hash = Combine(hash, std::hash<uint8_t>()(array->dtype.bits));
+    hash = Combine(hash, std::hash<uint16_t>()(array->dtype.lanes));
+    CHECK_EQ(array->ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    size_t data_size = runtime::GetDataSize(*array.operator->());
+    uint8_t * data = reinterpret_cast<uint8_t*>(array->data);
+    for (size_t i = 0; i < data_size; i++) {
+      hash = Combine(hash, std::hash<uint8_t>()(data[i]));
+    }
+    return hash;
+  }
+
+  size_t BindVar(const NodeRef& var) {
+    size_t hash = std::hash<int>()(var_counter++);
+    CHECK_EQ(hash_map_.count(var), 0);
+    hash_map_[var] = hash;
+
+    const auto* ty_param = var.as<TypeVarNode>();
+    if (ty_param && ty_param->kind == TypeVarNode::Kind::kShapeVar) {
+      hash_map_[ty_param->var] = hash;
+    }
+    return hash;
+  }
+
+  size_t VisitExpr_(const VarNode* var) final {
+    size_t name_hash = std::hash<std::string>()(var->name_hint);
+    return Combine(name_hash, TypeHash(var->type_annotation));
+  }
+
+  size_t VisitExpr_(const GlobalVarNode* global) final {
+    return std::hash<std::string>()(global->name_hint);
+  }
+
+  size_t VisitExpr_(const TupleNode* tuple) final {
+    size_t hash = std::hash<std::string>()(tuple->_type_key);
+    for (size_t i = 0; i < tuple->fields.size(); i++) {
+      hash = Combine(hash, ExprHash(tuple->fields[i]));
+    }
+    return hash;
+  }
+
+  size_t VisitExpr_(const FunctionNode* func) final {
+    size_t hash = std::hash<std::string>()(func->_type_key);
+    for (auto type_param : func->type_params) {
+      hash = Combine(hash, BindVar(type_param));
+    }
+
+    for (auto param : func->params) {
+      hash = Combine(hash, BindVar(param));
+    }
+
+    hash = Combine(hash, TypeHash(func->ret_type));
+    hash =  Combine(hash, ExprHash(func->body));
+
+    return hash;
+  }
+
+  size_t VisitExpr_(const CallNode* call) final {
+    size_t hash = std::hash<std::string>()(call->_type_key);
+    hash = Combine(hash, ExprHash(call->op));
+
+    for (auto arg : call->args) {
+      hash = Combine(hash, ExprHash(arg));
+    }
+
+    hash = Combine(hash, AttrHash(call->attrs));
+
+    return hash;
+  }
+
+  size_t VisitExpr_(const LetNode* let) final {
+    size_t hash = std::hash<std::string>()(let->_type_key);
+    hash = Combine(hash, BindVar(let->var));
+    hash = Combine(hash, ExprHash(let->value));
+    hash = Combine(hash, ExprHash(let->body));
+    return hash;
+  }
+
+  size_t VisitExpr_(const IfNode* ite) final {
+    size_t hash = std::hash<std::string>()(ite->_type_key);
+    hash = Combine(hash, ExprHash(ite->cond));
+    hash = Combine(hash, ExprHash(ite->true_branch));
+    hash = Combine(hash, ExprHash(ite->false_branch));
+    return hash;
+  }
+
+  size_t VisitExpr_(const OpNode* op) final {
+    return GetRef<Op>(op).hash();
+  }
+
+  size_t VisitExpr_(const ConstantNode* rconst) final {
+    return NDArrayHash(rconst->data);
+  }
+
+  size_t VisitExpr_(const TupleGetItemNode* get_item) final {
+    size_t hash = std::hash<std::string>()(get_item->_type_key);
+    hash = Combine(hash, ExprHash(get_item->tuple));
+    hash = Combine(hash, std::hash<int>()(get_item->index));
+    return hash;
+  }
+
+ private:
+  // renaming of NodeRef to indicate two nodes equals to each other
+  std::unordered_map<NodeRef, size_t, NodeHash, NodeEqual> hash_map_;
+  int var_counter = 0;
+};
+
+size_t StructuralHash(const Type& type) {
+  return RelayHashHandler().TypeHash(type);
+}
+
+size_t StructuralHash(const Expr& expr) {
+  return RelayHashHandler().ExprHash(expr);
+}
+
+TVM_REGISTER_API("relay._ir_pass._expr_hash")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = static_cast<int64_t>(RelayHashHandler().Hash(args[0]));
+  });
+
+TVM_REGISTER_API("relay._ir_pass._type_hash")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = static_cast<int64_t>(RelayHashHandler().TypeHash(args[0]));
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index d16c2df53435..5158d5c7cc9c 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -1,7 +1,14 @@
 import tvm
 import numpy as np
 from tvm import relay
-from tvm.relay.ir_pass import alpha_equal
+from tvm.relay import ir_pass
+
+def alpha_equal(x, y):
+    """
+    Wrapper around alpha equality which ensures that
+    the hash function respects equality.
+    """
+    return ir_pass.alpha_equal(x, y) and ir_pass.structural_hash(x) == ir_pass.structural_hash(y)
 
 def test_tensor_type_alpha_equal():
     t1 = relay.TensorType((3, 4), "float32")

From 824db6fb60665a744e52e15221870c75afa14131 Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Thu, 25 Oct 2018 18:30:23 -0700
Subject: [PATCH 283/529] fix typo in resnet definition (#1995)

---
 python/tvm/relay/testing/resnet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/testing/resnet.py b/python/tvm/relay/testing/resnet.py
index cb489f6e2471..9ba57ae09ef5 100644
--- a/python/tvm/relay/testing/resnet.py
+++ b/python/tvm/relay/testing/resnet.py
@@ -60,7 +60,7 @@ def residual_unit(data,
         bn1 = layers.batch_norm_infer(data=data,
                                       epsilon=2e-5,
                                       name=name + '_bn1')
-        act1 = relay.relu(data=bn1)
+        act1 = relay.nn.relu(data=bn1)
         conv1 = layers.conv2d(
             data=act1,
             channels=int(num_filter*0.25),
@@ -69,12 +69,12 @@ def residual_unit(data,
             padding=(0, 0),
             name=name + '_conv1')
         bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, name=name + '_bn2')
-        act2 = relay.relu(data=bn2)
+        act2 = relay.nn.relu(data=bn2)
         conv2 = layers.conv2d(
             data=act2, channels=int(num_filter*0.25), kernel_size=(3, 3),
             strides=(1, 1), padding=(1, 1), name=name + '_conv2')
         bn3 = layers.batch_norm_infer(data=conv2, epsilon=2e-5, name=name + '_bn3')
-        act3 = relay.relu(data=bn3)
+        act3 = relay.nn.relu(data=bn3)
         conv3 = layers.conv2d(
             data=act3, channels=num_filter, kernel_size=(1, 1),
             strides=(1, 1), padding=(0, 0), name=name + '_conv3')
@@ -120,7 +120,7 @@ def resnet(units,
         Number of units in each stage
 
     num_stages : int
-        Number of stage
+        Number of stages
 
     filter_list : list
         Channel size of each stage

From 096fa4863d42a86bb15862b02a0924b94bdb15d1 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 25 Oct 2018 22:54:55 -0700
Subject: [PATCH 284/529] [RELAY] Fix compilation under clang-4.0 (#1998)

---
 src/relay/ir/hash.cc | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/relay/ir/hash.cc b/src/relay/ir/hash.cc
index 3aa567a4892e..ce2049f269df 100644
--- a/src/relay/ir/hash.cc
+++ b/src/relay/ir/hash.cc
@@ -100,26 +100,24 @@ class RelayHashHandler:
 
   using AttrsHashHandler::VisitAttr_;
   size_t VisitAttr_(const Variable* var) final {
+    size_t hash = std::hash<std::string>()(Variable::_type_key);
     auto it = hash_map_.find(GetRef<VarExpr>(var));
     if (it != hash_map_.end()) {
       return it->second;
     }
-
-
-    size_t hash = std::hash<std::string>()(var->_type_key);
     return Combine(hash, std::hash<std::string>()(var->name_hint));
   }
 
   // Type hashing
   size_t VisitType_(const TensorTypeNode* tensor_type) final {
-    size_t hash = std::hash<std::string>()(tensor_type->_type_key);
+    size_t hash = std::hash<std::string>()(TensorTypeNode::_type_key);
     hash = Combine(hash, DataTypeHash(tensor_type->dtype));
     hash = Combine(hash, Hash(tensor_type->shape));
     return hash;
   }
 
   size_t VisitType_(const IncompleteTypeNode* incomplete) final {
-    size_t hash = std::hash<std::string>()(incomplete->_type_key);
+    size_t hash = std::hash<std::string>()(IncompleteTypeNode::_type_key);
     return Combine(hash, std::hash<int>()(incomplete->kind));
   }
 
@@ -139,7 +137,7 @@ class RelayHashHandler:
   }
 
   size_t VisitType_(const FuncTypeNode* func_type) final {
-    size_t hash = std::hash<std::string>()(func_type->_type_key);
+    size_t hash = std::hash<std::string>()(FuncTypeNode::_type_key);
 
     for (auto type_param : func_type->type_params) {
       hash = Combine(hash, BindVar(type_param));
@@ -158,7 +156,7 @@ class RelayHashHandler:
   }
 
   size_t VisitType_(const TypeRelationNode* type_rel) final {
-    size_t hash = std::hash<std::string>()(type_rel->_type_key);
+    size_t hash = std::hash<std::string>()(TypeRelationNode::_type_key);
     hash = Combine(hash, std::hash<std::string>()(type_rel->func->name));
     hash = Combine(hash, AttrHash(type_rel->attrs));
 
@@ -170,7 +168,7 @@ class RelayHashHandler:
   }
 
   size_t VisitType_(const TupleTypeNode* tuple_type) final {
-    size_t hash = std::hash<std::string>()(tuple_type->_type_key);
+    size_t hash = std::hash<std::string>()(TupleTypeNode::_type_key);
     for (size_t i = 0; i < tuple_type->fields.size(); i++) {
       hash = Combine(hash, TypeHash(tuple_type->fields[i]));
     }
@@ -213,7 +211,7 @@ class RelayHashHandler:
   }
 
   size_t VisitExpr_(const TupleNode* tuple) final {
-    size_t hash = std::hash<std::string>()(tuple->_type_key);
+    size_t hash = std::hash<std::string>()(TupleNode::_type_key);
     for (size_t i = 0; i < tuple->fields.size(); i++) {
       hash = Combine(hash, ExprHash(tuple->fields[i]));
     }
@@ -221,7 +219,7 @@ class RelayHashHandler:
   }
 
   size_t VisitExpr_(const FunctionNode* func) final {
-    size_t hash = std::hash<std::string>()(func->_type_key);
+    size_t hash = std::hash<std::string>()(FunctionNode::_type_key);
     for (auto type_param : func->type_params) {
       hash = Combine(hash, BindVar(type_param));
     }
@@ -237,7 +235,7 @@ class RelayHashHandler:
   }
 
   size_t VisitExpr_(const CallNode* call) final {
-    size_t hash = std::hash<std::string>()(call->_type_key);
+    size_t hash = std::hash<std::string>()(CallNode::_type_key);
     hash = Combine(hash, ExprHash(call->op));
 
     for (auto arg : call->args) {
@@ -250,7 +248,7 @@ class RelayHashHandler:
   }
 
   size_t VisitExpr_(const LetNode* let) final {
-    size_t hash = std::hash<std::string>()(let->_type_key);
+    size_t hash = std::hash<std::string>()(LetNode::_type_key);
     hash = Combine(hash, BindVar(let->var));
     hash = Combine(hash, ExprHash(let->value));
     hash = Combine(hash, ExprHash(let->body));
@@ -258,7 +256,8 @@ class RelayHashHandler:
   }
 
   size_t VisitExpr_(const IfNode* ite) final {
-    size_t hash = std::hash<std::string>()(ite->_type_key);
+    size_t key = std::hash<std::string>()(IfNode::_type_key);
+    size_t hash = key;
     hash = Combine(hash, ExprHash(ite->cond));
     hash = Combine(hash, ExprHash(ite->true_branch));
     hash = Combine(hash, ExprHash(ite->false_branch));
@@ -274,7 +273,7 @@ class RelayHashHandler:
   }
 
   size_t VisitExpr_(const TupleGetItemNode* get_item) final {
-    size_t hash = std::hash<std::string>()(get_item->_type_key);
+    size_t hash = std::hash<std::string>()(TupleGetItemNode::_type_key);
     hash = Combine(hash, ExprHash(get_item->tuple));
     hash = Combine(hash, std::hash<int>()(get_item->index));
     return hash;

From f7b9f3b1797642c35b7f598fb1457bfce8fa1275 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Sat, 27 Oct 2018 00:45:16 +0530
Subject: [PATCH 285/529] [RELAY][OP] Split (#1876)

---
 docs/langref/relay_op.rst            |  2 +
 include/tvm/relay/attrs/transform.h  | 16 +++++
 nnvm/src/top/tensor/transform.cc     |  2 +-
 python/tvm/relay/expr.py             | 11 ++++
 python/tvm/relay/op/transform.py     | 35 +++++++++-
 src/lang/attr_functor.h              |  4 ++
 src/lang/attrs.cc                    |  2 +
 src/relay/op/tensor/transform.cc     | 97 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level3.py | 33 ++++++++++
 9 files changed, 200 insertions(+), 2 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 42883f5f77da..11fb282abac5 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -94,6 +94,7 @@ This level enables additional math and transform operators.
    tvm.relay.full
    tvm.relay.full_like
    tvm.relay.cast
+   tvm.relay.split
 
 
 **Level 4: Broadcast and Reductions**
@@ -198,6 +199,7 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.full
 .. autofunction:: tvm.relay.full_like
 .. autofunction:: tvm.relay.cast
+.. autofunction:: tvm.relay.split
 
 
 Level 4 Definitions
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index b0150c4ac3d9..dfad1013701f 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -106,6 +106,22 @@ struct SqueezeAttrs : public tvm::AttrsNode<SqueezeAttrs> {
   }
 };  // struct SqueezeAttrs
 
+struct SplitAttrs : public tvm::AttrsNode<SplitAttrs> {
+  NodeRef indices_or_sections;
+  int axis;
+
+  TVM_DECLARE_ATTRS(SplitAttrs, "relay.attrs.SplitAttrs") {
+    TVM_ATTR_FIELD(indices_or_sections)
+        .describe("Indices or sections to split into. Accepts an int or a tuple"
+                  "If indices_or_sections is an integer, the input will be divided equally"
+                  "along given axis. If such a split is not possible, an error is raised."
+                  "If indices_or_sections is a tuple of sorted integers,"
+                  "the entries indicate where along axis the array is split.");
+    TVM_ATTR_FIELD(axis).set_default(0)
+        .describe("the axis to be splitted.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index a8159b539410..8e35039a8085 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -427,7 +427,7 @@ along which to split the array.
       return Array<Tensor>{ topi::split(inputs[0], indices, param.axis) };
     }
 })
-.set_support_level(1);
+.set_support_level(3);
 
 // cast
 DMLC_REGISTER_PARAMETER(CastParam);
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 655379066c74..0650a493d9a6 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -5,6 +5,7 @@
 import numpy as _np
 from .base import RelayNode, register_relay_node
 from . import _make
+from . import _expr
 from . import ty as _ty
 from .._ffi import base as _base
 from .. import nd as _nd
@@ -284,6 +285,16 @@ def astuple(self):
         as an argument to an FFI function."""
         return self.tuple_value
 
+    def astext(self):
+        """Get the text format of the tuple expression.
+
+        Returns
+        -------
+        text : str
+            The text format of the tuple expression.
+        """
+        return _expr._text_print(self.tuple_value)
+
     def __getitem__(self, index):
         if index >= len(self):
             raise IndexError("Tuple index out of range")
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 84e2398f0a9e..3cf139c7dd86 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1,6 +1,7 @@
 """Transform operators."""
 
 from . import _make
+from ..expr import TupleWrapper
 
 
 def expand_dims(data, axis, num_newaxis=1):
@@ -146,7 +147,7 @@ def take(data, indices, axis=None):
 
     Parameters
     ----------
-    a : relay.Expr
+    data : relay.Expr
         The source array.
 
     indices : rely.Expr
@@ -280,3 +281,35 @@ def collapse_sum_like(data, collapse_type):
         The resulting tensor.
     """
     return _make.collapse_sum_like(data, collapse_type)
+
+
+def split(data, indices_or_sections, axis=0):
+    """Split input tensor along axis by sections or indices.
+
+    If indices_or_sections is an integer, the input will be divided equally
+    along given axis. If such a split is not possible, an error is raised.
+
+    If indices_or_sections is a tuple of sorted integers,
+    the entries indicate where along axis the array is split.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The source array.
+
+    indices_or_sections : int or tuple of int
+        Indices or sections to split into. Accepts an int or a tuple
+
+    axis : int, optional
+        The axis over which to split.
+
+    Returns
+    -------
+    ret : relay.Tuple([relay.Expr, relay.Expr])
+        The computed result.
+    """
+    if isinstance(indices_or_sections, int):
+        ret_size = indices_or_sections
+    else:
+        ret_size = len(indices_or_sections) + 1
+    return TupleWrapper(_make.split(data, indices_or_sections, axis), ret_size)
diff --git a/src/lang/attr_functor.h b/src/lang/attr_functor.h
index ef1d061015c3..9257ad3b5490 100644
--- a/src/lang/attr_functor.h
+++ b/src/lang/attr_functor.h
@@ -64,6 +64,7 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
   virtual R VisitAttr_(const ir::Add* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::Sub* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::Mul* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Div* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::Mod* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::Min* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::Max* op, Args... args) ATTR_FUNCTOR_DEFAULT;
@@ -96,6 +97,7 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
     ATTR_FUNCTOR_DISPATCH(Add);
     ATTR_FUNCTOR_DISPATCH(Sub);
     ATTR_FUNCTOR_DISPATCH(Mul);
+    ATTR_FUNCTOR_DISPATCH(Div);
     ATTR_FUNCTOR_DISPATCH(Min);
     ATTR_FUNCTOR_DISPATCH(Max);
     ATTR_FUNCTOR_DISPATCH(GE);
@@ -135,6 +137,7 @@ class AttrsEqualHandler :
   bool VisitAttr_(const ir::Add* lhs, const NodeRef& other) final;
   bool VisitAttr_(const ir::Sub* lhs, const NodeRef& other) final;
   bool VisitAttr_(const ir::Mul* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Div* lhs, const NodeRef& other) final;
   bool VisitAttr_(const ir::Mod* lhs, const NodeRef& other) final;
   bool VisitAttr_(const ir::Min* lhs, const NodeRef& other) final;
   bool VisitAttr_(const ir::Max* lhs, const NodeRef& other) final;
@@ -174,6 +177,7 @@ class AttrsHashHandler :
   size_t VisitAttr_(const ir::Add* op) final;
   size_t VisitAttr_(const ir::Sub* op) final;
   size_t VisitAttr_(const ir::Mul* op) final;
+  size_t VisitAttr_(const ir::Div* op) final;
   size_t VisitAttr_(const ir::Mod* op) final;
   size_t VisitAttr_(const ir::Min* op) final;
   size_t VisitAttr_(const ir::Max* op) final;
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index 9aa067c09679..3b273f4939ef 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -132,6 +132,7 @@ bool AttrsEqualHandler::VisitAttr_(const StrMapNode* lhs, const NodeRef& other)
 TVM_DEFINE_ATTRS_BINOP_EQUAL(Add);
 TVM_DEFINE_ATTRS_BINOP_EQUAL(Sub);
 TVM_DEFINE_ATTRS_BINOP_EQUAL(Mul);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Div);
 TVM_DEFINE_ATTRS_BINOP_EQUAL(Mod);
 TVM_DEFINE_ATTRS_BINOP_EQUAL(Max);
 TVM_DEFINE_ATTRS_BINOP_EQUAL(Min);
@@ -243,6 +244,7 @@ size_t AttrsHashHandler::VisitAttr_(const StrMapNode* lhs) {
 TVM_DEFINE_ATTRS_BINOP_HASH(Add);
 TVM_DEFINE_ATTRS_BINOP_HASH(Sub);
 TVM_DEFINE_ATTRS_BINOP_HASH(Mul);
+TVM_DEFINE_ATTRS_BINOP_HASH(Div);
 TVM_DEFINE_ATTRS_BINOP_HASH(Mod);
 TVM_DEFINE_ATTRS_BINOP_HASH(Max);
 TVM_DEFINE_ATTRS_BINOP_HASH(Min);
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 29dff1e4ba27..d7b4980f80b2 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -6,12 +6,14 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/transform.h>
 #include <tvm/ir_operator.h>
+#include <tvm/ir.h>
 #include <vector>
 #include "../op_common.h"
 
 
 namespace tvm {
 namespace relay {
+using ir::IntImm;
 
 // relay.cast
 TVM_REGISTER_NODE_TYPE(CastAttrs);
@@ -834,5 +836,100 @@ RELAY_REGISTER_OP("broadcast_to_like")
 .set_support_level(10)
 .add_type_rel("BroadCastToLike", BroadCastToLikeRel);
 
+// Split
+TVM_REGISTER_NODE_TYPE(SplitAttrs);
+
+bool SplitRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  // `types` contains: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  CHECK(data != nullptr);
+  CHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
+  const auto param = attrs.as<SplitAttrs>();
+  CHECK(param != nullptr);
+  auto axis = param->axis;
+  if (axis < 0) {
+    axis += data->shape.size();
+  }
+  CHECK_LT(axis, data->shape.size())
+    << "axis should be within the input dimension range.";
+  CHECK_GT(axis, 0)
+    << "axis should be within the input dimension range.";
+
+  if (const IntImm* sections = param->indices_or_sections.as<IntImm>()) {
+    CHECK(reporter->Assert(data->shape[axis] %
+                           sections->value == make_zero(Int(64))))
+        << "indices_or_sections need to be able to divide input.shape[axis]";
+    std::vector<Type> fields;
+    for (int i = 0; i < sections->value; ++i) {
+        std::vector<IndexExpr>&& oshape = AsVector(data->shape);
+        oshape[axis] /= int32_t(sections->value);
+        auto vec_type = TensorTypeNode::make(oshape, data->dtype);
+        fields.push_back(vec_type);
+    }
+    reporter->Assign(types[1], TupleTypeNode::make(Array<Type>(fields)));
+  } else {
+    auto indices = param->indices_or_sections.as<ArrayNode>()->data;
+    auto begin = IndexExpr(make_zero(Int(32)));
+    std::vector<Type> fields;
+    for (uint i = 0; i < indices.size(); ++i) {
+      CHECK(reporter->Assert(IndexExpr(indices[i]) > begin))
+          << "indices_or_sections need to be a sorted ascending list";
+      std::vector<IndexExpr>&& oshape = AsVector(data->shape);
+      oshape[axis] = IndexExpr(indices[i]) - begin;
+      begin = IndexExpr(indices[i]);
+      auto vec_type = TensorTypeNode::make(oshape, data->dtype);
+      fields.push_back(vec_type);
+    }
+    CHECK(reporter->Assert(begin < data->shape[axis]))
+        << "The sum of sections must match the input.shape[axis]";
+    std::vector<IndexExpr>&& oshape = AsVector(data->shape);
+    oshape[axis] = data->shape[axis] - begin;
+    auto vec_type = TensorTypeNode::make(oshape, data->dtype);
+    fields.push_back(vec_type);
+    reporter->Assign(types[1], TupleTypeNode::make(Array<Type>(fields)));
+  }
+  return true;
+}
+
+Expr MakeSplit(Expr data,
+               NodeRef indices_or_sections,
+               int axis) {
+  auto attrs = make_node<SplitAttrs>();
+  attrs->axis = axis;
+  attrs->indices_or_sections = std::move(indices_or_sections);
+  static const Op& op = Op::Get("split");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.split")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    if (args.type_codes[1] == kDLInt) {
+      *rv = MakeSplit(args[0], make_const(Int(64), int64_t(args[1])), args[2]);
+    } else {
+      *rv = MakeSplit(args[0], args[1], args[2]);
+    }
+});
+
+RELAY_REGISTER_OP("split")
+.describe(R"code(Splits an array along a particular axis into multiple sub-arrays.
+
+Indices or sections to split into. Accepts an int or a tuple
+If indices_or_sections is an integer, the input will be divided equally
+along given axis. If such a split is not possible, an error is raised.
+
+If indices_or_sections is a tuple of sorted integers,
+the entries indicate where along axis the array is split.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.SplitAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Split", SplitRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 8ab3c41c079d..804d3c46ca36 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -107,6 +107,38 @@ def verify_take(dshape, indices_shape, oshape, axis=None):
     verify_take((d1, d2), (d3, d4, d5), (d1, d3, d4, d5), 1)
     verify_take((d1, d2, d3, d4), (d5, d6), (d1, d2, d5, d6, d4), -2)
 
+def test_split_infer_type():
+    def verify_split(dshape, indices_or_sections, ret_type, axis=None):
+        x = relay.var("x", relay.ty.TensorType(dshape, "float32"))
+        y = relay.split(x, indices_or_sections, axis=axis)
+        y.astext()
+        yy = relay.ir_pass.infer_type(y.astuple())
+        assert yy.checked_type == ret_type
+
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    axis = tvm.var("axis")
+    verify_split((5, 5, 2, 2), 5,
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                     relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                     relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                     relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                     relay.ty.TensorType((5, 1, 2, 2), "float32")])),
+                  axis=1)
+    verify_split((d1, d2, d3, d4), 4,
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((d1, d2, d3/4, d4), "float32"),
+                     relay.ty.TensorType((d1, d2, d3/4, d4), "float32"),
+                     relay.ty.TensorType((d1, d2, d3/4, d4), "float32"),
+                     relay.ty.TensorType((d1, d2, d3/4, d4), "float32")])),
+                  axis=2)
+    verify_split((d1, d2, d3, d4), (2, 4, 7),
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((d1, 2, d3, d4), "float32"),
+                     relay.ty.TensorType((d1, 2, d3, d4), "float32"),
+                     relay.ty.TensorType((d1, 3, d3, d4), "float32"),
+                     relay.ty.TensorType((d1, (d2-7), d3, d4), "float32")])),
+                  axis=1)
 
 def test_full():
     # default settings: match input dtype
@@ -161,3 +193,4 @@ def test_infer_type_leaky_relu():
     test_infer_type_leaky_relu()
     test_squeeze_infer_type()
     test_squeeze_bad_axes_infer_type()
+    test_split_infer_type()

From 27d30f299c1aa4984d604cf1f8c0e1ea16eab38b Mon Sep 17 00:00:00 2001
From: Yang Chen <40417152+yangchen-MS@users.noreply.github.com>
Date: Fri, 26 Oct 2018 16:38:32 -0700
Subject: [PATCH 286/529] initialize base class in copy constructors (#2006)

GCC issues warnings with -Wextra if we don't explicitly initialize
base class in copy constructors. This commit fixed the issue.
---
 include/tvm/runtime/packed_func.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index c306f8d15160..c2098636f687 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -603,7 +603,7 @@ class TVMRetValue : public TVMPODValue_ {
   using TVMPODValue_::operator TVMContext;
   using TVMPODValue_::operator NDArray;
   // Disable copy and assign from another value, but allow move.
-  TVMRetValue(const TVMRetValue& other) {
+  TVMRetValue(const TVMRetValue& other) : TVMPODValue_() {
     this->Assign(other);
   }
   // conversion operators

From cf39ff176834c30209bd57486c2b686bec2f2177 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Fri, 26 Oct 2018 20:11:31 -0700
Subject: [PATCH 287/529] [RELAY] Add occurs check before unification (#2012)

---
 src/relay/pass/type_solver.cc           |  5 +++++
 tests/cpp/relay_pass_type_infer_test.cc | 22 ++++++++++++++++++++++
 tests/python/relay/test_type_infer.py   | 17 +++++++++++++++++
 3 files changed, 44 insertions(+)
 create mode 100644 tests/cpp/relay_pass_type_infer_test.cc

diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index 67378c5d14a6..3ca161d23f72 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -61,6 +61,11 @@ Type TypeSolver::Unify(const Type& dst, const Type& src) {
   // - handle shape pattern matching
   TypeNode* lhs = GetTypeNode(dst);
   TypeNode* rhs = GetTypeNode(src);
+
+  // do occur check so we don't create self-referencing structure
+  if (lhs->FindRoot() == rhs->FindRoot()) {
+    return lhs->resolved_type;
+  }
   if (lhs->resolved_type.as<IncompleteTypeNode>()) {
     MergeFromTo(lhs, rhs);
     return rhs->resolved_type;
diff --git a/tests/cpp/relay_pass_type_infer_test.cc b/tests/cpp/relay_pass_type_infer_test.cc
new file mode 100644
index 000000000000..e1a81d3c0535
--- /dev/null
+++ b/tests/cpp/relay_pass_type_infer_test.cc
@@ -0,0 +1,22 @@
+#include <gtest/gtest.h>
+#include <tvm/tvm.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/type.h>
+#include <tvm/relay/pass.h>
+
+TEST(Relay, SelfReference) {
+  using namespace tvm;
+  auto type_a = relay::TypeVarNode::make("a", relay::TypeVarNode::kType);
+  auto type_b = relay::TypeVarNode::make("b", relay::TypeVarNode::kType);
+  auto x = relay::VarNode::make("x", type_a);
+  auto f = relay::FunctionNode::make(tvm::Array<relay::Var>{ x }, x, type_b, Array<relay::TypeVar>{});
+  auto fx = relay::CallNode::make(f, Array<relay::Expr>{ x });
+  auto type_fx = relay::InferType(fx, relay::EnvironmentNode::make(Map<relay::GlobalVar, relay::Function>{}));
+  CHECK_EQ(type_fx->checked_type(), type_a);
+}
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index e1d749e75863..8f92fc0f5192 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -107,6 +107,22 @@ def test_type_args():
     assert sh2[0].value == 1
     assert sh2[1].value == 10
 
+def test_self_reference():
+    """
+    Program:
+       def f(x) {
+           return x;
+       }
+    """
+    a = relay.TypeVar("a")
+    x = relay.var("x", a)
+    sb = relay.ScopeBuilder()
+    f = relay.Function([x], x)
+    fx = relay.Call(f, [x])
+    assert relay.ir_pass.infer_type(x).checked_type == a
+    assert relay.ir_pass.infer_type(f).checked_type == relay.FuncType([a], a)
+    assert relay.ir_pass.infer_type(fx).checked_type == a
+
 if __name__ == "__main__":
     test_free_expr()
     test_dual_op()
@@ -117,3 +133,4 @@ def test_type_args():
     test_tuple()
     test_free_expr()
     test_type_args()
+    test_self_reference()

From 2563f36add868c0ae7b08d25733c0eaa1d9174c3 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Sat, 27 Oct 2018 08:44:15 +0530
Subject: [PATCH 288/529] [RELAY]reshape_like (#1950)

---
 docs/langref/relay_op.rst            |  2 +
 include/tvm/relay/type.h             |  5 +++
 python/tvm/relay/op/transform.py     | 23 ++++++++++++
 src/relay/ir/type.cc                 | 12 ++++++
 src/relay/op/tensor/transform.cc     | 56 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level3.py | 17 +++++++++
 6 files changed, 115 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 11fb282abac5..d1549cd8326e 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -78,6 +78,7 @@ This level enables additional math and transform operators.
    tvm.relay.ones
    tvm.relay.ones_like
    tvm.relay.reshape
+   tvm.relay.reshape_like
    tvm.relay.copy
    tvm.relay.transpose
    tvm.relay.floor
@@ -189,6 +190,7 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.abs
 .. autofunction:: tvm.relay.negative
 .. autofunction:: tvm.relay.reshape
+.. autofunction:: tvm.relay.reshape_like
 .. autofunction:: tvm.relay.copy
 .. autofunction:: tvm.relay.transpose
 .. autofunction:: tvm.relay.take
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index a2d15a05d454..0b61004f9a66 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -82,6 +82,11 @@ class TensorTypeNode : public BaseTensorTypeNode {
     v->Visit("span", &span);
   }
 
+  /*! \brief Return product of elements in the shape.
+   *  \return (d1 * d_2 ... * d_n) if shape is (d_1, d_2, ..., d_n) and 1 if shape size is zero.
+   */
+  TVM_DLL IndexExpr Size() const;
+
   TVM_DLL static TensorType make(Array<IndexExpr> shape, DataType dtype);
 
   /*! \brief Construct an scalar containing elements of dtype.  */
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 3cf139c7dd86..9d14463a530c 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -142,6 +142,29 @@ def reshape(data, newshape):
     return _make.reshape(data, list(newshape))
 
 
+def reshape_like(data, shape_like):
+    """Reshapes the input array by the size of another array.
+    For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
+    the input array into an output array with the same shape as the second input array.
+    .. note::
+    Sizes for both array should be compatible.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    shape_like : tuple of int
+        The new shape. Should be compatible with the original shape.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    return _make.reshape_like(data, shape_like)
+
+
 def take(data, indices, axis=None):
     """Take elements from an array along an axis.
 
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
index d6fc2e85b2d8..bbe6472609df 100644
--- a/src/relay/ir/type.cc
+++ b/src/relay/ir/type.cc
@@ -22,6 +22,18 @@ TensorType TensorTypeNode::Scalar(DataType dtype) {
   return TensorTypeNode::make({}, dtype);
 }
 
+IndexExpr TensorTypeNode::Size() const {
+  if (shape.size() == 0) {
+    return make_const(Int(64), 1);
+  }
+
+  IndexExpr size = shape[0];
+  for (size_t i = 1; i < shape.size(); ++i) {
+    size *= shape[i];
+  }
+  return size;
+}
+
 TVM_REGISTER_NODE_TYPE(TensorTypeNode);
 
 TVM_REGISTER_API("relay._make.TensorType")
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index d7b4980f80b2..5faa0805426a 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -377,6 +377,62 @@ Example::
 .set_support_level(3)
 .add_type_rel("Reshape", ReshapeRel);
 
+
+/*!
+* \brief ReshapeLikeRel User defined type constraint function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return False if the relation has not been resolved, it might be resolved later.
+*  True if this relation has been resolved.
+*/
+bool ReshapeLikeRel(const Array<Type>& types,
+                    int num_inputs,
+                    const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+  const auto* reshape_like = types[1].as<TensorTypeNode>();
+  if (reshape_like == nullptr) {
+    return false;
+  }
+  CHECK(reporter->AssertEQ(data->Size(), reshape_like->Size()))
+    << "Reshape inputs size should be compatible.";
+  reporter->Assign(types[2], TensorTypeNode::make(reshape_like->shape, data->dtype));
+  return true;
+}
+
+
+Expr MakeReshapeLike(Expr data,
+                     Expr shape_like) {
+  static const Op& op = Op::Get("reshape_like");
+  return CallNode::make(op, {data, shape_like}, Attrs(), {});
+}
+
+
+TVM_REGISTER_API("relay.op._make.reshape_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeReshapeLike, args, rv);
+});
+
+
+RELAY_REGISTER_OP("reshape_like")
+.describe(R"code(Reshapes the input array by the size of another array.
+For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
+the input array into an output array with the same shape as the second input array.
+.. note::
+    Sizes for both array should be compatible.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("shape_like", "Tensor", "Shape tensor.")
+.set_support_level(3)
+.add_type_rel("ReshapeLike", ReshapeLikeRel);
+
+
 // Take
 TVM_REGISTER_NODE_TYPE(TakeAttrs);
 
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 804d3c46ca36..2ee6f758f100 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -88,6 +88,22 @@ def test_reshape_infer_type():
         (n, t, 2000), "float32")
 
 
+def test_reshape_like():
+    # concrete shape
+    x = relay.var("x", relay.TensorType((1, 2, 3), "float32"))
+    y = relay.var("y", relay.TensorType((1,6), "float32"))
+    z = relay.reshape_like(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((1, 6), "float32")
+
+    # symbolic shape
+    n, c, h, w = tvm.var("n"), 2, 3, tvm.var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.var("y", relay.TensorType((1, 8, 8), "float32"))
+    z = relay.reshape_like(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((1, 8, 8), "float32")
+
 
 def test_take_infer_type():
     def verify_take(dshape, indices_shape, oshape, axis=None):
@@ -187,6 +203,7 @@ def test_infer_type_leaky_relu():
     test_clip_type()
     test_transpose_infer_type()
     test_reshape_infer_type()
+    test_reshape_like()
     test_take_infer_type()
     test_full()
     test_full_like()

From f4b038398b475f36bb245d33fc137beba375d4ac Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Sat, 27 Oct 2018 11:18:24 +0800
Subject: [PATCH 289/529] [TOPI][CUDA] batched int8 conv2d (#1961)

---
 topi/python/topi/cuda/conv2d_int8.py       | 56 ++++++++++++++--------
 topi/tests/python/test_topi_conv2d_int8.py |  4 ++
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 053c9bc6bd31..9d3757c35fbb 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -9,7 +9,7 @@
 from ..nn.conv2d import conv2d_NCHWc_int8_prepacked
 from ..nn.pad import pad
 from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple, get_const_int, traverse_inline
+from ..util import get_const_tuple, traverse_inline
 
 
 def _conv2d_NCHWc_int8_arg_to_workload(data, kernel, stride, padding, out_dtype):
@@ -183,7 +183,7 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
             _schedule_injective(packed_data.op, s)
             _schedule_injective(packed_kernel.op, s)
     else:
-        kernel = packed_data
+        kernel = packed_kernel
 
     if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
         s[kernel].compute_inline()
@@ -191,7 +191,6 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
     if pad_data != packed_data:
         s[pad_data].compute_inline()
 
-    batch = get_const_int(packed_data.shape[0])
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
@@ -210,33 +209,50 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
 
     # tile and bind spatial axes
     n, f, y, x, c = s[output].op.axis
+    cfg.define_split("tile_n", cfg.axis(n), num_outputs=4)
     cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
     cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
     cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
 
+    # this is the scope to attach global config inside this kernel
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
     bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
     by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
     bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
 
-    # this is the scope to attach global config inside this kernel
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    max_block_z = 128
-    if batch > max_block_z:
-        _, n = s[output].split(n, factor=max_block_z)
-    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
-    fused_byx = s[output].fuse(by, bx)
-    s[output].bind(n, tvm.thread_axis("blockIdx.z"))
+    s[output].reorder(bn, bf, by, bx, vn, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
+    s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
     s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
-    s[output].bind(fused_byx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vn, tvm.thread_axis("vthread"))
     s[output].bind(vf, tvm.thread_axis("vthread"))
     s[output].bind(vy, tvm.thread_axis("vthread"))
     s[output].bind(vx, tvm.thread_axis("vthread"))
-    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
-    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
 
-    s[conv].compute_at(s[output], tx)
+    cfg.define_knob("fuse_yx", [0, 1]) # fuse ty,tx or tn,tf
+    if cfg["fuse_yx"].val:
+        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
+        s[output].bind(tf, tvm.thread_axis("threadIdx.y"))
+        tyx = s[output].fuse(ty, tx)
+        s[output].bind(tyx, tvm.thread_axis("threadIdx.x"))
+        s[conv].compute_at(s[output], tyx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2]
+        n_ty = cfg["tile_f"].size[2]
+        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
+    else:
+        s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z"))
+        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[conv].compute_at(s[output], tx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
+        n_ty = cfg["tile_y"].size[2]
+        n_tx = cfg["tile_x"].size[2]
 
     # tile and bind reduction axes
     n, f, y, x, c = s[conv].op.axis
@@ -272,9 +288,9 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
             fused = s[load].fuse(n, f, y, x, oc_chunk)
             s[load].vectorize(c)
 
-        fused, tx = s[load].split(fused, factor=cfg["tile_x"].size[2])
-        fused, ty = s[load].split(fused, factor=cfg["tile_y"].size[2])
-        fused, tz = s[load].split(fused, factor=cfg["tile_f"].size[2])
+        fused, tx = s[load].split(fused, factor=n_tx)
+        fused, ty = s[load].split(fused, factor=n_ty)
+        fused, tz = s[load].split(fused, factor=n_tz)
         s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
         s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
         s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
index af2d9e2046c4..93a0587c64ff 100644
--- a/topi/tests/python/test_topi_conv2d_int8.py
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -172,6 +172,10 @@ def test_conv2d_nchw():
         verify_conv2d_NCHWc_int8(1, 2048,   8, 192, 1, 1, 0)
         verify_conv2d_NCHWc_int8(1, 1024,  19,  84, 3, 1, 1)
 
+        # batch > 1
+        verify_conv2d_NCHWc_int8(7,   32, 149,  32, 3, 1, 0)
+        verify_conv2d_NCHWc_int8(8,   32, 149,  32, 3, 1, 0)
+        verify_conv2d_NCHWc_int8(32,  32, 149,  32, 3, 1, 0)
 
 if __name__ == "__main__":
     test_conv2d_nchw()

From 21dc6a4c15887764d86036f295eff1789af7c01e Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 27 Oct 2018 17:22:19 -0700
Subject: [PATCH 290/529] [RELAY][OP] Fix conv2d NHWC type inference. (#2019)

---
 include/tvm/packed_func_ext.h        |  8 +++++---
 src/relay/ir/op.cc                   |  5 +++++
 src/relay/op/nn/convolution.cc       | 20 ++++++++++++--------
 src/relay/op/nn/layout.h             |  4 +---
 tests/python/relay/test_op_level2.py | 21 ++++++++++++++++++---
 5 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index c5a83608c617..45366f3ad55a 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -35,6 +35,8 @@ struct NodeTypeChecker {
     // It can be turned off, but will make non strict checking.
     // TODO(tqchen) possibly find alternative to turn of RTTI
     using ContainerType = typename T::ContainerType;
+    // always allow nullptr.
+    if (sptr == nullptr) return true;
     return sptr->derived_from<ContainerType>();
   }
   static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
@@ -46,7 +48,7 @@ struct NodeTypeChecker {
 template<typename T>
 struct NodeTypeChecker<Array<T> > {
   static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return false;
+    if (sptr == nullptr) return true;
     if (!sptr->is_type<ArrayNode>()) return false;
     ArrayNode* n = static_cast<ArrayNode*>(sptr);
     for (const auto& p : n->data) {
@@ -64,7 +66,7 @@ struct NodeTypeChecker<Array<T> > {
 template<typename V>
 struct NodeTypeChecker<Map<std::string, V> > {
   static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return false;
+    if (sptr == nullptr) return true;
     if (!sptr->is_type<StrMapNode>()) return false;
     StrMapNode* n = static_cast<StrMapNode*>(sptr);
     for (const auto& kv : n->data) {
@@ -83,7 +85,7 @@ struct NodeTypeChecker<Map<std::string, V> > {
 template<typename K, typename V>
 struct NodeTypeChecker<Map<K, V> > {
   static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return false;
+    if (sptr == nullptr) return true;
     if (!sptr->is_type<MapNode>()) return false;
     MapNode* n = static_cast<MapNode*>(sptr);
     for (const auto& kv : n->data) {
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
index 4826aed54ba5..96e805b5af2f 100644
--- a/src/relay/ir/op.cc
+++ b/src/relay/ir/op.cc
@@ -150,5 +150,10 @@ TVM_REGISTER_NODE_TYPE(OpNode)
     return static_cast<const OpNode*>(n)->name;
   });
 
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<OpNode>([](const OpNode* node, tvm::IRPrinter* p) {
+    p->stream << "Op(" << node->name << ")";
+  });
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index b573a2981c39..8e1d9db50e7e 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -21,7 +21,6 @@ bool Conv2DRel(const Array<Type>& types,
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
-
   static const Layout kNCHW("NCHW");
   static const Layout kOIHW("OIHW");
 
@@ -42,14 +41,17 @@ bool Conv2DRel(const Array<Type>& types,
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
+  std::vector<IndexExpr> dshape_nchw = ConvertLayout(
+      data->shape, in_layout, kNCHW);
+
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
     CHECK_EQ(param->kernel_size.size(), 2);
     CHECK_EQ(param->dilation.size(), 2);
     std::vector<IndexExpr> wshape(
-        {param->channels / param->groups,
-         data->shape[1] / param->groups,
+       {param->channels / param->groups,
+         dshape_nchw[1] / param->groups,
          param->kernel_size[0],
          param->kernel_size[1]});
     wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
@@ -78,16 +80,16 @@ bool Conv2DRel(const Array<Type>& types,
           << " channels=" << param->channels
           << " wshape=" << Array<IndexExpr>(wshape);
     }
-    CHECK(reporter->AssertEQ(data->shape[1] / param->groups, wshape[1]));
+    CHECK(reporter->AssertEQ(dshape_nchw[1] / param->groups, wshape[1]));
     channels = wshape[0];
     dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
     dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
   }
   // dilation
-  std::vector<IndexExpr> oshape({data->shape[0], channels, 0, 0});
+  std::vector<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
 
-  oshape[2] = (data->shape[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1;
-  oshape[3] = (data->shape[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1;
+  oshape[2] = (dshape_nchw[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1;
+  oshape[3] = (dshape_nchw[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1;
   DataType out_dtype = param->out_dtype;
   if (out_dtype.bits() == 0) {
     out_dtype = data->dtype;
@@ -183,7 +185,9 @@ bool Conv2DTransposeRel(const Array<Type>& types,
     << " But got "<< kernel_layout;
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
-  const auto dshape_nchw = ConvertLayout(data->shape, in_layout, kNCHW);
+
+  auto dshape_nchw = ConvertLayout(data->shape, in_layout, kNCHW);
+
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
     CHECK_EQ(param->kernel_size.size(), 2);
diff --git a/src/relay/op/nn/layout.h b/src/relay/op/nn/layout.h
index b1dc4a71af1c..d9eb59d6e31c 100644
--- a/src/relay/op/nn/layout.h
+++ b/src/relay/op/nn/layout.h
@@ -495,9 +495,7 @@ inline std::vector<IndexExpr> ConvertLayout(
       IndexExpr src_dim_size = src[i];
 
       if (src_minor_pos >= 0) {
-        const int64_t* minor_size = as_const_int(src[src_minor_pos]);
-        CHECK(minor_size == nullptr &&
-              src_factor == minor_size[0])
+        CHECK(is_const_int(src[src_minor_pos], src_factor))
             << "src shape " << Array<IndexExpr>(src)
             << " does not agree with layout "
             << src_layout;
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 2f32b316924a..9dd2491289f2 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -32,9 +32,9 @@ def test_conv2d_infer_type():
 
     # Infer with a different layout
     n, c, h, w = 4, 32, 224, 224
-    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
-    w = relay.var("w")
-    y = relay.nn.conv2d(x, w,
+    x = relay.var("x", relay.TensorType((n//4, c//4, h, w, 4, 4), "int8"))
+    wt = relay.var("w")
+    y = relay.nn.conv2d(x, wt,
                         kernel_size=(3, 3),
                         padding=(1, 1),
                         channels=16,
@@ -47,6 +47,21 @@ def test_conv2d_infer_type():
     assert yy.args[1].checked_type == relay.TensorType(
         (4, 8, 3, 3, 4, 4), "int8")
 
+    # Infer with NHWC
+    n, c, h, w = 4, 32, 224, 224
+    x = relay.var("x", relay.TensorType((n, h, w, c), "int8"))
+    wt = relay.var("w")
+    y = relay.nn.conv2d(x, wt,
+                        kernel_size=(3, 3),
+                        padding=(1, 1),
+                        channels=16,
+                        data_layout="NHWC",
+                        out_dtype="int32")
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type ==  relay.TensorType(
+        (n, h, w, 16), "int32")
+
+
 def test_conv2d_transpose_infer_type():
     # symbolic in batch dimension
     n, c, h, w = tvm.var("n"), 10, 10, 12

From 247ea6da260d1116dd0ffcb1e5ec85bc3109f2f8 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Sun, 28 Oct 2018 11:19:07 +0900
Subject: [PATCH 291/529] [OPENCL][RUNTIME] Fix race condition of modules
 (#2018)

---
 apps/benchmark/gpu_imagenet_bench.py    | 52 +++++++++++++++++--------
 src/runtime/opencl/opencl_device_api.cc |  2 +-
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
index 80df08128995..17c1fbc435b6 100644
--- a/apps/benchmark/gpu_imagenet_bench.py
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -2,6 +2,7 @@
 see README.md for the usage and results of this script.
 """
 import argparse
+import threading
 
 import numpy as np
 
@@ -14,6 +15,26 @@
 from util import get_network
 
 
+def benchmark(network, target):
+    net, params, input_shape, output_shape = get_network(network, batch_size=1)
+
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
+
+    # create runtime
+    ctx = tvm.context(str(target), 0)
+    module = runtime.create(graph, lib, ctx)
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input('data', data_tvm)
+    module.set_input(**params)
+
+    # evaluate
+    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=args.repeat)
+    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+    print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--network", type=str, choices=
@@ -29,6 +50,7 @@
     parser.add_argument("--target", type=str,
                         choices=['cuda', 'opencl', 'rocm', 'nvptx', 'metal'], default='cuda',
                         help="The tvm compilation target")
+    parser.add_argument("--thread", type=int, default=1, help="The number of threads to be run.")
     args = parser.parse_args()
 
     dtype = 'float32'
@@ -44,20 +66,16 @@
     print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
     print("--------------------------------------------------")
     for network in networks:
-        net, params, input_shape, output_shape = get_network(network, batch_size=1)
-
-        with nnvm.compiler.build_config(opt_level=3):
-            graph, lib, params = nnvm.compiler.build(
-                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
-
-        # create runtime
-        ctx = tvm.context(str(target), 0)
-        module = runtime.create(graph, lib, ctx)
-        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-        module.set_input('data', data_tvm)
-        module.set_input(**params)
-
-        # evaluate
-        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=args.repeat)
-        prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
-        print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
+        if args.thread == 1:
+            benchmark(network, target)
+        else:
+            threads = list()
+            for n in range(args.thread):
+                thread = threading.Thread(target=benchmark, args=([network, target]), name="thread%d" % n)
+                threads.append(thread)
+
+            for thread in threads:
+                thread.start()
+
+            for thread in threads:
+                thread.join()
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 84c9918530f5..d5177fd9525a 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -232,7 +232,6 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
   if (initialized_) return;
   std::lock_guard<std::mutex> lock(this->mu);
   if (initialized_) return;
-  initialized_ = true;
   if (context != nullptr) return;
   // matched platforms
   std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
@@ -271,6 +270,7 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
         clCreateCommandQueue(this->context, did, 0, &err_code));
     OPENCL_CHECK_ERROR(err_code);
   }
+  initialized_ = true;
 }
 
 TVM_REGISTER_GLOBAL("device_api.opencl")

From d061fd4a2f20060bbc986392a7c0a1e1a59741ce Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 27 Oct 2018 21:03:08 -0700
Subject: [PATCH 292/529] [DOCKER] temporary revert cuda version to cuda8
 (#2021)

---
 docker/Dockerfile.ci_gpu | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 7b97a54185f4..708331d3d61a 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -1,5 +1,5 @@
 # CI docker GPU env
-FROM nvidia/cuda:9.0-cudnn7-devel
+FROM nvidia/cuda:8.0-cudnn7-devel
 
 # Base scripts
 RUN apt-get update --fix-missing
@@ -62,9 +62,6 @@ RUN pip3 install Pillow
 COPY install/ubuntu_install_vulkan.sh /install/ubuntu_install_vulkan.sh
 RUN bash /install/ubuntu_install_vulkan.sh
 
-COPY install/ubuntu_install_tensorflow.sh /install/ubuntu_install_tensorflow.sh
-RUN bash /install/ubuntu_install_tensorflow.sh
-
 # AutoTVM deps
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh

From af960777f98b8b1c5c1f61c427f08df9b94df54d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Sat, 27 Oct 2018 21:10:59 -0700
Subject: [PATCH 293/529] save (#2015)

---
 include/tvm/relay/expr.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 566acf96e2e9..50e5dfa8d89b 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -134,6 +134,7 @@ class VarNode : public ExprNode {
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("name_hint", &name_hint);
     v->Visit("type_annotation", &type_annotation);
+    v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
   }
 
@@ -161,6 +162,7 @@ class GlobalVarNode : public ExprNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("name_hint", &name_hint);
+    v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
   }
 
@@ -377,6 +379,7 @@ class TupleGetItemNode : public ExprNode {
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("tuple_value", &tuple);
     v->Visit("index", &index);
+    v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
   }
 

From d331f1f839a58994b5f10bdc6971037548cb19aa Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Sun, 28 Oct 2018 09:31:14 -0700
Subject: [PATCH 294/529] [TF] ignore Truncate in cast (#2022)

---
 nnvm/python/nnvm/frontend/tensorflow.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index ad7c4fc6796f..9cd07cca3cc6 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -306,7 +306,8 @@ def _cast():
     def _impl(inputs, attr, params):
         # Convert from tensorflow Dtype to str
         attr['DstT'] = attr['DstT'].name
-        return AttrCvt(op_name='cast', transforms={'DstT': 'dtype'}, ignores=['SrcT'])(inputs, attr)
+        return AttrCvt(op_name='cast', transforms={'DstT': 'dtype'},
+                       ignores=['SrcT', 'Truncate'])(inputs, attr)
     return _impl
 
 def _expand_dims():

From 9b0ec3440770eb46df7da9e2afdd21d84e3d9e36 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Sun, 28 Oct 2018 22:13:06 +0530
Subject: [PATCH 295/529] [DOCKER][GOLANG] fix golang version. (#2023)

---
 docker/README.md                        |  2 +-
 docker/install/ubuntu_install_golang.sh |  4 ++--
 tests/scripts/task_golang.sh            | 11 +++++++++++
 3 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100755 tests/scripts/task_golang.sh

diff --git a/docker/README.md b/docker/README.md
index 213c84cc6e5e..df9ea42af68a 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -89,5 +89,5 @@ Here are some common use examples to perform CI tasks.
 - build golang test suite.
 
   ```bash
-  ./docker/build.sh ci_cpu make -C golang tests
+  ./docker/build.sh ci_cpu tests/scripts/task_golang.sh
   ```
diff --git a/docker/install/ubuntu_install_golang.sh b/docker/install/ubuntu_install_golang.sh
index e15a456bc15a..2361ccfbd2e4 100644
--- a/docker/install/ubuntu_install_golang.sh
+++ b/docker/install/ubuntu_install_golang.sh
@@ -1,4 +1,4 @@
 #install the necessary dependancies for golang build
-apt-get update && apt-get install -y golang-0.10-go
-apt-get update && apt-get install -y godoc
+apt-get update && apt-get install -y golang-1.10-go
+apt-get update && apt-get install -y golang-1.10-doc
 apt-get update && apt-get install -y golint
diff --git a/tests/scripts/task_golang.sh b/tests/scripts/task_golang.sh
new file mode 100755
index 000000000000..363ee05bcbec
--- /dev/null
+++ b/tests/scripts/task_golang.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -e
+
+export LD_LIBRARY_PATH=lib:$LD_LIBRARY_PATH
+
+tvm_root="$(git rev-parse --show-toplevel)"
+export PYTHONPATH="$tvm_root/python":"$tvm_root/nnvm/python":"$tvm_root/topi/python"
+
+# Golang tests
+make -C golang tests

From 89497162e84cd8f40d0eb5bba9380e157919109c Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 28 Oct 2018 15:59:26 -0700
Subject: [PATCH 296/529] [RELAY][PASS] FoldScaleAxis Forward (#2020)

* [RELAY][PASS] FoldScaleAxis Forward

* Introduce helper function type_as

* Update per review comment

* Fix according to comments
---
 include/tvm/relay/attrs/transform.h           |  13 +-
 include/tvm/relay/expr.h                      |  26 +
 include/tvm/relay/expr_functor.h              |  14 +-
 include/tvm/relay/op.h                        |  43 +-
 python/tvm/relay/ir_pass.py                   |  20 +
 python/tvm/relay/op/transform.py              |  18 +-
 src/relay/ir/alpha_equal.cc                   |  20 +-
 src/relay/ir/expr_functor.cc                  |   4 +-
 src/relay/op/tensor/transform.cc              |  14 +-
 src/relay/pass/fold_scale_axis.cc             | 554 ++++++++++++++++++
 src/relay/pass/pattern_util.h                 | 123 ++++
 src/relay/pass/type_infer.cc                  |  58 +-
 tests/python/relay/test_op_level3.py          |   8 +-
 .../python/relay/test_pass_fold_scale_axis.py | 153 +++++
 14 files changed, 1001 insertions(+), 67 deletions(-)
 create mode 100644 src/relay/pass/fold_scale_axis.cc
 create mode 100644 src/relay/pass/pattern_util.h
 create mode 100644 tests/python/relay/test_pass_fold_scale_axis.py

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index dfad1013701f..cb87d358e966 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -94,15 +94,16 @@ struct InitOpAttrs : public tvm::AttrsNode<InitOpAttrs> {
 
 /*! \brief Attributes used in squeeze operators */
 struct SqueezeAttrs : public tvm::AttrsNode<SqueezeAttrs> {
-  Array<IndexExpr> axes;
+  // use axis to make the name numpy compatible.
+  Array<Integer> axis;
 
   TVM_DECLARE_ATTRS(SqueezeAttrs, "relay.attrs.SqueezeAttrs") {
-    TVM_ATTR_FIELD(axes)
-        .describe("The axes to squeeze in the input tensor."
-                  "If `axes = []`, all axis of dimension 1 get squeezed;"
+    TVM_ATTR_FIELD(axis)
+        .describe("The axis to squeeze in the input tensor."
+                  "If `axis = None`, all axis of dimension 1 get squeezed;"
                   "Else, the dimension in axes get squeezed."
-                  "It is an error if an axes does not has dimension 1.")
-        .set_default(Array<IndexExpr>({}));
+                  "It is an error if an axis does not has dimension 1.")
+        .set_default(NullValue<Array<Integer> >());
   }
 };  // struct SqueezeAttrs
 
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 50e5dfa8d89b..2e3bbadb7841 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -40,6 +40,18 @@ class ExprNode : public RelayNode {
                                       "field for this node";
     return this->checked_type_;
   }
+  /*!
+   * \brief Check if the inferred(checked) type of the Expr
+   *  is backed by a TTypeNode and return it.
+   *
+   * \note This function will thrown an error if the node type
+   *       of this Expr is not TTypeNode.
+   *
+   * \return The corresponding TTypeNode pointer.
+   * \tparam The specific TypeNode we look for.
+   */
+  template<typename TTypeNode>
+  inline const TTypeNode* type_as() const;
 
   static constexpr const char* _type_key = "relay.Expr";
   TVM_DECLARE_BASE_NODE_INFO(ExprNode, RelayNode);
@@ -391,6 +403,20 @@ class TupleGetItemNode : public ExprNode {
 
 RELAY_DEFINE_NODE_REF(TupleGetItem, TupleGetItemNode, Expr);
 
+// implementataions
+template<typename TTypeNode>
+inline const TTypeNode* ExprNode::type_as() const {
+  static_assert(std::is_base_of<TypeNode, TTypeNode>::value,
+                "TType must be a special case of type");
+  CHECK(checked_type_.defined())
+      << "Type inference for this Expr has not completed";
+  const TTypeNode* node = checked_type_.as<TTypeNode>();
+  CHECK(node != nullptr)
+      << "Expected type to be " << TTypeNode::_type_key
+      << ", but get " << checked_type_->type_key();
+  return node;
+}
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_EXPR_H_
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index c0256cf3a1c3..bf4025f79224 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -150,7 +150,14 @@ class ExprVisitor
 class ExprMutator
     : public ::tvm::relay::ExprFunctor<Expr(const Expr&)> {
  public:
-  Expr Mutate(const Expr& expr);
+  /*!
+   * \brief Mutate is alias for VisitExpr
+   * \return expr.
+   */
+  Expr Mutate(const Expr& expr) {
+    return this->VisitExpr(expr);
+  }
+  Expr VisitExpr(const Expr& expr) override;
   Expr VisitExpr_(const VarNode* op) override;
   Expr VisitExpr_(const ConstantNode* op) override;
   Expr VisitExpr_(const GlobalVarNode* op) override;
@@ -161,7 +168,8 @@ class ExprMutator
   Expr VisitExpr_(const LetNode* op) override;
   Expr VisitExpr_(const IfNode* op) override;
   Expr VisitExpr_(const TupleGetItemNode* op) override;
-  /*! \brief Used to visit the types inside of expressions.
+  /*!
+   * \brief Used to visit the types inside of expressions.
    *
    * Can be overloaded to transform the types in arbitrary
    * ways, one way would be to define a sub-class of type
@@ -169,7 +177,7 @@ class ExprMutator
    */
   virtual Type VisitType(const Type& t);
 
- private:
+ protected:
   /*! \brief Internal map used for memoization. */
   std::unordered_map<Expr, Expr, NodeHash, NodeEqual> memo_;
 };
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index 9f28fbebccfc..ad447ad13cee 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -74,6 +74,17 @@ class OpNode : public relay::ExprNode {
     v->Visit("support_level", &support_level);
   }
 
+  /*!
+   * \brief Check that if current op is a "primtive operator".
+   * That is the arguments are all type variables, and there is a single
+   * type relation applied to the input and output types.
+   */
+  bool IsPrimitiveOp() const {
+    if (is_primitive_ != -1) return is_primitive_ != 0;
+    is_primitive_ = this->IsPrimitiveOp_() ? 1 : 0;
+    return is_primitive_ != 0;
+  }
+
   static constexpr const char* _type_key = "relay.Op";
   TVM_DECLARE_NODE_TYPE_INFO(OpNode, ExprNode);
 
@@ -81,9 +92,24 @@ class OpNode : public relay::ExprNode {
   // friend class
   friend class GenericOpMap;
   friend class OpRegistry;
+  friend bool IsPrimitiveOp(const Expr&);
   // Program internal unique index of operator.
   // Used to help index the program.
   uint32_t index_{0};
+  // whether this is a primitive op. -1 means unknown.
+  mutable int is_primitive_{-1};
+  // Internal function to compute if it is primitive op
+  bool IsPrimitiveOp_() const {
+    const auto& fn_ty = this->op_type;
+    if (fn_ty->type_constraints.size() != 1) return false;
+    const TypeRelationNode* rel = fn_ty->type_constraints[0].as<TypeRelationNode>();
+    if (rel == nullptr) return false;
+    // validate if the type parameter matches up
+    for (size_t i = 0; i < fn_ty->type_params.size(); ++i) {
+      if (!fn_ty->type_params[i].same_as(rel->args[i])) return false;
+    }
+    return true;
+  }
 };
 
 /*!
@@ -497,22 +523,7 @@ inline ValueType OpMap<ValueType>::get(const Op& op,
  */
 inline bool IsPrimitiveOp(const Expr& expr) {
   const auto* op = expr.as<OpNode>();
-
-  if (!op) {
-    return false;
-  }
-
-  const auto& fn_ty = op->op_type;
-  if (fn_ty->type_constraints.size() != 1) return false;
-
-  const TypeRelationNode* rel = fn_ty->type_constraints[0].as<TypeRelationNode>();
-  if (rel == nullptr) return false;
-  // validate if the type parameter matches up
-  for (size_t i = 0; i < fn_ty->type_params.size(); ++i) {
-    if (!fn_ty->type_params[i].same_as(rel->args[i])) return false;
-  }
-
-  return true;
+  return op != nullptr && op->IsPrimitiveOp();
 }
 
 }  // namespace relay
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index f930751c41a7..6adfaacdc86d 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -10,6 +10,7 @@
 from .expr import Expr
 from .ty import Type
 
+
 def infer_type(expr, env=None):
     """Infer the type of expr under the context of env.
 
@@ -30,6 +31,23 @@ def infer_type(expr, env=None):
     return _ir_pass.infer_type(expr, env)
 
 
+def forward_fold_scale_axis(expr):
+    """Fold the scaling of axis into weights of conv2d/dense.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression, we expect that expr's types
+        should be fully inferred by infer_type.
+
+    Returns
+    -------
+    folded_expr : tvm.relay.Expr
+        The folded expression after transformation.
+    """
+    return _ir_pass.forward_fold_scale_axis(expr)
+
+
 def well_formed(expr):
     """Check that each Var is only bound once (well formed).
 
@@ -149,6 +167,7 @@ def alpha_equal(lhs, rhs):
     """
     return bool(_make._alpha_equal(lhs, rhs))
 
+
 def graph_equal(lhs, rhs):
     """Compare two Relay expr for data-flow equivalence.
     The difference between this and alpha-equality is that
@@ -170,6 +189,7 @@ def graph_equal(lhs, rhs):
     """
     return bool(_make._graph_equal(lhs, rhs))
 
+
 def structural_hash(value):
     """Hash a Relay expression structurally.
 
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 9d14463a530c..909b175f08ca 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -49,27 +49,25 @@ def transpose(data, axes=None):
     return _make.transpose(data, list(axes))
 
 
-def squeeze(data, axes=None):
+def squeeze(data, axis=None):
     """Squeeze axes in the array.
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.relay.Expr
         The input data to the operator.
 
-    axes : None or List[int]
-        Axes to remove.
-        If axes = [] or = None, remove all axis of dimensions 1.
-        Otherwise, remove all axis in axes.
-        If any axis in axes has dimension that does not equal 1, it is an error.
+    axis : None or List[int]
+        The set of axes to remove.
+        If axis = None, remove all axis of dimensions 1.
+        If any specified axis has dimension that does not equal 1, it is an error.
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.relay.Expr
         The squeezed result.
     """
-    axes = axes or []
-    return _make.squeeze(data, list(axes))
+    return _make.squeeze(data, axis)
 
 
 def reshape(data, newshape):
diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 7aab9bb3223b..8409581b53bf 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -296,13 +296,23 @@ class AlphaEqualHandler:
     if (const CallNode* rhs = other.as<CallNode>()) {
       if (!ExprEqual(lhs->op, rhs->op)) return false;
       if (lhs->args.size() != rhs->args.size()) return false;
-      if (lhs->type_args.size() != rhs->type_args.size()) return false;
-
+      // skip type_args check for primitive ops.
+      bool is_primitive = IsPrimitiveOp(lhs->op);
+      if (!is_primitive) {
+        if (lhs->type_args.size() != rhs->type_args.size()) {
+          return false;
+        }
+      }
       for (size_t i = 0; i < lhs->args.size(); ++i) {
-        if (!ExprEqual(lhs->args[i], rhs->args[i])) return false;
+        if (!ExprEqual(lhs->args[i], rhs->args[i])) {
+          return false;
+        }
       }
-      for (size_t i = 0; i < lhs->type_args.size(); ++i) {
-        if (!TypeEqual(lhs->type_args[i], rhs->type_args[i])) return false;
+
+      if (!is_primitive) {
+        for (size_t i = 0; i < lhs->type_args.size(); ++i) {
+          if (!TypeEqual(lhs->type_args[i], rhs->type_args[i])) return false;
+        }
       }
       return AttrEqual(lhs->attrs, rhs->attrs);
     } else {
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 557daa98e899..b7a752d43a5c 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -12,12 +12,12 @@
 namespace tvm {
 namespace relay {
 
-Expr ExprMutator::Mutate(const Expr& expr) {
+Expr ExprMutator::VisitExpr(const Expr& expr) {
   auto it = this->memo_.find(expr);
   if (it != this->memo_.end()) {
     return it->second;
   } else {
-    Expr new_expr = ExprMutator::VisitExpr(expr);
+    Expr new_expr = ExprFunctor::VisitExpr(expr);
     memo_[expr] = new_expr;
     return new_expr;
   }
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 5faa0805426a..635f04668f33 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -761,9 +761,9 @@ Examples::
 TVM_REGISTER_NODE_TYPE(SqueezeAttrs);
 
 Expr MakeSqueeze(Expr data,
-                 Array<IndexExpr> axes) {
+                 Array<Integer> axis) {
   auto attrs = make_node<SqueezeAttrs>();
-  attrs->axes = std::move(axes);
+  attrs->axis = std::move(axis);
   static const Op& op = Op::Get("squeeze");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
@@ -785,8 +785,8 @@ bool SqueezeRel(const Array<Type>& types,
   const auto* param = attrs.as<SqueezeAttrs>();
   CHECK(param != nullptr);
   std::vector<IndexExpr> result_shape;
-  // if axes is empty, squeeze all axes of dimension 1
-  if (param->axes.size() == 0) {
+  // if axes is None, squeeze all axes of dimension 1
+  if (!param->axis.defined()) {
     for (const auto& e : data->shape) {
       const int64_t* axis_ptr = as_const_int(e);
       CHECK(axis_ptr != nullptr) << "the axes attribute must be concrete";
@@ -800,10 +800,8 @@ bool SqueezeRel(const Array<Type>& types,
     for (const auto& e : data->shape) {
       original_shape.push_back(std::pair<IndexExpr, bool>(e, true));
     }
-    for (const auto& e : param->axes) {
-      const int64_t* axis_ptr = as_const_int(e);
-      CHECK(axis_ptr != nullptr);
-      original_shape.at(*axis_ptr).second = false;
+    for (const auto& e : param->axis) {
+      original_shape.at(e->value).second = false;
     }
     for (const auto p : original_shape) {
       if (p.second) {
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
new file mode 100644
index 000000000000..b1c767704372
--- /dev/null
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -0,0 +1,554 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file fold_scale_axis.cc
+ *
+ * \brief Fold axis scaling into weights of
+ *  conv/dense operators.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include "pattern_util.h"
+#include "../op/nn/layout.h"
+
+namespace tvm {
+namespace relay {
+/*!
+ * \brief namespace of fold scale axis
+ *
+ * Use namespace to reduce potential naming conflict.
+ */
+namespace fold_scale_axis {
+
+using runtime::TypedPackedFunc;
+
+
+// FoldScaleAxisFoward algorithm:
+//
+// The general idea is that we transform Expr to tuple of
+// (value, axes, scale), where the final result satiesfies:
+//
+// result = value
+// for i, k in enumerate(axes):
+//    k-ith dimension of result *= i-th dimension of scale
+//
+// Then we can propagate this signal along and fold the scale if necessary.
+// However, it is possible that certain scale may never be consumed
+// if there is no dense/conv2d that follows multiplication.
+//
+// In order to make sure all the scale we sent out can be consumed eventually,
+// we run a backward "preparation phase", which propagates the demand
+// of the potential axes scaling back to its input.
+//
+// The folding process is done in two steps:
+// - Prepare phase: backward propagation of demand.
+// - Transform phase: forward transformation,
+
+/*!
+ * \brief sorted array axis, can also be nullptr.
+ *
+ *  nullptr means no scaling request can be done.
+ */
+using AxesSet = Array<Integer>;
+
+/*!
+ * \brief Merge two axis set together by taking
+ *  intersection.
+ *
+ * \note The axes in a AxesSet should be sorted.
+ *
+ * \param lhs The left axis.
+ * \param rhs The right axis.
+ * \return The result of the inersection.
+ */
+AxesSet Intersect(const AxesSet& lhs, const AxesSet& rhs) {
+  if (!lhs.defined()) return lhs;
+  if (!rhs.defined()) return rhs;
+  // This code relies on axes in a AxesSet to be sorted.
+  AxesSet ret;
+  size_t i = 0, j = 0;
+  while (i < lhs.size() && j < rhs.size()) {
+    if (lhs[i]->value < rhs[j]->value) {
+      ++i;
+    } else if (lhs[i]->value > rhs[j]->value) {
+      ++j;
+    } else {
+      ret.push_back(lhs[i]);
+      ++i; ++j;
+    }
+  }
+  return ret;
+}
+
+/*!
+ * \param Get function from op_map.
+ * \param op_map The OpMap.
+ * \param op The operator being called.
+ * \tparam ValueType the content value type.
+ * \return The result value map.
+ */
+template<typename ValueType>
+ValueType GetFunc(const OpMap<ValueType>& op_map,
+                  const Expr& op) {
+  if (const OpNode* opnode = op.as<OpNode>()) {
+    return op_map.get(GetRef<Op>(opnode), ValueType());
+  } else {
+    return ValueType();
+  }
+}
+
+/*!
+ * \brief Preparation function for for pass scale forward.
+ * \param call The call node.
+ * \param out_scale_axes Possible scaling on axes of the output.
+ * \return The result scaling on axes of the input.
+ */
+using FForwardPrep = runtime::TypedPackedFunc<
+  Array<AxesSet> (const Call& call, const AxesSet& out_scale_axes)>;
+
+/*! \brief Axis scale tuple.  */
+class STupleNode : public Node {
+ public:
+  /*! \brief The value */
+  Expr value;
+  /*! \brief The axes to scale, can be nullptr(means no-scaling) */
+  AxesSet axes = NullValue<AxesSet>();
+  /*! \brief The scaling factor */
+  Expr scale = NullValue<Expr>();
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("value", &value);
+    v->Visit("axes", &axes);
+    v->Visit("scale", &scale);
+  }
+
+  static constexpr const char* _type_key = "relay.fold_scale_axis.STupleNode";
+  TVM_DECLARE_NODE_TYPE_INFO(STupleNode, Node);
+};
+
+RELAY_DEFINE_NODE_REF(STuple, STupleNode, NodeRef);
+
+/*!
+ * \brief The transform function, transform an old call to
+ *  a new one given the new args.
+ * \param ref_call Reference call node that represent the op and the types.
+ * \param expected_out_axes The scale axes allowed in the output.
+ * \param sargs The input arguments.
+ */
+using FForwardTransform = TypedPackedFunc<
+  STuple(const Call& ref_call,
+         const AxesSet& expected_out_axes,
+         const Array<STuple>& sargs)>;
+
+//----------------------------------------------
+// Generic Visitors for FScaleAxisForward
+//----------------------------------------------
+class FScaleAxisForwardPrep : private ExprVisitor {
+ public:
+  std::unordered_map<const Node*, AxesSet>
+  Prepare(const Expr& body) {
+    this->Update(body, NullValue<AxesSet>());
+    this->VisitExpr(body);
+    // flist is added in the Post-DFS order
+    // which is a special case of topological order.
+    // We reversely traverse the list to invoke the lazy functions.
+    // This act like a backprop of valid scale axis messages
+    for (auto it = flist_.rbegin(); it != flist_.rend(); ++it) {
+      (*it)();
+    }
+    // return the created message;
+    return std::move(message_);
+  }
+
+ private:
+  // The invoke list
+  std::vector<std::function<void()> > flist_;
+  // The message on each node.
+  std::unordered_map<const Node*, AxesSet> message_;
+  // Update the message stored at node.
+  void Update(const Expr& node, const AxesSet& axes) {
+    // We run intersection of messages:
+    //
+    // %y = multiply(%x, %scale)
+    // %z1 = conv2d(%y, %w)
+    // %z2 = exp(%y)
+    //
+    // Consider the above code example,
+    // because %z2 will propagate null to %y,
+    // the AxesSet on %y is also null,
+    // and the forward folding won't be triggered.
+    const Node* key = node.get();
+    if (message_.count(key)) {
+      message_[key] = Intersect(message_[key], axes);
+    } else {
+      message_[key] = axes;
+    }
+  }
+  // Visitor pattern override.
+  void VisitExpr_(const LetNode* call) {
+    LOG(FATAL) << "FoldScaleAxis only accept dataflow-form";
+  }
+
+  void VisitExpr_(const FunctionNode* op) {
+    ExprVisitor::VisitExpr_(op);
+    auto flazy = [this, op] {
+      this->Update(op->body, NullValue<AxesSet>());
+    };
+    flist_.push_back(flazy);
+  }
+
+  void VisitExpr_(const CallNode* call) {
+    ExprVisitor::VisitExpr_(call);
+    // function to be lazily invoked
+    auto flazy = [this, call]() {
+      static const auto& fprep =
+        Op::GetAttr<FForwardPrep>("FScaleAxisForwardPrep");
+      // find the message send to this node.
+      auto it = message_.find(call);
+      AxesSet out_axes;
+      if (it != message_.end()) {
+        out_axes = it->second;
+      } else {
+        out_axes = NullValue<AxesSet>();
+      }
+      // pass the message back to all the children it references.
+      auto f = GetFunc(fprep, call->op);
+      if (f != nullptr) {
+        Array<AxesSet> in_axes = f(GetRef<Call>(call), out_axes);
+        CHECK_EQ(in_axes.size(), call->args.size());
+        for (size_t i = 0; i < call->args.size(); ++i) {
+          this->Update(call->args[i], in_axes[i]);
+        }
+      } else {
+        for (size_t i = 0; i < call->args.size(); ++i) {
+          this->Update(call->args[i], NullValue<AxesSet>());
+        }
+      }
+    };
+    flist_.push_back(flazy);
+  }
+
+  void VisitExpr_(const TupleNode* op) {
+    ExprVisitor::VisitExpr_(op);
+    // do not support pass scale through tuple for now.
+    auto flazy = [this, op]() {
+      for (const Expr& field : op->fields) {
+        this->Update(field, NullValue<AxesSet>());
+      }
+    };
+    flist_.push_back(flazy);
+  }
+
+  void VisitExpr_(const IfNode* op) {
+    ExprVisitor::VisitExpr_(op);
+    // do pass through condition
+    // by assigning NullValue<AxesSet>
+    // it means fuse signal cannot pass
+    // through into these subexpressions.
+    auto flazy = [this, op]() {
+      this->Update(op->cond, NullValue<AxesSet>());
+      this->Update(op->true_branch, NullValue<AxesSet>());
+      this->Update(op->false_branch, NullValue<AxesSet>());
+    };
+    flist_.push_back(flazy);
+  }
+};
+
+class FScaleAxisForwardTransform : private ExprMutator {
+ public:
+  // Transform expression.
+  Expr Transform(Expr expr) {
+    expected_scale_axes_ =
+        FScaleAxisForwardPrep().Prepare(expr);
+    return this->Mutate(expr);
+  }
+
+ private:
+  // Valid axes on each node.
+  std::unordered_map<const Node*, AxesSet> expected_scale_axes_;
+  std::unordered_map<const Node*, STuple> scale_memo_;
+  // If user simply call mutate,
+  // then only Expr is returned and we cannot
+  // accept outstanding scales.
+  Expr VisitExpr(const Expr& expr) final {
+    Expr res = ExprMutator::VisitExpr(expr);
+    CHECK(!scale_memo_.count(expr.get()))
+        << "Outstanding scale";
+    return res;
+  }
+
+  STuple GetSTuple(const Expr& expr) {
+    Expr res = ExprMutator::VisitExpr(expr);
+    auto it = scale_memo_.find(expr.get());
+    if (it != scale_memo_.end()) {
+      CHECK(it->second->value.same_as(res));
+      return it->second;
+    } else {
+      auto node = make_node<STupleNode>();
+      node->value = res;
+      return STuple(node);
+    }
+  }
+
+  Expr VisitExpr_(const CallNode* call_node) final {
+    static const auto& ftransform =
+        Op::GetAttr<FForwardTransform>("FScaleAxisForwardTransform");
+    auto new_op = this->Mutate(call_node->op);
+    bool has_scale = false;
+    bool unchanged = call_node->op.same_as(new_op);
+
+    Array<STuple> call_sargs;
+    Array<Expr> call_args;
+    for (auto arg : call_node->args) {
+      STuple new_sarg = this->GetSTuple(arg);
+      unchanged &= new_sarg->value.same_as(arg);
+      if (new_sarg->axes.defined()) has_scale = true;
+      call_sargs.push_back(new_sarg);
+      call_args.push_back(new_sarg->value);
+    }
+
+    // get expected scale axes.
+    AxesSet expected_out_axes;
+    auto axis_it = expected_scale_axes_.find(call_node);
+    if (axis_it != expected_scale_axes_.end()) {
+      expected_out_axes = axis_it->second;
+    }
+    // propagation function
+    auto f = GetFunc(ftransform, call_node->op);
+    if (f != nullptr) {
+      STuple sret = f(GetRef<Call>(call_node), expected_out_axes, call_sargs);
+      if (sret.defined()) {
+        if (sret->axes.defined()) {
+          scale_memo_[call_node] = sret;
+        }
+        return sret->value;
+      }
+    }
+    // normal path
+    CHECK(!has_scale) << "Outstanding scale, on op=" << call_node->op;
+    if (unchanged) {
+      return GetRef<Expr>(call_node);
+    } else {
+      return CallNode::make(
+          new_op, call_args, call_node->attrs, call_node->type_args);
+    }
+  }
+};
+
+//----------------------------------------------
+// Per operator defs for FScaleAxisForward
+//----------------------------------------------
+
+// Intermediate operators
+Array<AxesSet> ReluForwardPrep(const Call& call, AxesSet out) {
+  return {out};
+}
+
+STuple ReluForwardTransform(const Call& ref_call,
+                              const AxesSet& expected_axes,
+                              const Array<STuple>& sargs) {
+  if (!sargs[0]->axes.defined()) return STuple();
+  // return transformed conv2d
+  auto rnode = make_node<STupleNode>();
+  rnode->value = CallNode::make(
+      ref_call->op, {sargs[0]->value}, ref_call->attrs, {});
+  rnode->scale = sargs[0]->scale;
+  rnode->axes = sargs[0]->axes;
+  return STuple(rnode);
+}
+
+RELAY_REGISTER_OP("nn.relu")
+.set_attr<FForwardPrep>("FScaleAxisForwardPrep", ReluForwardPrep);
+
+RELAY_REGISTER_OP("nn.relu")
+.set_attr<FForwardTransform>("FScaleAxisForwardTransform", ReluForwardTransform);
+
+RELAY_REGISTER_OP("nn.leaky_relu")
+.set_attr<FForwardPrep>("FScaleAxisForwardPrep", ReluForwardPrep);
+
+RELAY_REGISTER_OP("nn.leaky_relu")
+.set_attr<FForwardTransform>("FScaleAxisForwardTransform", ReluForwardTransform);
+
+// AddSub
+Array<AxesSet> AddSubForwardPrep(const Call& call, AxesSet out_axes) {
+  const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
+
+  auto none = NullValue<AxesSet>();
+  if (MatchBroadcastToLeftAxes(tlhs, trhs, out_axes)) {
+    return {out_axes, none};
+  } else if (MatchBroadcastToLeftAxes(trhs, tlhs, out_axes)) {
+    return {none, out_axes};
+  } else {
+    return {none, none};
+  }
+}
+
+STuple AddSubForwardTransform(const Call& ref_call,
+                              const AxesSet& expected_out_axes,
+                              const Array<STuple>& sargs) {
+  if (!sargs[0]->axes.defined() && !sargs[1]->axes.defined()) {
+    return STuple();
+  }
+  const auto* tlhs = ref_call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = ref_call->args[1]->type_as<TensorTypeNode>();
+
+  auto rnode = make_node<STupleNode>();
+  if (sargs[0]->axes.defined()) {
+    CHECK(!sargs[1]->axes.defined());
+    CHECK(MatchBroadcastToLeftAxes(tlhs, trhs, sargs[0]->axes));
+    Expr scale = ExpandBiasToMatchAxis(
+        sargs[0]->scale, tlhs->shape.size(), sargs[0]->axes);
+    Expr rhs = Divide(sargs[1]->value, scale);
+    rnode->value = CallNode::make(ref_call->op, {sargs[0]->value, rhs},
+                                  ref_call->attrs, ref_call->type_args);
+    rnode->scale = sargs[0]->scale;
+    rnode->axes = sargs[0]->axes;
+  } else {
+    CHECK(sargs[1]->axes.defined());
+    CHECK(sargs[0]->axes.defined());
+    CHECK(MatchBroadcastToLeftAxes(trhs, tlhs, sargs[1]->axes));
+    Expr scale = ExpandBiasToMatchAxis(
+        sargs[1]->scale, trhs->shape.size(), sargs[1]->axes);
+    Expr lhs = Divide(sargs[0]->value, scale);
+    rnode->value = CallNode::make(ref_call->op, {lhs, sargs[1]->value},
+                                  ref_call->attrs, ref_call->type_args);
+    rnode->scale = sargs[1]->scale;
+    rnode->axes = sargs[1]->axes;
+  }
+  return STuple(rnode);
+}
+
+RELAY_REGISTER_OP("add")
+.set_attr<FForwardPrep>("FScaleAxisForwardPrep", AddSubForwardPrep);
+
+RELAY_REGISTER_OP("add")
+.set_attr<FForwardTransform>("FScaleAxisForwardTransform", AddSubForwardTransform);
+
+RELAY_REGISTER_OP("subtract")
+.set_attr<FForwardPrep>("FScaleAxisForwardPrep", AddSubForwardPrep);
+
+RELAY_REGISTER_OP("subtract")
+.set_attr<FForwardTransform>("FScaleAxisForwardTransform", AddSubForwardTransform);
+
+// Producer operators
+// Multiply produces the scale-axis pair.
+STuple MultiplyForwardTransform(const Call& ref_call,
+                                const AxesSet& expected_out_axes,
+                                const Array<STuple>& sargs) {
+  if (!expected_out_axes.defined()) return STuple();
+  // TODO(tvm-team) allow same axes accumulation
+  // not as important because it is less common in nn.
+  CHECK(!sargs[0]->axes.defined());
+  CHECK(!sargs[1]->axes.defined());
+  const auto* tlhs = ref_call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = ref_call->args[1]->type_as<TensorTypeNode>();
+
+  Expr lhs = sargs[0]->value;
+  Expr rhs = sargs[1]->value;
+  auto rnode = make_node<STupleNode>();
+  if (MatchBroadcastToLeftAxes(tlhs, trhs, expected_out_axes, &rhs)) {
+    rnode->value = lhs;
+    rnode->scale = rhs;
+    rnode->axes = expected_out_axes;
+  } else if (MatchBroadcastToLeftAxes(trhs, tlhs, expected_out_axes, &lhs)) {
+    rnode->value = rhs;
+    rnode->scale = lhs;
+    rnode->axes = expected_out_axes;
+  }
+  return STuple(rnode);
+}
+
+RELAY_REGISTER_OP("multiply")
+.set_attr<FForwardTransform>("FScaleAxisForwardTransform", MultiplyForwardTransform);
+
+// Consumer operators
+// Conv2D send out requirement of axis folding.
+Array<AxesSet> Conv2DForwardPrep(const Call& call, AxesSet out) {
+  // TODO(tvm-team) support general data layout
+  // by transforming weight
+  const auto* param = call->attrs.as<Conv2DAttrs>();
+  CHECK(param != nullptr);
+  Layout data_layout(param->data_layout);
+  Layout weight_layout(param->weight_layout);
+  int c_big_axis = data_layout.indexof('C');
+  int c_small_axis = data_layout.indexof('c');
+  const auto* tdata = call->args[0]->type_as<TensorTypeNode>();
+  CHECK(tdata) << "require checked type";
+
+  CHECK_GE(c_big_axis, 0);
+  AxesSet data_axes = NullValue<AxesSet>();
+  // For now, we only support simple pattern (no folded weight/data)
+  // More general layout can be supported under the current framework.
+  // By using a unified layout transformation.
+  // We only need to change the Prep and Mutate function.
+  //
+  // only handle depthwise or full conv2d.
+  // TODO(tvm-team) handle grouped conv by reshape + bcast
+  bool is_depthwise_conv2d =
+      is_const_int(tdata->shape[c_big_axis], param->groups);
+  if (weight_layout.indexof('i') < 0 &&
+      c_small_axis < 0 &&
+      (param->groups == 1 || is_depthwise_conv2d)) {
+    data_axes = {c_big_axis};
+  }
+  return {data_axes, NullValue<AxesSet>()};
+}
+
+// Conv2D consumes the scale axis during transformation.
+STuple Conv2DForwardTransform(const Call& ref_call,
+                              const AxesSet& expected_axes,
+                              const Array<STuple>& sargs) {
+  // if data do not have scale, normal transform path.
+  STuple sdata = sargs[0];
+  if (!sdata->scale.defined()) return STuple();
+  CHECK(sdata->axes.defined());
+  const auto* param = ref_call->attrs.as<Conv2DAttrs>();
+  CHECK(param != nullptr);
+  Layout data_layout(param->data_layout);
+  Layout weight_layout(param->weight_layout);
+  int c_big_axis = data_layout.indexof('C');
+  CHECK_GE(c_big_axis, 0);
+  // For now, we only support simple pattern (no folded weight/data)
+  // TODO(tvm-team) support general data layout
+  CHECK_EQ(weight_layout.indexof('i'), -1);
+  CHECK(sdata->axes.size() == 1 &&
+        c_big_axis == sdata->axes[0]->value);
+  int big_ic_axis = weight_layout.indexof('I');
+
+  const auto* tdata = ref_call->args[0]->type_as<TensorTypeNode>();
+  // Check it must be depthwise or full conv2d.
+  bool is_depthwise_conv2d =
+      is_const_int(tdata->shape[c_big_axis], param->groups);
+  CHECK(param->groups == 1 || is_depthwise_conv2d);
+
+  // match the ic_axis
+  Expr scale = ExpandBiasToMatchAxis(
+      sdata->scale, weight_layout.ndim(), {big_ic_axis});
+  Expr weight = Multiply(sargs[1]->value, scale);
+  // return transformed conv2d
+  auto rnode = make_node<STupleNode>();
+  rnode->value = CallNode::make(
+      ref_call->op, {sdata->value, weight}, ref_call->attrs, ref_call->type_args);
+  return STuple(rnode);
+}
+
+RELAY_REGISTER_OP("nn.conv2d")
+.set_attr<FForwardPrep>("FScaleAxisForwardPrep", Conv2DForwardPrep);
+
+RELAY_REGISTER_OP("nn.conv2d")
+.set_attr<FForwardTransform>("FScaleAxisForwardTransform", Conv2DForwardTransform);
+
+
+Expr ForwardFoldScaleAxis(Expr data) {
+  return FScaleAxisForwardTransform().Transform(data);
+}
+
+// Expose the FoldScaleAxisFoward
+TVM_REGISTER_API("relay._ir_pass.forward_fold_scale_axis")
+.set_body_typed<Expr(Expr)>(ForwardFoldScaleAxis);
+
+}  // namespace fold_scale_axis
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
new file mode 100644
index 000000000000..a395e74cdf0b
--- /dev/null
+++ b/src/relay/pass/pattern_util.h
@@ -0,0 +1,123 @@
+/*!
+ *  Copyright (c) 2018 by Contributors.
+ *
+ * \file tvm/relay/pass/pattern_util.h
+ * \brief Header of internal operator functions
+ *  These can be used for writing passes.
+ */
+#ifndef TVM_RELAY_PASS_PATTERN_UTIL_H_
+#define TVM_RELAY_PASS_PATTERN_UTIL_H_
+
+#include <tvm/relay/op.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/attrs/transform.h>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Try to match lhs and rhs via broadcasting rule, such that:
+ *
+ * rhs matches the dimension of lhs specified by lhs_axes
+ * rhs's value equals 1 on rest of dimensions.
+ *
+ * \param tlhs The type of left operand (data)
+ * \param trhs The type right operand (bias)
+ * \param lhs_axes The axes on lhs to match.
+ * \param rhs_value A squeezed version of rhs which only contains matched dimension.
+ * \return Whether match is successful.
+ */
+inline bool MatchBroadcastToLeftAxes(const TensorTypeNode* tlhs,
+                                     const TensorTypeNode* trhs,
+                                     const Array<Integer>& lhs_axes,
+                                     Expr* rhs_value = nullptr) {
+  if (tlhs->shape.size() < trhs->shape.size()) return false;
+  AttrsEqual equal;
+  size_t base = tlhs->shape.size() - trhs->shape.size();
+  size_t j = 0;
+
+  NodePtr<SqueezeAttrs> squeeze_attrs;
+  if (rhs_value != nullptr) {
+    squeeze_attrs = make_node<SqueezeAttrs>();
+  }
+
+  for (size_t i = 0; i < tlhs->shape.size(); ++i) {
+    if (j < lhs_axes.size() && i == static_cast<size_t>(lhs_axes[j]->value)) {
+      if (i < base || !equal(tlhs->shape[i], trhs->shape[i - base])) {
+        return false;
+      }
+      ++j;
+    } else if (i >= base) {
+      if (!is_const_int(trhs->shape[i - base], 1)) {
+        return false;
+      }
+      if (rhs_value != nullptr) {
+        squeeze_attrs->axis.push_back(static_cast<int>(i - base));
+      }
+    }
+  }
+  if (rhs_value != nullptr && squeeze_attrs->axis.size() != 0) {
+    static const Op& squeeze_op = Op::Get("squeeze");
+    *rhs_value = CallNode::make(squeeze_op, {rhs_value[0]}, Attrs(squeeze_attrs), {});
+  }
+  return true;
+}
+
+/*!
+ * \brief Expand 1D Tensor to match axis.
+ *
+ * The result bias can be used to add or multiply to
+ * the target Tensor on the specified axis via broadcasting rule.
+ *
+ * \param bias The bias.
+ * \param target_ndim target dimension.
+ * \param axes The axis on the output we want to match on.
+ */
+inline Expr ExpandBiasToMatchAxis(Expr bias,
+                                  int target_ndim,
+                                  const Array<Integer>& axes) {
+  static const Op& expand_dims = Op::Get("expand_dims");
+  for (size_t i = axes.size(); i != 0; --i) {
+    if (i == axes.size()) {
+      int64_t num_pad_axis = target_ndim - axes[i - 1]->value - 1;
+      if (num_pad_axis > 0) {
+        auto attrs = make_node<ExpandDimsAttrs>();
+        attrs->axis = i;
+        attrs->num_newaxis = static_cast<int>(num_pad_axis);
+        bias = CallNode::make(expand_dims, {bias}, Attrs(attrs), {});
+      }
+    } else {
+      int64_t diff = axes[i]->value - axes[i - 1]->value;
+      CHECK_GE(diff, 0L);
+      if (diff > 0) {
+        auto attrs = make_node<ExpandDimsAttrs>();
+        attrs->axis = i;
+        attrs->num_newaxis = static_cast<int>(diff);
+        bias = CallNode::make(expand_dims, {bias}, Attrs(attrs), {});
+      }
+    }
+  }
+  return bias;
+}
+
+inline Expr Multiply(Expr lhs, Expr rhs) {
+  static const Op& op = Op::Get("multiply");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+inline Expr Divide(Expr lhs, Expr rhs) {
+  static const Op& op = Op::Get("divide");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+
+inline Expr ReshapeLike(Expr lhs, Expr rhs) {
+  static const Op& op = Op::Get("reshape_like");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_PATTERN_UTIL_H_
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 7c8eeef92c5d..c1f6cdc63974 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -406,28 +406,57 @@ class TypeInferencer::Resolver : public ExprMutator {
     CHECK(checked_type.as<IncompleteTypeNode>() == nullptr)
         << "Cannot resolve type of " << GetRef<Expr>(op)
         << " at " << op->span;
+
     Expr new_e = ExprMutator::VisitExpr_(op);
-    if (!checked_type.same_as(new_e->checked_type_)) {
+    // new_call and new_var's code is only going to be valid for VarNode/CallNode.
+    // Compiler optimization will likely fold these away for other nodes.
+    CallNode* new_call =(
+        std::is_base_of<CallNode, T>::value ?
+        static_cast<CallNode*>(new_e.node_.get()) : nullptr);
+    VarNode* new_var =(
+        std::is_base_of<VarNode, T>::value ?
+        static_cast<VarNode*>(new_e.node_.get()) : nullptr);
+
+    // check if we need update the new_e
+    bool need_update_type = !checked_type.same_as(new_e->checked_type_);
+    bool need_update_call = (
+        std::is_base_of<CallNode, T>::value &&
+        it->second.type_args.defined() &&
+        !it->second.type_args.same_as(new_call->type_args));
+    bool need_update_var = (
+        std::is_base_of<VarNode, T>::value &&
+        update_missing_type_annotation_ &&
+        !new_var->type_annotation.defined());
+
+    if (!need_update_type && !need_update_var && !need_update_call) return new_e;
+
+    if (!new_e.node_.unique()) {
       // Copy on write optimization
       // If new_e is an old expression,
       // we make a copy mutating an existing reference.
-      if (!new_e.node_.unique()) {
-        new_e = Expr(make_node<T>(*new_e.as<T>()));
-      }
-      new_e->checked_type_ = checked_type;
+      new_e = Expr(make_node<T>(*new_e.as<T>()));
+      new_call = (
+          std::is_base_of<CallNode, T>::value ?
+          static_cast<CallNode*>(new_e.node_.get()) : nullptr);
+      new_var = (
+          std::is_base_of<VarNode, T>::value ?
+          static_cast<VarNode*>(new_e.node_.get()) : nullptr);
     }
 
-    if (it->second.type_args.defined()) {
-      Call call = Downcast<Call>(new_e);
-      const CallNode* const_call_ref = call.operator->();
-      CallNode* call_ref = const_cast<CallNode*>(const_call_ref);
-      call_ref->type_args = it->second.type_args;
+    // attach the information.
+    if (need_update_type) {
+      new_e->checked_type_ = checked_type;
+    }
 
-      for (size_t i = 0; i < call->type_args.size(); i++) {
-        call_ref->type_args.Set(i, solver_->Resolve(call->type_args[i]));
+    if (need_update_call) {
+      new_call->type_args = it->second.type_args;
+      for (size_t i = 0; i < new_call->type_args.size(); i++) {
+        new_call->type_args.Set(i, solver_->Resolve(new_call->type_args[i]));
       }
     }
-
+    if (need_update_var) {
+      new_var->type_annotation = checked_type;
+    }
     return new_e;
   }
 
@@ -438,6 +467,9 @@ class TypeInferencer::Resolver : public ExprMutator {
  private:
   const std::unordered_map<Expr, ResolvedTypeInfo, NodeHash, NodeEqual>& tmap_;
   TypeSolver* solver_;
+  // whether attach the checked type as type_annotation
+  // if original type anntation is missing.
+  bool update_missing_type_annotation_{true};
 };
 
 
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 2ee6f758f100..427ac562fbc7 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -55,8 +55,8 @@ def test_transpose_infer_type():
 def test_squeeze_infer_type():
     n, t, d = 1, 4, 1
     x = relay.var("x", relay.TensorType((n, t, d), "float32"))
-    y = relay.squeeze(x, axes=(2,))
-    assert "axes=" in y.astext()
+    y = relay.squeeze(x, axis=(2,))
+    assert "axis=" in y.astext()
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType(
         (1, 4), "float32")
@@ -64,7 +64,7 @@ def test_squeeze_infer_type():
     n, t, d = 1, 4, 1
     x = relay.var("x", relay.TensorType((n, t, d), "float32"))
     y = relay.squeeze(x)
-    assert "axes=" not in y.astext()
+    assert "axis=" not in y.astext()
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType(
         (4,), "float32")
@@ -74,7 +74,7 @@ def test_squeeze_infer_type():
 def test_squeeze_bad_axes_infer_type():
     n, t, d = 1, 4, 1
     x = relay.var("x", relay.TensorType((n, t, d), "float32"))
-    y = relay.squeeze(x, axes=(1,))
+    y = relay.squeeze(x, axis=(1,))
     yy = relay.ir_pass.infer_type(y)
 
 
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
new file mode 100644
index 000000000000..7ce3b35efe46
--- /dev/null
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -0,0 +1,153 @@
+from tvm import relay
+
+
+def test_fold_fwd_simple():
+    """Simple testcase."""
+    def before(x, conv_weight, in_bias, in_scale, channels):
+        args = [x, conv_weight, in_bias, in_scale]
+        in_scale = relay.expand_dims(in_scale, axis=1, num_newaxis=2)
+        in_bias = relay.expand_dims(in_bias, axis=1, num_newaxis=2)
+        x = relay.multiply(x, in_scale)
+        x = relay.nn.relu(x)
+        x = relay.add(x, in_bias)
+        y = relay.nn.conv2d(x, conv_weight,
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        return relay.Function(args, y)
+
+    def expected(x, conv_weight, in_bias, in_scale, channels):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, conv_weight, in_bias, in_scale]
+        in_scale = relay.expand_dims(in_scale, axis=1, num_newaxis=2)
+        in_bias = relay.expand_dims(in_bias, axis=1, num_newaxis=2)
+        squeezed_scale = relay.squeeze(in_scale, axis=[1,2])
+        x = relay.nn.relu(x)
+        in_bias = relay.divide(in_bias, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=2))
+        x = relay.add(x, in_bias)
+        conv_weight = relay.multiply(
+            conv_weight , relay.expand_dims(squeezed_scale, axis=1, num_newaxis=2))
+        y = relay.nn.conv2d(x, conv_weight,
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        return relay.Function(args, y)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        in_bias = relay.var("in_bias", shape=(in_channels,))
+        in_scale = relay.var("in_scale", shape=(in_channels,))
+
+        y1 = before(x, weight, in_bias, in_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        type_dict = {x.name_hint:x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
+        y1_expected = expected(x, weight, in_bias, in_scale, channels)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 10), 2)
+
+
+def test_fold_fwd_dual_path():
+    """scale axis being consumed by two consumers"""
+    def before(x, conv_weight, in_bias, in_scale, channels):
+        args = [x, conv_weight, in_bias, in_scale]
+        x = relay.multiply(in_scale, x)
+        x = relay.nn.relu(x)
+        x = relay.subtract(x, in_bias)
+        y1 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             weight_layout="HWOI",
+                             groups=channels,
+                             padding=(1, 1))
+        y2 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             weight_layout="HWOI",
+                             groups=channels,
+                             padding=(1, 1))
+        z = relay.add(y1, y2)
+        return relay.Function(args, z)
+
+    def expected(x, conv_weight, in_bias, in_scale, channels):
+        args = [x, conv_weight, in_bias, in_scale]
+        x = relay.nn.relu(x)
+        in_bias = relay.divide(in_bias, in_scale)
+        x = relay.subtract(x, in_bias)
+        y1 = relay.nn.conv2d(x,
+                             relay.multiply(conv_weight, in_scale),
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             weight_layout="HWOI",
+                             groups=channels,
+                             padding=(1, 1))
+        y2 = relay.nn.conv2d(x,
+                             relay.multiply(conv_weight, in_scale),
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             weight_layout="HWOI",
+                             groups=channels,
+                             padding=(1, 1))
+        z = relay.add(y1, y2)
+        return relay.Function(args, z)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[-1]
+        # test depthwise
+        assert in_channels == channels
+        weight = relay.var("weight")
+        in_bias = relay.var("in_bias", shape=(in_channels,))
+        in_scale = relay.var("in_scale", shape=(in_channels,))
+        y1 = before(x, weight, in_bias, in_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
+        type_dict = {x.name_hint:x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_expected = expected(x, weight, in_bias, in_scale, channels)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 3), 3)
+
+
+def test_fold_fwd_fail():
+    """testcase where we canont fold"""
+    def before(x, conv_weight, in_bias, in_scale, channels):
+        x = relay.multiply(x, in_scale)
+        xx = relay.nn.leaky_relu(x, alpha=0.1)
+        y1 = relay.nn.conv2d(xx, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             padding=(1, 1))
+        z = relay.add(y1, x)
+        return relay.Function(relay.ir_pass.free_vars(z), z)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[-1]
+        # test depthwise
+        assert in_channels == channels
+        weight = relay.var("weight")
+        in_bias = relay.var("in_bias", shape=(in_channels,))
+        in_scale = relay.var("in_scale", shape=(in_channels,))
+        y1 = before(x, weight, in_bias, in_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
+        assert relay.ir_pass.alpha_equal(y1, y1_folded)
+
+    check((2, 11, 10, 4), 4)
+
+
+if __name__ == "__main__":
+    test_fold_fwd_simple()
+    test_fold_fwd_dual_path()
+    test_fold_fwd_fail()

From 8c352ab4486c0fa7688439031c3028b1fe58d34e Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Sun, 28 Oct 2018 18:27:59 -0700
Subject: [PATCH 297/529] Add attrs package (#2025)

---
 docker/install/ubuntu_install_python_package.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 505a25d28e3d..bd6e67cc1ed9 100644
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -1,3 +1,3 @@
 # install libraries for python package on ubuntu
-pip2 install nose pylint numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime
-pip3 install nose pylint numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime
+pip2 install nose pylint numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime attrs
+pip3 install nose pylint numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime attrs

From d79633aa9dbbc6afca4ace93565e1350e726429b Mon Sep 17 00:00:00 2001
From: xqdan <danxiaoqiang@126.com>
Date: Mon, 29 Oct 2018 09:37:05 +0800
Subject: [PATCH 298/529] [intrin]support fmod for cuda (#1964)

---
 python/tvm/intrin.py                   | 16 +++++++++++
 src/codegen/intrin_rule_cuda.cc        |  2 ++
 src/codegen/intrin_rule_metal.cc       |  3 ++
 src/codegen/intrin_rule_opencl.cc      |  3 ++
 src/lang/ir_operator.cc                |  6 ++++
 tests/python/integration/test_ewise.py | 40 ++++++++++++++++++++++++++
 6 files changed, 70 insertions(+)

diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py
index 30da873b5dcf..3207b6112b1d 100644
--- a/python/tvm/intrin.py
+++ b/python/tvm/intrin.py
@@ -376,6 +376,22 @@ def popcount(x):
     """
     return call_pure_intrin(x.dtype, "popcount", x)
 
+def fmod(x, y):
+    """Return the remainder of x divided by y with the same sign as x.
+
+    Parameters
+    ----------
+    x : Expr
+        Input argument.
+    y : Expr
+        Input argument.
+
+    Returns
+    -------
+    z : Expr
+        The result.
+    """
+    return call_pure_intrin(x.dtype, "fmod", x, y)
 
 # Intrinsic rule related code
 def register_intrin_rule(target, intrin, f=None, override=False):
diff --git a/src/codegen/intrin_rule_cuda.cc b/src/codegen/intrin_rule_cuda.cc
index ee98a54329ab..a6867c7f201c 100644
--- a/src/codegen/intrin_rule_cuda.cc
+++ b/src/codegen/intrin_rule_cuda.cc
@@ -91,6 +91,8 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.popcount")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.tvm_warp_shuffle")
 .set_body(DispatchExtern<CUDAShuffle>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.fmod")
+.set_body(DispatchExtern<CUDAMath>);
 
 }  // namespace intrin
 }  // namespace codegen
diff --git a/src/codegen/intrin_rule_metal.cc b/src/codegen/intrin_rule_metal.cc
index 8b499fb9ea9b..2e65d5537dd2 100644
--- a/src/codegen/intrin_rule_metal.cc
+++ b/src/codegen/intrin_rule_metal.cc
@@ -42,6 +42,9 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.pow")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.popcount")
 .set_body(DispatchExtern<Direct>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.fmod")
+.set_body(DispatchExtern<Direct>);
+
 }  // namespace intrin
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/intrin_rule_opencl.cc b/src/codegen/intrin_rule_opencl.cc
index 1cb1aed01102..e4cf11bf6e64 100644
--- a/src/codegen/intrin_rule_opencl.cc
+++ b/src/codegen/intrin_rule_opencl.cc
@@ -42,6 +42,9 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.pow")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.popcount")
 .set_body(DispatchExtern<Direct>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.fmod")
+.set_body(DispatchExtern<Direct>);
+
 // There is no warp shuffle instruction in standard OpenCL
 // When shuffle is used, we assume it is intel's shuffle extension
 struct IntelShuffle {
diff --git a/src/lang/ir_operator.cc b/src/lang/ir_operator.cc
index 275752644be9..9ae2912901be 100644
--- a/src/lang/ir_operator.cc
+++ b/src/lang/ir_operator.cc
@@ -450,4 +450,10 @@ Expr prod(Expr source, Array<IterVar> rdom) {
   return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
+Expr fmod(Expr x, Expr y) {
+  BinaryOpMatchTypes(x, y);
+  CHECK(x.type().is_float()) << "fmod only applies to float";
+  return ir::Call::make(x.type(), "fmod", { x, y }, ir::Call::PureIntrinsic);
+}
+
 }  // namespace tvm
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index 0f58c2367576..b3f17b7c1bb1 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -38,6 +38,45 @@ def check_device(device, host="stackvm"):
     check_device("cuda", "llvm")
     check_device("vulkan")
 
+def test_fmod():
+    # graph
+    def run(dtype):
+        n = tvm.var('n')
+        A = tvm.placeholder((n,), name='A', dtype=dtype)
+        B = tvm.placeholder((n,), name='B', dtype=dtype)
+        C = tvm.compute(A.shape, lambda *i: tvm.fmod(A(*i), B(*i)), name='C')
+        s = tvm.create_schedule(C.op)
+        # create iter var and assign them tags.
+        num_thread = 8
+        bx, tx = s[C].split(C.op.axis[0], factor=num_thread)
+
+        def check_device(device):
+            ctx = tvm.context(device, 0)
+            if not ctx.exist:
+                print("skip because %s is not enabled.." % device)
+                return
+            target = tvm.target.create(device)
+            if "cpu" not in target.keys:
+                s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+                s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+            fmod = tvm.build(s, [A, B, C], device, name="myfmod")
+
+            # launch the kernel.
+            n = 1024
+            a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), ctx)
+            b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), ctx)
+            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+            ftimer = fmod.time_evaluator(fmod.entry_name, ctx, number=1)
+            tcost = ftimer(a, b, c).mean
+            #fmod(a, b, c)
+            np.testing.assert_allclose(
+                c.asnumpy(), np.mod(a.asnumpy(), b.asnumpy()), rtol=1e-5)
+
+        check_device("cuda")
+        check_device("opencl -device=intel_graphics")
+        check_device("metal")
+
+    run("float32")
 
 def test_multiple_cache_write():
     # graph
@@ -245,3 +284,4 @@ def check_device(device):
     test_add()
     test_log_pow_llvm()
     test_popcount()
+    test_fmod()

From d915318c5e15be0b5b79b8e0010d37e9d6df8df2 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Sun, 28 Oct 2018 20:54:47 -0700
Subject: [PATCH 299/529] Do not mutate GlobalVar's checked_type field. (#2026)

---
 src/relay/pass/type_infer.cc          |  2 +-
 tests/python/relay/test_type_infer.py | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index c1f6cdc63974..c0f1db97b538 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -366,7 +366,7 @@ class TypeInferencer::Resolver : public ExprMutator {
   }
 
   Expr VisitExpr_(const GlobalVarNode* op) final {
-    return AttachCheckedType(op);
+    return GetRef<GlobalVar>(op);
   }
 
   Expr VisitExpr_(const OpNode* op) final {
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 8f92fc0f5192..b1823004022c 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -123,6 +123,16 @@ def f(x) {
     assert relay.ir_pass.infer_type(f).checked_type == relay.FuncType([a], a)
     assert relay.ir_pass.infer_type(fx).checked_type == a
 
+def test_global_var_cow_issue():
+    env = relay.env.Environment({})
+    gv = relay.GlobalVar("foo")
+    x = relay.var('x', shape=[])
+    func = relay.Function([x], relay.Call(gv, [x]), relay.TensorType([], 'float32'))
+    env[gv] = func
+    # They should both point to the same global variable if global variables are
+    # stable across type checking.
+    assert gv == func.body.op
+
 if __name__ == "__main__":
     test_free_expr()
     test_dual_op()
@@ -134,3 +144,4 @@ def f(x) {
     test_free_expr()
     test_type_args()
     test_self_reference()
+    test_global_var_cow_issue()
\ No newline at end of file

From fd87cad72214d088a82adb24f5314397be904867 Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Sun, 28 Oct 2018 20:56:58 -0700
Subject: [PATCH 300/529] [Relay] DQN Port (#2009)

---
 python/tvm/relay/testing/__init__.py       |  1 +
 python/tvm/relay/testing/dqn.py            | 71 ++++++++++++++++++++++
 tests/python/relay/test_ir_text_printer.py |  5 ++
 3 files changed, 77 insertions(+)
 create mode 100644 python/tvm/relay/testing/dqn.py

diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 9bfda3c7abc7..2a9b81b84230 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -3,3 +3,4 @@
 
 from . import mlp
 from . import resnet
+from . import dqn
diff --git a/python/tvm/relay/testing/dqn.py b/python/tvm/relay/testing/dqn.py
new file mode 100644
index 000000000000..736894612e19
--- /dev/null
+++ b/python/tvm/relay/testing/dqn.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Net of Nature DQN
+Reference:
+Mnih, Volodymyr, et al. "Human-level control through deep reinforcement learning."
+Nature 518.7540 (2015): 529.
+"""
+
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def get_net(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32"):
+    """get symbol of nature dqn"""
+    data_shape = (batch_size,) + image_shape
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    conv1 = layers.conv2d(data, kernel_size=(8, 8), strides=(4, 4), padding=(0, 0),
+                          channels=32, name="conv1")
+    relu1 = relay.nn.relu(conv1)
+    conv2 = layers.conv2d(relu1, kernel_size=(4, 4), strides=(2, 2), padding=(0, 0),
+                          channels=64, name="conv2")
+    relu2 = relay.nn.relu(conv2)
+    conv3 = layers.conv2d(relu2, kernel_size=(3, 3), strides=(1, 1), padding=(0, 0),
+                          channels=64, name="conv3")
+    relu3 = relay.nn.relu(conv3)
+    bf1 = relay.nn.batch_flatten(relu3)
+    dense1 = layers.dense_add_bias(bf1, units=512, name="dense1")
+    relu4 = relay.nn.relu(dense1)
+    dense2 = layers.dense_add_bias(relu4, units=num_actions, name="dense2")
+
+    args = relay.ir_pass.free_vars(dense2)
+    return relay.Function(args, dense2)
+
+
+def get_workload(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32"):
+    """Get benchmark workload for a Deep Q Network
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+    num_actions : int, optional
+        Number of actions
+    image_shape : tuple, optional
+        The input image shape
+    dtype : str, optional
+        The data type
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size, num_actions=num_actions, image_shape=image_shape, dtype=dtype)
+    return create_workload(net)
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 69ba4797a1c7..7b2c343b0844 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -104,10 +104,15 @@ def test_resnet():
     net, params = tvm.relay.testing.resnet.get_workload(batch_size=1)
     net.astext()
 
+def test_dqn():
+    net, params = tvm.relay.testing.dqn.get_workload(batch_size=1)
+    show(net.astext())
+
 if __name__ == "__main__":
     do_print[0] = True
     test_resnet()
     test_mlp()
+    test_dqn()
     test_func()
     test_env()
     test_meta_data()

From 0308989b7e41f4a9406c3a64c7f301107f90e8af Mon Sep 17 00:00:00 2001
From: xqdan <danxiaoqiang@126.com>
Date: Tue, 30 Oct 2018 02:19:37 +0800
Subject: [PATCH 301/529] [PASS]unroll loops with extent=1 (#2027)

---
 src/pass/unroll_loop.cc                   |  4 +++-
 tests/python/unittest/test_pass_unroll.py | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/pass/unroll_loop.cc b/src/pass/unroll_loop.cc
index 6c0ac517553f..d4481e86c0fc 100644
--- a/src/pass/unroll_loop.cc
+++ b/src/pass/unroll_loop.cc
@@ -76,7 +76,9 @@ class LoopUnroller : public IRMutator {
       normal_loop_depth_ += 1;
     }
 
-    if (auto_unroll && explicit_unroll_) {
+    if ((auto_unroll && explicit_unroll_) ||
+        // unroll loops with extent = 1, no matter how many steps in body
+        (value <= auto_max_extent_ && auto_max_extent_ == 1)) {
       return Unroll(op);
     } else {
       if (auto_unroll) {
diff --git a/tests/python/unittest/test_pass_unroll.py b/tests/python/unittest/test_pass_unroll.py
index dda3fdad166c..68467b0c04c6 100644
--- a/tests/python/unittest/test_pass_unroll.py
+++ b/tests/python/unittest/test_pass_unroll.py
@@ -35,6 +35,23 @@ def test_unroll_loop():
     assert isinstance(ret.rest, tvm.stmt.For)
     assert ret.rest.for_type != tvm.stmt.For.Unrolled
 
+def test_unroll_fake_loop():
+    ib = tvm.ir_builder.create()
+    dtype = 'int32'
+    n = tvm.var('n')
+    Ab = tvm.decl_buffer((n, ), dtype)
+    Aptr = ib.buffer_ptr(Ab)
+    # for i in 0 to n-1:
+    with ib.for_range(0, 1, name="i") as i:
+        Aptr[i*2] = 3
+        with ib.for_range(0, 10, name="j") as j:
+            Aptr[j + 1] = Aptr[i] + 1
+
+    stmt = ib.get()
+    ret = tvm.ir_pass.UnrollLoop(stmt, 8, 0, 1, True)
+    assert isinstance(ret.first, tvm.stmt.Store)
+
 
 if __name__ == "__main__":
     test_unroll_loop()
+    test_unroll_fake_loop()
\ No newline at end of file

From 12606711ad50f1098f9719d0836481a19dc80880 Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Mon, 29 Oct 2018 14:10:06 -0700
Subject: [PATCH 302/529] [Relay] DCGAN port (#2010)

---
 python/tvm/relay/testing/__init__.py       |  1 +
 python/tvm/relay/testing/dcgan.py          | 96 ++++++++++++++++++++++
 python/tvm/relay/testing/dqn.py            | 10 +++
 python/tvm/relay/testing/layers.py         | 24 ++++++
 tests/python/relay/test_ir_text_printer.py |  7 +-
 5 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/relay/testing/dcgan.py

diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 2a9b81b84230..547fff425595 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -4,3 +4,4 @@
 from . import mlp
 from . import resnet
 from . import dqn
+from . import dcgan
diff --git a/python/tvm/relay/testing/dcgan.py b/python/tvm/relay/testing/dcgan.py
new file mode 100644
index 000000000000..96cd871e4122
--- /dev/null
+++ b/python/tvm/relay/testing/dcgan.py
@@ -0,0 +1,96 @@
+# pylint: disable=unused-argument
+"""
+Net of the generator of DCGAN
+
+Adopted from:
+https://github.com/tqchen/mxnet-gan/blob/master/mxgan/generator.py
+
+Reference:
+Radford, Alec, Luke Metz, and Soumith Chintala.
+"Unsupervised representation learning with deep convolutional generative adversarial networks."
+arXiv preprint arXiv:1511.06434 (2015).
+"""
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def deconv2d(data, ishape, oshape, kshape, name, stride=(2, 2)):
+    """a deconv layer that enlarges the feature map"""
+    target_shape = (oshape[-2], oshape[-1])
+
+    pad_y = (kshape[0] - 1) // 2
+    pad_x = (kshape[1] - 1) // 2
+    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
+    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
+
+    net = layers.conv2d_transpose(data,
+                                  kernel_size=kshape,
+                                  strides=stride,
+                                  channels=oshape[0],
+                                  padding=(pad_y, pad_x),
+                                  output_padding=(adj_y, adj_x),
+                                  name=name)
+    return net
+
+def deconv2d_bn_relu(data, prefix, **kwargs):
+    """a block of deconv + batch norm + relu"""
+    eps = 1e-5 + 1e-12
+    net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
+    net = layers.batch_norm_infer(net, epsilon=eps, name="batch_norm")
+    net = relay.nn.relu(net)
+    return net
+
+def get_net(batch_size, random_len=100, oshape=(3, 64, 64), ngf=128, code=None, dtype="float32"):
+    """get net of dcgan generator"""
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
+
+    code = relay.var("data", dtype=dtype, shape=(batch_size, random_len)) if code is None else code
+    dense_weight = relay.var("dense_weight")
+    dense = relay.nn.dense(code, weight=dense_weight, units=4*4*ngf*8)
+    relu = relay.nn.relu(dense)
+    # 4 x 4
+    reshape = relay.reshape(relu, newshape=(-1, ngf * 8, 4, 4))
+    # 8 x 8
+    dc8 = deconv2d_bn_relu(
+        reshape, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
+    # 16x16
+    dc16 = deconv2d_bn_relu(
+        dc8, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
+    # 32x32
+    dc32 = deconv2d_bn_relu(
+        dc16, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
+    dc64 = deconv2d(
+        dc32, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
+    tanh = relay.tanh(dc64)
+
+    args = relay.ir_pass.free_vars(tanh)
+    return relay.Function(args, tanh)
+
+
+def get_workload(batch_size, oshape=(3, 64, 64), ngf=128, random_len=100, dtype="float32"):
+    """Get benchmark workload for a DCGAN generator
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+    oshape : tuple, optional
+        The shape of output image, layout="CHW"
+    ngf: int, optional
+        The number of final feature maps in the generator
+    random_len : int, optional
+        The length of random input
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size, random_len, oshape=oshape, ngf=ngf, dtype=dtype)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/dqn.py b/python/tvm/relay/testing/dqn.py
index 736894612e19..034ac0a6c2e5 100644
--- a/python/tvm/relay/testing/dqn.py
+++ b/python/tvm/relay/testing/dqn.py
@@ -30,15 +30,25 @@ def get_net(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32"
     """get symbol of nature dqn"""
     data_shape = (batch_size,) + image_shape
     data = relay.var("data", shape=data_shape, dtype=dtype)
+
+    conv1_bias = relay.var("conv1_bias")
     conv1 = layers.conv2d(data, kernel_size=(8, 8), strides=(4, 4), padding=(0, 0),
                           channels=32, name="conv1")
+    conv1 = relay.nn.bias_add(conv1, conv1_bias)
     relu1 = relay.nn.relu(conv1)
+
+    conv2_bias = relay.var("conv2_bias")
     conv2 = layers.conv2d(relu1, kernel_size=(4, 4), strides=(2, 2), padding=(0, 0),
                           channels=64, name="conv2")
+    conv2 = relay.nn.bias_add(conv2, conv2_bias)
     relu2 = relay.nn.relu(conv2)
+
+    conv3_bias = relay.var("conv3_bias")
     conv3 = layers.conv2d(relu2, kernel_size=(3, 3), strides=(1, 1), padding=(0, 0),
                           channels=64, name="conv3")
+    conv3 = relay.nn.bias_add(conv3, conv3_bias)
     relu3 = relay.nn.relu(conv3)
+
     bf1 = relay.nn.batch_flatten(relu3)
     dense1 = layers.dense_add_bias(bf1, units=512, name="dense1")
     relu4 = relay.nn.relu(dense1)
diff --git a/python/tvm/relay/testing/layers.py b/python/tvm/relay/testing/layers.py
index fc06ca229f77..1b279d9e72af 100644
--- a/python/tvm/relay/testing/layers.py
+++ b/python/tvm/relay/testing/layers.py
@@ -80,6 +80,30 @@ def conv2d(data, weight=None, **kwargs):
         weight = relay.var(name + "_weight")
     return relay.nn.conv2d(data, weight, **kwargs)
 
+def conv2d_transpose(data, weight=None, **kwargs):
+    """Wrapper of conv2d_transpose which automatically creates weights if not given.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input expression.
+
+    weight : relay.Expr
+        The weight to conv2d_transpose.
+
+    kwargs : dict
+        Additional arguments.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result.
+    """
+    name = kwargs.get("name")
+    kwargs.pop("name")
+    if not weight:
+        weight = relay.var(name + "_weight")
+    return relay.nn.conv2d_transpose(data, weight, **kwargs)
 
 def dense_add_bias(data, weight=None, bias=None, **kwargs):
     """Wrapper of dense which automatically creates weights if not given.
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 7b2c343b0844..fd446f9b7f03 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -106,13 +106,18 @@ def test_resnet():
 
 def test_dqn():
     net, params = tvm.relay.testing.dqn.get_workload(batch_size=1)
-    show(net.astext())
+    net.astext()
+
+def test_dcgan():
+    net, params = tvm.relay.testing.dcgan.get_workload(batch_size=1)
+    net.astext()
 
 if __name__ == "__main__":
     do_print[0] = True
     test_resnet()
     test_mlp()
     test_dqn()
+    test_dcgan()
     test_func()
     test_env()
     test_meta_data()

From bfc8c68748798de9b73e14adbd0113a69dfd236d Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 30 Oct 2018 02:45:37 +0530
Subject: [PATCH 303/529] [RELAY]prelu op support (#2016)

---
 docs/langref/relay_op.rst            |  2 +
 include/tvm/relay/attrs/nn.h         | 11 ++++++
 include/tvm/relay/type.h             |  1 +
 python/tvm/relay/op/nn/nn.py         | 27 ++++++++++++++
 src/relay/op/nn/nn.cc                | 56 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level3.py | 39 ++++++++++++++++---
 6 files changed, 130 insertions(+), 6 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index d1549cd8326e..405f071e3283 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -74,6 +74,7 @@ This level enables additional math and transform operators.
 
    tvm.relay.zeros
    tvm.relay.nn.leaky_relu
+   tvm.relay.nn.prelu
    tvm.relay.zeros_like
    tvm.relay.ones
    tvm.relay.ones_like
@@ -183,6 +184,7 @@ Level 2 Definitions
 Level 3 Definitions
 -------------------
 .. autofunction:: tvm.relay.nn.leaky_relu
+.. autofunction:: tvm.relay.nn.prelu
 .. autofunction:: tvm.relay.floor
 .. autofunction:: tvm.relay.ceil
 .. autofunction:: tvm.relay.trunc
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 34bd5eb93312..5077c82412a6 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -278,6 +278,17 @@ struct LeakyReluAttrs : public tvm::AttrsNode<LeakyReluAttrs> {
 };
 
 
+/*! \brief Attributes for prelu operator */
+struct PReluAttrs : public tvm::AttrsNode<PReluAttrs> {
+  int axis;
+
+  TVM_DECLARE_ATTRS(PReluAttrs, "relay.attrs.PReluAttrs") {
+    TVM_ATTR_FIELD(axis).set_default(1)
+        .describe("Specify which shape axis the channel is specified.");
+  }
+};
+
+
 /*! \brief Attributes used in dropout operator */
 struct DropoutAttrs : public tvm::AttrsNode<DropoutAttrs> {
   double rate;
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index 0b61004f9a66..c8ccb6035ae9 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -280,6 +280,7 @@ class TypeReporterNode : public Node {
   TVM_DLL virtual void Assign(const Type& dst, const Type& src) = 0;
   /*!
    * \brief assert shape expression comparison.
+   * \note Use assert only if any of the condition input is symbolic.
    * \param cond The condition of operation.
    * \return false if assertation can be proven to have failed
    *      true if solver can still proceed.
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index d0ccfcb44899..61c930436167 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -528,6 +528,33 @@ def leaky_relu(data, alpha):
     return _make.leaky_relu(data, alpha)
 
 
+def prelu(data, alpha, axis=1):
+    """This operator takes data as input and does Leaky version
+    of a Rectified Linear Unit.
+
+    .. math::
+
+        `y = x > 0 ? x : alpha * x`
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    alpha : tvm.relay.Expr
+        Slope coefficient for the negative half axis.
+
+    axis : int, optional
+        Specify which shape axis the channel is specified.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.prelu(data, alpha, axis)
+
+
 def pad(data,
         pad_width,
         pad_value=0.0):
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index d38c5a0ebe0d..d141eec3bdd2 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -171,6 +171,62 @@ RELAY_REGISTER_OP("nn.leaky_relu")
 .add_type_rel("Identity", IdentityRel);
 
 
+TVM_REGISTER_NODE_TYPE(PReluAttrs);
+
+bool PReluRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const PReluAttrs* param = attrs.as<PReluAttrs>();
+  CHECK(param != nullptr);
+
+  CHECK(param->axis < static_cast<int>(data->shape.size()))
+    << "Wrong axis ("  << param->axis << ")value.";
+
+  // assign alpha type
+  Array<IndexExpr> alpha_shape({data->shape[param->axis]});
+  reporter->Assign(types[1], TensorTypeNode::make(alpha_shape, data->dtype));
+
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(data->shape, data->dtype));
+  return true;
+}
+
+// Positional relay function to create prelu operator used by frontend FFI.
+Expr MakePRelu(Expr data,
+               Expr alpha,
+               int axis) {
+  auto attrs = make_node<PReluAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("nn.prelu");
+  return CallNode::make(op, {data, alpha}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.prelu")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakePRelu, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.prelu")
+.describe(R"code(Parametric version of a Rectified Linear Unit.
+It accepts two arguments: an input ``x`` and a channelwise slope ``alpha``
+and computes the output as :math:`PReLU(x) y = x > 0 ? x : alpha * x`,
+where :math:`*` is an channelwise multiplication for each sample in the batch.
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.PReluAttrs")
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("alpha", "Tensor", "Input channelwise alpha.")
+.set_support_level(3)
+.add_type_rel("PRelu", PReluRel);
+
+
 TVM_REGISTER_API("relay.op.nn._make.softmax")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
   auto make_func = [](Expr data, int axis) {
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 427ac562fbc7..6f06c8698e3f 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -188,13 +188,39 @@ def test_full_like():
     assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
 def test_infer_type_leaky_relu():
-   n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-   x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-   y = relay.nn.leaky_relu(x, alpha=0.1)
-   "alpha=0.1" in y.astext()
-   yy = relay.ir_pass.infer_type(y)
-   assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.nn.leaky_relu(x, alpha=0.1)
+    "alpha=0.1" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
+def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
+    x = relay.var("data", relay.TensorType(data, dtype))
+    if alpha:
+        y = relay.var("alpha", relay.TensorType(alpha, dtype))
+    else:
+        y = relay.var("alpha", relay.IncompleteType())
+    z = relay.nn.prelu(x, y, axis=axis)
+    zz = relay.ir_pass.infer_type(z)
+    if axis != 1:
+        assert "axis" in z.astext()
+    assert zz.checked_type == relay.ty.TensorType(output, dtype)
+    if not alpha:
+        axis = axis if axis else 1
+        alpha_shape = (data[axis],)
+        assert zz.args[1].checked_type == relay.TensorType(alpha_shape, "float32")
+
+def test_infer_type_prelu():
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    verify_infer_type_prelu((n, c, h, w), (c,), 1, (n, c, h, w))
+    verify_infer_type_prelu((n, h, w, c), (c,), 3, (n, h, w, c))
+    verify_infer_type_prelu((n, c, h, w), None, 1, (n, c, h, w))
+    verify_infer_type_prelu((n, h, w, c), None, 3, (n, h, w, c))
+    verify_infer_type_prelu((1, 3, 2, 2), (3,), 1, (1, 3, 2, 2))
+    verify_infer_type_prelu((1, 2, 2, 3), (3,), 3, (1, 2, 2, 3))
+    verify_infer_type_prelu((1, 3, 2, 2), None, 1, (1, 3, 2, 2))
+    verify_infer_type_prelu((1, 2, 2, 3), None, 3, (1, 2, 2, 3))
 
 if __name__ == "__main__":
     test_cast()
@@ -208,6 +234,7 @@ def test_infer_type_leaky_relu():
     test_full()
     test_full_like()
     test_infer_type_leaky_relu()
+    test_infer_type_prelu()
     test_squeeze_infer_type()
     test_squeeze_bad_axes_infer_type()
     test_split_infer_type()

From 5984fab9741e5ef137f0ec3dd105017a2066779a Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Mon, 29 Oct 2018 14:36:44 -0700
Subject: [PATCH 304/529] Refine porting x86 NCHWc conv to AutoTVM (#1993)

---
 nnvm/python/nnvm/top/nn.py                  |  17 +-
 python/tvm/autotvm/task/dispatcher.py       | 113 +++--
 python/tvm/autotvm/task/space.py            |  14 +-
 python/tvm/autotvm/task/task.py             |  15 +-
 python/tvm/autotvm/task/topi_integration.py |  17 +-
 topi/python/topi/generic/nn.py              |  22 +-
 topi/python/topi/hls/nn.py                  |  21 +-
 topi/python/topi/intel_graphics/conv2d.py   |   5 +-
 topi/python/topi/nn/conv2d.py               |  57 +--
 topi/python/topi/x86/conv2d.py              | 516 ++++----------------
 topi/python/topi/x86/conv2d_avx_1x1.py      | 128 +----
 topi/python/topi/x86/conv2d_avx_common.py   | 115 +----
 topi/recipe/conv/test_conv_int8_intel.py    |  27 +-
 topi/tests/python/test_topi_conv2d_NCHWc.py | 206 ++++++++
 tutorials/autotvm/tune_nnvm_x86.py          |  15 +-
 15 files changed, 487 insertions(+), 801 deletions(-)
 create mode 100644 topi/tests/python/test_topi_conv2d_NCHWc.py

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 49192cacd713..a4b36ea853d5 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -167,16 +167,16 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")
-    kh, kw = attrs.get_int_tuple('kernel_size')
     groups = attrs.get_int("groups")
-    channels = attrs.get_int("channels")
     layout = attrs.get_string("layout")
     out_layout = attrs.get_string("out_layout")
+    out_dtype = attrs.get_string("out_dtype")
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
     assert dilation == (1, 1), "not support dilate now"
     if groups == 1:
         # pylint: disable=assignment-from-no-return
-        out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], channels, (kh, kw),
-                                   strides, padding, layout, out_layout)
+        out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], strides, padding,
+                                   layout, out_layout, out_dtype)
         # pylint: enable=assignment-from-no-return
     else:
         raise ValueError("not support arbitrary group number > 1 for now")
@@ -190,16 +190,9 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
 def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
     """Schedule definition of conv2d NCHWc"""
     groups = attrs.get_int("groups")
-    kh, kw = attrs.get_int_tuple('kernel_size')
-    oc = attrs.get_int("channels")
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    layout = attrs.get_string("layout")
-    out_layout = attrs.get_string("out_layout")
     with tvm.target.create(target):
         if groups == 1:
-            return topi.generic.schedule_conv2d_NCHWc(oc, (kh, kw), strides, padding,
-                                                      layout, out_layout, outs)
+            return topi.generic.schedule_conv2d_NCHWc(outs)
         else:
             raise ValueError("not support group number > 1 for now")
 
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 164877e3b451..fd91d60e7982 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -60,6 +60,53 @@ def query(self, target, workload):
             ret = self._old_ctx.query(target, workload)
         return ret
 
+    def update(self, target, workload, cfg):
+        """
+        Update context with a specific config.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload : Workload
+            The current workload.
+        cfg : ConfigSpace
+            The specific configuration.
+
+        Note
+        ----
+        This interface is for cases when TVM decides to replace an operator in the graph.
+        For example, `AlterOpLayout` pass (enables when `opt_level = 3`) replaces `NCHW`
+        convolution with `NCHW[x]c` implementation on x86 CPUs.
+        Thus in TOPI, we first query schedule using original `NCHW` workload,
+        then update the dispatcher with the new `NCHW[x]c` workload.
+        So that later on, `NCHW[x]c` convolution can get schedule from the dispatcher using
+        its own workload directly.
+
+        .. code-block:: python
+
+            @conv2d_alter_layout.register("cpu")
+            def _alter_conv2d_layout(attrs, inputs, tinfo):
+                workload = get_conv2d_workload(...)
+                dispatch_ctx = autotvm.task.DispatchContext.current
+                target = tvm.target.current_target()
+                config = dispatch_ctx.query(target, workload)
+
+                # Get conv2d_NCHWc workload from config
+                # new_workload = ...
+                # new_inputs = ...
+                # new_attrs = ...
+
+                # Store altered operator's config
+                dispatch_ctx.update(target, new_workload, config)
+                return sym.contrib.conv2d_NCHWc(*new_inputs, **new_attrs)
+
+        We directly store `config` back because `conv2d_NCHW` and `conv2d_NCHWc`
+        share the same schedule parameters.
+        One can construct a new `ConfigEntity` if this is not the case.
+        """
+        raise NotImplementedError()
+
     def _query_inside(self, target, workload):
         """
         Query the context to get the specific config for a template.
@@ -179,6 +226,11 @@ def _query_inside(self, target, workload):
         self.workload = workload
         return self._config
 
+    def update(self, target, workload, cfg):
+        """Override update"""
+        self.workload = workload
+        self._config = cfg
+
 
 class ApplyHistoryBest(DispatchContext):
     """
@@ -197,6 +249,7 @@ def __init__(self, records):
 
         self.best_by_targetkey = {}
         self.best_by_model = {}
+        self._best_user_defined = {}
 
         if records:
             self.load(records)
@@ -264,17 +317,32 @@ def _query_inside(self, target, workload):
             if opt.startswith("-model"):
                 model = opt[7:]
                 key = (model, workload)
+                if key in self._best_user_defined:
+                    return self._best_user_defined[key]
                 if key in self.best_by_model:
                     return self.best_by_model[key][0].config
 
         # then try matching by target key
         for k in target.keys:
             key = (k, workload)
+            if key in self._best_user_defined:
+                return self._best_user_defined[key]
             if key in self.best_by_targetkey:
                 return self.best_by_targetkey[key][0].config
 
         return None
 
+    def update(self, target, workload, cfg):
+        for opt in target.options:
+            if opt.startswith("-model"):
+                model = opt[7:]
+                key = (model, workload)
+                self._best_user_defined[key] = cfg
+
+        for k in target.keys:
+            key = (k, workload)
+            self._best_user_defined[key] = cfg
+
 
 class FallbackContext(DispatchContext):
     """
@@ -324,6 +392,10 @@ def clear_cache(self, target, workload):
         if key in self.memory:
             del self.memory[key]
 
+    def update(self, target, workload, cfg):
+        key = (str(target), workload)
+        self.memory[key] = cfg
+
 DispatchContext.current = FallbackContext()
 
 def clear_fallback_cache(target, workload):
@@ -391,37 +463,14 @@ def _query_inside(self, target, workload):
         cfg : ConfigSpace
             The specific configuration.
         """
-        cfg = self._records[self._counter][0].config
-        self._counter += 1
-        return cfg
-
-    def query_global_dict(self, key):
-        """
-        Query the context to get config from global
-        config dictionary.
-
-        Parameters
-        ----------
-        key : str
-            Key to query the config.
-
-        Returns
-        -------
-        cfg : ConfigSpace
-            The specific configuration.
-        """
+        if self._counter < len(self._records):
+            cfg = self._records[self._counter][0].config
+            self._counter += 1
+            self.update(target, workload, cfg)
+            return cfg
+        key = (str(target), workload)
         return self._global_cfg_dict[key]
 
-    def update_global_dict(self, key, val):
-        """
-        Update the global config dictionary.
-
-        Parameters
-        ----------
-        key : str
-            Key of config.
-
-        val : ConfigSpace
-            Value of config.
-        """
-        self._global_cfg_dict[key] = val
+    def update(self, target, workload, cfg):
+        key = (str(target), workload)
+        self._global_cfg_dict[key] = cfg
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index f9bf60237776..32bd66b6c12d 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -1,5 +1,5 @@
 # pylint: disable=too-few-public-methods,invalid-name,unused-argument,arguments-differ
-# pylint: disable=consider-using-enumerate
+# pylint: disable=consider-using-enumerate,too-many-lines
 """
 Template configuration space.
 
@@ -996,5 +996,17 @@ def fallback_with_reference_log(self, ref_log):
             if not isinstance(self.space_map[knob_name], SplitSpace):
                 self._entity_map[knob_name] = best_match_cfg[knob_name]
 
+    def __setitem__(self, name, entity):
+        """set the entity(knob) of by name
+
+        Parameters
+        ----------
+        name: str
+            name of the entity
+        entity: SplitEntity, ReorderEntity, AnnotateEntity, OtherOptionEntity
+            value of the entity
+        """
+        self._entity_map[name] = entity
+
     def __repr__(self):
         return "%s,%s,%s" % (str(self._entity_map)[12:-1], self.template_key, self.code_hash)
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index ab52788c8d91..22a15143b96e 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -182,7 +182,7 @@ def create(func_name, args, target, target_host=None, template_key=None):
 
     return ret
 
-def args_to_workload(x):
+def args_to_workload(x, topi_compute_func=None):
     """Convert argument list to hashable workload tuple.
     This function will convert list to tuple, tvm node to python value and
     flatten tvm.tensor.Tensor to a tuple
@@ -191,6 +191,8 @@ def args_to_workload(x):
     ----------
     x: primitive hashable types or tensor.Tensor
         The original value
+    topi_compute_func: topi compute function
+        The function name will be added as first element of the workload tuple
 
     Returns
     -------
@@ -198,18 +200,19 @@ def args_to_workload(x):
         The hashable value
     """
     if isinstance(x, tensor.Tensor):
-        return get_const_tuple(x.shape) + (x.dtype, )
+        workload = get_const_tuple(x.shape) + (x.dtype, )
     elif isinstance(x, (tuple, list, container.Array)):
-        return tuple([args_to_workload(a) for a in x])
+        workload = tuple([args_to_workload(a) for a in x])
     elif isinstance(x, (str, int, float, np.int, np.float)):
-        return x
+        workload = x
     elif isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)):
-        return x.value
+        workload = x.value
     elif x is None:
-        return 0
+        workload = 0
     else:
         raise RuntimeError('Do not support type "%s" in argument. Consider to use'
                            'primitive types only' % type(x))
+    return (get_func_name(topi_compute_func), ) + workload  if topi_compute_func else workload
 
 def template(func):
     """
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 18f45f8d6708..f005ee0c9a54 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -1,4 +1,4 @@
-# pylint: disable=unused-variable,invalid-name
+# pylint: disable=unused-variable,invalid-name,unused-argument
 """
 Decorators for registering tunable templates to TOPI.
 
@@ -13,7 +13,6 @@
 
 from ... import _api_internal, tensor
 
-from ..util import get_func_name
 from .task import args_to_workload, dispatcher
 
 
@@ -55,8 +54,6 @@ def register_topi_compute(topi_compute, target_keys, template_keys, func=None):
     --------
     See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
     """
-    fname = get_func_name(topi_compute)
-
     def _decorator(f):
         targets = [target_keys] if isinstance(target_keys, str) else target_keys
         for target_key in targets:
@@ -68,7 +65,7 @@ def _decorator(f):
                 def config_dispatcher(*args, **kwargs):
                     """override topi call as a config dispatcher"""
                     assert not kwargs, "Do not support kwargs in template function call"
-                    return (fname, ) + args_to_workload(args)
+                    return args_to_workload(args, topi_compute)
                 _REGISTED_DISPATHCER[target_key][topi_compute] = config_dispatcher
 
             config_dispatcher = _REGISTED_DISPATHCER[target_key][topi_compute]
@@ -88,7 +85,7 @@ def template_call(cfg, *args, **kwargs):
                 attrs = {}
                 for k, v in node.op.attrs.items():
                     attrs[k] = v
-                attrs['workload'] = (fname, ) + args_to_workload(args)
+                attrs['workload'] = args_to_workload(args, topi_compute)
                 if isinstance(op, tensor.ComputeOp):
                     op = _api_internal._ComputeOp(
                         op.name, op.tag, attrs, op.axis, op.body)
@@ -153,7 +150,7 @@ def _decorator(f):
             if topi_schedule not in _REGISTED_DISPATHCER[target_key]:
                 @topi_schedule.register(target_key)
                 @dispatcher
-                def config_dispatcher(outs):
+                def config_dispatcher(outs, *args, **kwargs):
                     """override topi call as a workload dispatcher"""
                     def traverse(tensors):
                         """traverse all ops to find attached workload"""
@@ -179,11 +176,11 @@ def traverse(tensors):
             config_dispatcher = _REGISTED_DISPATHCER[target_key][topi_schedule]
 
             @config_dispatcher.register(template_keys)
-            def template_call(cfg, outs):
+            def template_call(cfg, outs, *args, **kwargs):
                 """call the schedule func"""
                 if f == topi_schedule.fdefault:
-                    return f(outs)
-                return f(cfg, outs)
+                    return f(outs, *args, **kwargs)
+                return f(cfg, outs, *args, **kwargs)
 
         return f
 
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index e99ce263296b..765b48d286bc 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -55,33 +55,15 @@ def schedule_conv2d_nhwc(outs):
 
 
 @tvm.target.generic_func
-def schedule_conv2d_NCHWc(num_filter, kernel_size, strides,
-                          padding, layout, out_layout, outs):
+def schedule_conv2d_NCHWc(outs):
     """Schedule for conv2d_NCHW[x]c
 
     Parameters
     ----------
-    num_filter : int
-        The number of filter, i.e., the output channel.
-
-    kernel_size : tuple of int
-        (kernel_height, kernel_width)
-
-    strides : tuple of int
-        (stride_of_height, stride_of_width)
-
-    padding : tuple of int
-        (pad_of_height, pad_of_width)
-
-    layout : str
-        Input data layout
-
-    out_layout : str
-        Output data layout
-
     outs : Array of Tensor
         The computation graph description of conv2d_NCHWc
         in the format of an array of tensors.
+        The number of filter, i.e., the output channel.
 
     Returns
     -------
diff --git a/topi/python/topi/hls/nn.py b/topi/python/topi/hls/nn.py
index 8c986d7a5663..536453fc629c 100644
--- a/topi/python/topi/hls/nn.py
+++ b/topi/python/topi/hls/nn.py
@@ -73,30 +73,11 @@ def schedule_conv2d_nhwc(outs):
 
 
 @generic.schedule_conv2d_NCHWc.register(["hls"])
-def schedule_conv2d_NCHWc(num_filter, kernel_size, strides,
-                          padding, layout, out_layout, outs):
+def schedule_conv2d_NCHWc(outs):
     """Schedule for conv2d_NCHW[x]c
 
     Parameters
     ----------
-    num_filter : int
-        The number of filter, i.e., the output channel.
-
-    kernel_size : tuple of int
-        (kernel_height, kernel_width)
-
-    strides : tuple of int
-        (stride_of_height, stride_of_width)
-
-    padding : tuple of int
-        (pad_of_height, pad_of_width)
-
-    layout : str
-        Input data layout
-
-    out_layout : str
-        Output data layout
-
     outs : Array of Tensor
         The computation graph description of conv2d_NCHWc
         in the format of an array of tensors.
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index 4dae00e9c146..f6767b68afa1 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -61,8 +61,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
 
 @conv2d_NCHWc.register(["intel_graphics"])
-def _decl_conv2d(data, kernel, num_filter, kernel_size, stride, padding, layout,\
-                 out_layout, out_dtype='float32'):
+def _decl_conv2d(data, kernel, stride, padding, layout, out_layout, out_dtype='float32'):
     """Conv2D operator for Intel Graphics backend.
 
     Parameters
@@ -101,7 +100,7 @@ def _decl_conv2d(data, kernel, num_filter, kernel_size, stride, padding, layout,
     return _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype)
 
 @generic.schedule_conv2d_NCHWc.register(["intel_graphics"])
-def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding, layout, out_layout, outs):
+def schedule_conv2d_NCHWc(outs):
     """Schedule for conv2d_nchw for Intel Graphics
 
     Parameters
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 4d70c4903a3f..7636350dfbf6 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -84,32 +84,6 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
         '{} vs. {}".format(data.dtype, kernel.dtype)
     return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
 
-def _get_workload_int8(data, kernel, stride, padding, out_dtype):
-    """ Get the workload structure. """
-    _, CI, IH, IW = [x.value for x in data.shape]
-    CO, _, KH, KW = [x.value for x in kernel.shape]
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-    assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
-        "Do not support inputs with different data types now. ' \
-        '{} vs. {}".format(data.dtype, kernel.dtype)
-    return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
-
-
-
-@tvm.target.generic_func
-def _get_alter_layout_schedule(wkl):
-    # pylint: disable=unreachable
-    """ Get the platform specific schedule for conv2d_alter_layout. """
-    target = tvm.target.current_target()
-    raise RuntimeError(
-        "No schedule for current target:{}".format(target))
-    # This return has no use, merely to supress pylint warning
-    return wkl
-
 
 @tvm.target.generic_func
 def _get_schedule(wkl):
@@ -122,28 +96,6 @@ def _get_schedule(wkl):
     return wkl
 
 
-@tvm.target.generic_func
-def _get_schedule_NCHWc(wkl, layout, out_layout):
-    # pylint: disable=unreachable
-    """ Get the platform specific schedule. """
-    target = tvm.target.current_target()
-    raise RuntimeError(
-        "No schedule for current target:{}".format(target))
-    # This return has no use, merely to supress pylint warning
-    return wkl
-
-
-@tvm.target.generic_func
-def _get_schedule_NCHWc_int8(wkl, layout, out_layout):
-    # pylint: disable=unreachable
-    """ Get the platform specific schedule. """
-    target = tvm.target.current_target()
-    raise RuntimeError(
-        "No schedule for current target:{}".format(target))
-    # This return has no use, merely to supress pylint warning
-    return wkl
-
-
 def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     """Convolution operator in NCHW layout.
 
@@ -302,8 +254,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
 
 
 @tvm.target.generic_func
-def conv2d_NCHWc(data, kernel, num_filter, kernel_size, stride,
-                 padding, layout, out_layout, out_dtype='float32'):
+def conv2d_NCHWc(data, kernel, stride, padding, layout, out_layout, out_dtype='float32'):
     """Conv2D operator for nChw[x]c layout.
 
     Parameters
@@ -316,12 +267,6 @@ def conv2d_NCHWc(data, kernel, num_filter, kernel_size, stride,
         [num_filter_chunk, in_channel_chunk, filter_height, filter_width,
         in_channel_block, num_filter_block]
 
-    num_filter : int
-        number of filters, i.e., output channel size
-
-    kernel_size : tuple of two ints
-       [kernel_height, kernel_width]
-
     stride : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
 
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index f766d827686d..c588e74432a4 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -2,203 +2,15 @@
 """Conv2D schedule on x86"""
 import tvm
 from tvm import autotvm
-from tvm.autotvm.task.dispatcher import ApplyGraphBest
 from tvm.autotvm.task.nnvm_integration import deserialize_args
 from tvm.autotvm.task import register, get_config
 from .. import generic, tag
 from .. import nn
 from ..util import get_const_tuple
-from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, \
-    _get_workload_int8, _get_schedule, _get_schedule_NCHWc, \
-    _get_schedule_NCHWc_int8, _get_alter_layout_schedule, Workload
+from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, _get_workload
 from ..nn.pad import pad
 
 from . import conv2d_avx_1x1, conv2d_avx_common
-from .conv2d_avx_common import AVXConvCommonFwd
-from .conv2d_avx_1x1 import AVXConv1x1Fwd
-from .check_targets import check_skylake
-
-@_get_schedule.register("cpu")
-def _get_schedule_conv(wkl):
-    _WORKLOADS_AVX = [
-        # workloads of resnet18_v1 on imagenet
-        Workload('float32', 'float32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
-        Workload('float32', 'float32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-        Workload('float32', 'float32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-        Workload('float32', 'float32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-        Workload('float32', 'float32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-        Workload('float32', 'float32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-        Workload('float32', 'float32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-        Workload('float32', 'float32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-        # workloads of resnet34_v1 on imagenet, no extra workload required
-        # workloads of resnet50_v1 on imagenet
-        Workload('float32', 'float32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
-        # workloads of resnet101_v1 on imagenet, no extra workload required
-        # workloads of resnet152_v1 on imagenet, no extra workload required
-        # workloads of resnet18_v2 on imagenet, no extra workload required
-        # workloads of resnet34_v2 on imagenet, no extra workload required
-    ]
-
-    fp32_vec_len = 8
-    target = tvm.target.current_target(allow_none=False)
-    for opt in target.options:
-        if opt == '-mcpu=skylake-avx512':
-            fp32_vec_len = 16
-
-    _SCHEDULES_AVX = [
-        # workloads of resnet18_v1 on imagenet
-        AVXConvCommonFwd(3, fp32_vec_len, 28, False),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, False),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, True),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 7),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
-        # workloads of resnet34_v1 on imagenet, no extra workload required
-        # workloads of resnet50_v1 on imagenet
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        # workloads of resnet101_v1 on imagenet, no extra workload required
-        # workloads of resnet152_v1 on imagenet, no extra workload required
-        # workloads of resnet18_v2 on imagenet, no extra workload required
-        # workloads of resnet34_v2 on imagenet, no extra workload required
-    ]
-
-    if wkl not in _WORKLOADS_AVX:
-        if wkl.hkernel == 1 and wkl.wkernel == 1:
-            return conv2d_avx_1x1._get_default_schedule(wkl, fp32_vec_len)
-        return conv2d_avx_common._get_default_schedule(wkl, fp32_vec_len)
-    idx = _WORKLOADS_AVX.index(wkl)
-    sch = _SCHEDULES_AVX[idx]
-    return sch
-
-def _get_schedule_conv_int8(wkl):
-    _WORKLOADS_AVX = [
-        ## Following are for INT8 kernels
-        Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-        Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-        Workload('uint8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-        Workload('uint8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-        Workload('uint8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-        Workload('uint8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-        Workload('uint8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-        # workloads of resnet34_v1 on imagenet, no extra workload required
-        # workloads of resnet50_v1 on imagenet
-        Workload('uint8', 'int32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
-    ]
-
-    fp32_vec_len = 8
-    target = tvm.target.current_target(allow_none=False)
-    if check_skylake(target):
-        fp32_vec_len = 16
-
-    _SCHEDULES_AVX = [
-        # Following are for INT8 operations
-        # workloads of resnet18_v1 on imagenet
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, False),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, True),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 7),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
-        # workloads of resnet34_v1 on imagenet, no extra workload required
-        # workloads of resnet50_v1 on imagenet
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        # workloads of resnet101_v1 on imagenet, no extra workload required
-        # workloads of resnet152_v1 on imagenet, no extra workload required
-        # workloads of resnet18_v2 on imagenet, no extra workload required
-        # workloads of resnet34_v2 on imagenet, no extra workload required
-    ]
-
-    if wkl not in _WORKLOADS_AVX:
-        if wkl.hkernel == 1 and wkl.wkernel == 1:
-            return conv2d_avx_1x1._get_default_schedule(wkl, fp32_vec_len)
-        return conv2d_avx_common._get_default_schedule(wkl, fp32_vec_len)
-    idx = _WORKLOADS_AVX.index(wkl)
-    sch = _SCHEDULES_AVX[idx]
-    return sch
-
-@_get_schedule_NCHWc.register("cpu")
-def _get_schedule_NCHWc_x86(wkl, layout, out_layout):
-    return _get_schedule_conv(wkl)
-
-@_get_schedule_NCHWc_int8.register("cpu")
-def _get_schedule_NCHWc_x86_int8(wkl, layout, out_layout):
-    return _get_schedule_conv_int8(wkl)
-
-@_get_alter_layout_schedule.register("cpu")
-def _get_alter_layout_schedule_x86(wkl):
-    return _get_schedule_conv(wkl)
-
 
 def _get_fp32_len():
     fp32_vec_len = 8
@@ -210,18 +22,23 @@ def _get_fp32_len():
     return fp32_vec_len
 
 
-def _get_default_sch(workload):
+def _get_default_config(cfg, workload):
+    """
+    Get default schedule config for the workload
+    Parameters
+    ----------
+    workload : topi.nn.conv2d.Workload
+        Convolution workload
+    """
     fp32_vec_len = _get_fp32_len()
-    _, _, kh, kw, _ = workload[2]
-    is_kernel_1x1 = kh == 1 and kw == 1
+    is_kernel_1x1 = workload.hkernel == 1 and workload.wkernel == 1
     if is_kernel_1x1:
-        cfg = conv2d_avx_1x1._fallback_schedule(workload, fp32_vec_len)
+        conv2d_avx_1x1._fallback_schedule(cfg, workload, fp32_vec_len)
     else:
-        cfg = conv2d_avx_common._fallback_schedule(workload, fp32_vec_len)
-    return cfg
+        conv2d_avx_common._fallback_schedule(cfg, workload, fp32_vec_len)
 
 
-def _create_schedule_template(cfg, data, kernel, strides, padding, layout):
+def _create_tuning_space(cfg, data, kernel, strides, padding, layout):
     """Create schedule configuration from input arguments"""
     dshape = get_const_tuple(data.shape)
     kshape = get_const_tuple(kernel.shape)
@@ -247,38 +64,17 @@ def _create_schedule_template(cfg, data, kernel, strides, padding, layout):
         cfg.define_knob("unroll_kw", [True, False])
 
 
-def conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
-    """convert argument to workload"""
-    if len(kernel.shape) == 4:
-        raw_kernel = kernel
-    else:  # the input kernel is transformed by alter_op_layout
-        shape = get_const_tuple(kernel.shape)
-        raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]),
-                                     dtype=kernel.dtype)
-    return ('conv2d', ) + autotvm.task.args_to_workload(
-        [data, raw_kernel, strides, padding, layout, out_dtype])
-
-
-@conv2d.register("cpu")
-@autotvm.task.dispatcher
-def conv2d_x86(data, kernel, strides, padding, layout, out_dtype):
-    """x86 conv2d declaration."""
-    return conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
-
-
-@conv2d_x86.register(["direct"])
+@autotvm.register_topi_compute(conv2d, 'cpu', 'direct')
 def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype):
     out_dtype = data.dtype if out_dtype is None else out_dtype
     padding = padding if isinstance(padding, (tuple, list)) else (padding, padding)
     strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
     if layout == 'NCHW':
-        _create_schedule_template(cfg, data, kernel, strides, padding, layout)
+        _create_tuning_space(cfg, data, kernel, strides, padding, layout)
         if cfg.is_fallback:
-            workload = conv_arg_to_workload(data, kernel, strides, padding,
-                                            layout, out_dtype)
-            cfg = _get_default_sch(workload)
-        args = [cfg, data, kernel, strides, padding, layout, out_dtype]
-        return _declaration_conv_impl(*args)
+            wkl = _get_workload(data, kernel, strides, padding, out_dtype)
+            _get_default_config(cfg, wkl)
+        return _declaration_conv_impl(cfg, data, kernel, strides, padding, layout, out_dtype)
     elif layout == 'HWCN':
         return nn.conv2d_hwcn(data, kernel, strides, padding, out_dtype)
     elif layout == 'NHWC':
@@ -345,11 +141,7 @@ def _declaration_conv_impl(cfg, data, kernel, strides, padding, layout, out_dtyp
                          lambda n, c, h, w: conv[n, c // oc_bn, h, w, c % oc_bn]
                          .astype(out_dtype),
                          name='output_unpack',
-                         tag='conv2d_nchw',
-                         attrs={'workload':
-                                    conv_arg_to_workload(data, kernel, strides,
-                                                         padding, layout,
-                                                         out_dtype)})
+                         tag='conv2d_nchw')
     return unpack
 
 
@@ -385,18 +177,7 @@ def traverse(op):
 
             _, _, kh, kw = get_const_tuple(kernel.shape)
             is_kernel_1x1 = kh == 1 and kw == 1
-            current_cfg = cfg
-            if cfg.is_fallback:
-                workload_attr = op.attrs["workload"]
-                strides = (int(workload_attr[3][0].value), int(workload_attr[3][1].value))
-                padding = (int(workload_attr[4][0].value), int(workload_attr[4][1].value))
-                layout = workload_attr[5].value
-                out_dtype = workload_attr[6].value
-                workload = conv_arg_to_workload(data, kernel, strides, padding,
-                                                layout, out_dtype)
-                current_cfg = _get_default_sch(workload)
-            args = [s, current_cfg, data, data_pad, data_vec, kernel_vec, conv_out,
-                    output, outs[0]]
+            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
             if is_kernel_1x1:
                 conv2d_avx_1x1._schedule_conv(*args)
             else:
@@ -470,17 +251,13 @@ def traverse(op):
 @register("topi_x86_conv2d_NCHWc")
 def _topi_nn_conv2d_NCHWc(*args, **kwargs):
     assert not kwargs, "Do not support kwargs in template function call"
-    args = deserialize_args(args)
-    data, kernel = args[:2]
-    strides = args[4]
-    padding = args[5]
-    layout = args[6]
+    data, kernel, strides, padding, origin_layout, dtype = deserialize_args(args)
     raw_data_shape = get_const_tuple(data.shape)
     raw_kernel_shape = get_const_tuple(kernel.shape)
 
     # get config here
     cfg = get_config()
-    _create_schedule_template(cfg, data, kernel, strides, padding, layout)
+    _create_tuning_space(cfg, data, kernel, strides, padding, origin_layout)
 
     # change shape with the value in config
     ic_bn, oc_bn, ow_bn = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
@@ -491,50 +268,13 @@ def _topi_nn_conv2d_NCHWc(*args, **kwargs):
     out_layout = "NCHW%dc" % oc_bn
     new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn,
                         raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn)
-    args[0] = tvm.placeholder(new_data_shape, data.dtype)
-    args[1] = tvm.placeholder(new_kernel_shape, kernel.dtype)
-    args[6] = data_layout
-    args[7] = out_layout
+    new_data = tvm.placeholder(new_data_shape, data.dtype)
+    new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
 
-    C = _declaration_conv_NCHWc(cfg, *args, **kwargs)
-    s = _schedule_conv2d_NCHWc(cfg, args[2], args[3], args[4], args[5],
-                               args[6], args[7], [C])
-    return s, [args[0], args[1], C]
-
-
-def conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides,
-                               padding, layout, out_layout, out_dtype):
-    """convert argument to workload"""
-    dshape = get_const_tuple(data.shape)
-    kshape = get_const_tuple(kernel.shape)
-    if len(dshape) > 4:
-        raw_data = tvm.placeholder((dshape[0], dshape[1] * dshape[4], dshape[2],
-                                    dshape[3]), dtype=kernel.dtype)
-    else:
-        raw_data = data
-    if len(kshape) > 4:
-        raw_kernel = tvm.placeholder((kshape[0] * kshape[5], kshape[1] * kshape[4],
-                                      kshape[2], kshape[3]), dtype=kernel.dtype)
-    else:
-        raw_kernel = kernel
-    return ('conv2d_NCHWc', ) + autotvm.task.args_to_workload(
-        [raw_data, raw_kernel, strides, padding, layout, out_layout,
-         out_dtype])
-
-
-def _query_dispatcher(workload, in_alter_op=False):
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    if isinstance(dispatch_ctx, ApplyGraphBest):
-        if in_alter_op:
-            cfg = dispatch_ctx.query(None, None)
-        else:
-            cfg = dispatch_ctx.query_global_dict(workload)
-    else:
-        target = tvm.target.current_target()
-        cfg = dispatch_ctx.query(target, workload)
-        if cfg.is_fallback:
-            cfg = _get_default_sch(workload)
-    return cfg
+    C = _declaration_conv_NCHWc(cfg, new_data, new_kernel, strides, padding,
+                                data_layout, out_layout, dtype)
+    s = _schedule_conv2d_NCHWc(cfg, [C])
+    return s, [new_data, new_kernel, C]
 
 
 @conv2d_alter_layout.register("cpu")
@@ -546,87 +286,72 @@ def _alter_conv2d_layout(attrs, inputs, tinfo):
     # only optimize for NCHW, groups=1 conv
     if attrs['layout'] != 'NCHW' or attrs.get_int("groups") != 1:
         return None
+    batch_size, in_channel, height, width = get_const_tuple(data.shape)
+    out_channel, _, kh, kw = get_const_tuple(kernel.shape)
 
-    kernel_size = attrs.get_int_tuple("kernel_size")
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
     layout = attrs['layout']
-    out_layout = layout if attrs["out_layout"] == "__undef__" else attrs["out_layout"]
 
     dtype = data.dtype
     out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"]
-    workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides,
-                                          padding, layout, out_layout, out_dtype)
-    cfg = _query_dispatcher(workload, True)
-    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    new_attrs['layout'] = 'NCHW%dc' % ic_bn
-    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
 
-    # Store global schedule dictionary for ApplyGraphBest dispatcher
+    workload = autotvm.task.args_to_workload(
+        [data, kernel, strides, padding, layout, out_dtype], conv2d)
     dispatch_ctx = autotvm.task.DispatchContext.current
-    if isinstance(dispatch_ctx, ApplyGraphBest):
-        workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides,
-                                              padding, new_attrs['layout'],
-                                              new_attrs['out_layout'], out_dtype)
-        global_dict_key = workload
-        dispatch_ctx.update_global_dict(global_dict_key, cfg)
+    target = tvm.target.current_target()
+    cfg = dispatch_ctx.query(target, workload)
+    if cfg.is_fallback:
+        wkl = _get_workload(data, kernel, strides, padding, out_dtype)
+        _get_default_config(cfg, wkl)
 
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    new_attrs['layout'] = 'NCHW%dc' % ic_bn
+    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
     # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
     new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
 
-    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+    # Store altered operator's config
+    new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                               dtype=data.dtype)
+    new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn),
+                                 dtype=kernel.dtype)
+    new_workload = autotvm.task.args_to_workload(
+        [new_data, new_kernel, strides, padding, new_attrs['layout'],
+         new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
+    dispatch_ctx.update(target, new_workload, cfg)
 
-
-@conv2d_NCHWc.register("cpu")
-def conv2d_NCHWc_cpu(data, kernel, num_filter, kernel_size, strides,
-                     padding, layout, out_layout, out_dtype):
-    """x86 conv2d_NCHWc declaration."""
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    if not isinstance(dispatch_ctx, ApplyGraphBest):
-        layout = out_layout = "NCHW"
-    workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides,
-                                          padding, layout, out_layout, out_dtype)
-    cfg = _query_dispatcher(workload)
-    return _declaration_conv_NCHWc(cfg, data, kernel, num_filter, kernel_size, strides,
-                                   padding, layout, out_layout, out_dtype)
+    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
 
 
-def _declaration_conv_NCHWc(cfg, data, kernel, num_filter, kernel_size, strides,
+@autotvm.register_topi_compute(conv2d_NCHWc, 'cpu', 'direct')
+def _declaration_conv_NCHWc(cfg, data, kernel, strides,
                             padding, layout, out_layout, out_dtype):
-    n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
-    ic = ic_chunk * ic_block
-    kh, kw = kernel_size if isinstance(kernel_size, (tuple, list)) else \
-        (kernel_size, kernel_size)
-    is_kernel_1x1 = kh == 1 and kw == 1
-    ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding)
-    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    # layout and out_layout are not used here,
+    # we keep them for debug convenience when dumping autotvm workload
+    HPAD, WPAD = padding if isinstance(padding, (tuple, list)) else (padding, padding)
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
 
+    n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
+    in_channel = ic_chunk * ic_bn
     if data.dtype == 'uint8':
-        wkl = _get_workload_int8(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
-                                 tvm.placeholder((num_filter, ic, kh, kw),
-                                                 dtype=kernel.dtype),
-                                 strides, padding, out_dtype)
-        sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout)
-        return conv2d_avx_1x1._declaration_conv_NCHWc_int8(wkl, sch, data, kernel) \
-            if is_kernel_1x1 \
-            else conv2d_avx_common._declaration_conv_NCHWc_int8(wkl, sch, data, kernel)
-
-    args = [cfg, data, kernel, (kh, kw), (sh, sw), (ph, pw), layout, out_layout, out_dtype]
-    return _declaration_conv_NCHWc_impl(*args)
-
-
-def _declaration_conv_NCHWc_impl(cfg, data, kernel, kernel_size, strides, padding, layout,
-                                 out_layout, out_dtype):
-    HPAD, WPAD = padding
-    HSTR, WSTR = strides
-
-    n, ic_chunk, ih, iw, ic_block = get_const_tuple(data.shape)
-    ic = ic_chunk * ic_block
-    kh, kw = kernel_size
-    oc_chunk, _, _, _, _, oc_block = get_const_tuple(kernel.shape)
-    oc = oc_chunk * oc_block
-    oh = (ih + 2 * HPAD - kh) // HSTR + 1
-    ow = (iw + 2 * WPAD - kw) // WSTR + 1
+        oc_chunk, _, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(kernel.shape)
+    else:
+        oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape)
+    num_filter = oc_chunk * oc_bn
+
+    # get workload and related schedule config
+    wkl = _get_workload(tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
+                        tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width),
+                                        dtype=kernel.dtype),
+                        strides, padding, out_dtype)
+    if cfg.is_fallback:
+        _get_default_config(cfg, wkl)
+
+    # output shape
+    out_height = (ih + 2 * HPAD - kernel_height) // HSTR + 1
+    out_width = (iw + 2 * WPAD - kernel_width) // WSTR + 1
+    oshape = (n, oc_chunk, out_height, out_width, oc_bn)
 
     # DOPAD
     DOPAD = (HPAD != 0 or WPAD != 0)
@@ -635,51 +360,43 @@ def _declaration_conv_NCHWc_impl(cfg, data, kernel, kernel_size, strides, paddin
     else:
         data_pad = data
 
-    # fetch schedule
-    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    if ic_bn != ic_block:
-        raise RuntimeError("ic_bn in config is not equal to actual data ic_block: %d vs %d."
-                           % (ic_bn, ic_block))
-    if oc_bn != oc_block:
-        raise RuntimeError("oc_bn in config is not equal to actual kernel oc_block: %d vs %d."
-                           % (oc_bn, oc_block))
-
-    # convolution
-    oshape = (n, oc//oc_bn, oh, ow, oc_bn)
-
-    ic = tvm.reduce_axis((0, ic), name='ic')
-    kh = tvm.reduce_axis((0, kernel_size[0]), name='kh')
-    kw = tvm.reduce_axis((0, kernel_size[1]), name='kw')
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
 
-    workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size,
-                                          strides, padding, layout,
-                                          out_layout, out_dtype),
-    attrs = {'workload': workload}
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+    if data.dtype == 'uint8':
+        assert out_dtype == "int32", \
+            "INT8 convolution requires input dtype = uint8 and output dtype=int32"
+        # Intel performs dot product of 2 "4" Int8 values
+        # Current implementation requires ic_bn to be a multiple of 4
+        n_elems = 4
+        assert ic_bn % n_elems == 0
+
+        ic_outer = tvm.reduce_axis((0, wkl.in_filter//ic_bn), name='ic_outer')
+        ic_f_inner = tvm.reduce_axis((0, ic_bn//n_elems), name='ic_f_inner')
+        ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
+        return tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                           tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw,
+                                            ic_f_inner * n_elems +  ic_s_inner]
+                                   .astype(out_dtype) *
+                                   kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner,
+                                          oc_block, ic_s_inner].astype(out_dtype),
+                                   axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
+                           name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8")
+    # else: fp implementation
+    return tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
                        tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR+kh, ow*WSTR+kw,
                                         ic%ic_bn].astype(out_dtype) *
                                kernel[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn, oc_block],
                                axis=[ic, kh, kw]),
-                       name='conv2d_NCHWc', tag="conv2d_NCHWc", attrs=attrs)
-    return conv
-
-
-@generic.schedule_conv2d_NCHWc.register("cpu")
-def schedule_conv2d_NCHWc(num_filter, kernel_size, strides, padding,
-                          layout, out_layout, outs):
-    """x86 conv2d_NCHWc schedule"""
-    return _schedule_conv2d_NCHWc(None, num_filter, kernel_size, strides, padding,
-                                  layout, out_layout, outs)
+                       name='conv2d_NCHWc', tag="conv2d_NCHWc")
 
 
-def _schedule_conv2d_NCHWc(cfg, num_filter, kernel_size, strides, padding,
-                           layout, out_layout, outs):
+@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc, 'cpu', ['direct'])
+def _schedule_conv2d_NCHWc(cfg, outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    if not isinstance(dispatch_ctx, ApplyGraphBest):
-        layout = out_layout = "NCHW"
 
     def traverse(op):
         """Traverse operators from computation graph"""
@@ -702,34 +419,17 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
-            kh, kw = kernel_size if isinstance(kernel_size, (tuple, list)) else \
-                (kernel_size, kernel_size)
-            is_kernel_1x1 = kh == 1 and kw == 1
-            n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
-            ic = ic_chunk * ic_block
-            original_data = tvm.placeholder((n, ic, h, w), dtype=data.dtype)
-
-            kh, kw = kernel_size
-            original_kernel = tvm.placeholder((num_filter, ic, kh, kw),
-                                              dtype=kernel.dtype)
+            args = [s, cfg, data_vec, conv_out, outs[0]]
             if data.dtype == 'uint8':
-                wkl = _get_workload_int8(original_data, original_kernel,
-                                         strides, padding, conv_out.dtype)
-                sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout)
-                args = [s, wkl, sch, data_vec, kernel, conv_out, outs[0]]
-                if is_kernel_1x1:
+                # int8 conv kernel is 7-dim
+                _, _, kh, kw, _, _, _ = get_const_tuple(kernel.shape)
+                if kh == 1 and kw == 1:
                     conv2d_avx_1x1._schedule_conv_NCHWc_int8(*args)
                 else:
                     conv2d_avx_common._schedule_conv_NCHWc_int8(*args)
             else:
-                current_cfg = cfg
-                if current_cfg is None:
-                    workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides,
-                                                          padding, layout, out_layout,
-                                                          conv_out.dtype)
-                    current_cfg = _query_dispatcher(workload)
-                args = [s, current_cfg, data_vec, conv_out, outs[0]]
-                if is_kernel_1x1:
+                _, _, kh, kw, _, _, = get_const_tuple(kernel.shape)
+                if kh == 1 and kw == 1:
                     conv2d_avx_1x1._schedule_conv_NCHWc(*args)
                 else:
                     conv2d_avx_common._schedule_conv_NCHWc(*args)
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 96affc7b9d23..ce70ec83828b 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -1,21 +1,15 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
 """1x1 Conv2D schedule on for Intel CPU"""
 from __future__ import absolute_import as _abs
-from collections import namedtuple
 import tvm
-from tvm.autotvm.task import ConfigEntity
-
-import topi
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from ..nn.util import infer_pad
-from ..nn.pad import pad
+from ..util import get_const_tuple
 from .tensor_intrin import dot_16x1x16_int8_int8_int32
 from .check_targets import check_skylake
 
-AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor'])
-
-
-def _get_default_schedule(wkl, simd_width):
+def _fallback_schedule(cfg, wkl, simd_width):
     HPAD, WPAD = wkl.hpad, wkl.wpad
     HSTR, WSTR = wkl.hstride, wkl.wstride
     out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
@@ -37,45 +31,11 @@ def _get_default_schedule(wkl, simd_width):
         if out_width % ow_factor == 0:
             for oh_factor in range(out_height, 0, -1):
                 if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
-                    return AVXConv1x1Fwd(ic_bn, oc_bn, oh_factor, ow_factor)
-
-    raise ValueError("cannot decide default schedule for workload: {}".format(wkl))
-
-
-def _fallback_schedule(wkl, simd_width):
-    batch_size, in_channel, height, width, _ = wkl[1]
-    out_channel, _, hkernel, wkernel, _ = wkl[2]
-    HPAD, WPAD = wkl[4]
-    HSTR, WSTR = wkl[3]
-    out_height = (height + 2 * HPAD - hkernel) // HSTR + 1
-    out_width = (width + 2 * WPAD - wkernel) // WSTR + 1
-
-    oc_bn = 1
-    for bn in range(simd_width, 0, -1):
-        if out_channel % bn == 0:
-            oc_bn = bn
-            break
-
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -1):
-        if in_channel % bn == 0:
-            ic_bn = bn
-            break
-
-    for ow_factor in range(out_width, 0, -1):
-        if out_width % ow_factor == 0:
-            for oh_factor in range(out_height, 0, -1):
-                if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
-                    cfg_dict = {"i": -1,
-                                "c": None,
-                                "e": [["tile_ic", "sp", [in_channel // ic_bn, ic_bn]],
-                                      ["tile_oc", "sp", [out_channel // oc_bn, oc_bn]],
-                                      ["tile_oh", "ot", oh_factor],
-                                      ["tile_ow", "sp", [out_width // ow_factor,
-                                                         ow_factor]],],
-                                "t": ""}
-                    return ConfigEntity.from_json_dict(cfg_dict)
-
+                    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
+                    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
+                    cfg["tile_oh"] = OtherOptionEntity(oh_factor)
+                    cfg["tile_ow"] = SplitEntity([out_width // ow_factor, ow_factor])
+                    return
     raise ValueError("cannot decide default schedule for workload: {}".format(wkl))
 
 
@@ -148,8 +108,8 @@ def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, outpu
 
 def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
     # fetch schedule
-    ic_bn, oh_factor, ow_factor = (cfg["tile_ic"].size[-1], cfg["tile_oh"].val,
-                                   cfg["tile_ow"].size[-1])
+    oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1]
+    _, _, _, _, ic_bn = get_const_tuple(data.shape)
 
     # schedule data
     A = data
@@ -201,57 +161,13 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
     return s
 
 
-def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
-    """ Declaration for int8 conv"""
-    out_dtype = wkl.out_dtype
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    batch_size = data.shape[0]
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
-
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
-    else:
-        data_pad = data
-
-    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
-
-    # Intel performs dot product of 2 "4" Int8 values
-    n_elems = 4
-    assert sch.ic_bn%n_elems == 0
-    ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer')
-    ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner')
-    ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
-
-    # Reshaping kernel as the last 2 dimensions are 1x1 (k_h x k_w)
-    k_shape = kernel.shape
-    kernel = topi.reshape(kernel, (k_shape[0], k_shape[1], k_shape[2], k_shape[3],
-                                   k_shape[4] * k_shape[5] * k_shape[6]))
-
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_pad[n, ic_outer, oh*HSTR, ow*WSTR,
-                                        ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) *
-                               kernel[oc_chunk, ic_outer, ic_f_inner,
-                                      oc_block, ic_s_inner].astype(out_dtype),
-                               axis=[ic_outer, ic_f_inner, ic_s_inner]),
-                       name='conv2d_NCHWc_int8',
-                       tag="conv2d_NCHWc_int8")
-
-
-    return conv
-
-
-def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
+def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
     """
     Defines the schedule for INT8 for intel machines
     Uses the Intel intrinsics to use INT8 operations
     More details - https://software.intel.com/en-us/articles/
     lower-numerical-precision-deep-learning-inference-and-training
     """
-
     target = tvm.target.current_target(allow_none=False)
     int32_lanes = -1
     if check_skylake(target):
@@ -260,6 +176,10 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
         return s
     assert int32_lanes != -1
 
+    oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1]
+    _, _, _, _, ic_bn = get_const_tuple(data.shape)
+    _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
+
     # schedule data
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
@@ -271,8 +191,8 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     CC = s.cache_write(C, 'global')
 
     batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor)
-    ow_outer, ow_inner = s[C].split(ow, factor=sch.ow_factor)
+    oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[C].split(ow, factor=ow_factor)
     s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
     s[C].vectorize(oc_block)
 
@@ -282,17 +202,17 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
         s[C].parallel(parallel_axis)
 
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
+    kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
 
     # Skylake and future processors have 16 vector lanes
-    assert sch.oc_bn % int32_lanes == 0
+    assert oc_bn % int32_lanes == 0
 
     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
 
-    oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
-    ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
+    oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)
 
-    s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_outer, ic_f_inner, oh_inner,
+    s[CC].reorder(oc_chunk, oh_outer, ow_outer, kh, kw, ic_outer, ic_f_inner, oh_inner,
                   ow_inner, oc_f_inner, oc_s_inner, ic_s_inner)
     s[CC].fuse(oc_chunk, oh_outer)
 
@@ -303,8 +223,8 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
 
     if C != O:
         batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-        oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor)
-        ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor)
+        oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
+        ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
         s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
 
         parallel_axis = s[O].fuse(oc_chunk, oh_outer)
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index eaa3d15e64b0..e52722ed54a7 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -1,19 +1,15 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
 """Conv2D schedule on for Intel CPU"""
 from __future__ import absolute_import as _abs
-from collections import namedtuple
 import tvm
-from tvm.autotvm.task import ConfigEntity
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from ..nn.util import infer_pad
-from ..nn.pad import pad
+from ..util import get_const_tuple
 from .tensor_intrin import dot_16x1x16_int8_int8_int32
 from .check_targets import check_skylake
 
-AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw'])
-
-
-def _get_default_schedule(wkl, simd_width):
+def _fallback_schedule(cfg, wkl, simd_width):
     HPAD, WPAD = wkl.hpad, wkl.wpad
     HSTR, WSTR = wkl.hstride, wkl.wstride
     out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
@@ -36,42 +32,10 @@ def _get_default_schedule(wkl, simd_width):
             reg_n = n
             break
 
-    return AVXConvCommonFwd(ic_bn, oc_bn, reg_n, False)
-
-
-def _fallback_schedule(wkl, simd_width):
-    batch_size, in_channel, height, width, _ = wkl[1]
-    out_channel, _, hkernel, wkernel, _ = wkl[2]
-    HPAD, WPAD = wkl[4]
-    HSTR, WSTR = wkl[3]
-    out_width = (width + 2 * WPAD - wkernel) // WSTR + 1
-
-    oc_bn = 1
-    for bn in range(simd_width, 0, -1):
-        if out_channel % bn == 0:
-            oc_bn = bn
-            break
-
-    ic_bn = 1
-    for bn in range(oc_bn, 0, -1):
-        if in_channel % bn == 0:
-            ic_bn = bn
-            break
-
-    reg_n = 1
-    for n in range(31, 0, -1):
-        if out_width % n == 0:
-            reg_n = n
-            break
-
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [in_channel // ic_bn, ic_bn]],
-                      ["tile_oc", "sp", [out_channel // oc_bn, oc_bn]],
-                      ["tile_ow", "sp", [out_width // reg_n, reg_n]],
-                      ["unroll_kw", "ot", False]],
-                "t": ""}
-    return ConfigEntity.from_json_dict(cfg_dict)
+    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
+    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
+    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
+    cfg["unroll_kw"] = OtherOptionEntity(False)
 
 
 def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
@@ -147,8 +111,8 @@ def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, outpu
 
 def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
     # fetch schedule
-    ic_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_ow"].size[-1],
-                               cfg["unroll_kw"].val)
+    reg_n, unroll_kw = cfg["tile_ow"].size[-1], cfg["unroll_kw"].val
+    _, _, _, _, ic_bn = get_const_tuple(data.shape)
 
     # schedule data
     A = data
@@ -197,52 +161,7 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
     return s
 
 
-def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
-    """
-    This function sets up the compute for INT8 conv 2d
-    Inputs are in INT8 datatype
-    Output is in INT32 datatype
-    """
-
-    out_dtype = wkl.out_dtype
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    batch_size = data.shape[0]
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
-
-    # pack data
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
-    else:
-        data_pad = data
-
-    # convolution
-    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
-    kh = tvm.reduce_axis((0, wkl.hkernel), name='kh')
-    kw = tvm.reduce_axis((0, wkl.wkernel), name='kw')
-
-    # Intel performs dot product of 2 "4" Int8 values
-    # Current implementation requires ic_bn to be a multiple of 4
-    n_elems = 4
-    assert sch.ic_bn%n_elems == 0
-
-    ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer')
-    ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner')
-    ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw,
-                                        ic_f_inner * n_elems +  ic_s_inner].astype(out_dtype) *
-                               kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner,
-                                      oc_block, ic_s_inner].astype(out_dtype),
-                               axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
-                       name='conv2d_NCHWc_int8',
-                       tag="conv2d_NCHWc_int8")
-    return conv
-
-def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
+def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
     """
     Defines the schedule for INT8 for intel machines
     Uses the Intel intrinsics to use INT8 operations
@@ -263,6 +182,10 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
         return s
     assert int32_lanes != -1
 
+    reg_n, unroll_kw = cfg["tile_ow"].size[-1], cfg["unroll_kw"].val
+    _, _, _, _, ic_bn = get_const_tuple(data.shape)
+    _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
+
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
         batch, ic_chunk, ih, iw, _ = s[A].op.axis
@@ -274,7 +197,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     CC = s.cache_write(C, 'global')
 
     _, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n)
+    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
     s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
     parallel_axis = s[C].fuse(oc_chunk, oh)
     s[C].vectorize(oc_block)
@@ -285,14 +208,14 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
     kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
 
-    ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
+    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
 
     # Skylake and future processors have 16 vector lanes
-    assert sch.oc_bn % int32_lanes == 0
+    assert oc_bn % int32_lanes == 0
 
     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
 
-    if sch.unroll_kw:
+    if unroll_kw:
         s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, ic_f_inner, kw,
                       ow_block, oc_f_inner, oc_s_inner, ic_s_inner)
         s[CC].unroll(kw)
@@ -308,7 +231,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
 
     if C != O:
         batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-        ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n)
+        ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
         s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
         parallel_axis = s[O].fuse(oc_chunk, oh)
         s[C].compute_at(s[O], parallel_axis)
diff --git a/topi/recipe/conv/test_conv_int8_intel.py b/topi/recipe/conv/test_conv_int8_intel.py
index 863b3a6a41ab..593f913db15d 100644
--- a/topi/recipe/conv/test_conv_int8_intel.py
+++ b/topi/recipe/conv/test_conv_int8_intel.py
@@ -54,19 +54,11 @@ def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad,
     data_shape = (1, in_filter//NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)
 
     if out_dtype == 'int32':
-        if k_h != 1:
-            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
-                            NUM_VEC_LANES//4, NUM_VEC_LANES, 4)
-        else:
-            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES//4,
-                            NUM_VEC_LANES, 4, k_h, k_w)
+        kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
+                        NUM_VEC_LANES//4, NUM_VEC_LANES, 4)
     elif out_dtype == 'float32':
-        if k_h != 1:
-            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
-                            NUM_VEC_LANES, NUM_VEC_LANES)
-        else:
-            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES,
-                            NUM_VEC_LANES, k_h, k_w)
+        kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
+                        NUM_VEC_LANES, NUM_VEC_LANES)
     out_height = (im_height + 2 * hpad - k_h) // hstride + 1
     out_width = (im_width + 2 * wpad - k_w) // wstride + 1
     o_shape = (1, out_filter//NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
@@ -103,8 +95,7 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
 
 
     with tvm.target.create(TARGET_NAME):
-        conv = topi.nn.conv2d_NCHWc(data, kernel, num_filter=out_filter,
-                                    kernel_size=(k_h, k_w), stride=hstride,
+        conv = topi.nn.conv2d_NCHWc(data, kernel, stride=hstride,
                                     padding=hpad, layout='NCHWc',
                                     out_layout='NCHWc', out_dtype=out_dtype)
         out = topi.nn.relu(conv)
@@ -114,13 +105,7 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
         LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))
 
         # Generate and run the optimized schedule
-        sconv = topi.generic.nn.schedule_conv2d_NCHWc(num_filter=out_filter,
-                                                      kernel_size=(k_h, k_w),
-                                                      strides=hstride,
-                                                      padding=hpad,
-                                                      layout='NCHWc',
-                                                      out_layout='NCHWc',
-                                                      outs=[out])
+        sconv = topi.generic.nn.schedule_conv2d_NCHWc(outs=[out])
         func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name='conv')
         func(data_array, kernel_array, c_sch)
 
diff --git a/topi/tests/python/test_topi_conv2d_NCHWc.py b/topi/tests/python/test_topi_conv2d_NCHWc.py
new file mode 100644
index 000000000000..38e6ad6d9e7c
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_NCHWc.py
@@ -0,0 +1,206 @@
+"""Test for NCHW[x]c convolution"""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+from common import get_all_backend
+
+def _transform_data(data, bn):
+    # NCHW -> NCHW[x]c
+    batch_size, channel, height, width = data.shape
+    data = np.transpose(data, (0, 2, 3, 1))
+    data = np.reshape(data, (batch_size, height, width, channel//bn, bn))
+    data = np.transpose(data, (0, 3, 1, 2, 4))
+    return data
+
+def _transform_kernel(kernel, ic_bn, oc_bn):
+    # OIHW -> OIHW[x]i[x]o
+    out_channel, in_channel, kh, kw = kernel.shape
+    kernel = np.transpose(kernel, (1, 2, 3, 0))
+    kernel = np.reshape(kernel, (in_channel, kh, kw, out_channel//oc_bn, oc_bn))
+    kernel = np.transpose(kernel, (1, 2, 3, 4, 0))
+    kernel = np.reshape(kernel, (kh, kw, out_channel//oc_bn, oc_bn, in_channel//ic_bn, ic_bn))
+    kernel = np.transpose(kernel, (2, 4, 0, 1, 5, 3))
+    return kernel
+
+def _transform_bias(bias, bn):
+    # [num_filter, 1, 1] -> [num_filter//bn, 1, 1, bn]
+    num_filter, h, w = bias.shape
+    bias = np.transpose(bias, (1, 2, 0))
+    bias = np.reshape(bias, (h, w, num_filter//bn, bn))
+    bias = np.transpose(bias, (2, 0, 1, 3))
+    return bias
+
+def verify_conv2d_NCHWc(batch, in_channel, in_size, num_filter, kernel, stride,
+                        padding, dilation=1, add_bias=False, add_relu=False, dtype="float32"):
+    assert dilation == 1, "conv2d_NCHWc does not support dilation for now."
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" %
+          (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+
+    in_height = in_width = in_size
+
+    # for testing functionality,
+    # we choose arbitrary block size that can divide the channel,
+    # regardless of the performance.
+    oc_block = 1
+    for bn in range(16, 0, -1):
+        if num_filter % bn == 0:
+            oc_block = bn
+            break
+
+    ic_block = 1
+    for bn in range(oc_block, 0, -1):
+        if in_channel % bn == 0:
+            ic_block = bn
+            break
+
+    A = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='A')
+    W = tvm.placeholder((num_filter//oc_block, in_channel//ic_block, kernel, kernel, ic_block, oc_block), name='W')
+    bias = tvm.placeholder((num_filter//oc_block, 1, 1, oc_block), name='bias')
+
+    @memoize("topi.tests.test_topi_conv2d_NCHWc.verify_conv2d_NCHWc")
+    def get_ref_data():
+        a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
+        w_np = np.random.uniform(size=(num_filter, in_channel, kernel, kernel)).astype(dtype)
+        b_np = np.random.uniform(size=(num_filter, 1, 1)).astype(dtype)
+        c_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
+        if add_bias:
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+        return _transform_data(a_np, ic_block), _transform_kernel(w_np, ic_block, oc_block), \
+               _transform_bias(b_np, oc_block), _transform_data(c_np, oc_block)
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding),
+                                     layout='NCHW%dc'%ic_block,
+                                     out_layout="NCHW%dc"%oc_block,
+                                     out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_NCHWc([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device,
+                             name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
+                                  (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device,
+                             name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
+                                  (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    # test llvm only for now since conv2d_NCHWc implement is missing in other backend.
+    for device in ["llvm"]:
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
+
+
+if __name__ == "__main__":
+    # ResNet18 workloads
+    verify_conv2d_NCHWc(1,   3, 224,  64, 7, 2, 3)
+    verify_conv2d_NCHWc(1,  64,  56,  64, 3, 1, 1)
+    verify_conv2d_NCHWc(1,  64,  56,  64, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  64,  56, 128, 3, 2, 1)
+    verify_conv2d_NCHWc(1,  64,  56, 128, 1, 2, 0)
+    verify_conv2d_NCHWc(1, 128,  28, 128, 3, 1, 1)
+    verify_conv2d_NCHWc(1, 128,  28, 256, 3, 2, 1)
+    verify_conv2d_NCHWc(1, 128,  28, 256, 1, 2, 0)
+    verify_conv2d_NCHWc(1, 256,  14, 256, 3, 1, 1)
+    verify_conv2d_NCHWc(1, 256,  14, 512, 3, 2, 1)
+    verify_conv2d_NCHWc(1, 256,  14, 512, 1, 2, 0)
+    verify_conv2d_NCHWc(1, 512,   7, 512, 3, 1, 1)
+
+    # bias, relu
+    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_relu=True)
+    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_bias=True)
+    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
+
+    # disable dilation test since it is not supported by NCHW[x]c conv for now.
+    # verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, dilation=2)
+
+    # batch size
+    verify_conv2d_NCHWc(4, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_NCHWc(9, 64, 56, 64, 3, 1, 1)
+
+    # weird workloads
+    verify_conv2d_NCHWc(2, 2, 2, 2, 2, 2, 2)
+    verify_conv2d_NCHWc(3, 3, 3, 3, 3, 3, 3)
+    verify_conv2d_NCHWc(4, 4, 4, 4, 4, 4, 4)
+    verify_conv2d_NCHWc(5, 5, 5, 5, 5, 5, 5)
+    verify_conv2d_NCHWc(6, 6, 6, 6, 6, 6, 6)
+
+    # disable these tests due to some bugs of llvm with nvptx
+    # verify_conv2d_NCHWc(1, 1, 1, 1, 1, 1, 1, dilation=1)
+    # verify_conv2d_NCHWc(1, 1, 1, 1, 1, 1, 1, dilation=2)
+    # verify_conv2d_NCHWc(2, 13, 71, 59, 3, 1, 1)
+
+    # inception v3 workloads
+    verify_conv2d_NCHWc(1,    3, 299,  32, 3, 2, 0)
+    verify_conv2d_NCHWc(1,   32, 149,  32, 3, 1, 0)
+    verify_conv2d_NCHWc(1,   32, 147,  64, 3, 1, 1)
+    verify_conv2d_NCHWc(1,   64,  73,  80, 1, 1, 0)
+    verify_conv2d_NCHWc(1,   80,  73, 192, 3, 1, 0)
+    verify_conv2d_NCHWc(1,  192,  35,  64, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  192,  35,  48, 1, 1, 0)
+    verify_conv2d_NCHWc(1,   48,  35,  64, 5, 1, 2)
+    verify_conv2d_NCHWc(1,   64,  35,  96, 3, 1, 1)
+    verify_conv2d_NCHWc(1,   96,  35,  96, 3, 1, 1)
+    verify_conv2d_NCHWc(1,  192,  35,  32, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  256,  35,  64, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  256,  35,  48, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  288,  35,  64, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  288,  35,  48, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  288,  35, 384, 3, 2, 0)
+    verify_conv2d_NCHWc(1,   96,  35,  96, 3, 2, 0)
+    verify_conv2d_NCHWc(1,  768,  17, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  768,  17, 128, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  128,  17, 128, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  128,  17, 192, 7, 1, 3)
+    verify_conv2d_NCHWc(1,  128,  17, 128, 7, 1, 3)
+    verify_conv2d_NCHWc(1,  128,  17, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  768,  17, 160, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  160,  17, 160, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  160,  17, 192, 7, 1, 3)
+    verify_conv2d_NCHWc(1,  160,  17, 160, 7, 1, 3)
+    verify_conv2d_NCHWc(1,  160,  17, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  192,  17, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  192,  17, 192, 7, 1, 3)
+    verify_conv2d_NCHWc(1,  192,  17, 320, 3, 2, 0)
+    verify_conv2d_NCHWc(1,  192,  17, 192, 3, 2, 0)
+    verify_conv2d_NCHWc(1, 1280,   8, 320, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 1280,   8, 384, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  384,   8, 384, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  384,   8, 384, 3, 1, 1)
+    verify_conv2d_NCHWc(1, 1280,   8, 448, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  448,   8, 384, 3, 1, 1)
+    verify_conv2d_NCHWc(1, 1280,   8, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 2048,   8, 320, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 2048,   8, 384, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 2048,   8, 448, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 2048,   8, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 1024,  19,  84, 3, 1, 1)
+    verify_conv2d_NCHWc(1, 2048,  10, 126, 3, 1, 1)
+    verify_conv2d_NCHWc(1,  512,   5, 126, 3, 1, 1)
+    verify_conv2d_NCHWc(1,  256,   3, 126, 3, 1, 1)
diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py
index efd1ee4e1a12..18f1117dc68a 100644
--- a/tutorials/autotvm/tune_nnvm_x86.py
+++ b/tutorials/autotvm/tune_nnvm_x86.py
@@ -14,7 +14,6 @@
 import tvm
 from tvm import autotvm
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from topi.x86.conv2d import conv_NCHWc_arg_to_workload
 import tvm.contrib.graph_runtime as runtime
 
 #################################################################
@@ -118,17 +117,9 @@ def tune_kernels(tasks,
         prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
 
         # converting conv2d tasks to conv2d_NCHWc tasks
-        # data, kernel are tuples of ("TENSOR", shape, dtype)
-        data, kernel, strides, padding, layout, dtype = tsk.args
-        kernel_size = (kernel[1][2], kernel[1][3])
-        data_plc = tvm.placeholder(data[1], name="data")
-        kernel_plc = tvm.placeholder(kernel[1], name="kernel")
-        args = [data_plc, kernel_plc, kernel[1][0], kernel_size, strides,
-                padding, layout, layout, dtype]
-        args = autotvm.task.nnvm_integration.serialize_args(args)
-        task = autotvm.task.create("topi_x86_conv2d_NCHWc", args=args, target=target)
-        task.workload = conv_NCHWc_arg_to_workload(data_plc, kernel_plc, kernel_size,
-                                                   strides, padding, layout, layout, dtype)
+        task = autotvm.task.create("topi_x86_conv2d_NCHWc", args=tsk.args,
+                                   target=target, template_key='direct')
+        task.workload = tsk.workload
 
         # create tuner
         if tuner == 'xgb' or tuner == 'xgb-rank':

From 4fb2d7e13ee4bdfe7ba5edf58cb3a98166ea9e03 Mon Sep 17 00:00:00 2001
From: kun-zh <32951065+kun-zh@users.noreply.github.com>
Date: Tue, 30 Oct 2018 06:39:38 +0800
Subject: [PATCH 305/529] [PASS] add a pass for the specific hardware
 accelarator when it is not binded (#1999)

---
 include/tvm/ir.h                              |  5 ++++
 include/tvm/ir_pass.h                         |  9 +++++++
 src/api/api_pass.cc                           |  1 +
 src/pass/detect_device.cc                     | 21 +++++++++++++++
 src/pass/split_host_device.cc                 |  3 ++-
 .../test_pass_decorate_device_scope.py        | 26 +++++++++++++++++++
 6 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 src/pass/detect_device.cc
 create mode 100644 tests/python/unittest/test_pass_decorate_device_scope.py

diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 14e60146567f..212234303c61 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -237,6 +237,11 @@ constexpr const char* pipeline_exec_scope = "pipeline_exec_scope";
  */
 constexpr const char* opengl_stage_scope = "opengl_stage_scope";
 
+/*!
+ * \brief Mark that it is in the device scope.
+ */
+constexpr const char* device_scope = "device_scope";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index 9403a2e6151b..332becb7aa38 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -326,6 +326,15 @@ Stmt RewriteUnsafeSelect(Stmt stmt);
  */
 Stmt LowerStorageAccessInfo(Stmt stmt);
 
+/*!
+ * \brief Decorate the stmt with a device scope, this is helpful for 
+ * hardware accelerator without thread blocks.
+ *
+ * \param stmt The stmt to be trasnformed
+ * \return Transformed stmt.
+ */
+Stmt DecorateDeviceScope(Stmt stmt);
+
 /*!
  * \brief Make an user callable API LoweredFunc.
  *
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index 1e571ca0dc41..575535f26e81 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -154,5 +154,6 @@ REGISTER_PASS1(LowerTVMBuiltin);
 REGISTER_PASS1(CombineContextCall);
 REGISTER_PASS2(VerifyMemory);
 REGISTER_PASS2(VerifyGPUCode);
+REGISTER_PASS1(DecorateDeviceScope);
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/pass/detect_device.cc b/src/pass/detect_device.cc
new file mode 100644
index 000000000000..c5fb0dd1b8f3
--- /dev/null
+++ b/src/pass/detect_device.cc
@@ -0,0 +1,21 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file detect_device.cc
+ */
+
+#include <tvm/ir_pass.h>
+#include <tvm/ir_mutator.h>
+#include "../pass/ir_util.h"
+
+namespace tvm {
+namespace ir {
+Stmt DecorateDeviceScope(Stmt stmt) {
+  Stmt body = AttrStmt::make(make_zero(Int(32)),
+                             ir::attr::device_scope,
+                             0,
+                             stmt);
+  return body;
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc
index 112c2c173df1..4cfbc7c90d8c 100644
--- a/src/pass/split_host_device.cc
+++ b/src/pass/split_host_device.cc
@@ -153,7 +153,8 @@ class HostDeviceSplitter : public IRMutator {
 
   Stmt Mutate_(const AttrStmt *op, const Stmt& s) final {
     if (op->attr_key == attr::thread_extent ||
-        op->attr_key == attr::pipeline_exec_scope) {
+        op->attr_key == attr::pipeline_exec_scope ||
+        op->attr_key == attr::device_scope) {
       return SplitDeviceFunc(s);
     }
     return IRMutator::Mutate_(op, s);
diff --git a/tests/python/unittest/test_pass_decorate_device_scope.py b/tests/python/unittest/test_pass_decorate_device_scope.py
new file mode 100644
index 000000000000..1d9eb899a642
--- /dev/null
+++ b/tests/python/unittest/test_pass_decorate_device_scope.py
@@ -0,0 +1,26 @@
+import tvm
+
+def test_decorate_device():
+    m = tvm.var('m')
+    l = tvm.var('l')
+    A = tvm.placeholder((m, l), name='A')
+
+    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+
+    s = tvm.create_schedule(A2.op)
+    xo, xi = s[A2].split(A2.op.axis[0], factor=8)
+    s[A1].compute_at(s[A2], xo)
+    s[A1].set_scope("shared")
+
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt1 = tvm.ir_pass.Simplify(stmt)
+    stmt2 = tvm.ir_pass.DecorateDeviceScope(stmt1)
+    assert isinstance(stmt2, tvm.stmt.AttrStmt)
+    assert stmt2.attr_key == "device_scope"
+    assert stmt1 == stmt2.body
+    
+if __name__ == "__main__":
+    test_decorate_device()
+

From 4823d55c146a0baac94cc379093f9ea7c8334653 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Mon, 29 Oct 2018 16:33:49 -0700
Subject: [PATCH 306/529] [Frontend][MXNet] Change mxnet graph traversal from
 recursion to iteration (#2007)

---
 nnvm/python/nnvm/frontend/mxnet.py | 102 ++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 22 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 87b169a1cfbc..d1c2f305c27d 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -381,6 +381,55 @@ def _as_list(arr):
         return arr
     return [arr]
 
+def _topo_sort(symbol):
+    """Sort all symbols in the mxnet graph in topological order.
+
+    Parameters
+    ----------
+    symbol : mxnet.sym.Symbol
+
+    Returns:
+    -------
+    list
+        List of mxnet symbol
+    """
+    queue = []
+    symbol_map = {}
+    deps = {}
+    dep_cnts = {}
+    for s in symbol:
+        symbol_map[s.attr('name')] = s
+        queue.append(s)
+    while queue:
+        sym = queue.pop(0)
+        name = sym.attr('name')
+        childs = sym.get_children()
+        if childs is None:
+            dep_cnts[name] = 0
+        else:
+            dep_cnts[name] = len(set([c.attr('name') for c in childs]))
+            for child in childs:
+                child_name = child.attr('name')
+                if child_name not in deps:
+                    deps[child_name] = set()
+                deps[child_name].add(name)
+                if child_name not in symbol_map:
+                    symbol_map[child_name] = child
+                    queue.append(child)
+    order = []
+    while dep_cnts:
+        remove = []
+        for name in dep_cnts:
+            if dep_cnts[name] == 0:
+                order.append(symbol_map[name])
+                remove.append(name)
+                if name in deps:
+                    for other in deps[name]:
+                        dep_cnts[other] -= 1
+        for name in remove:
+            del dep_cnts[name]
+    return order
+
 def _from_mxnet_impl(symbol, graph):
     """Convert mxnet symbol to nnvm implementation.
     Reconstruct a nnvm symbol by traversing the mxnet symbol.
@@ -398,28 +447,37 @@ def _from_mxnet_impl(symbol, graph):
     nnvm.sym.Symbol
         Converted symbol
     """
-    if len(symbol.list_outputs()) > 1:
-        return [_from_mxnet_impl(s, graph) for s in symbol]
-
-    name = symbol.attr('name')
-    output_index = json.loads(symbol.tojson())['heads'][0][1]
-    node = graph.get(name, None)
-    if node:
-        return node[output_index]
-    attr = symbol.list_attr()
-    op_name = symbol.attr('op_name')
-    childs = symbol.get_children()
-    if childs is not None:
-        childs = [_from_mxnet_impl(childs[i], graph) for i in range(len(childs.list_outputs()))]
-        childs = [x for y in childs for x in _as_list(y)]  # expand group symbol
-        node = _convert_symbol(op_name, childs, attr)
-    elif op_name != 'null':
-        node = _convert_symbol(op_name, [], attr)   # no input symbol
-    else:
-        op_name = json.loads(symbol.tojson())['nodes'][0]['op']
-        node = _sym.Variable(name=name, **attr)
-    graph[name] = node
-    return node[output_index]
+    def get_node(sym):
+        name = sym.attr('name')
+        if name not in graph:
+            return None
+        output_index = json.loads(sym.tojson())['heads'][0][1]
+        return graph[name][output_index]
+
+    assert symbol is not None
+    # Traverse all symbols in topological order
+    for sym in _topo_sort(symbol):
+        name = sym.attr('name')
+        attr = sym.list_attr()
+        op_name = sym.attr('op_name')
+        childs = sym.get_children()
+        if childs is not None:
+            childs = [get_node(child) for child in childs]
+            childs = [x for y in childs for x in _as_list(y)]
+            node = _convert_symbol(op_name, childs, attr)
+        elif op_name != 'null':
+            node = _convert_symbol(op_name, [], attr)
+        else:
+            node = _sym.Variable(name=name, **attr)
+        graph[name] = node
+    nodes = []
+    for sym in symbol:
+        node = get_node(sym)
+        assert node is not None
+        nodes.append(node)
+    if len(nodes) > 1:
+        return _sym.Group(nodes)
+    return nodes[0]
 
 def from_mxnet(symbol, arg_params=None, aux_params=None):
     """Convert from MXNet's model into compatible NNVM format.

From feca27e4c47ba6cc0b4b0f1c98a66246219964a9 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 29 Oct 2018 19:43:37 -0700
Subject: [PATCH 307/529] [RELAY][PASS] FoldScaleAxis Backward (#2024)

---
 include/tvm/relay/expr_functor.h              |   6 +-
 python/tvm/relay/ir_pass.py                   |  29 ++
 src/relay/ir/expr_functor.cc                  |  12 +-
 src/relay/pass/fold_scale_axis.cc             | 455 +++++++++++++++++-
 src/relay/pass/pattern_util.h                 |  23 +-
 .../python/relay/test_pass_fold_scale_axis.py | 177 ++++++-
 6 files changed, 667 insertions(+), 35 deletions(-)

diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index bf4025f79224..85a6b502d845 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -135,9 +135,9 @@ class ExprVisitor
   void VisitExpr_(const TupleGetItemNode* op) override;
   virtual void VisitType(const Type& t);
 
- private:
-  // internal visited flag.
-  std::unordered_set<const Node*> visited_;
+ protected:
+  // Internal visiting counter
+  std::unordered_map<const Node*, size_t> visit_counter_;
 };
 
 /*!
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 6adfaacdc86d..82afa83ee376 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -31,6 +31,29 @@ def infer_type(expr, env=None):
     return _ir_pass.infer_type(expr, env)
 
 
+def backward_fold_scale_axis(expr):
+    """Backward fold axis scaling into weights of conv2d/dense.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression, we expect that expr's types
+        should be fully inferred by infer_type.
+
+    Returns
+    -------
+    folded_expr : tvm.relay.Expr
+        The folded expression after transformation.
+
+    Note
+    ----
+    It is recommended to call backward_fold_scale_axis
+    before using forward_fold_scale_axis.
+    As backward folding targets common conv-bn pattern.
+    """
+    return _ir_pass.backward_fold_scale_axis(expr)
+
+
 def forward_fold_scale_axis(expr):
     """Fold the scaling of axis into weights of conv2d/dense.
 
@@ -44,6 +67,12 @@ def forward_fold_scale_axis(expr):
     -------
     folded_expr : tvm.relay.Expr
         The folded expression after transformation.
+
+    Note
+    ----
+    It is recommended to call backward_fold_scale_axis
+    before using forward_fold_scale_axis.
+    As backward folding targets common conv-bn pattern.
     """
     return _ir_pass.forward_fold_scale_axis(expr)
 
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index b7a752d43a5c..ed7c1d1d1e5a 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -160,10 +160,14 @@ Expr ExprMutator::VisitExpr_(const TupleGetItemNode* g) {
 Type ExprMutator::VisitType(const Type& t) { return t; }
 
 void ExprVisitor::VisitExpr(const Expr& expr) {
-  if (visited_.count(expr.get())) return;
-  using TParent = ExprFunctor<void(const Expr&)>;
-  TParent::VisitExpr(expr);
-  visited_.insert(expr.get());
+  auto it = visit_counter_.find(expr.get());
+  if (it != visit_counter_.end()) {
+    ++it->second;
+  } else {
+    using TParent = ExprFunctor<void(const Expr&)>;
+    TParent::VisitExpr(expr);
+    visit_counter_.insert({expr.get(), 1});
+  }
 }
 
 void ExprVisitor::ExprVisitor::VisitExpr_(const VarNode* op) {
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index b1c767704372..e757118f33f2 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -24,9 +24,9 @@ namespace fold_scale_axis {
 using runtime::TypedPackedFunc;
 
 
-// FoldScaleAxisFoward algorithm:
+// FoldScaleAxis algorithm:
 //
-// The general idea is that we transform Expr to tuple of
+// The general idea is to transform Expr to tuple of
 // (value, axes, scale), where the final result satiesfies:
 //
 // result = value
@@ -41,9 +41,14 @@ using runtime::TypedPackedFunc;
 // we run a backward "preparation phase", which propagates the demand
 // of the potential axes scaling back to its input.
 //
-// The folding process is done in two steps:
+// Forward folding process is done in two steps:
 // - Prepare phase: backward propagation of demand.
 // - Transform phase: forward transformation,
+//
+// Similarly, backward folding process is done in two steps:
+// - Prepare phase: forward propagation of demand.
+// - Transform phase: transformation by push down the axes scale signal to inputs.
+//
 
 /*!
  * \brief sorted array axis, can also be nullptr.
@@ -99,7 +104,7 @@ ValueType GetFunc(const OpMap<ValueType>& op_map,
 }
 
 /*!
- * \brief Preparation function for for pass scale forward.
+ * \brief Preparation function for pass scale forward.
  * \param call The call node.
  * \param out_scale_axes Possible scaling on axes of the output.
  * \return The result scaling on axes of the input.
@@ -144,7 +149,7 @@ using FForwardTransform = TypedPackedFunc<
 //----------------------------------------------
 // Generic Visitors for FScaleAxisForward
 //----------------------------------------------
-class FScaleAxisForwardPrep : private ExprVisitor {
+class ForwardPrep : private ExprVisitor {
  public:
   std::unordered_map<const Node*, AxesSet>
   Prepare(const Expr& body) {
@@ -255,12 +260,12 @@ class FScaleAxisForwardPrep : private ExprVisitor {
   }
 };
 
-class FScaleAxisForwardTransform : private ExprMutator {
+class ForwardTransformer : private ExprMutator {
  public:
   // Transform expression.
-  Expr Transform(Expr expr) {
+  Expr Fold(Expr expr) {
     expected_scale_axes_ =
-        FScaleAxisForwardPrep().Prepare(expr);
+        ForwardPrep().Prepare(expr);
     return this->Mutate(expr);
   }
 
@@ -346,13 +351,13 @@ Array<AxesSet> ReluForwardPrep(const Call& call, AxesSet out) {
 }
 
 STuple ReluForwardTransform(const Call& ref_call,
-                              const AxesSet& expected_axes,
-                              const Array<STuple>& sargs) {
+                            const AxesSet& expected_axes,
+                            const Array<STuple>& sargs) {
   if (!sargs[0]->axes.defined()) return STuple();
   // return transformed conv2d
   auto rnode = make_node<STupleNode>();
   rnode->value = CallNode::make(
-      ref_call->op, {sargs[0]->value}, ref_call->attrs, {});
+      ref_call->op, {sargs[0]->value}, ref_call->attrs, ref_call->type_args);
   rnode->scale = sargs[0]->scale;
   rnode->axes = sargs[0]->axes;
   return STuple(rnode);
@@ -474,8 +479,6 @@ Array<AxesSet> Conv2DForwardPrep(const Call& call, AxesSet out) {
   Layout weight_layout(param->weight_layout);
   int c_big_axis = data_layout.indexof('C');
   int c_small_axis = data_layout.indexof('c');
-  const auto* tdata = call->args[0]->type_as<TensorTypeNode>();
-  CHECK(tdata) << "require checked type";
 
   CHECK_GE(c_big_axis, 0);
   AxesSet data_axes = NullValue<AxesSet>();
@@ -486,8 +489,7 @@ Array<AxesSet> Conv2DForwardPrep(const Call& call, AxesSet out) {
   //
   // only handle depthwise or full conv2d.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
-  bool is_depthwise_conv2d =
-      is_const_int(tdata->shape[c_big_axis], param->groups);
+  bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
   if (weight_layout.indexof('i') < 0 &&
       c_small_axis < 0 &&
       (param->groups == 1 || is_depthwise_conv2d)) {
@@ -515,18 +517,24 @@ STuple Conv2DForwardTransform(const Call& ref_call,
   CHECK_EQ(weight_layout.indexof('i'), -1);
   CHECK(sdata->axes.size() == 1 &&
         c_big_axis == sdata->axes[0]->value);
+  int big_oc_axis = weight_layout.indexof('O');
   int big_ic_axis = weight_layout.indexof('I');
 
-  const auto* tdata = ref_call->args[0]->type_as<TensorTypeNode>();
   // Check it must be depthwise or full conv2d.
-  bool is_depthwise_conv2d =
-      is_const_int(tdata->shape[c_big_axis], param->groups);
+  bool is_depthwise_conv2d = IsDepthwiseConv2D(ref_call, param, weight_layout);
   CHECK(param->groups == 1 || is_depthwise_conv2d);
+  Expr weight = sargs[1]->value;
 
   // match the ic_axis
-  Expr scale = ExpandBiasToMatchAxis(
-      sdata->scale, weight_layout.ndim(), {big_ic_axis});
-  Expr weight = Multiply(sargs[1]->value, scale);
+  if (is_depthwise_conv2d) {
+    Expr scale = ExpandBiasToMatchAxis(
+        sdata->scale, weight_layout.ndim(), {big_oc_axis});
+    weight = Multiply(weight, scale);
+  } else {
+    Expr scale = ExpandBiasToMatchAxis(
+        sdata->scale, weight_layout.ndim(), {big_ic_axis});
+    weight = Multiply(weight, scale);
+  }
   // return transformed conv2d
   auto rnode = make_node<STupleNode>();
   rnode->value = CallNode::make(
@@ -542,13 +550,416 @@ RELAY_REGISTER_OP("nn.conv2d")
 
 
 Expr ForwardFoldScaleAxis(Expr data) {
-  return FScaleAxisForwardTransform().Transform(data);
+  return ForwardTransformer().Fold(data);
 }
 
 // Expose the FoldScaleAxisFoward
 TVM_REGISTER_API("relay._ir_pass.forward_fold_scale_axis")
 .set_body_typed<Expr(Expr)>(ForwardFoldScaleAxis);
 
+//----------------------------------------
+// Implement backward transformations.
+//----------------------------------------
+class BackwardTransformer;
+
+/*!
+ * \brief Preparation function for for pass scale backward.
+ * \param call The call node.
+ * \param in_scale_axes Allowed input scaling.
+ * \return The result scaling on axes of the input.
+ */
+using FBackwardPrep = TypedPackedFunc<
+  AxesSet(const Call& call, const Array<AxesSet>& in_scale_axes)>;
+
+using FBackwardTransform = TypedPackedFunc<
+  Expr(const Call& call,
+       const AxesSet& axes,
+       const Expr& scale,
+       const BackwardTransformer& transformer)>;
+
+//----------------------------------------------
+// Generic Visitors for FScaleAxisBackward
+//----------------------------------------------
+/*!
+ * \brief Get reference counter of each internal ExprNode in body.
+ * \param body The body expression.
+ * \return The reference count mapping.
+ */
+std::unordered_map<const Node*, size_t>
+GetExprRefCount(const Expr& body) {
+  class ExprRefCounter : private ExprVisitor {
+   public:
+    std::unordered_map<const Node*, size_t>
+    Get(const Expr& body) {
+      this->VisitExpr(body);
+      return std::move(this->visit_counter_);
+    }
+  };
+  return ExprRefCounter().Get(body);
+}
+
+class BackwardPrep : private ExprVisitor {
+ public:
+  // The message on each node.
+  std::unordered_map<const Node*, AxesSet>
+  Prepare(const Expr& body) {
+    ref_counter_ = GetExprRefCount(body);
+    this->VisitExpr(body);
+    return std::move(message_);
+  }
+
+ private:
+  // The message on each node.
+  std::unordered_map<const Node*, AxesSet> message_;
+  // reference counter of an internal expr
+  std::unordered_map<const Node*, size_t> ref_counter_;
+  // Visit the expression.
+  void VisitExpr_(const CallNode* call) {
+    ExprVisitor::VisitExpr_(call);
+    static const auto& fprep =
+        Op::GetAttr<FBackwardPrep>("FScaleAxisBackwardPrep");
+    auto f = GetFunc(fprep, call->op);
+    if (f == nullptr) return;
+    auto rit = ref_counter_.find(call);
+    CHECK(rit != ref_counter_.end());
+    // We only allow propagation of scale backward
+    // if the expression is only referred by a single parent.
+    if (rit->second != 1) return;
+    Array<AxesSet> in_axes;
+    for (Expr arg : call->args) {
+      auto it = message_.find(arg.get());
+      if (it != message_.end()) {
+        in_axes.push_back(it->second);
+      } else {
+        in_axes.push_back(NullValue<AxesSet>());
+      }
+    }
+    AxesSet out_axes = f(GetRef<Call>(call), in_axes);
+    if (out_axes.defined()) {
+      message_[call] = out_axes;
+    }
+  }
+};
+
+class BackwardTransformerNode :
+      public Node,
+      private ExprMutator {
+ public:
+  // Run forward transform.
+  Expr Fold(Expr expr) {
+    expected_scale_axes_ = BackwardPrep().Prepare(expr);
+    return this->Mutate(expr);
+  }
+  /*!
+   * \brief Transform the expr to consider the scaling.
+   *
+   * \param expr The input expression.
+   * \param axes The axes to scale.
+   * \param scale The scale applied to the axes.
+   * \return The result of transformation.
+   */
+  Expr Transform(const Expr& expr, AxesSet axes, Expr scale) {
+    // NOTE: the result of Transform is not memoized.
+    // However, in the current rule, Transform will
+    // only be called to expr that is referred once.
+    if (const CallNode* call_node = expr.as<CallNode>()) {
+      return Transform(call_node, axes, scale);
+    } else {
+      CHECK(!axes.defined()) << "outstanding scale";
+      return ExprMutator::VisitExpr(expr);
+    }
+  }
+  /*!
+   * \brief Normal way of mutating call node.
+   * \param call_node The call node to be mutated.
+   * \return the result of the call Mutation.
+   */
+  Expr NormalCallTransform(const CallNode* call_node) {
+    return ExprMutator::VisitExpr_(call_node);
+  }
+  /*!
+   * \brief Get the expected axes on expr.
+   * \param expr The expresison.
+   * \return The expected axes.
+   */
+  AxesSet GetExpectedAxes(const Expr& expr) const {
+    auto it = expected_scale_axes_.find(expr.get());
+    if (it != expected_scale_axes_.end()) return it->second;
+    return NullValue<AxesSet>();
+  }
+
+  // solver is not serializable.
+  void VisitAttrs(tvm::AttrVisitor* v) final {}
+
+  static constexpr const char* _type_key = "relay.fold_scale_axis.FBackwardTransformer";
+  TVM_DECLARE_NODE_TYPE_INFO(BackwardTransformerNode, Node);
+
+ private:
+  // Valid axes on each node.
+  std::unordered_map<const Node*, AxesSet> expected_scale_axes_;
+  // Override mutation of call.
+  Expr VisitExpr_(const CallNode* call_node) final {
+    return Transform(call_node, NullValue<AxesSet>(), NullValue<Expr>());
+  }
+  // Transform of CallNode.
+  Expr Transform(const CallNode* call_node, AxesSet axes, Expr scale);
+};
+
+class BackwardTransformer : public NodeRef {
+ public:
+  BackwardTransformer() {}
+  explicit BackwardTransformer(
+      ::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {
+  }
+  BackwardTransformerNode* operator->() const {
+    return static_cast<BackwardTransformerNode*>(node_.get());
+  }
+  using ContainerType = BackwardTransformerNode;
+};
+
+Expr BackwardTransformerNode::Transform(
+    const CallNode* call_node, AxesSet axes, Expr scale) {
+  static const auto& ftransform =
+      Op::GetAttr<FBackwardTransform>("FScaleAxisBackwardTransform");
+  auto f = GetFunc(ftransform, call_node->op);
+  if (f != nullptr) {
+    return f(GetRef<Call>(call_node),
+             axes,
+             scale,
+             GetRef<BackwardTransformer>(this));
+  } else {
+    CHECK(!axes.defined()) << "outstanding scale";
+    return NormalCallTransform(call_node);
+  }
+}
+
+
+//----------------------------------------------
+// Per operator defs for FScaleAxisForward
+//----------------------------------------------
+
+// Intermediate operators
+AxesSet ReluBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
+  return in_axes[0];
+}
+
+Expr ReluBackwardTransform(const Call& call,
+                           const AxesSet& axes,
+                           const Expr& scale,
+                           const BackwardTransformer& transformer) {
+  if (!axes.defined()) {
+    return transformer->NormalCallTransform(call.operator->());
+  }
+  Expr input = transformer->Transform(
+      call->args[0], axes, scale);
+  return CallNode::make(call->op, {input}, call->attrs, call->type_args);
+}
+
+RELAY_REGISTER_OP("nn.relu")
+.set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", ReluBackwardPrep);
+
+RELAY_REGISTER_OP("nn.relu")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", ReluBackwardTransform);
+
+RELAY_REGISTER_OP("nn.leaky_relu")
+.set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", ReluBackwardPrep);
+
+RELAY_REGISTER_OP("nn.leaky_relu")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", ReluBackwardTransform);
+
+// AddSub
+AxesSet AddSubBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
+  const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
+  AttrsEqual equal;
+  if (in_axes[0].defined() &&
+      MatchBroadcastToLeftAxes(tlhs, trhs, in_axes[0])) {
+    return in_axes[0];
+  } else if (in_axes[1].defined() &&
+             MatchBroadcastToLeftAxes(trhs, tlhs, in_axes[1])) {
+    return in_axes[1];
+  } else if (in_axes[0].defined() &&
+             in_axes[1].defined() &&
+             equal(in_axes[0], in_axes[1]) &&
+             equal(tlhs->shape, trhs->shape)) {
+    // add of two elements.
+    return in_axes[0];
+  } else {
+    return NullValue<AxesSet>();
+  }
+}
+
+Expr AddSubBackwardTransform(const Call& call,
+                             const AxesSet& axes,
+                             const Expr& scale,
+                             const BackwardTransformer& transformer) {
+  const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
+  if (!axes.defined()) {
+    return transformer->NormalCallTransform(call.operator->());
+  }
+  AxesSet lhs_axes = transformer->GetExpectedAxes(call->args[0]);
+  AxesSet rhs_axes = transformer->GetExpectedAxes(call->args[1]);
+  AttrsEqual equal;
+
+  if (lhs_axes.defined() && rhs_axes.defined()) {
+    CHECK(equal(lhs_axes, rhs_axes));
+    CHECK(equal(axes, lhs_axes));
+    Expr lhs = transformer->Transform(call->args[0], axes, scale);
+    Expr rhs = transformer->Transform(call->args[1], axes, scale);
+    return CallNode::make(call->op, {lhs, rhs}, call->attrs, call->type_args);
+  } else if (lhs_axes.defined()) {
+    CHECK(equal(axes, lhs_axes));
+    Expr lhs = transformer->Transform(call->args[0], axes, scale);
+    Expr rhs = transformer->Transform(
+        call->args[1], NullValue<AxesSet>(), NullValue<Expr>());
+    Expr rhs_scale = ExpandBiasToMatchAxis(
+        scale, tlhs->shape.size(), axes);
+    rhs = Multiply(rhs, rhs_scale);
+    return CallNode::make(call->op, {lhs, rhs}, call->attrs, call->type_args);
+  } else if (rhs_axes.defined()) {
+    CHECK(equal(axes, rhs_axes));
+    Expr lhs = transformer->Transform(
+        call->args[0], NullValue<AxesSet>(), NullValue<Expr>());
+    Expr rhs = transformer->Transform(call->args[1], axes, scale);
+    Expr lhs_scale = ExpandBiasToMatchAxis(
+        scale, trhs->shape.size(), axes);
+    lhs = Multiply(lhs, lhs_scale);
+    return CallNode::make(call->op, {lhs, rhs}, call->attrs, call->type_args);
+  } else {
+    LOG(FATAL) << "outstanding scale";
+    return Expr();
+  }
+}
+
+RELAY_REGISTER_OP("add")
+.set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", AddSubBackwardPrep);
+
+RELAY_REGISTER_OP("add")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", AddSubBackwardTransform);
+
+RELAY_REGISTER_OP("subtract")
+.set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", AddSubBackwardPrep);
+
+RELAY_REGISTER_OP("subtract")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", AddSubBackwardTransform);
+
+// Producer operators
+// Multiply produces the scale-axis pair.
+Expr MultiplyBackwardTransform(const Call& call,
+                               const AxesSet& axes,
+                               const Expr& scale,
+                               const BackwardTransformer& transformer) {
+  CHECK(!axes.defined()) << "outstanding scale";
+  const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
+  AxesSet lhs_axes = transformer->GetExpectedAxes(call->args[0]);
+  AxesSet rhs_axes = transformer->GetExpectedAxes(call->args[1]);
+  if (lhs_axes.defined()) {
+    // NOTE we won't recursively call mutating on scale part.
+    // since there  won't be scale chance within scale part.
+    Expr rhs = call->args[1];
+    if (MatchBroadcastToLeftAxes(tlhs, trhs, lhs_axes, &rhs)) {
+      return transformer->Transform(call->args[0], lhs_axes, rhs);
+    }
+  } else if (rhs_axes.defined()) {
+    Expr lhs = call->args[0];
+    if (MatchBroadcastToLeftAxes(trhs, tlhs, rhs_axes, &lhs)) {
+      return transformer->Transform(call->args[1], rhs_axes, lhs);
+    }
+  }
+  return transformer->NormalCallTransform(call.operator->());
+}
+
+RELAY_REGISTER_OP("multiply")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", MultiplyBackwardTransform);
+
+// Consumer operators
+// Conv2D send out requirement of axis folding.
+AxesSet Conv2DBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
+  const auto* param = call->attrs.as<Conv2DAttrs>();
+  CHECK(param != nullptr);
+  Layout out_layout(param->out_layout);
+  if (!out_layout.defined()) {
+    out_layout = Layout(param->data_layout);
+  }
+  Layout weight_layout(param->weight_layout);
+  int c_big_axis = out_layout.indexof('C');
+  int c_small_axis = out_layout.indexof('c');
+
+  CHECK_GE(c_big_axis, 0);
+  // For now, we only support simple pattern (no folded weight/data)
+  // More general layout can be supported under the current framework.
+  // By using a unified layout transformation.
+  // We only need to change the Prep and Mutate function.
+  //
+  // only handle depthwise or full conv2d.
+  // TODO(tvm-team) handle grouped conv by reshape + bcast
+  bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
+  if (weight_layout.indexof('o') < 0 &&
+      weight_layout.indexof('i') < 0 &&
+      c_small_axis < 0 &&
+      (param->groups == 1 || is_depthwise_conv2d)) {
+    return {c_big_axis};
+  } else {
+    return NullValue<AxesSet>();
+  }
+}
+
+// Conv2D consumes the scale axis during transformation.
+Expr Conv2DBackwardTransform(const Call& call,
+                             const AxesSet& axes,
+                             const Expr& scale,
+                             const BackwardTransformer& transformer) {
+  if (!axes.defined()) {
+    return transformer->NormalCallTransform(call.operator->());
+  }
+  const auto* param = call->attrs.as<Conv2DAttrs>();
+  CHECK(param != nullptr);
+  Layout out_layout(param->out_layout);
+  if (!out_layout.defined()) {
+    out_layout = Layout(param->data_layout);
+  }
+  Layout weight_layout(param->weight_layout);
+  int c_big_axis = out_layout.indexof('C');
+  CHECK_GE(c_big_axis, 0);
+  // For now, we only support simple pattern (no folded weight/data)
+  // TODO(tvm-team) support general data layout
+  CHECK_EQ(weight_layout.indexof('o'), -1);
+  CHECK_EQ(weight_layout.indexof('i'), -1);
+  CHECK(axes.size() == 1 &&
+        c_big_axis == axes[0]->value);
+
+  int big_oc_axis = weight_layout.indexof('O');
+  // Check it must be depthwise or full conv2d.
+  bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
+  CHECK(param->groups == 1 || is_depthwise_conv2d);
+
+  Expr data = transformer->Transform(
+      call->args[0], NullValue<AxesSet>(), NullValue<Expr>());
+  Expr weight = transformer->Transform(
+      call->args[1], NullValue<AxesSet>(), NullValue<Expr>());
+  // scale on input for deptwise.
+  Expr wscale = ExpandBiasToMatchAxis(
+      scale, weight_layout.ndim(), {big_oc_axis});
+  weight = Multiply(weight, wscale);
+  return CallNode::make(
+      call->op, {data, weight}, call->attrs, call->type_args);
+}
+
+RELAY_REGISTER_OP("nn.conv2d")
+.set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", Conv2DBackwardPrep);
+
+RELAY_REGISTER_OP("nn.conv2d")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", Conv2DBackwardTransform);
+
+Expr BackwardFoldScaleAxis(Expr data) {
+  return make_node<BackwardTransformerNode>()->Fold(data);
+}
+
+TVM_REGISTER_API("relay._ir_pass.backward_fold_scale_axis")
+.set_body_typed<Expr(Expr)>(BackwardFoldScaleAxis);
+
 }  // namespace fold_scale_axis
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index a395e74cdf0b..a41e6c35b93a 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -11,6 +11,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/attrs/transform.h>
+#include "../op/nn/layout.h"
 
 namespace tvm {
 namespace relay {
@@ -100,11 +101,31 @@ inline Expr ExpandBiasToMatchAxis(Expr bias,
   return bias;
 }
 
+/*!
+ * \brief Check if the call is depthwise conv2d.
+ *
+ * \param call The conv2d call.
+ * \param param The conv2d attributes.
+ * \return Whether it is depthwise_conv2d.
+ */
+inline bool IsDepthwiseConv2D(const Call& call,
+                              const Conv2DAttrs* param,
+                              const Layout& weight_layout) {
+  static const Layout kOIHW("OIHW");
+  auto wshape = ConvertLayout(
+      call->args[1]->type_as<TensorTypeNode>()->shape,
+      weight_layout, kOIHW);
+  return is_const_int(wshape[0], param->groups) &&
+      is_const_int(wshape[1], 1);
+}
+
+
 inline Expr Multiply(Expr lhs, Expr rhs) {
   static const Op& op = Op::Get("multiply");
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
 }
 
+
 inline Expr Divide(Expr lhs, Expr rhs) {
   static const Op& op = Op::Get("divide");
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
@@ -116,8 +137,6 @@ inline Expr ReshapeLike(Expr lhs, Expr rhs) {
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
 }
 
-
-
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_PATTERN_UTIL_H_
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index 7ce3b35efe46..1b57bdce0e0c 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -62,14 +62,14 @@ def before(x, conv_weight, in_bias, in_scale, channels):
                              channels=channels,
                              kernel_size=(3, 3),
                              data_layout="NHWC",
-                             weight_layout="HWOI",
+                             weight_layout="HWIO",
                              groups=channels,
                              padding=(1, 1))
         y2 = relay.nn.conv2d(x, conv_weight,
                              channels=channels,
                              kernel_size=(3, 3),
                              data_layout="NHWC",
-                             weight_layout="HWOI",
+                             weight_layout="HWIO",
                              groups=channels,
                              padding=(1, 1))
         z = relay.add(y1, y2)
@@ -85,7 +85,7 @@ def expected(x, conv_weight, in_bias, in_scale, channels):
                              channels=channels,
                              kernel_size=(3, 3),
                              data_layout="NHWC",
-                             weight_layout="HWOI",
+                             weight_layout="HWIO",
                              groups=channels,
                              padding=(1, 1))
         y2 = relay.nn.conv2d(x,
@@ -93,7 +93,7 @@ def expected(x, conv_weight, in_bias, in_scale, channels):
                              channels=channels,
                              kernel_size=(3, 3),
                              data_layout="NHWC",
-                             weight_layout="HWOI",
+                             weight_layout="HWIO",
                              groups=channels,
                              padding=(1, 1))
         z = relay.add(y1, y2)
@@ -147,7 +147,176 @@ def check(shape, channels):
     check((2, 11, 10, 4), 4)
 
 
+def test_fold_bwd_simple():
+    """Simple testcase."""
+    def before(x, conv_weight, out_bias, out_scale, channels):
+        args = [x, conv_weight, out_bias, out_scale]
+        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        y = relay.nn.conv2d(x, conv_weight,
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y = relay.add(y, out_bias)
+        y = relay.nn.relu(y)
+        y = relay.multiply(y, out_scale)
+        return relay.Function(args, y)
+
+    def expected(x, conv_weight, out_bias, out_scale, channels):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, conv_weight, out_bias, out_scale]
+        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        squeezed_scale = relay.squeeze(out_scale, axis=[1,2])
+        conv_weight = relay.multiply(
+            conv_weight , relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3))
+
+        y = relay.nn.conv2d(x, conv_weight,
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        out_bias = relay.multiply(out_bias,
+                                  relay.expand_dims(squeezed_scale, axis=1, num_newaxis=2))
+        y = relay.add(y, out_bias)
+        y = relay.nn.relu(y)
+        return relay.Function(args, y)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        out_bias = relay.var("out_bias", shape=(channels,))
+        out_scale = relay.var("out_scale", shape=(channels,))
+
+        y1 = before(x, weight, out_bias, out_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        type_dict = {x.name_hint:x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
+        y1_expected = expected(x, weight, out_bias, out_scale, channels)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 10), 8)
+
+
+def test_fold_bwd_dual_path():
+    """Dual path testcase."""
+    def before(x, conv_weight, out_bias, out_scale, channels):
+        args = [x, conv_weight, out_bias, out_scale]
+        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        y1 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y2 = relay.nn.relu(y2)
+        y = relay.add(y1, y2)
+        y = relay.multiply(y, out_scale)
+        return relay.Function(args, y)
+
+    def expected(x, conv_weight, out_bias, out_scale, channels):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, conv_weight, out_bias, out_scale]
+        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        squeezed_scale = relay.squeeze(out_scale, axis=[1,2])
+        def fold_conv_weight():
+            return  relay.multiply(
+                conv_weight ,
+                relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3))
+        y1 = relay.nn.conv2d(x, fold_conv_weight(),
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(x, fold_conv_weight(),
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y2 = relay.nn.relu(y2)
+        y = relay.add(y1, y2)
+        return relay.Function(args, y)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        out_bias = relay.var("out_bias", shape=(channels,))
+        out_scale = relay.var("out_scale", shape=(channels,))
+
+        y1 = before(x, weight, out_bias, out_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        type_dict = {x.name_hint:x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
+        y1_expected = expected(x, weight, out_bias, out_scale, channels)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 10), 8)
+
+
+def test_fold_bwd_fail():
+    """Dual path testcase."""
+    def fail1(x, conv_weight, out_bias, out_scale, channels):
+        args = [x, conv_weight, out_bias, out_scale]
+        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        y1 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1),
+                             out_layout="CNHW")
+        # fold will fail because the axis from two path
+        # differs from each other.
+        y2 = relay.nn.relu(y2)
+        y = relay.add(y1, y2)
+        y = relay.multiply(y, out_scale)
+        return relay.Function(args, y)
+
+    def fail2(x, conv_weight, out_bias, out_scale, channels):
+        args = [x, conv_weight, out_bias, out_scale]
+        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        y1 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y2 = relay.nn.relu(y1)
+        # fold will fail because y1 is referred also by y2
+        y1 = relay.multiply(y1, out_scale)
+        y = relay.add(y1, y2)
+        return relay.Function(args, y)
+
+
+    def check(shape, channels, fbefore):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        out_bias = relay.var("out_bias", shape=(channels,))
+        out_scale = relay.var("out_scale", shape=(channels,))
+        y1 = fbefore(x, weight, out_bias, out_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1)
+
+    check((4, 4, 10, 10), 4, fail1)
+    check((4, 4, 10, 10), 4, fail2)
+
+
 if __name__ == "__main__":
     test_fold_fwd_simple()
     test_fold_fwd_dual_path()
     test_fold_fwd_fail()
+    test_fold_bwd_simple()
+    test_fold_bwd_dual_path()
+    test_fold_bwd_fail()

From ea746687d9c916bf3d2093de627c5c58130df4ab Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 29 Oct 2018 22:44:48 -0400
Subject: [PATCH 308/529] Conditional Loop Partitioning - Extending to remove
 if conditions (#1797)

---
 src/pass/loop_partition.cc                    |  47 ++++--
 .../unittest/test_pass_loop_partition.py      | 158 ++++++++++++++++++
 2 files changed, 191 insertions(+), 14 deletions(-)

diff --git a/src/pass/loop_partition.cc b/src/pass/loop_partition.cc
index 0de8a88edb00..95ce130785d7 100644
--- a/src/pass/loop_partition.cc
+++ b/src/pass/loop_partition.cc
@@ -239,11 +239,16 @@ class ThreadPartitionInserter : public IRMutator {
 // Try to do partition at the candidate IRs
 class LoopPartitioner : public IRMutator {
  public:
-  explicit LoopPartitioner(std::unordered_set<const Node*> candidates)
-    : candidates_(candidates) {}
+  explicit LoopPartitioner(bool split_const_loop)
+      : selector(CandidateSelector(split_const_loop)) {}
+
+  Stmt VisitAndMutate(const Stmt& stmt) {
+    selector.Visit(stmt);
+    return Mutate(stmt);
+  }
 
   Stmt Mutate_(const For* op, const Stmt& stmt) {
-    if (candidates_.count(op)) {
+    if (selector.candidates.count(op)) {
       Stmt s = TryPartition(op, stmt, op->loop_var,
           op->min, op->min + op->extent - 1, op->body, false);
       if (s.defined()) return s;
@@ -266,7 +271,7 @@ class LoopPartitioner : public IRMutator {
     const IterVarNode *iv = op->node.as<IterVarNode>();
     CHECK(iv);
     Var var = iv->var;
-    if (candidates_.count(op)) {
+    if (selector.candidates.count(op)) {
       Stmt s = TryPartition(op, stmt, var, 0, op->value - 1, op->body, true);
       if (s.defined()) return s;
     }
@@ -295,9 +300,9 @@ class LoopPartitioner : public IRMutator {
   inline Stmt MakeFor(const Node* op, Expr extent, Stmt body);
 
   /* Candidate IRs that may be partitioned potentially */
-  std::unordered_set<const Node*> candidates_;
   std::unordered_map<const Variable*, IntSet> hint_map_;
   std::unordered_map<const Variable*, IntSet> relax_map_;
+  CandidateSelector selector;
 };
 
 Stmt LoopPartitioner::TryPartition(const Node* node,
@@ -322,7 +327,7 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
   Expr body_begin;
   Stmt pre_stmt;
   if (true_itrv.as<arith::IntervalSet>()->i.has_lower_bound()) {
-    body_begin = true_itrv.min();
+    body_begin = ir::Simplify(true_itrv.min());
     if (!can_prove(body_begin == min)) {
       Expr cond = (body_begin - min >= 0);
       if (!can_prove(cond)) {
@@ -343,7 +348,7 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
   Expr post_doubt_begin;
   Stmt post_stmt;
   if (true_itrv.as<arith::IntervalSet>()->i.has_upper_bound()) {
-    post_doubt_begin = true_itrv.max() + 1;
+    post_doubt_begin = ir::Simplify(true_itrv.max() + 1);
     if (!can_prove(true_itrv.max() == max)) {
       // require the extent to be non-negative
       Expr cond = (max - post_doubt_begin + 1 >= 0);
@@ -354,8 +359,17 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
       }
       // [post_doubt_begin, max]
       if (!partition_thread_scope) {
-        Stmt post_body = Substitute(body, {{Var{var}, var + post_doubt_begin}});
-        post_stmt = MakeFor(node, max - post_doubt_begin + 1, post_body);
+        Stmt post_body;
+        // If the loop is going from 0 to 1, replace the loop var with min value
+        if (as_const_int(max) && as_const_int(post_doubt_begin)) {
+            if (*as_const_int(max) == *as_const_int(post_doubt_begin)) {
+                post_body = Substitute(body, {{Var{var}, post_doubt_begin}});
+                post_stmt = post_body;
+            }
+        } else {
+            post_body = Substitute(body, {{Var{var}, var + post_doubt_begin}});
+            post_stmt = MakeFor(node, max - post_doubt_begin + 1, post_body);
+        }
       }
     }
   } else {
@@ -368,8 +382,15 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
     Stmt simplified_body = ConditionEliminator(partitions).Mutate(body);
     Stmt new_body = Substitute(simplified_body, {{Var{var}, var + body_begin}});
     s = MakeFor(node, post_doubt_begin - body_begin, new_body);
-    if (pre_stmt.defined())  s = Block::make(pre_stmt, s);
-    if (post_stmt.defined()) s = Block::make(s, post_stmt);
+
+    if (!(pre_stmt.defined() && post_stmt.defined())) s = VisitAndMutate(s);
+    if (pre_stmt.defined()) s = Block::make(pre_stmt, s);
+    if (post_stmt.defined()) {
+      if (as_const_int(max) && as_const_int(post_doubt_begin)) {
+        post_stmt = VisitAndMutate(post_stmt);
+      }
+      s = Block::make(s, post_stmt);
+    }
   } else {
     Expr cond = const_true();
     if (!can_prove(body_begin == min)) cond = cond && (var >= body_begin);
@@ -402,9 +423,7 @@ class RemoveLikelyTags : public IRMutator {
 };
 
 Stmt LoopPartition(Stmt stmt, bool split_const_loop) {
-  CandidateSelector selector(split_const_loop);
-  selector.Visit(stmt);
-  stmt = LoopPartitioner(selector.candidates).Mutate(stmt);
+  stmt = LoopPartitioner(split_const_loop).VisitAndMutate(stmt);
   stmt = RemoveLikelyTags().Mutate(stmt);
   return stmt;
 }
diff --git a/tests/python/unittest/test_pass_loop_partition.py b/tests/python/unittest/test_pass_loop_partition.py
index a1025e1f662c..85860ce824d0 100644
--- a/tests/python/unittest/test_pass_loop_partition.py
+++ b/tests/python/unittest/test_pass_loop_partition.py
@@ -177,6 +177,157 @@ def test_everything_during_deduction():
     stmt = tvm.ir_pass.Simplify(stmt)
     assert(isinstance(stmt.body.body, tvm.stmt.IfThenElse))
 
+def test_single_likely():
+    n = 60
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((n, ), name='B')
+
+    T = tvm.compute((n, ), lambda i: A[i]+B[i])
+    s = tvm.create_schedule(T.op)
+    x = T.op.axis[0]
+    xo, xi = s[T].split(x, factor=16)
+
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+def test_multi_likely():
+    n = 94
+    m = 62
+    A = tvm.placeholder((n, m), name='A')
+    B = tvm.placeholder((n, m), name='B')
+
+    T = tvm.compute((n, m), lambda i, j: A[i, j]+B[i, j])
+    s = tvm.create_schedule(T.op)
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    x, y = T.op.axis
+    xo, xi = s[T].split(x, factor=16)
+    yo, yi = s[T].split(y, factor=16)
+    s[T].reorder(xo, yo, xi, yi)
+
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+def test_oneD_pool():
+    m = tvm.var('m')
+    ib = tvm.ir_builder.create()
+    #data = tvm.placeholder((16,), name = 'data')
+    data = ib.pointer("float32", name="A")
+    out = ib.pointer("float32", name="A")
+    with ib.for_range(0, 16, 'ow') as ow:
+        with ib.for_range(0, 3, 'kw') as kw:
+            with ib.if_scope(ib.likely(ow > 0)):
+                with ib.if_scope(ib.likely(ow < 15)):
+                    out[ow] = tvm.max(out[ow], data[ow + kw - 1])
+    with ib.for_range(0, 16, 'ow') as ow:
+        with ib.for_range(0, 3, 'kw') as kw:
+            with ib.if_scope(ib.likely(ow < 1)):
+                with ib.if_scope(ib.likely(kw > 0)):
+                    out[ow] = tvm.max(out[ow], data[ow + kw - 1])
+    with ib.for_range(0, 16, 'ow') as ow:
+        with ib.for_range(0, 3, 'kw') as kw:
+            with ib.if_scope(ib.likely(ow > 14)):
+                with ib.if_scope(ib.likely(kw < 2)):
+                    out[ow] = tvm.max(out[ow], data[ow + kw - 1])
+
+    stmt = ib.get()
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+def test_cce_loop_1():
+  ib = tvm.ir_builder.create()
+  dtype = 'float16'
+  n = 514
+  m = 514
+  _A = tvm.placeholder((n*m,), name = 'A')
+  Ab = tvm.decl_buffer((n*m,), dtype, name="A")
+  A = ib.buffer_ptr(Ab)
+  _B = tvm.placeholder((n*m,), name = 'B')
+  Bb = tvm.decl_buffer((n*m,), dtype, name="B")
+  B = ib.buffer_ptr(Bb)
+  #for i in 0 to n-1:
+  with ib.for_range(0, 11, name="i") as i:
+      with ib.for_range(0, 160, name="j") as j:
+          with ib.if_scope(ib.likely(((i*160) + j) < 1600)):
+               A[(i+1)*m+j+1] = B[(i)*m+j+1] + B[(i+1)*m+j+1] + B[(i+2)*m+j+1]
+  stmt = ib.get()
+  stmt = tvm.ir_pass.LoopPartition(stmt, True)
+  stmt = tvm.ir_pass.Simplify(stmt)
+  assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+def test_cce_loop_2():
+  ib = tvm.ir_builder.create()
+  len = 112
+  tile = 32
+  loop = (len + tile - 1) // tile
+  with ib.for_range(0, loop, 'i') as i:
+    head = i * tile
+    with ib.if_scope(ib.likely(head + tile > len)):
+      tail = len
+      ib.emit(tvm.call_extern('float32', "cce_intrisic", head, tail))
+    with ib.else_scope():
+      tail = head + tile
+      ib.emit(tvm.call_extern('float32', "cce_intrisic", head, tail))
+
+  stmt = ib.get()
+  stmt = tvm.ir_pass.LoopPartition(stmt, True)
+  stmt = tvm.ir_pass.Simplify(stmt)
+  assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+
+def test_cce_loop_3():
+    ib = tvm.ir_builder.create()
+    loop1 = 4
+    loop2 = 9998
+    tile = 39991
+    with ib.for_range(0,loop2,'i') as i:
+        with ib.for_range(0,loop1,'j') as j:
+            head1 = i
+            head2 = j
+            with ib.if_scope(ib.likely(head1*loop1 + head2 < tile)):
+                ib.emit(tvm.call_extern('float16',"cce_intrisic",head1))
+
+    stmt = ib.get()
+    stmt = tvm.ir_pass.LoopPartition(stmt,True)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+def test_conv_tiling():
+    HSTR = WSTR = 1
+    in_channel = 128
+    kernel_height = kernel_width = 3
+    out_channel = 64
+    batch_size = 1
+    in_height = in_width = 64
+    out_height = out_width = in_height - kernel_height + 1
+    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
+    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
+        out_channel), name='kernel')
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
+                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
+                                                     kernel[kh, kw, ic, oc],
+                                                     axis=[ic, kh, kw]),
+                       name="conv2d")
+    s = tvm.create_schedule(conv.op)
+
+    n, oc, oh, ow = conv.op.axis
+    oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
 if __name__ == "__main__":
     test_basic()
     test_const_loop()
@@ -187,3 +338,10 @@ def test_everything_during_deduction():
     test_select()
     test_thread_axis2()
     test_everything_during_deduction()
+    test_single_likely()
+    test_multi_likely()
+    test_oneD_pool()
+    test_cce_loop_1()
+    test_cce_loop_2()
+    test_cce_loop_3()
+    test_conv_tiling()

From 4f7da63b4fddf016fee55e112f95727cea05f72c Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 30 Oct 2018 08:15:44 +0530
Subject: [PATCH 309/529] [YOLO]yolo op added in frontend and removed from topi
 (#1974)

---
 nnvm/python/nnvm/frontend/darknet.py     | 20 +++++---
 nnvm/python/nnvm/top/vision.py           | 15 ------
 nnvm/src/top/vision/yolo/yolo.cc         | 33 --------------
 topi/include/topi/vision/yolo/yolo.h     | 58 ------------------------
 topi/python/topi/testing/__init__.py     |  1 -
 topi/python/topi/testing/yolo_python.py  | 43 ------------------
 topi/python/topi/vision/yolo/__init__.py |  1 -
 topi/python/topi/vision/yolo/yolo.py     | 30 ------------
 topi/src/topi.cc                         |  6 ---
 topi/tests/python_cpp/test_topi_yolo.py  | 49 --------------------
 10 files changed, 14 insertions(+), 242 deletions(-)
 delete mode 100644 nnvm/src/top/vision/yolo/yolo.cc
 delete mode 100644 topi/include/topi/vision/yolo/yolo.h
 delete mode 100644 topi/python/topi/testing/yolo_python.py
 delete mode 100644 topi/python/topi/vision/yolo/yolo.py
 delete mode 100644 topi/tests/python_cpp/test_topi_yolo.py

diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 4da2e90bca42..18d07d07ac6b 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -317,12 +317,19 @@ def _darknet_region(inputs, attrs):
 
 def _darknet_yolo(inputs, attrs):
     """Process the yolo operation."""
-    op_name, new_attrs = 'yolov3_yolo', {}
-    if 'n' in attrs:
-        new_attrs['n'] = attrs.get('n', 1)
-    if 'classes' in attrs:
-        new_attrs['classes'] = attrs.get('classes', 1)
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    num = attrs.get('n', 1)
+    classes = attrs.get('classes', 1)
+    input_shape = attrs.get('shape')
+    split_size = classes + 5
+    intermediate_shape = (input_shape[0], num, split_size, input_shape[2], input_shape[3])
+    data_block = _sym.reshape(inputs[0], shape=intermediate_shape)
+    split_indices = (2, 4)
+    split_res = _sym.split(data_block, indices_or_sections=split_indices, axis=2)
+    split_res0 = _sym.sigmoid(split_res[0])
+    split_res2 = _sym.sigmoid(split_res[2])
+    concat_list = [split_res0, split_res[1], split_res2]
+    out = _sym.concatenate(*concat_list, axis=2)
+    return _sym.reshape(out, shape=input_shape), None
 
 def _darknet_activations(inputs, attrs):
     """Process the activation function."""
@@ -635,6 +642,7 @@ def _get_darknet_attrs(self, layer, layer_num):
         elif LAYERTYPE.YOLO == layer.type:
             attr.update({'n' : layer.n})
             attr.update({'classes' : layer.classes})
+            attr.update({'shape' : (1, layer.c, layer.h, layer.w)})
 
         elif LAYERTYPE.UPSAMPLE == layer.type:
             attr.update({'scale' : layer.stride})
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index e59b2bdfe6d9..f2e12c0f367a 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -38,21 +38,6 @@ def schedule_region(attrs, outs, target):
 
 reg.register_pattern("yolo_region", OpPattern.OPAQUE)
 
-@reg.register_compute("yolov3_yolo")
-def compute_yolo(attrs, inputs, _):
-    """Compute definition of yolo"""
-    n = attrs.get_int("n")
-    classes = attrs.get_int("classes")
-    return topi.vision.yolo.yolo(inputs[0], n, classes)
-
-@reg.register_schedule("yolov3_yolo")
-def schedule_yolo(attrs, outs, target):
-    """Schedule definition of yolo"""
-    with tvm.target.create(target):
-        return topi.generic.schedule_injective(outs)
-
-reg.register_pattern("yolov3_yolo", OpPattern.OPAQUE)
-
 # multibox_prior
 @reg.register_schedule("multibox_prior")
 def schedule_multibox_prior(_, outs, target):
diff --git a/nnvm/src/top/vision/yolo/yolo.cc b/nnvm/src/top/vision/yolo/yolo.cc
deleted file mode 100644
index 4800f4371f9d..000000000000
--- a/nnvm/src/top/vision/yolo/yolo.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file yolo.cc
- * \brief Property def of yolo operators.
- */
-#include <nnvm/op.h>
-#include <nnvm/node.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/top/nn.h>
-#include "../../elemwise_op_common.h"
-
-namespace nnvm {
-namespace top {
-
-NNVM_REGISTER_OP(yolov3_yolo)
-.describe(R"code(Yolo layer
-)code" NNVM_ADD_FILELINE)
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_support_level(5)
-.add_argument("data", "Tensor", "Input data")
-.set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
-.set_attr<FInplaceOption>(
-    "FInplaceOption",
-    [](const NodeAttrs &attrs) {
-      return std::vector<std::pair<int, int>>{{0, 0}, {1, 0}};
-    })
-.set_attr<FGradient>("FGradient", [](const NodePtr &n,
-                                     const std::vector<NodeEntry> &ograds) {
-  return std::vector<NodeEntry>{ograds[0], ograds[0]};
-});
-}  // namespace top
-}  // namespace nnvm
diff --git a/topi/include/topi/vision/yolo/yolo.h b/topi/include/topi/vision/yolo/yolo.h
deleted file mode 100644
index d2e24c01b253..000000000000
--- a/topi/include/topi/vision/yolo/yolo.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \brief YOLO op constructions
- * \file vision/yolo/yolo.h
- */
-#ifndef TOPI_VISION_YOLO_YOLO_H_
-#define TOPI_VISION_YOLO_YOLO_H_
-
-#include <algorithm>
-#include <string>
-
-#include "topi/detail/constant_utils.h"
-#include "topi/tags.h"
-#include "topi/transform.h"
-#include "tvm/tvm.h"
-
-
-namespace topi {
-namespace vision {
-namespace yolo {
-using namespace tvm;
-using namespace nn;
-
-/*!
-* \brief yolo operation
-*
-* \param data The input tensor.
-* \param num Darknet layer parameter n
-* \param classes number of classes in the yolo model
-* \param name The name of the operation
-* \param tag The tag to mark the operation
-*
-* \return A Tensor whose op member is the yolo operation
-*/
-inline Tensor yolo(const Tensor &data,
-                   int num,
-                   int classes,
-                   std::string name = "tensor",
-                   std::string tag = "yolo_output") {
-  auto input_shape = data->shape;
-  int split_size = classes + 5;
-  Array <Expr> intermediate_shape = {input_shape[0],
-                                     num,
-                                     split_size,
-                                     input_shape[2],
-                                     input_shape[3]};
-  auto data_block = reshape(data, intermediate_shape);
-  Array <Expr> split_indices = {2, 4};
-  Array <Tensor> split_res = split(data_block, split_indices, 2);
-  split_res.Set(0, sigmoid(split_res[0]));
-  split_res.Set(2, sigmoid(split_res[2]));
-  Tensor out = concatenate(split_res, 2);
-  return reshape(out, input_shape);
-}
-}  // namespace yolo
-}  // namespace vision
-}  // namespace topi
-#endif  // TOPI_VISION_YOLO_YOLO_H_
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index c9d995a38686..c91eea7958ea 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -15,7 +15,6 @@
 from .bilinear_resize_python import bilinear_resize_python
 from .reorg_python import reorg_python
 from .region_python import region_python
-from .yolo_python import yolo_python
 from .shortcut_python import shortcut_python
 from .lrn_python import lrn_python
 from .l2_normalize_python import l2_normalize_python
diff --git a/topi/python/topi/testing/yolo_python.py b/topi/python/topi/testing/yolo_python.py
deleted file mode 100644
index a6b3a41203c6..000000000000
--- a/topi/python/topi/testing/yolo_python.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
-"""Yolo operator in python"""
-import numpy as np
-
-def entry_index(batch, w, h, outputs, classes, coords, location, entry):
-    n = int(location/(w*h))
-    loc = location%(w*h)
-    return batch*outputs + n*w*h*(coords+classes+1) + entry*w*h + loc
-
-def yolo_python(a_np, N, classes):
-    """Yolo operator
-    Parameters
-    ----------
-    a_np : numpy.ndarray
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    N : int
-        Darknet layer parameter n
-
-    classes : int
-        Darknet layer parameter classes
-
-    Returns
-    -------
-    b_np : np.ndarray
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-
-    batch, in_channel, in_height, in_width = a_np.shape
-    a_np_temp = np.reshape(a_np, batch*in_channel*in_height*in_width)
-    outputs = batch*in_channel*in_height*in_width
-    b_np = np.zeros(batch*in_channel*in_height*in_width)
-    for i in range(batch*in_channel*in_height*in_width):
-        b_np[i] = a_np_temp[i]
-    for b in range(batch):
-        for n in range(N):
-            index = entry_index(b, in_width, in_height, outputs, classes, 4, n*in_width*in_height, 0)
-            b_np[index: index+2*in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+2*in_width*in_height]))
-            index = entry_index(b, in_width, in_height, outputs, classes, 4, n*in_width*in_height, 4)
-            b_np[index: index+(1+classes)*in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+(1+classes)*in_width*in_height]))
-
-    b_np = np.reshape(b_np, (batch, in_channel, in_height, in_width))
-    return b_np
diff --git a/topi/python/topi/vision/yolo/__init__.py b/topi/python/topi/vision/yolo/__init__.py
index 2c0a165f8aac..c0e9899a41aa 100644
--- a/topi/python/topi/vision/yolo/__init__.py
+++ b/topi/python/topi/vision/yolo/__init__.py
@@ -3,4 +3,3 @@
 from __future__ import absolute_import as _abs
 
 from .region import *
-from .yolo import *
diff --git a/topi/python/topi/vision/yolo/yolo.py b/topi/python/topi/vision/yolo/yolo.py
deleted file mode 100644
index 6ae630a86d8f..000000000000
--- a/topi/python/topi/vision/yolo/yolo.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# pylint: disable=invalid-name, unused-variable
-"""
-YOLO Operator
-=============
-YOLO operator, used in darknet.
-"""
-from __future__ import absolute_import as _abs
-import tvm
-from ... import cpp
-
-@tvm.target.generic_func
-def yolo(data, num, classes):
-    """YOLO forward operators.
-    Parameters
-    ----------
-    data : tvm.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]
-
-    num : int
-        Darknet layer parameter n
-
-    classes : int
-        Darknet layer parameter classes
-
-    Returns
-    -------
-    out : tvm.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]
-    """
-    return cpp.yolo.yolo(data, num, classes)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index ae1ad57551cb..2d9f2fd6c6b2 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -32,7 +32,6 @@
 #include <topi/vision/reorg.h>
 #include <topi/image/resize.h>
 #include <topi/vision/yolo/region.h>
-#include <topi/vision/yolo/yolo.h>
 #include <topi/generic/default.h>
 #include <topi/generic/extern.h>
 #include <topi/generic/injective.h>
@@ -413,11 +412,6 @@ TVM_REGISTER_GLOBAL("topi.vision.yolo.region")
   *rv = vision::yolo::region(args[0], args[1], args[2], args[3], args[4], args[5]);
   });
 
-TVM_REGISTER_GLOBAL("topi.vision.yolo.yolo")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = vision::yolo::yolo(args[0], args[1], args[2]);
-  });
-
 /* Ops from image/resize.h */
 TVM_REGISTER_GLOBAL("topi.image.resize")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
diff --git a/topi/tests/python_cpp/test_topi_yolo.py b/topi/tests/python_cpp/test_topi_yolo.py
deleted file mode 100644
index 293de4fca087..000000000000
--- a/topi/tests/python_cpp/test_topi_yolo.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""Test code for yolo op"""
-import logging
-import numpy as np
-import tvm
-import topi
-import topi.testing
-from topi.util import get_const_tuple
-
-def verify_yolo(ishape, n, classes):
-    '''Verify yolo operator by comparing outputs from tvm and numpy implementation'''
-    
-    A = tvm.placeholder(ishape, name='A')
-    B = topi.cpp.yolo.yolo(A, n, classes)
-    dtype = A.dtype
-
-    def get_ref_data_yolo():
-        '''Randomly initialize the data variables and get refernce output for the yolo operation'''
-        a_np = np.random.uniform(size=ishape).astype(dtype)
-        b_np = topi.testing.yolo_python(a_np, n, classes)
-        return a_np, b_np
-
-    a_np, b_np = get_ref_data_yolo()
-    def check_device(device):
-        '''Check the device is available and if so, build and run the program'''
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], False)
-        else:
-            s = topi.cpp.cuda.schedule_injective(target, [B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        func = tvm.build(s, [A, B], device, name="yolo")
-        func(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
-        check_device(device)
-
-def test_yolo():
-    verify_yolo((1, 425, 19, 19), 5, 80)
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-    test_yolo()

From 7941ea6e8ebfdffada8c69c76613c689ccda2804 Mon Sep 17 00:00:00 2001
From: kun-zh <32951065+kun-zh@users.noreply.github.com>
Date: Wed, 31 Oct 2018 00:38:31 +0800
Subject: [PATCH 310/529] Fix a bug in inject-virtual-thread (#2039)

---
 src/pass/inject_virtual_thread.cc             |  2 +-
 .../unittest/test_pass_inject_vthread.py      | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc
index f1aed09d47da..3fc2e24fb4f1 100644
--- a/src/pass/inject_virtual_thread.cc
+++ b/src/pass/inject_virtual_thread.cc
@@ -321,7 +321,7 @@ class VTInjector : public IRMutator {
     CHECK_EQ(max_loop_depth_, 0);
     Stmt then_case = this->Mutate(op->then_case);
     Stmt else_case;
-    if (else_case.defined()) {
+    if (op->else_case.defined()) {
       int temp = max_loop_depth_;
       max_loop_depth_ = 0;
       else_case = this->Mutate(op->else_case);
diff --git a/tests/python/unittest/test_pass_inject_vthread.py b/tests/python/unittest/test_pass_inject_vthread.py
index 502a55574df0..16f4c4652a3d 100644
--- a/tests/python/unittest/test_pass_inject_vthread.py
+++ b/tests/python/unittest/test_pass_inject_vthread.py
@@ -60,7 +60,26 @@ def get_vthread(name):
     assert stmt.body.body.body.body.body.body.extents[0].value == 2
     assert len(stmt.body.body.body.body.body.body.extents) == 3
 
+def test_vthread_if_then_else():
+    nthread = 2
+    tx = tvm.thread_axis("vthread")
+    ib = tvm.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    with ib.for_range(0, 100) as i:
+        ib.scope_attr(tx, "virtual_thread", nthread)
+        B = ib.allocate("float32", 128, name="B", scope="shared")
+        with ib.if_scope(i == 0):
+            B[i] = A[i * nthread + tx]
+        with ib.else_scope():
+            B[i] = A[i * nthread + tx] + 1
+        with ib.if_scope(i == 0):
+            B[i] = A[i * nthread + tx] + 2
+    stmt = ib.get()
+    stmt = tvm.ir_pass.InjectVirtualThread(stmt)
+    assert stmt.body.body.body.first.else_case != None
+    assert stmt.body.body.body.rest.else_case == None
 
 if __name__ == "__main__":
     test_vthread_extern()
     test_vthread()
+    test_vthread_if_then_else()

From 1570a1ad06c59face86486287669e249f0783e48 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Tue, 30 Oct 2018 09:38:52 -0700
Subject: [PATCH 311/529] [TOPI][AUTOTVM] Improve style (#2034)

* [TOPI] Improve the style of using autotvm

* fix
---
 topi/python/topi/arm_cpu/conv2d.py | 199 ++++++++++++++---------------
 topi/python/topi/mali/conv2d.py    |  89 +++++++------
 topi/python/topi/nn/conv2d.py      |  11 --
 topi/python/topi/x86/conv2d.py     |   8 +-
 4 files changed, 150 insertions(+), 157 deletions(-)

diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index a193e9acf5cb..c34bf256788b 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -12,34 +12,40 @@
 from ..nn import pad, conv2d, conv2d_alter_layout, conv2d_winograd_without_weight_transform
 from ..nn.util import get_const_int, get_pad_tuple
 
-def _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
-    """convert argument to workload"""
-    if len(kernel.shape) == 4:
-        raw_kernel = kernel
-    else:  # the input kernel is transformed by alter_op_layout
-        shape = get_const_tuple(kernel.shape)
-        raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]),
-                                     dtype=kernel.dtype)
-    return ('conv2d', ) + autotvm.task.args_to_workload(
-        [data, raw_kernel, strides, padding, layout, out_dtype])
-
-@conv2d.register('arm_cpu')
-@autotvm.task.dispatcher
-def conv2d_arm_cpu(data, kernel, strides, padding, layout, out_dtype):
-    """TOPI compute callback. Mark this function as a dispatcher, so
-    this template can assign config according to workload
+@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['direct'])
+def conv2d_arm_cpu(cfg, data, kernel, strides, padding, layout, out_dtype):
+    """TOPI compute callback for conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
+        filter_width, num_filter_block]
+
+    strides : list of two ints
+        [stride_height, stride_width]
+
+    padding : list of two ints
+        [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
 
     Returns
     -------
-    workload: Tuple
-        Dispatcher will use this workload to query corresponding config.
-        Then use cfg.template_key to call a registered template.
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
     """
-    return _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
-
-@conv2d_arm_cpu.register(['direct'])
-def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype):
-    """spatial packing template"""
     return _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile=2)
 
 @autotvm.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd'])
@@ -93,8 +99,6 @@ def _callback(op):
 def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
     assert layout == "NCHW", "Only support NCHW"
     # create workload according to raw arguments
-    wkl = _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
-
     out_dtype = out_dtype or data.dtype
     N, CI, IH, IW = get_const_tuple(data.shape)
     if len(kernel.shape) == 4:
@@ -177,8 +181,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
 
     output = tvm.compute(oshape, lambda n, co, h, w:
                          conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
-                         name='output_unpack', tag='spatial_conv2d_output',
-                         attrs={'workload': wkl})
+                         name='output_unpack', tag='spatial_conv2d_output')
     return output
 
 def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
@@ -238,16 +241,13 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
     return s
 
 
-@conv2d_arm_cpu.register('winograd')
-def decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
+@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd'])
+def conv2d_arm_cpu_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
+    """ TOPI compute callback. Use winograd template """
     tile_size = 4
     return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
 
 def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
-    # create workload according to raw arguments
-    wkl = _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout,
-                                         out_dtype, tile_size)
-
     N, CI, IH, IW = get_const_tuple(data.shape)
     if len(kernel.shape) == 4:
         pre_computed = False
@@ -368,8 +368,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
     # unpack output
     output = tvm.compute((N, K, H, W), lambda n, k, h, w:
                          Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
-                         name='output', tag='winograd_conv2d_output',
-                         attrs={'workload': wkl})
+                         name='output', tag='winograd_conv2d_output')
 
     # we have to manually assign effective GFLOP for winograd
     cfg.add_flop(2 * N * K * H * W * KH * KW * C)
@@ -458,36 +457,11 @@ def _schedule_winograd(cfg, s, output, last):
         s[output].compute_inline()
 
 
-def _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype, tile_size):
-    """convert argument to workload"""
-    K = 3
-    shape = get_const_tuple(kernel.shape)
-    alpha = tile_size + K - 1
-    if len(kernel.shape) == 4:
-        assert shape[2:] == (K, K)
-        CO, CI = shape[:2]
-    else:
-        assert shape[:2] == (alpha, alpha)
-        CO, CI, VCO = shape[2:]
-        CO *= VCO
-
-    raw_kernel = tvm.placeholder((CO, CI, K, K), dtype=kernel.dtype)
-    return ('conv2d', ) + autotvm.task.args_to_workload(
-        [data, raw_kernel, strides, padding, layout, out_dtype])
-
-
 ##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
-@conv2d_winograd_without_weight_transform.register(['arm_cpu'])
-@autotvm.task.dispatcher
-def winograd_ww_config_dispatcher_(data, kernel, strides, padding, layout, out_dtype, tile_size):
-    return _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype,
-                                          tile_size)
-
-
-@winograd_ww_config_dispatcher_.register(['winograd'])
-def decl_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
-    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype,
-                          tile_size)
+@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'arm_cpu', ['winograd'])
+def conv2d_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    """TOPI compute callback"""
+    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
 
 
 @autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
@@ -514,8 +488,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
 
     new_attrs = {k: attrs[k] for k in attrs.keys()}
 
-    assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
-                                                      "when alter_op_layout is enabled"
+    dilation = attrs.get_int_tuple("dilation")
     strides = attrs.get_int_tuple("strides")
     padding = attrs.get_int_tuple("padding")
     groups = attrs.get_int('groups')
@@ -523,38 +496,60 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
     out_dtype = attrs["out_dtype"]
     out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype
 
-    if groups == 1:
-        # query config of this workload
-        workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding,
-                                         layout, out_dtype)
-        cfg = autotvm.DispatchContext.current.query(tvm.target.current_target(), workload)
-
-        if cfg.is_fallback:  # if is fallback, clear query cache and return None
-            autotvm.task.clear_fallback_cache(tvm.target.current_target(), workload)
-            return None
-
-        if cfg.template_key == 'direct':  # packing weight tensor
-            new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
-            return sym.conv2d(*copy_inputs, **new_attrs)
-        else:  # pre-compute weight transformation in winograd
-            if "-device=arm_cpu" in tvm.target.current_target().options:
-                tile_size = 4
-                VC = cfg['tile_k'].size[-1]
-            else:
-                from ..mali.conv2d import _pick_tile_size
-                tile_size = _pick_tile_size(tinfos[0], tinfos[1])
-                VC = cfg['tile_bna'].val
-
-            weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1],
-                                                                  tile_size=tile_size)
-            CO, CI, KH, KW = get_const_tuple(tinfos[1].shape)
-            weight = sym.reshape(weight,
-                                 shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
-            weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])
-
-            copy_inputs[1] = weight
-            new_attrs['tile_size'] = tile_size
-            return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
-
-    # do nothing for depthwise convolution
-    return None
+    if layout != 'NCHW' or groups != 1 or dilation != (1, 1):
+        return None
+
+    data, kernel = tinfos[0:2]
+    N, CI, H, W = get_const_tuple(data.shape)
+    CO, _, KH, KW = get_const_tuple(kernel.shape)
+
+    # query config of this workload
+    workload = autotvm.task.args_to_workload(
+        [data, kernel, strides, padding, layout, out_dtype], conv2d)
+    target = tvm.target.current_target()
+    dispatch_ctx = autotvm.DispatchContext.current
+    cfg = dispatch_ctx.query(target, workload)
+
+    if cfg.is_fallback:  # if is fallback, clear query cache and return None
+        autotvm.task.clear_fallback_cache(target, workload)
+        return None
+
+    if cfg.template_key == 'direct':  # pack weight tensor
+        VC = cfg['tile_co'].size[-1]
+        new_attrs['kernel_layout'] = 'OIHW%do' % VC
+
+        # Store the same config for the altered operator (workload)
+        new_data = data
+        new_kernel = tvm.placeholder((CO // VC, CI, KH, KW, VC), dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, 'NCHW', out_dtype], conv2d)
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return sym.conv2d(*copy_inputs, **new_attrs)
+    else:  # pre-compute weight transformation in winograd
+        if "-device=arm_cpu" in target.options:
+            tile_size = 4
+            VC = cfg['tile_k'].size[-1]
+        else:
+            from ..mali.conv2d import _pick_tile_size
+            tile_size = _pick_tile_size(tinfos[0], tinfos[1])
+            VC = cfg['tile_bna'].val
+
+        weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1], tile_size=tile_size)
+        weight = sym.reshape(weight,
+                             shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
+        weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])
+
+        copy_inputs[1] = weight
+        new_attrs['tile_size'] = tile_size
+
+        # Store the same config for the altered operator (workload)
+        new_data = data
+        new_weight = tvm.placeholder((KH + tile_size - 1, KH + tile_size -1, CO // VC, CI, VC),
+                                     kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_weight, strides, padding, new_attrs['layout'], out_dtype, tile_size],
+            conv2d_winograd_without_weight_transform)
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 121498f217c4..390b60ba6a97 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -12,27 +12,43 @@
     get_pad_tuple, pad, conv2d_alter_layout
 
 # reuse some compute declarations from ARM CPU
-from ..arm_cpu.conv2d import _conv_arg_to_workload, _decl_spatial_pack,\
-    _winograd_conv_arg_to_workload, _alter_conv2d_layout_arm
+from ..arm_cpu.conv2d import _decl_spatial_pack, _alter_conv2d_layout_arm
 
 
-@conv2d.register('mali')
-@autotvm.task.dispatcher
-def conv2d_mali(data, kernel, strides, padding, layout, out_dtype):
-    """TOPI compute callback. Mark this function as a dispatcher, so
-    this template can assign config according to workload
+@autotvm.register_topi_compute(conv2d, 'mali', ['direct'])
+def conv2d_mali(cfg, data, kernel, strides, padding, layout, out_dtype):
+    """TOPI compute callback for conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
+        filter_width, num_filter_block]
+
+    strides : list of two ints
+        [stride_height, stride_width]
+
+    padding : list of two ints
+        [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
 
     Returns
     -------
-    workload: Tuple
-        Dispatcher will use this workload to query corresponding config.
-        Then use cfg.template_key to call a registered template.
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
     """
-    return _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
-
-@conv2d_mali.register(['direct'])
-def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype):
-    """spatial packing template"""
     return _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile=3)
 
 @autotvm.register_topi_schedule(schedule_conv2d_nchw, 'mali', ['direct', 'winograd'])
@@ -158,8 +174,8 @@ def _pick_tile_size(data, kernel):
     else:
         return 2
 
-@conv2d_mali.register('winograd')
-def decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
+@autotvm.register_topi_compute(conv2d, 'mali', ['winograd'])
+def conv2d_mali_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
     tile_size = _pick_tile_size(data, kernel)
     return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
 
@@ -305,9 +321,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
                          # thw following term is used to make the padding effective,
                          # otherwise the padding will be eliminated by bound inference
                          + tvm.const(0, out_dtype) * M[alpha-1][alpha-1][CO-1][P_round-1],
-                         name='output', tag='winograd_conv2d_output',
-                         attrs={'workload': _winograd_conv_arg_to_workload(
-                             data, kernel, strides, padding, layout, out_dtype, tile_size)})
+                         name='output', tag='winograd_conv2d_output')
 
     # we have to manually assign effective GFLOP for winograd
     cfg.add_flop(2 * N * CO * H * W * KH * KW * CI)
@@ -410,29 +424,15 @@ def _schedule_winograd(cfg, s, op):
 
     s[Y].compute_at(s[output], tt)
 
-@conv2d_alter_layout.register(["mali"])
-def _alter_conv2d_layout(attrs, inputs, tinfos):
-    try:
-        return _alter_conv2d_layout_arm(attrs, inputs, tinfos)
-    except KeyError:  # to filter out fallback opencl templates
-        return None
-
 ##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
-@conv2d_winograd_without_weight_transform.register(['mali'])
-@autotvm.task.dispatcher
-def winograd_ww_config_dispatcher_(data, kernel, strides, padding, layout, out_dtype, tile_size):
-    return _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype,
-                                          tile_size)
-
-
-@winograd_ww_config_dispatcher_.register(['winograd'])
-def decl_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
-    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype,
-                          tile_size)
+@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'mali', ['winograd'])
+def conv2d_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    """TOPI compute callback"""
+    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
 
 
-@autotvm.task.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
-                                     'mali', ['winograd'])
+@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                'mali', ['winograd'])
 def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
     """TOPI schedule callback"""
     s = tvm.create_schedule([x.op for x in outs])
@@ -445,6 +445,15 @@ def _callback(op):
     return s
 
 
+##### REGISTER ALTER OP LAYOUT #####
+@conv2d_alter_layout.register(["mali"])
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    try:
+        return _alter_conv2d_layout_arm(attrs, inputs, tinfos)
+    except KeyError:  # to filter out fallback opencl templates
+        return None
+
+
 ##### SCHECULE UTILITIES #####
 def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
     """ tile and bind to GPU threads """
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 7636350dfbf6..17b1ceb7ab13 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -85,17 +85,6 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
     return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
 
 
-@tvm.target.generic_func
-def _get_schedule(wkl):
-    # pylint: disable=unreachable
-    """ Get the platform specific schedule. """
-    target = tvm.target.current_target()
-    raise RuntimeError(
-        "No schedule for current target:{}".format(target))
-    # This return has no use, merely to supress pylint warning
-    return wkl
-
-
 def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     """Convolution operator in NCHW layout.
 
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index c588e74432a4..3dc6d5e4bab8 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -3,7 +3,7 @@
 import tvm
 from tvm import autotvm
 from tvm.autotvm.task.nnvm_integration import deserialize_args
-from tvm.autotvm.task import register, get_config
+from tvm.autotvm.task import get_config
 from .. import generic, tag
 from .. import nn
 from ..util import get_const_tuple
@@ -145,7 +145,7 @@ def _declaration_conv_impl(cfg, data, kernel, strides, padding, layout, out_dtyp
     return unpack
 
 
-@autotvm.task.register_topi_schedule(generic.schedule_conv2d_nchw, 'cpu', ['direct'])
+@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, 'cpu', ['direct'])
 def schedule_conv2d(cfg, outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
@@ -248,7 +248,7 @@ def traverse(op):
 # We define schedule template in this function instead of
 # declaration function since actual input arguments need
 # to be altered by the schedule selected.
-@register("topi_x86_conv2d_NCHWc")
+@autotvm.task.register("topi_x86_conv2d_NCHWc")
 def _topi_nn_conv2d_NCHWc(*args, **kwargs):
     assert not kwargs, "Do not support kwargs in template function call"
     data, kernel, strides, padding, origin_layout, dtype = deserialize_args(args)
@@ -311,7 +311,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo):
     # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
     new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
 
-    # Store altered operator's config
+    # Store the same config for the altered operator (workload)
     new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
                                dtype=data.dtype)
     new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn),

From 6c4aa8190b24a5afdac611e4f07f7a9049292fe1 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 30 Oct 2018 09:53:30 -0700
Subject: [PATCH 312/529] [RELAY][OP]  Maketuple to be resolved when containing
 incompleteType (#2031)

---
 src/relay/op/tensor/transform.cc     |  2 +-
 src/relay/pass/type_infer.cc         | 38 +++++++++++++++++++++++-----
 tests/python/relay/test_op_level1.py |  1 +
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 635f04668f33..20e0e3adbfd3 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -140,7 +140,7 @@ bool ConcatenateRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* tensor_tuple = types[0].as<TupleTypeNode>();
   if (tensor_tuple == nullptr) {
-    CHECK(types[0].as<TupleTypeNode>())
+    CHECK(types[0].as<IncompleteTypeNode>())
         << "cast: expect input type to be TupleType but get "
         << types[0];
     return false;
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index c0f1db97b538..e3e8ad7ecdf7 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -56,11 +56,31 @@ bool TupleGetItemRel(const Array<Type>& types,
   return true;
 }
 
+bool MakeTupleRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(static_cast<size_t>(num_inputs + 1), types.size());
+  for (int i = 0; i < num_inputs; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) return false;
+  }
+  Array<Type> fields;
+  for (int i = 0; i < num_inputs; ++i) {
+    fields.push_back(types[i]);
+  }
+  reporter->Assign(types[num_inputs], TupleTypeNode::make(fields));
+  return true;
+}
+
 TVM_REGISTER_NODE_TYPE(TupleGetItemAttrs);
 TVM_REGISTER_API("tvm.relay.type_relation.TupleGetItem")
 .set_body_typed<bool(const Array<Type>&, int, const Attrs&, const TypeReporter&)>(
     TupleGetItemRel);
 
+TVM_REGISTER_API("tvm.relay.type_relation.MakeTuple")
+.set_body_typed<bool(const Array<Type>&, int, const Attrs&, const TypeReporter&)>(
+    MakeTupleRel);
+
 struct ResolvedTypeInfo {
   explicit ResolvedTypeInfo(Type checked_type, Array<Type> type_args)
       : checked_type(checked_type), type_args(type_args) {}
@@ -104,6 +124,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
   TypeSolver solver_;
   // relation function
   TypeRelationFn tuple_getitem_rel_;
+  TypeRelationFn make_tuple_rel_;
   // Unify two types
   Type Unify(const Type& t1, const Type& t2, const Span& span) {
     // TODO(tqchen, jroesch): propagate span to solver
@@ -154,14 +175,19 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
   }
 
   Type VisitExpr_(const TupleNode* op) final {
-    // TODO(tqchen, jroesch)
-    // tuple should be a constraint in the type solver
-    // to handle cases where the field type is not known.
-    Array<Type> fields;
+    if (!make_tuple_rel_.defined())  {
+      make_tuple_rel_ = TypeRelationFn(
+          EnvFunc::Get("tvm.relay.type_relation.MakeTuple").node_);
+    }
+    Array<Type> types;
     for (Expr field : op->fields) {
-      fields.push_back(GetType(field));
+      types.push_back(GetType(field));
     }
-    return TupleTypeNode::make(fields);
+    Type rtype = IncompleteTypeNode::make(TypeVarNode::Kind::kType);
+    types.push_back(rtype);
+    solver_.AddConstraint(TypeRelationNode::make(
+        make_tuple_rel_, types, op->fields.size(), Attrs()));
+    return rtype;
   }
 
   Type VisitExpr_(const TupleGetItemNode* op) final {
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index fd01dbdde012..a622dfc2cbd4 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -87,6 +87,7 @@ def test_concatenate_infer_type():
     zz = relay.ir_pass.infer_type(z)
     assert zz.checked_type == relay.TensorType((n, t, 200))
 
+    x = relay.exp(x)
     z = relay.concatenate((x, y), axis=2)
     zz = relay.ir_pass.infer_type(z)
     assert zz.checked_type == relay.TensorType((n, t, 200))

From be4b7c1c6796b59bcc7e6bbb8dcecda087c60913 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Tue, 30 Oct 2018 15:29:36 -0700
Subject: [PATCH 313/529] [RELAY][RUNTIME] Add Relay interpreter and compiler
 for TVM runtime system. (#1954)

---
 include/tvm/relay/base.h                  |   9 +-
 include/tvm/relay/build_module.h          |  76 +++
 include/tvm/relay/expr.h                  |  14 +-
 include/tvm/relay/interpreter.h           | 140 ++++++
 include/tvm/relay/pass.h                  |  72 +--
 python/tvm/relay/__init__.py              |  18 +
 python/tvm/relay/_interpreter.py          |   4 +
 python/tvm/relay/env.py                   |   5 +-
 python/tvm/relay/graph_runtime_codegen.py | 551 ++++++++++++++++++++++
 python/tvm/relay/interpreter.py           | 130 +++++
 python/tvm/relay/ir_pass.py               |   6 +
 python/tvm/relay/op/_tensor.py            |  49 +-
 python/tvm/relay/op/nn/_nn.py             |  16 +
 python/tvm/relay/op/op.py                 |  11 +-
 python/tvm/relay/testing/mlp.py           |   1 +
 src/relay/interpreter.cc                  | 432 +++++++++++++++++
 src/relay/ir/base.cc                      |   2 +
 src/relay/ir/environment.cc               |  15 +-
 src/relay/ir/expr.cc                      |  45 +-
 src/relay/ir/expr_functor.cc              |   3 +-
 src/relay/ir/hash.cc                      |   4 +-
 src/relay/pass/fuse_ops.cc                |  86 ++++
 src/relay/pass/lower_ops.cc               | 222 +++++++++
 src/relay/pass/type_infer.cc              |  13 +-
 src/relay/pass/util.cc                    |   2 +-
 tests/python/relay/test_graph_runtime.py  |  80 ++++
 tests/python/relay/test_interpreter.py    | 142 ++++++
 tests/python/relay/test_type_infer.py     |  77 ++-
 tests/scripts/task_python_integration.sh  |   2 +-
 29 files changed, 2168 insertions(+), 59 deletions(-)
 create mode 100644 include/tvm/relay/build_module.h
 create mode 100644 include/tvm/relay/interpreter.h
 create mode 100644 python/tvm/relay/_interpreter.py
 create mode 100644 python/tvm/relay/graph_runtime_codegen.py
 create mode 100644 python/tvm/relay/interpreter.py
 create mode 100644 python/tvm/relay/op/nn/_nn.py
 create mode 100644 src/relay/interpreter.cc
 create mode 100644 src/relay/pass/fuse_ops.cc
 create mode 100644 src/relay/pass/lower_ops.cc
 create mode 100644 tests/python/relay/test_graph_runtime.py
 create mode 100644 tests/python/relay/test_interpreter.py

diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index 4ae35f585c6f..b7621e20cf6a 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -22,8 +22,15 @@ namespace tvm {
  * You can find more about Relay by reading the language reference.
  */
 namespace relay {
+
+#define RELAY_DEBUG(...) \
+{ auto fdebug = runtime::Registry::Get("relay.debug"); \
+  CHECK(fdebug) << "Could not find Relay Python debugger function."; \
+  (*fdebug)("RELAY_DEBUG", __FILE__, __LINE__, __VA_ARGS__); \
+}
+
 /*!
- * \brief we always used NodeRef for referencing nodes.
+ * \brief We always used NodeRef for referencing nodes.
  *
  *  By default, NodeRef is a std::shared_ptr of node
  */
diff --git a/include/tvm/relay/build_module.h b/include/tvm/relay/build_module.h
new file mode 100644
index 000000000000..ed889eba0bd0
--- /dev/null
+++ b/include/tvm/relay/build_module.h
@@ -0,0 +1,76 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/build_module.h
+ * \brief The passes and data structures needed to build a
+ * tvm::Module from a Relay program.
+ */
+#ifndef TVM_RELAY_BUILD_MODULE_H_
+#define TVM_RELAY_BUILD_MODULE_H_
+
+#include <tvm/lowered_func.h>
+#include <tvm/relay/environment.h>
+#include <tvm/relay/expr.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief A lowered Relay operation.
+ *
+ * A lowered operation is a pair containing the "primitive" function used
+ * to produce the lowered function as well as the lowered function itself.
+ */
+class LoweredOp;
+/*! \brief Call container. */
+class LoweredOpNode : public Node {
+ public:
+  /*!
+   * \brief The primitive function to be lowered.
+   *
+   * A primitive function consists only of calls to relay::Op which
+   * can be fused.
+   */
+  Function func;
+
+  /*!
+   * \brief The lowered function.
+   */
+  LoweredFunc lowered_func;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("func", &func);
+    v->Visit("lowered_func", &lowered_func);
+  }
+
+  TVM_DLL static LoweredOp make(
+      Function func,
+      LoweredFunc lowered_func);
+
+  static constexpr const char* _type_key = "relay.LoweredOp";
+  TVM_DECLARE_NODE_TYPE_INFO(LoweredOpNode, Node);
+};
+
+RELAY_DEFINE_NODE_REF(LoweredOp, LoweredOpNode, NodeRef);
+
+/*!
+ * \brief Lower the operations contained in a Relay expression.
+ *
+ * The lowering pass will only lower functions marked as primitive,
+ * the FuseOps pass will provide this behavior, if run before LowerOps.
+ *
+ * \note This will do a reachability analysis and lower all definitions
+ * reachable from the provided expression.
+ *
+ * \param env  The environment.
+ * \param expr The expression with operations to be lowered.
+ * \param target The target to lower the functions to.
+ *
+ * \return The set of lowered operations.
+ */
+Array<LoweredOp> LowerOps(const Environment& env, const Expr& expr,
+                          const std::string& target = "llvm");
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BUILD_MODULE_H_
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 2e3bbadb7841..029470c067ce 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -213,12 +213,18 @@ class FunctionNode : public ExprNode {
    */
   tvm::Array<TypeVar> type_params;
 
+  /*!
+   * \brief The attributes which store metadata about functions.
+   */
+  tvm::Attrs attrs;
+
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("params", &params);
     v->Visit("body", &body);
     v->Visit("ret_type", &ret_type);
     v->Visit("type_params", &type_params);
     v->Visit("span", &span);
+    v->Visit("attrs", &attrs);
     v->Visit("_checked_type_", &checked_type_);
   }
 
@@ -233,7 +239,8 @@ class FunctionNode : public ExprNode {
   TVM_DLL static Function make(tvm::Array<Var> params,
                                Expr body,
                                Type ret_type,
-                               tvm::Array<TypeVar> ty_params);
+                               tvm::Array<TypeVar> ty_params,
+                               tvm::Attrs attrs = Attrs());
 
   static constexpr const char* _type_key = "relay.Function";
   TVM_DECLARE_NODE_TYPE_INFO(FunctionNode, ExprNode);
@@ -241,6 +248,11 @@ class FunctionNode : public ExprNode {
 
 RELAY_DEFINE_NODE_REF(Function, FunctionNode, Expr);
 
+
+TVM_DLL NodeRef FunctionGetAttr(const Function& func, const std::string& key);
+TVM_DLL Function FunctionSetAttr(const Function& func, const std::string& key, const NodeRef& data);
+
+
 /*!
  * \brief Call corresponds to operator invocation.
  *  Corresponds to the operator in computational graph terminology.
diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
new file mode 100644
index 000000000000..1c382faaef04
--- /dev/null
+++ b/include/tvm/relay/interpreter.h
@@ -0,0 +1,140 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/interpreter.h
+ * \brief An interpreter for Relay.
+ *
+ * This file implements a simple reference interpreter for Relay programs.
+ * Given a Relay environment, and a Relay expression it produces a value.
+ *
+ * The interpreter's values are a naive representation of the values that
+ * can be produced by a Relay program and are exposed via tvm::Node's
+ * system to Python for introspection and debugging.
+ *
+ * The interpreter's intent is to serve as a reference semantics for the Relay IR,
+ * as well as for debugging and testing.
+ */
+#ifndef TVM_RELAY_INTERPRETER_H_
+#define TVM_RELAY_INTERPRETER_H_
+
+#include <tvm/relay/environment.h>
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief A Relay value.
+ */
+class Value;
+
+/*! \brief Evaluate an expression using the interpreter producing a value.
+ *
+ * The resulting value can be passed to Python, making it easy to use
+ * for testing and debugging.
+ *
+ * The interpreter interprets the program fragments not supported by the
+ * TVM runtime, although the interpreter is naively implemented it uses
+ * TVM operators for evaluating all operators.
+ *
+ * Our intent is that this will never be the most efficient implementation of
+ * Relay's semantics, but a readable and clear one.
+ */
+Value Evaluate(Environment env, Expr e);
+
+/*! \brief The base container type of Relay values. */
+class ValueNode : public RelayNode {
+ public:
+  static constexpr const char* _type_key = "relay.Value";
+  TVM_DECLARE_BASE_NODE_INFO(ValueNode, RelayNode);
+};
+
+class Value : public NodeRef {
+ public:
+  Value() {}
+  explicit Value(NodePtr<Node> n) : NodeRef(n) {}
+  const ValueNode* operator->() const {
+    return static_cast<const ValueNode*>(node_.get());
+  }
+
+  using ContainerType = ValueNode;
+};
+
+/*! \brief A Relay closure, i.e a scope and a function. */
+class Closure;
+
+/*! \brief The container type of Closures. */
+class ClosureNode : public ValueNode {
+ public:
+  /*! \brief The set of free variables in the closure.
+   *
+   * These are the captured variables which are required for
+   * evaluation when we call the closure.
+   */
+  tvm::Map<Var, Value> env;
+  /*! \brief The function which implements the closure.
+   *
+   * \note May reference the variables contained in the env.
+   */
+  Function func;
+
+  ClosureNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("env", &env);
+    v->Visit("func", &func);
+  }
+
+  TVM_DLL static Closure make(tvm::Map<Var, Value> env, Function func);
+
+  static constexpr const char* _type_key = "relay.Closure";
+  TVM_DECLARE_NODE_TYPE_INFO(ClosureNode, ValueNode);
+};
+
+RELAY_DEFINE_NODE_REF(Closure, ClosureNode, Value);
+
+/*! \brief A tuple value. */
+class TupleValue;
+
+/*! \brief Tuple (x, ... y). */
+struct TupleValueNode : ValueNode {
+  tvm::Array<Value> fields;
+
+  TupleValueNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("fields", &fields); }
+
+  TVM_DLL static TupleValue make(tvm::Array<Value> value);
+
+  static constexpr const char* _type_key = "relay.TupleValue";
+  TVM_DECLARE_NODE_TYPE_INFO(TupleValueNode, ValueNode);
+};
+
+RELAY_DEFINE_NODE_REF(TupleValue, TupleValueNode, Value);
+
+/*! \brief A tensor value. */
+class TensorValue;
+
+/*! \brief The tensor value container, wrapping an NDArray. */
+struct TensorValueNode : ValueNode {
+  runtime::NDArray data;
+
+  TensorValueNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("data", &data); }
+
+  /*! \brief Build a value from an NDArray. */
+  TVM_DLL static TensorValue make(runtime::NDArray data);
+
+  /*! \brief Construct an empty tensor value from t. */
+  TVM_DLL static TensorValue FromType(const Type& t);
+
+  static constexpr const char* _type_key = "relay.TensorValue";
+  TVM_DECLARE_NODE_TYPE_INFO(TensorValueNode, ValueNode);
+};
+
+RELAY_DEFINE_NODE_REF(TensorValue, TensorValueNode, Value);
+
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_INTERPRETER_H_
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index bf16c7ed8e33..b29678106d21 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -8,6 +8,7 @@
 
 #include <tvm/relay/environment.h>
 #include <tvm/relay/expr.h>
+#include <string>
 
 namespace tvm {
 namespace relay {
@@ -20,7 +21,8 @@ namespace relay {
  * populated with the result type.
  *
  * \param expr The expression to type check.
- * \param env The environment used for referencing global functions, can be None.
+ * \param env The environment used for referencing global functions, can be
+ * None.
  *
  * \return A type checked expression with its checked_type field populated.
  */
@@ -35,7 +37,8 @@ Expr InferType(const Expr& expr, const Environment& env);
  * \return A type checked Function with its checked_type field populated.
  * \note this function mutates env and is not thread-safe.
  */
-Function InferType(const Function& f, const Environment& env, const GlobalVar& var);
+Function InferType(const Function& f, const Environment& env,
+                   const GlobalVar& var);
 
 /*!
  * \brief Check that types are well kinded by applying "kinding rules".
@@ -94,28 +97,30 @@ bool AlphaEqual(const Type& t1, const Type& t2);
  *
  * For example, the expression `let x = 1 in let x = 2 in 3` bound x twice.
  *
- * `let f = (\x -> x) in let g = (\x -> x + 1) in f(g(2))` also bound x twice, although x is not shadowed.
+ * `let f = (\x -> x) in let g = (\x -> x + 1) in f(g(2))` also bound x twice,
+ * although x is not shadowed.
  *
- * \param e the expression to check.
+  * \param expr the expression to check.
  *
- * \return true iff all Var in e is bound at most once.
+  * \return true iff all Var in expr is bound at most once.
  */
-bool WellFormed(const Expr& e);
+bool WellFormed(const Expr& expr);
 
-/*! \brief Get free Vars from expr in PostDFS order.
+/*! \brief Get free type parameters from expression expr.
  *
  * Free variables are variables that are not bound by a
  * let or a function parameter in the context.
  *
  * \param expr the expression.
  *
- * \return List of free vars, in the PostDFS order visited by expr.
+ * \return List of free vars, in the PostDFS order in the expression.
  */
 tvm::Array<Var> FreeVars(const Expr& expr);
 
 /*! \brief Get free TypeVars from expression expr.
  *
- * Free type parameters are type parameters that are not bound by a function type in the context.
+ * Free type parameters are type parameters that are not bound by a function
+ * type in the context.
  *
  * \param expr the expression.
  *
@@ -125,10 +130,12 @@ tvm::Array<TypeVar> FreeTypeVars(const Expr& expr);
 
 /*! \brief Remove expressions which does not effect the program result.
  *
- * It will remove let binding that are not referenced, and if branch that are not entered.
+ * It will remove let bindings which are not referenced, and branches that will
+ * not be entered.
  *
- * For example, this pass should turn `let a = 1 in 2` into `2`, as the value of the expression does not depend on a.
- * Another example is `if (true) then 1 else 2` will be optimized into 1.
+ * For example, this pass should turn `let a = 1 in 2` into `2`, as the value of
+ * the expression does not depend on a. Another example is `if (true) then 1
+ * else 2` will be optimized into 1.
  *
  * \param e the expression to optimize.
  *
@@ -136,27 +143,30 @@ tvm::Array<TypeVar> FreeTypeVars(const Expr& expr);
  */
 Expr DeadCodeElimination(const Expr& e);
 
-/*! \brief Hash a Relay type.
- *
- * Implements structural hashing of a Relay type.
- *
- *  \param type the type to hash.
- *
- *  \return the hash value.
- */
-size_t StructuralHash(const Type& type);
-
-/*! \brief Hash a Relay expression.
- *
- * Implements structural hashing of a Relay expression.
- *
- * \param expr the expression to hash.
- *
- * \return the hash value.
- */
-size_t StructuralHash(const Expr& expr);
+/*! \brief A hashing structure in the style of std::hash. */
+struct StructuralHash {
+  /*! \brief Hash a Relay type.
+   *
+   * Implements structural hashing of a Relay type.
+   *
+   *  \param type the type to hash.
+   *
+   *  \return the hash value.
+   */
+  size_t operator()(const Type& type) const;
 
+  /*! \brief Hash a Relay expression.
+   *
+   * Implements structural hashing of a Relay expression.
+   *
+   * \param expr the expression to hash.
+   *
+   * \return the hash value.
+   */
+  size_t operator()(const Expr& expr) const;
+};
 
 }  // namespace relay
 }  // namespace tvm
+
 #endif  // TVM_RELAY_PASS_H_
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 731a816460ee..d3b60c1174fa 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -1,5 +1,7 @@
 # pylint: disable=wildcard-import, redefined-builtin, invalid-name
 """The Relay IR namespace containing the IR definition and compiler."""
+from __future__ import absolute_import
+from ..api import register_func
 from . import base
 from . import ty
 from . import expr
@@ -15,6 +17,7 @@
 from . import vision
 from . import image
 
+
 from .scope_builder import ScopeBuilder
 
 # Span
@@ -46,6 +49,21 @@
 If = expr.If
 TupleGetItem = expr.TupleGetItem
 
+
 # helper functions
 var = expr.var
 const = expr.const
+
+@register_func("relay._tensor_value_repr")
+def _tensor_value_repr(tv):
+    return str(tv.data.asnumpy())
+
+@register_func("relay._constant_repr")
+def _tensor_constant_repr(tv):
+    return str(tv.data.asnumpy())
+
+# pylint: disable=unused-argument
+@register_func("relay.debug")
+def _debug(*args):
+    import pdb
+    pdb.set_trace()
diff --git a/python/tvm/relay/_interpreter.py b/python/tvm/relay/_interpreter.py
new file mode 100644
index 000000000000..d04319c17a99
--- /dev/null
+++ b/python/tvm/relay/_interpreter.py
@@ -0,0 +1,4 @@
+"""The interface to the Evaluator exposed from C++."""
+from tvm._ffi.function import _init_api
+
+_init_api("relay._interpreter", __name__)
diff --git a/python/tvm/relay/env.py b/python/tvm/relay/env.py
index 9c3241e18ef8..37e0999dce9e 100644
--- a/python/tvm/relay/env.py
+++ b/python/tvm/relay/env.py
@@ -45,9 +45,12 @@ def __setitem__(self, var, func):
         func: Function
             The function.
         """
+        return self._add(var, func)
+
+    def _add(self, var, func, update=False):
         if isinstance(var, _base.string_types):
             var = _expr.GlobalVar(var)
-        _env.Environment_Add(self, var, func)
+        return _env.Environment_Add(self, var, func, update)
 
     def __getitem__(self, var):
         """Lookup a global function by name or by variable.
diff --git a/python/tvm/relay/graph_runtime_codegen.py b/python/tvm/relay/graph_runtime_codegen.py
new file mode 100644
index 000000000000..d0ce239fa7fd
--- /dev/null
+++ b/python/tvm/relay/graph_runtime_codegen.py
@@ -0,0 +1,551 @@
+"""
+A compiler from a Relay expression to TVM's graph runtime.
+
+The compiler is built from a few pieces.
+
+First we define a compiler from a single Relay expression to the
+graph langauge. We require the expression to be a function.
+The function's parameters correpond to the placeholder/inputs
+and model parameters found in the computation graph representation.
+The body of the function represents the computation graph.
+
+The compiler's output is a program in the graph language, which is composed of
+graph langauge is composed of Node, NodeRef, InputNode, OpNode.
+This "little language" represents programs in TVM's graph format.
+
+To connect to the graph runtime, we use a printer that converts our graph format
+into TVM's JSON format. The resulting string can be loaded by
+contrib.graph_runtime or any other TVM runtime comptatible system.
+
+We expose this functionality in compile_to_tvm.
+"""
+
+from __future__ import absolute_import
+import json
+import attr
+from . import ir_pass
+from .op import Op
+from .expr import Var, Function, Call, If, GlobalVar, Constant, Let, Tuple
+from ..build_module import build as tvm_build_module
+from .. contrib import graph_runtime
+from .ir_pass import infer_type
+from .. import cpu
+
+class AbstractExprVisitor(object):
+    """A visitor over Expr in Python."""
+
+    def __init__(self):
+        self.memo_map = {}
+
+    # pylint: disable=no-else-return
+    def visit(self, expr):
+        """Apply the visitor to an expression."""
+        found = self.memo_map.get(expr)
+        if found:
+            return found
+
+        if isinstance(expr, Function):
+            res = self.visit_function(expr)
+        elif isinstance(expr, Call):
+            res = self.visit_call(expr)
+        elif isinstance(expr, Let):
+            res = self.visit_let(expr)
+        elif isinstance(expr, Var):
+            res = self.visit_var(expr)
+        elif isinstance(expr, GlobalVar):
+            res = self.visit_global_var(expr)
+        elif isinstance(expr, If):
+            res = self.visit_if(expr)
+        elif isinstance(expr, Tuple):
+            res = self.visit_tuple(expr)
+        elif isinstance(expr, Constant):
+            res = self.visit_constant(expr)
+        else:
+            raise Exception("warning unhandled case: {0}".format(type(expr)))
+
+        self.memo_map[expr] = res
+        return res
+
+    def visit_function(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_let(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_call(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_var(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_type(self, typ):
+        return typ
+
+    def visit_if(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_tuple(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_constant(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_global_var(self, _):
+        raise Exception("Abstract method please implement me.")
+
+
+class ExprMutator(AbstractExprVisitor):
+    """A functional visitor over Expr in Python."""
+
+    def visit_function(self, fn):
+        new_body = self.visit(fn.body)
+        return Function(
+            list(fn.params),
+            fn.ret_type, new_body,
+            fn.type_params)
+
+    def visit_let(self, let):
+        new_var = self.visit(let.var)
+        new_val = self.visit(let.value)
+        new_body = self.visit(let.body)
+        return Let(new_var, new_val, new_body)
+
+    def visit_call(self, call):
+        new_fn = self.visit(call.op)
+        new_args = [self.visit(arg) for arg in call.args]
+        return Call(new_fn, new_args, call.attrs)
+
+    def visit_var(self, var):
+        return var
+
+    def visit_global_id(self, global_var):
+        return global_var
+
+    def visit_if(self, ite):
+        return If(
+            self.visit(ite.guard),
+            self.visit(ite.true_b),
+            self.visit(ite.false_b))
+
+    def visit_tuple(self, tup):
+        return Tuple([self.visit(field) for field in tup.fields])
+
+    def visit_constant(self, const):
+        return const
+
+
+@attr.s
+class NodeRef(object):
+    """A reference to a node, used for constructing the graph."""
+    ident = attr.ib()
+    index = attr.ib(default=0)
+    version = attr.ib(default=0)
+
+    def to_json(self):
+        return [self.ident, self.index, self.version]
+
+
+@attr.s
+class Node(object):
+    """The base class for nodes in the TVM runtime system graph input."""
+    name = attr.ib()
+    attrs = attr.ib()
+    is_output = attr.ib()
+
+    def to_json(self):
+        raise Exception("Abstract method, please implement me.")
+
+
+@attr.s
+class InputNode(Node):
+    """An input node in the TVM runtime system graph input."""
+    name = attr.ib()
+    attrs = attr.ib()
+    is_output = attr.ib(default=False)
+
+    def to_json(self):
+        return {
+            "op": "null",
+            "name": self.name,
+            "inputs": []
+        }
+
+
+@attr.s
+class OpNode(Node):
+    """An operator node in the TVM runtime system's graph input."""
+    op_name = attr.ib()
+    inputs = attr.ib()
+    op_attrs = attr.ib()
+    is_output = attr.ib(default=False)
+
+    def to_json(self):
+        attrs = dict.copy(self.op_attrs)
+        # Extend ops with extra info.
+        attrs['func_name'] = self.op_name
+        # When do we flatten?
+        attrs['flatten_data'] = "0"
+        # Fix me!
+        attrs['num_inputs'] = str(len(self.inputs))
+        attrs['num_outputs'] = "1"
+
+        return {
+            "op": "tvm_op",
+            "name": self.name,
+            "attrs": attrs,
+            "inputs": self.inputs
+        }
+
+
+def shape_to_json(shape):
+    return [sh.value for sh in shape]
+
+
+def from_tensor(typ):
+    return (typ.dtype, shape_to_json(typ.shape))
+
+
+class GraphRuntimeCodegen(ExprMutator):
+    """The compiler from Relay to the TVM runtime system."""
+    nodes = attr.ib()
+    id_map = attr.ib()
+
+    def __init__(self, env):
+        ExprMutator.__init__(self)
+        self.nodes = []
+        self.id_map = {}
+        self.env = env
+
+    def add_node(self, node):
+        """
+        Add a node to the graph.
+
+        Parameters
+        ----------
+        node: Node
+            The node to add to the graph.
+
+        Returns
+        -------
+        node_ref: NodeRef
+            A reference to the node.
+
+        """
+        self.nodes.append(node)
+        ident = len(self.nodes) - 1
+        return NodeRef(ident)
+
+    def add_binding(self, ident, ref):
+        """
+        Add a identifier to node mapping.
+
+        Parameters
+        ----------
+        ident: relay.Var
+            The variable to map
+
+        ref: NodeRef
+            The node the identifier points.
+        """
+        self.id_map[ident] = ref
+
+    def let_bind(self, ident, node):
+        """
+        Let bind node to ident.
+
+        Parameters
+        ----------
+        ident: relay.Var
+            The variable to map.
+
+        ref: NodeRef
+            The node the identifier points.
+
+        Returns
+        -------
+        ref: NodeRef
+            Return reference to the node.
+        """
+        ref = self.add_node(node)
+        self.add_binding(ident, ref)
+        return ref
+
+    def get_node(self, ref):
+        """
+        Lookup a node by a node reference.
+
+        Parameters
+        ----------
+        ref: NodeRef
+            The reference to lookup.
+
+        Returns
+        -------
+        node: Node
+            The node.
+        """
+        return self.nodes[ref.ident]
+
+    def lookup(self, ident):
+        """
+        Lookup a node by identifier.
+
+        Parameters
+        ----------
+        ident: relay.Var
+            The reference to lookup.
+
+        Returns
+        -------
+        node: Node
+            The node.
+        """
+        return self.id_map[ident]
+
+    def codegen(self, func):
+        """Compile a single function into a graph.
+
+        Parameters
+        ----------
+        func: tvm.relay.Expr
+            The function to compile.
+        """
+        # First we convert all the parameters into input nodes.
+        params = func.params
+
+        for param in params:
+            dtype, shape = from_tensor(param.type_annotation)
+            node = InputNode("{0}".format(param.name_hint), {
+                "shape": shape,
+                "dtype": dtype,
+            })
+            self.let_bind(param, node)
+
+        # Then we compile the body into a graph which can depend
+        # on input variables.
+        output_ref = self.visit(func.body)
+
+        # Finally we retreive return value of program, which will
+        # become our output node.
+        self.get_node(output_ref).is_output = True
+
+    def visit_let(self, let):
+        """
+        Visit the let binding, by first traversing its value,
+        then setting the metadata on the returned NodeRef.
+
+        Finally visit the body, and return the NodeRef corresponding
+        to it.
+
+        Parameters
+        ----------
+        let: tvm.relay.Expr
+            The let binding to transform.
+
+        Returns
+        -------
+        ref: NodeRef
+            The node reference to the body.
+        """
+        ident = let.var
+        val = let.value
+        body = let.body
+
+        val_ref = self.visit(val)
+        dtype, shape = from_tensor(val.checked_type())
+        val_node = self.get_node(val_ref)
+        val_node.attrs["dtype"] = dtype
+        val_node.attrs["shape"] = shape
+        self.add_binding(ident, val_ref)
+        return self.visit(body)
+
+    def visit_var(self, var):
+        return self.lookup(var)
+
+    def visit_call(self, call):
+        """Transform a ::tvm.relay.Call into an operator in the TVM graph."""
+        inputs = []
+        for arg in call.args:
+            inputs.append(self.visit(arg).to_json())
+
+        if isinstance(call.op, Op):
+            raise Exception(
+                "Operators should be transformed away; try applying" +
+                "the fuse_ops transformation to the expression.")
+        elif isinstance(call.op, GlobalVar):
+            func = self.env[call.op]
+        elif isinstance(call.op, Function):
+            func = call.op
+        else:
+            raise Exception(
+                "TVM runtime does not support calls to {0}".format(type(call.op)))
+
+        if int(func.attrs.Primitive) != 1:
+            raise Exception(
+                "TVM only support calls to primitive functions " +
+                "(i.e functions composed of fusable operator invocations)")
+
+        op_name = func.attrs.LoweredFunc.name
+
+        attrs = {'shape': shape_to_json(call.checked_type.shape),
+                 'dtype': call.checked_type.dtype}
+        call_hash = str(ir_pass.structural_hash(call))
+        op_node = OpNode("call_" + call_hash, attrs, op_name, inputs, {})
+        return self.add_node(op_node)
+
+    def to_json(self):
+        """
+        Convert the sequence of nodes stored by the compiler into the
+        TVM graph runtime format.
+
+        Returns
+        -------
+        graph_json : str
+            The generated JSON as a string.
+        """
+        nodes = []
+        # First we compute "nodes" field.
+        for node in self.nodes:
+            nodes.append(node.to_json())
+
+        arg_nodes = []
+        heads = []
+        # Compute "arg_nodes" and "heads" fields.
+        for i, node in enumerate(self.nodes):
+            if isinstance(node, InputNode):
+                arg_nodes.append(i)
+
+            if node.is_output:
+                # Need to fix this.
+                heads.append(NodeRef(i).to_json())
+
+        def compute_node_row_ptr(nodes):
+            """Calculate the node_row_ptr field by doing a DFS backwards
+               from the output and reversing the path.
+            """
+            row_ptr = [len(nodes)]
+            discovered = set()
+            stack = []
+            stack.append(len(nodes) - 1)
+            while stack:
+                i = stack.pop()
+                if i not in discovered:
+                    discovered.add(i)
+                    row_ptr.append(i)
+                    node = nodes[i]
+                    if isinstance(node, OpNode):
+                        for inp in node.inputs:
+                            stack.append(inp[0])
+            row_ptr.reverse()
+            return row_ptr
+
+        # Compute "node_row_ptr".
+        node_row_ptr = compute_node_row_ptr(self.nodes)
+
+        # Compute "attrs" field.
+        attrs = {}
+
+        # These fields are mandatory.
+        shapes = []
+        storage_ids = []
+        dtype = []
+        dltype = []
+
+        for i, node in enumerate(self.nodes):
+            storage_ids.append(i)
+            shapes.append(node.attrs['shape'])
+            if node.attrs['dtype'] == 'float32':
+                dtype.append(0)
+                dltype.append('float32')
+
+        attrs["shape"] = ["list_shape", shapes]
+        attrs["storage_id"] = ["list_int", storage_ids]
+        attrs["dtype"] = ["list_int", dtype]
+        attrs["dltype"] = ["list_str", dltype]
+
+        json_dict = {
+            "nodes": nodes,
+            "arg_nodes": arg_nodes,
+            "heads": heads,
+            "attrs": attrs,
+            "node_row_ptr":  node_row_ptr
+        }
+
+        return json.dumps(json_dict)
+
+
+def build(env, func, target=None):
+    """
+    Compile a single function to the components needed by the
+    TVM RTS.
+
+    Parameters
+    ----------
+    func: relay.Expr
+        The function to build.
+
+    target: optional str
+        The target platform.
+
+    Returns
+    -------
+    (graph_json, mod, params): tuple of (str, tvm.Module, dict)
+        The outputs of building a Relay function for the TVM runtime.
+
+    """
+    if target is None:
+        target = 'llvm'
+
+    comp = GraphRuntimeCodegen(env)
+    # NB(@jroesch) This creates lowered functions, and generates names for them
+    #
+    # We need these names to emit the correct graph as these are names of the
+    # functions contained in the module.
+    lowered_ops = ir_pass.lower_ops(env, func)
+    mod = tvm_build_module([lf.lowered_func for lf in lowered_ops], target)
+
+    # Therefore the call to compile must come after.
+    comp.codegen(func)
+    graph_json = comp.to_json()
+    return graph_json, mod, None  # params currently isn't supported by API
+
+
+def graph_evaluate(env, func, *args):
+    """
+    Corresponding function to tvm.relay.eval.evaluate.
+
+    This function evaluates a Relay expression on the
+    TVM graph_runtime.
+
+    Parameters
+    ----------
+    env: tvm.relay.Environment
+        The global environment used.
+
+    expr: tvm.relay.Expr
+        The expression to evaluate.
+
+    args: list of tvm.relay.Expr
+        The arguments to apply to the expression, only works
+        if the expression has a function type.
+
+    Returns
+    -------
+    value: tvm.NDArray
+        The output Tensor produced by evaluating the expression.
+    """
+    func = infer_type(func, env)
+    func = ir_pass.fuse_ops(env, func)
+    func = infer_type(func, env)
+    graph_json, mod, params = build(env, func)
+    assert params is None
+    gmodule = graph_runtime.create(graph_json, mod, cpu(0))
+    # Create map of inputs.
+    inputs = {}
+    for i, arg in enumerate(args):
+        inputs[func.params[i].name_hint] = arg
+    # Set the inputs here.
+    gmodule.set_input(**inputs)
+    # Run the module, and fetch the output.
+    gmodule.run()
+    return gmodule.get_output(0)
diff --git a/python/tvm/relay/interpreter.py b/python/tvm/relay/interpreter.py
new file mode 100644
index 000000000000..06dc3c79fba4
--- /dev/null
+++ b/python/tvm/relay/interpreter.py
@@ -0,0 +1,130 @@
+#pylint: disable=no-else-return
+"""An interface to the Realy interpreter."""
+from __future__ import absolute_import
+import numpy as np
+from .. import register_func, nd
+from .base import NodeBase, register_relay_node
+from . import _make
+from . import _interpreter
+from . import ir_pass
+from .expr import Call, Constant, GlobalVar
+from . import const
+from .._ffi.base import integer_types
+
+class Value(NodeBase):
+    """Base class of all values.
+    """
+
+    @staticmethod
+    @register_func("relay.from_scalar")
+    def from_scalar(i, dtype=None):
+        """Convert a Python scalar to a Relay scalar."""
+        if dtype is None:
+            if isinstance(i, integer_types):
+                dtype = 'int32'
+            elif isinstance(i, float):
+                dtype = 'float32'
+            elif isinstance(i, bool):
+                dtype = 'uint8'
+            else:
+                raise Exception("unable to infer dtype {0}".format(type(i)))
+
+        return TensorValue(nd.array(np.array(i, dtype=dtype)))
+
+
+@register_relay_node
+class TupleValue(Value):
+    def __init__(self, *fields):
+        self.__init_handle_by_constructor__(
+            _make.TupleValue, fields)
+
+    def __getitem__(self, field_no):
+        return self.fields[field_no]
+
+
+@register_relay_node
+class Closure(Value):
+    pass
+
+
+@register_relay_node
+class TensorValue(Value):
+    """A Tensor value produced by the evaluator."""
+
+    def __init__(self, data):
+        """Allocate a new TensorValue and copy the data from `array` into
+           the new array.
+        """
+        if isinstance(data, np.ndarray):
+            data = nd.array(data)
+
+        self.__init_handle_by_constructor__(
+            _make.TensorValue, data)
+
+    def as_ndarray(self):
+        """Convert a Relay TensorValue into a tvm.ndarray."""
+        return self.data
+
+    def asnumpy(self):
+        """Convert a Relay TensorValue into a numpy.ndarray."""
+        return self.data.asnumpy()
+
+    def __eq__(self, other):
+        return self.data == other.data
+
+
+def _arg_to_ast(arg):
+    if isinstance(arg, TensorValue):
+        return Constant(arg.data)
+    elif isinstance(arg, np.ndarray):
+        return Constant(nd.array(arg))
+    elif isinstance(arg, Constant):
+        return arg
+    else:
+        return const(arg)
+
+
+def apply_passes(expr, env=None):
+    ck_expr = ir_pass.infer_type(expr, env=env)
+    fused_expr = ir_pass.fuse_ops(env, ck_expr)
+    return fused_expr
+
+
+def evaluate(env, expr, *args):
+    """
+    Evaluate a Relay expression on the interpreter.
+
+    Parameters
+    ----------
+    env: tvm.relay.Environment
+        The global environment used.
+
+    expr: tvm.relay.Expr
+        The expression to evaluate.
+
+    args: list of tvm.relay.Expr
+        The arguments to apply to the expression, only works
+        if the expression has a function type.
+
+    Returns
+    -------
+    value: tvm.relay.eval.Value
+        The value produced by evaluating the expression.
+    """
+    # assert len(args) == 0
+    relay_args = []
+    for arg in args:
+        relay_args.append(_arg_to_ast(arg))
+
+    # TODO: We need to move this optimization code into the optimizer/pass manager
+    if isinstance(expr, GlobalVar):
+        func = env[expr]
+        func = apply_passes(func, env)
+        env._add(expr, func, True)
+        opt_expr = Call(expr, relay_args)
+        # import pdb; pdb.set_trace()
+        return _interpreter.evaluate(env, opt_expr)
+    else:
+        expr = Call(expr, relay_args)
+        opt_expr = apply_passes(expr, env)
+        return _interpreter.evaluate(env, opt_expr)
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 82afa83ee376..68a07f190d42 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -240,3 +240,9 @@ def structural_hash(value):
         msg = ("found value of type {0} expected" +
                "relay.Expr or relay.Type").format(type(value))
         raise TypeError(msg)
+
+def fuse_ops(expr, env):
+    return _ir_pass.FuseOps(env, expr)
+
+def lower_ops(env, expr, target='llvm'):
+    return _ir_pass.LowerOps(env, expr, target)
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 0bc2054cebdf..6ccb394ef8db 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -1,2 +1,49 @@
-#pylint: disable=invalid-name
+#pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
+import tvm
+import topi
+from . import register
+
+def add_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.add(inputs[0], inputs[1])]
+
+def add_schedule(outputs, target):
+    assert len(outputs) == 1
+    return tvm.create_schedule(outputs[0].op)
+
+register("add", "FTVMCompute", add_compute)
+register("add", "FTVMSchedule", add_schedule)
+
+def subtract_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.subtract(inputs[0], inputs[1])]
+
+def subtract_schedule(outputs, target):
+    assert len(outputs) == 1
+    return tvm.create_schedule(outputs[0].op)
+
+register("subtract", "FTVMCompute", subtract_compute)
+register("subtract", "FTVMSchedule", subtract_schedule)
+
+def multiply_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.multiply(inputs[0], inputs[1])]
+
+def multiply_schedule(outputs, target):
+    assert len(outputs) == 1
+    return tvm.create_schedule(outputs[0].op)
+
+register("multiply", "FTVMCompute", multiply_compute)
+register("multiply", "FTVMSchedule", multiply_schedule)
+
+def equal_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.equal(inputs[0], inputs[1])]
+
+def equal_schedule(outputs, target):
+    assert len(outputs) == 1
+    return tvm.create_schedule(outputs[0].op)
+
+register("equal", "FTVMCompute", equal_compute)
+register("equal", "FTVMSchedule", equal_schedule)
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
new file mode 100644
index 000000000000..4f5dcd4dd08b
--- /dev/null
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -0,0 +1,16 @@
+#pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+import tvm
+import topi
+from .. import register
+
+def dense_compiler(attrs, inputs, output_type):
+    assert len(inputs) == 2
+    return [topi.nn.dense(inputs[0], inputs[1])]
+
+def dense_schedule(outputs, target):
+    assert len(outputs) == 1
+    return tvm.create_schedule(outputs[0].op)
+
+register("nn.dense", "FTVMCompute", dense_compiler)
+register("nn.dense", "FTVMSchedule", dense_schedule)
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index f1130b52e7ce..0c09f39a3c83 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -3,7 +3,8 @@
 
 from ..base import register_relay_node
 from ..expr import Expr
-
+from ...api import register_func
+from ...build_module import lower, build
 
 @register_relay_node
 class Op(Expr):
@@ -75,3 +76,11 @@ def _register(v):
 
 
 _init_api("relay.op", __name__)
+
+@register_func("relay.op.compiler._lower")
+def _lower(name, schedule, inputs, outputs):
+    return lower(schedule, list(inputs) + list(outputs), name=name)
+
+@register_func("relay.op.compiler._build")
+def _build(lowered_funcs):
+    return build(lowered_funcs, target="llvm")
diff --git a/python/tvm/relay/testing/mlp.py b/python/tvm/relay/testing/mlp.py
index 67fa0d90c643..7d7d984f7526 100644
--- a/python/tvm/relay/testing/mlp.py
+++ b/python/tvm/relay/testing/mlp.py
@@ -17,6 +17,7 @@
 """
 a simple multilayer perceptron
 """
+from __future__ import absolute_import
 from tvm import relay
 from .init import create_workload
 
diff --git a/src/relay/interpreter.cc b/src/relay/interpreter.cc
new file mode 100644
index 000000000000..534a2a980e4a
--- /dev/null
+++ b/src/relay/interpreter.cc
@@ -0,0 +1,432 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/interpreter.cc
+ * \brief An interpreter for the Relay IR.
+ */
+
+#include <tvm/codegen.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/interpreter.h>
+#include <tvm/relay/logging.h>
+#include <tvm/relay/pass.h>
+#include <tvm/relay/build_module.h>
+#include "./ir/type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace runtime;
+
+inline const PackedFunc& GetPackedFunc(const std::string& name) {
+  const PackedFunc* pf = tvm::runtime::Registry::Get(name);
+  CHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
+  return *pf;
+}
+
+/* Value Implementation */
+Closure ClosureNode::make(tvm::Map<Var, Value> env, Function func) {
+  NodePtr<ClosureNode> n = make_node<ClosureNode>();
+  n->env = std::move(env);
+  n->func = std::move(func);
+  return Closure(n);
+}
+
+TVM_REGISTER_API("relay._make.Closure")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
+      *ret = ClosureNode::make(args[0], args[1]);
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<ClosureNode>([](const ClosureNode* node, tvm::IRPrinter* p) {
+      p->stream << "ClosureNode(" << node->func << ")";
+    });
+
+TupleValue TupleValueNode::make(tvm::Array<Value> value) {
+  NodePtr<TupleValueNode> n = make_node<TupleValueNode>();
+  n->fields = value;
+  return TupleValue(n);
+}
+
+TVM_REGISTER_API("relay._make.TupleValue")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
+      *ret = TupleValueNode::make(args[0]);
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<TupleValueNode>([](const TupleValueNode* node,
+                                     tvm::IRPrinter* p) {
+      p->stream << "TupleValueNode(" << node->fields << ")";
+    });
+
+TensorValue TensorValueNode::make(runtime::NDArray data) {
+  NodePtr<TensorValueNode> n = make_node<TensorValueNode>();
+  n->data = std::move(data);
+  return TensorValue(n);
+}
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+    .set_dispatch<TensorValueNode>([](const TensorValueNode* node,
+                                      tvm::IRPrinter* p) {
+      auto to_str = GetPackedFunc("relay._tensor_value_repr");
+      std::string data_str = to_str(GetRef<TensorValue>(node));
+      p->stream << "TensorValueNode(" << data_str << ")";
+    });
+
+TensorValue TensorValueNode::FromType(const Type& t) {
+  if (auto tt_node = t.as<TensorTypeNode>()) {
+    std::vector<int64_t> dims;
+
+    for (auto dim : tt_node->shape) {
+      auto int_node = dim.as<tvm::ir::IntImm>();
+      CHECK(int_node) << "expected concrete dimensions";
+      dims.push_back(int_node->value);
+    }
+
+    DLDataType dtype;
+    DLContext context;
+
+    switch (tt_node->dtype.code()) {
+      case halideir_type_int:
+        dtype.code = kDLInt;
+        break;
+      case halideir_type_uint:
+        dtype.code = kDLUInt;
+        break;
+      case halideir_type_float:
+        dtype.code = kDLFloat;
+        break;
+      default:
+        throw dmlc::Error("can not convert HalideIR type into DLTensor dtype");
+    }
+
+    dtype.bits = tt_node->dtype.bits();
+    dtype.lanes = tt_node->dtype.lanes();
+
+    // TODO(@jroesch): Is this the right place to place the tensor?
+    context.device_type = DLDeviceType::kDLCPU;
+    context.device_id = 0;
+    runtime::NDArray data = NDArray::Empty(dims, dtype, context);
+    return TensorValueNode::make(data);
+  } else {
+    LOG(FATAL) << "expected a tensor type";
+    return TensorValue();
+  }
+}
+
+TVM_REGISTER_API("relay._make.TensorValue")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
+      runtime::NDArray data = args[0];
+      *ret = TensorValueNode::make(data);
+    });
+
+/* Evaluator Implementation. */
+struct EvalError : dmlc::Error {
+  explicit EvalError(const std::string& msg) : Error(msg) {}
+};
+
+/*!
+ * \brief A stack frame in the Relay interpreter.
+ *
+ * Contains a mapping from relay::Var to relay::Value.
+ */
+struct Frame {
+  /*! \brief The set of local variables and arguments for the frame. */
+  tvm::Map<Var, Value> locals;
+
+  explicit Frame(tvm::Map<Var, Value> locals) : locals(locals) {}
+};
+
+/*!
+ * \brief The call stack in the Relay interpreter.
+ *
+ * Contains a stack of frames; each corresponding to
+ * a function call.
+ */
+struct Stack {
+  /*! \brief The stack frames. */
+  std::vector<Frame> frames;
+  Stack() : frames() { frames.push_back(Frame({})); }
+
+  Frame& current_frame() { return frames.back(); }
+
+  Value Lookup(const Var& local) {
+    for (auto frame = frames.rbegin(); frame != frames.rend(); frame++) {
+      auto elem = frame->locals.find(local);
+      if (elem != frame->locals.end()) {
+        return (*elem).second;
+      }
+    }
+
+    LOG(FATAL) << "could not find variable binding for " << local
+               << "address= " << local.operator->();
+    return Value();
+  }
+  /*!
+   * A wrapper around Frame to add RAII semantics to pushing and popping
+   * stack frames.
+   */
+  struct LocalFrame {
+    Stack& st;
+    explicit LocalFrame(Stack& st, const Frame& fr) : st(st) {
+      st.frames.push_back(fr);
+    }
+    ~LocalFrame() { st.frames.pop_back(); }
+  };
+};
+
+/*! \brief The equal comparator for expressions. */
+struct ExprEqual {
+  bool operator()(const Expr& a, const Expr& b) const {
+    return AlphaEqual(a, b);
+  }
+};
+
+struct Interpreter : ExprFunctor<Value(const Expr& n)> {
+  Environment env;
+  Stack stack;
+  using JitKey = Function;
+
+  using OpMap = std::unordered_map<JitKey, PackedFunc, StructuralHash, ExprEqual>;
+
+  OpMap operator_map_;
+
+  template <typename T>
+  T with_frame(const Frame& fr, const std::function<T()>& f) {
+    Stack::LocalFrame lf(stack, fr);
+    return f();
+  }
+
+  Interpreter(Environment env) : env(env), operator_map_() {}
+  Interpreter(Environment env, OpMap operator_map) : env(env), operator_map_(operator_map) {}
+
+  void extend(const Var& id, Value v) {
+    this->stack.current_frame().locals.Set(id, v);
+  }
+
+  inline Value Lookup(const Var& local) {
+    return this->stack.Lookup(local);
+  }
+
+  Value Eval(const Expr& expr) {
+    return (*this)(expr);
+  }
+
+  Value VisitExpr(const Expr& expr) override {
+    RELAY_LOG(INFO) << "VisitExpr: " << expr << std::endl;
+    auto ret = ExprFunctor<Value(const Expr& n)>::VisitExpr(expr);
+    return ret;
+  }
+
+  Value VisitExpr_(const VarNode* var_node) override {
+    return Lookup(GetRef<Var>(var_node));
+  }
+
+  Value VisitExpr_(const GlobalVarNode* op) override {
+    return Eval(this->env->Lookup(GetRef<GlobalVar>(op)));
+  }
+
+  Value VisitExpr_(const OpNode* id) override {
+    // TODO(@jroesch): Eta-expand and return in this case.
+    throw EvalError(
+        "internal error, need to wrap intrinsic into call synthetic call node "
+        "in "
+        "this case, eta expand");
+  }
+
+  Value VisitExpr_(const ConstantNode* op) override {
+    return TensorValueNode::make(op->data);
+  }
+
+  Value VisitExpr_(const TupleNode* op) override {
+    std::vector<Value> values;
+
+    for (const auto& field : op->fields) {
+      Value field_value = Eval(field);
+      values.push_back(field_value);
+    }
+
+    return TupleValueNode::make(values);
+  }
+
+  Value VisitExpr_(const FunctionNode* func_node) override {
+    auto func = GetRef<Function>(func_node);
+    tvm::Map<Var, Value> captured_env;
+    Array<Var> free_vars = FreeVars(func);
+
+    for (const auto& var : free_vars) {
+      captured_env.Set(var, Eval(var));
+    }
+
+    return ClosureNode::make(captured_env, func);
+  }
+
+  inline Value InvokeCompiledOp(PackedFunc func, const Array<Value>& args,
+                                Type ret_type) {
+    // Marshal the arguments.
+    auto arg_len = args.size() + 1;
+    std::vector<TVMValue> values(arg_len);
+    std::vector<int> codes(arg_len);
+    TVMArgsSetter setter(values.data(), codes.data());
+    TVMRetValue ret;
+
+    // We need real type information to properly allocate the structure.
+    for (size_t i = 0; i < args.size(); i++) {
+      if (const TensorValueNode* tv = args[i].as<TensorValueNode>()) {
+        setter(i, tv->data);
+      }
+    }
+
+    // TVM's calling convention is that the final argument is the output
+    // buffer. To preserve the illusion of being a functional language
+    // we need to allocate space for the output buffer based on the
+    // return type.
+    CHECK(ret_type.as<TensorTypeNode>());
+
+    auto out_tensor = TensorValueNode::FromType(ret_type);
+
+    setter(arg_len - 1, out_tensor->data);
+    func.CallPacked(TVMArgs(values.data(), codes.data(), arg_len), &ret);
+    return out_tensor;
+  }
+
+  Value Invoke(const Closure& closure, const tvm::Array<Value>& args) {
+    // Get a reference to the function inside the closure.
+    auto func = closure->func;
+    auto compiled = operator_map_.find(func);
+    tvm::Array<Function> funcs;
+    for (auto op : operator_map_) {
+      funcs.push_back(op.first);
+    }
+
+    // This case we know we have precompiled the operator.
+    if (compiled != operator_map_.end()) {
+      auto func_ty = func->func_type_annotation();
+      return InvokeCompiledOp(compiled->second, args, func_ty->ret_type);
+    }
+
+    // Allocate a frame with the parameters and free variables.
+    tvm::Map<Var, Value> locals;
+
+    CHECK_EQ(func->params.size(), args.size());
+
+    for (size_t i = 0; i < func->params.size(); i++) {
+      CHECK_EQ(locals.count(func->params[i]), 0);
+      locals.Set(func->params[i], args[i]);
+    }
+
+    // Add the var to value mappings from the Closure's environment.
+    for (auto it = closure->env.begin(); it != closure->env.end(); ++it) {
+      CHECK_EQ(locals.count((*it).first), 0);
+      locals.Set((*it).first, (*it).second);
+    }
+
+    return with_frame<Value>(Frame(locals), [&]() { return Eval(func->body); });
+  }
+
+  Value VisitExpr_(const CallNode* call) override {
+    tvm::Array<Value> args;
+    for (auto arg : call->args) {
+      args.push_back(Eval(arg));
+    }
+
+    // We should not find operators after running fusion,
+    // and operator lowering.
+    //
+    // We have some functions cotaining chunks of operators
+    // which will be loaded into operator map.
+    if (auto op_node = call->op.as<OpNode>()) {
+      LOG(FATAL) << "found " << op_node->name
+                 << "; operators should be removed by future passes; try "
+                    "fusing and lowering";
+    }
+
+    // Now we just evaluate and expect to find a closure.
+    Value fn_val = Eval(call->op);
+    if (const ClosureNode* closure_node = fn_val.as<ClosureNode>()) {
+      auto closure = GetRef<Closure>(closure_node);
+      return this->Invoke(closure, args);
+    } else {
+      throw EvalError(
+          "internal error: type error, expected function value in the call "
+          "position");
+    }
+  }
+
+  Value VisitExpr_(const LetNode* op) override {
+    auto value = Eval(op->value);
+    this->extend(op->var, value);
+    return Eval(op->body);
+  }
+
+  Value VisitExpr_(const TupleGetItemNode* op) override {
+    Value val = Eval(op->tuple);
+    auto product_node = val.as<TupleValueNode>();
+    CHECK(product_node)
+      << "interal error: when evaluating TupleGetItem expected a tuple value";
+    CHECK_LT(static_cast<size_t>(op->index), product_node->fields.size())
+      << "internal error: index out of bounds";
+    return product_node->fields[op->index];
+  }
+
+  Value VisitExpr_(const IfNode* op) override {
+    Value v = Eval(op->cond);
+    if (const TensorValueNode* bv = v.as<TensorValueNode>()) {
+      // TODO(@jroesch, @MK): Refactor code into helper from DCE.
+      if (reinterpret_cast<uint8_t*>(bv->data->data)[0]) {
+        return Eval(op->true_branch);
+      } else {
+        return Eval(op->false_branch);
+      }
+    } else {
+      throw EvalError("type error, type system should have caught this");
+    }
+  }
+};
+
+Interpreter::OpMap CompileOperators(const Environment& env, const Expr& e) {
+  Interpreter::OpMap op_map;
+  auto lowered_ops = LowerOps(env, e);
+  RELAY_LOG(INFO) << "LoweredFuncs: " << lowered_ops << std::endl;
+  if (lowered_ops.size()) {
+    const PackedFunc* fbuild_ptr = Registry::Get("relay.op.compiler._build");
+    CHECK(fbuild_ptr) << "Could not find registered function: relay.op.compiler._build";
+    auto fbuild = *fbuild_ptr;
+
+    // Collect the set of lowered functions to build a module.
+    Array<LoweredFunc> lowered_funcs;
+    for (auto lop : lowered_ops) {
+      lowered_funcs.push_back(lop->lowered_func);
+    }
+
+    Module module = fbuild(lowered_funcs);
+
+    // Loop over the lowered operations to map them into the operator map.
+    for (auto lop : lowered_ops) {
+      Function func = lop->func;
+      LoweredFunc lf = lop->lowered_func;
+
+      RELAY_LOG(INFO) << "LoweredFunc: " << lf->name << std::endl;
+      auto op_impl = module.GetFunction(lf->name);
+      op_map.insert({func, op_impl});
+    }
+  }
+
+  return op_map;
+}
+
+Value Evaluate(Environment env, Expr e) {
+  auto op_map = CompileOperators(env, e);
+  Interpreter interp(env, op_map);
+  return interp.Eval(e);
+}
+
+TVM_REGISTER_API("relay._interpreter.evaluate")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
+      Environment env = args[0];
+      Expr expr = args[1];
+      *ret = Evaluate(env, expr);
+    });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
index a68910e56b71..1f73f297f99a 100644
--- a/src/relay/ir/base.cc
+++ b/src/relay/ir/base.cc
@@ -66,3 +66,5 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 
 }  // namespace relay
 }  // namespace tvm
+
+
diff --git a/src/relay/ir/environment.cc b/src/relay/ir/environment.cc
index dddad82c8afc..262758ba0478 100644
--- a/src/relay/ir/environment.cc
+++ b/src/relay/ir/environment.cc
@@ -49,9 +49,16 @@ void EnvironmentNode::Add(const GlobalVar& var,
         << "Environment#update changes type, not possible in this mode.";
   }
   this->functions.Set(var, checked_func);
-  // set gloval var map
-  CHECK(!global_var_map_.count(var->name_hint))
-      << "Duplicate global function name " << var->name_hint;
+
+  auto it = global_var_map_.find(var->name_hint);
+  if (it != global_var_map_.end()) {
+    CHECK_EQ((*it).second, var);
+  } else {
+    // set global var map
+    CHECK(!global_var_map_.count(var->name_hint))
+        << "Duplicate global function name " << var->name_hint;
+  }
+
   global_var_map_.Set(var->name_hint, var);
 }
 
@@ -94,7 +101,7 @@ TVM_REGISTER_API("relay._make.Environment")
 TVM_REGISTER_API("relay._env.Environment_Add")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     Environment env = args[0];
-    env->Add(args[1], args[2], false);
+    env->Add(args[1], args[2], args[3]);
   });
 
 TVM_REGISTER_API("relay._env.Environment_GetGlobalVar")
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index c75c414c8ce9..993892a94861 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -26,7 +26,10 @@ TVM_REGISTER_API("relay._make.Constant")
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ConstantNode>([](const ConstantNode* node, tvm::IRPrinter* p) {
-    p->stream << "Constant(TODO)";
+    const PackedFunc* fprint = Registry::Get("relay._constant_repr");
+    CHECK(fprint) << "unable to find printing function for constants";
+    std::string data = (*fprint)(GetRef<Constant>(node));
+    p->stream << "Constant(" << data << ")";
   });
 
 TensorType ConstantNode::tensor_type() const {
@@ -104,12 +107,14 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 Function FunctionNode::make(tvm::Array<Var> params,
                             Expr body,
                             Type ret_type,
-                            tvm::Array<TypeVar> type_params) {
+                            tvm::Array<TypeVar> type_params,
+                            tvm::Attrs attrs) {
   NodePtr<FunctionNode> n = make_node<FunctionNode>();
   n->params = std::move(params);
   n->body = std::move(body);
   n->ret_type = std::move(ret_type);
   n->type_params = std::move(type_params);
+  n->attrs = std::move(attrs);
   return Function(n);
 }
 
@@ -121,6 +126,39 @@ FuncType FunctionNode::func_type_annotation() const {
   return FuncTypeNode::make(param_types, this->ret_type, this->type_params, {});
 }
 
+NodeRef FunctionGetAttr(const Function& func, const std::string& key) {
+  if (!func->attrs.defined()) { return NodeRef(); }
+
+  const DictAttrsNode* dict_attrs = func->attrs.as<DictAttrsNode>();
+  CHECK(dict_attrs);
+  auto it = dict_attrs->dict.find(key);
+  if (it != dict_attrs->dict.end()) {
+    return (*it).second;
+  } else {
+    return NodeRef();
+  }
+}
+
+Function FunctionSetAttr(const Function& func, const std::string& key, const NodeRef& data) {
+  const DictAttrsNode* dattrs = func->attrs.as<DictAttrsNode>();
+  Attrs func_attrs;
+  if (dattrs) {
+    Map<std::string, NodeRef> dict = dattrs->dict;
+    dict.Set(key, data);
+    func_attrs = DictAttrsNode::make(dict);
+  } else {
+    Map<std::string, NodeRef> dict = {{key, data}};
+    func_attrs = DictAttrsNode::make(dict);
+  }
+
+  return FunctionNode::make(
+    func->params,
+    func->body,
+    func->ret_type,
+    func->type_params,
+    func_attrs);
+}
+
 TVM_REGISTER_NODE_TYPE(FunctionNode);
 
 TVM_REGISTER_API("relay._make.Function")
@@ -132,7 +170,8 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<FunctionNode>([](const FunctionNode* node,
                                    tvm::IRPrinter* p) {
       p->stream << "FunctionNode(" << node->params << ", " << node->ret_type
-                << ", " << node->body << ", " << node->type_params << ")";
+                << ", " << node->body << ", " << node->type_params << ", "
+                << node->attrs << ")";
 });
 
 Call CallNode::make(Expr op, Array<Expr> args, Attrs attrs,
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index ed7c1d1d1e5a..08f903a26d3e 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -92,7 +92,7 @@ Expr ExprMutator::VisitExpr_(const FunctionNode* op) {
       body.same_as(op->body)) {
     return GetRef<Expr>(op);
   } else {
-    return FunctionNode::make(params, body, ret_type, ty_params);
+    return FunctionNode::make(params, body, ret_type, ty_params, op->attrs);
   }
 }
 
@@ -198,6 +198,7 @@ void ExprVisitor::ExprVisitor::VisitExpr_(const FunctionNode* op) {
 
 void ExprVisitor::VisitExpr_(const CallNode* op) {
   this->VisitExpr(op->op);
+
   for (auto ty_arg : op->type_args) {
     this->VisitType(ty_arg);
   }
diff --git a/src/relay/ir/hash.cc b/src/relay/ir/hash.cc
index ce2049f269df..4fd91256db9c 100644
--- a/src/relay/ir/hash.cc
+++ b/src/relay/ir/hash.cc
@@ -285,11 +285,11 @@ class RelayHashHandler:
   int var_counter = 0;
 };
 
-size_t StructuralHash(const Type& type) {
+size_t StructuralHash::operator()(const Type& type) const {
   return RelayHashHandler().TypeHash(type);
 }
 
-size_t StructuralHash(const Expr& expr) {
+size_t StructuralHash::operator()(const Expr& expr) const {
   return RelayHashHandler().ExprHash(expr);
 }
 
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
new file mode 100644
index 000000000000..3aea12931649
--- /dev/null
+++ b/src/relay/pass/fuse_ops.cc
@@ -0,0 +1,86 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file src/tvm/relay/pass/fuse_ops.cc
+ *
+ * \brief Fuse Relay eligble sequences of Relay operators into a single one.
+ *
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/runtime/module.h>
+#include <tvm/lowered_func.h>
+#include <tvm/operation.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/logging.h>
+#include "../ir/type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace runtime;
+
+struct AbstractFusableOps : ExprMutator {
+  Environment env;
+  Array<GlobalVar> fusable_funcs;
+  int counter = 0;
+  size_t expr_hash;
+
+  AbstractFusableOps(Environment env, size_t expr_hash) : env(env), expr_hash(expr_hash) {}
+
+  Expr VisitExpr_(const CallNode* call) {
+    if (auto op_node = call->op.as<OpNode>()) {
+      // Placeholder fusion algorithm which abstracts
+      // single definitions into functions only.
+      Array<Var> params;
+      Array<Expr> inner_args;
+      Array<Expr> args;
+
+      int param_number = 0;
+      for (auto arg : call->args) {
+        auto name = std::string("p") + std::to_string(param_number++);
+        auto type = arg->checked_type();
+        auto var = VarNode::make(name, type);
+        params.push_back(var);
+        inner_args.push_back(var);
+        args.push_back(VisitExpr(arg));
+      }
+
+      auto body = CallNode::make(call->op, inner_args, call->attrs);
+      auto func = FunctionNode::make(params, body, call->checked_type(), {});
+      func = FunctionSetAttr(func, "Primitive", tvm::Integer(1));
+      std::string func_name = "fused_";
+      func_name += op_node->name;
+      func_name += "_";
+      func_name += std::to_string(counter++);
+      func_name += "_";
+      func_name += std::to_string(expr_hash);
+      auto gv = GlobalVarNode::make(func_name);
+      env->Add(gv, func);
+      fusable_funcs.push_back(gv);
+      return CallNode::make(gv, args, Attrs());
+    } else {
+      return ExprMutator::VisitExpr_(call);
+    }
+  }
+};
+
+Expr FuseOps(const Environment& env, const Expr& e) {
+  // First we convert all chains of fusable ops into
+  // abstracted functions which we mark as primtive
+  // then we convert these primtive functions into
+  // new operators.
+  auto abstract = AbstractFusableOps(env, StructuralHash()(e));
+  auto abstracted_e = abstract.VisitExpr(e);
+  RELAY_LOG(INFO) << "FuseOps: before=" << e
+                  << "Fuse: after=" << abstracted_e;
+  return abstracted_e;
+}
+
+TVM_REGISTER_API("relay._ir_pass.FuseOps")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = FuseOps(args[1], args[0]);
+});
+
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/lower_ops.cc b/src/relay/pass/lower_ops.cc
new file mode 100644
index 000000000000..6bab9a924269
--- /dev/null
+++ b/src/relay/pass/lower_ops.cc
@@ -0,0 +1,222 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file src/tvm/relay/pass/lower_ops.cc
+ *
+ * \brief Lower a Relay program to set of TVM operators.
+ *
+ */
+#include <tvm/lowered_func.h>
+#include <tvm/operation.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/logging.h>
+#include <tvm/relay/pass.h>
+#include <tvm/runtime/module.h>
+#include <tvm/relay/build_module.h>
+#include "../ir/type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace runtime;
+
+LoweredOp LoweredOpNode::make(Function func, LoweredFunc lowered_func) {
+  auto node = make_node<LoweredOpNode>();
+  node->func = func;
+  node->lowered_func = lowered_func;
+  return LoweredOp(node);
+}
+
+struct AbstractLocalFunctions : ExprMutator {
+  Environment env;
+  size_t expr_hash;
+  int counter = 0;
+  std::unordered_set<GlobalVar, NodeHash, NodeEqual> visited_funcs;
+  explicit AbstractLocalFunctions(Environment env)
+      : env(env), expr_hash(0), counter(0), visited_funcs() {}
+
+  Expr Abstract(const Expr& e) {
+    expr_hash = StructuralHash()(e);
+    return VisitExpr(e);
+  }
+
+  Expr VisitExpr_(const GlobalVarNode* gvar_node) final {
+    auto gvar = GetRef<GlobalVar>(gvar_node);
+    auto it = visited_funcs.find(gvar);
+    if (it == visited_funcs.end()) {
+      auto func = env->Lookup(gvar);
+      visited_funcs.insert(gvar);
+      auto new_func = FunctionNode::make(
+        func->params,
+        VisitExpr(func->body),
+        func->ret_type,
+        func->type_params,
+        func->attrs);
+      env->Update(gvar, new_func);
+    }
+    return gvar;
+  }
+
+  Expr VisitExpr_(const FunctionNode* func_node) final {
+    Function func = GetRef<Function>(func_node);
+    auto free_vars = FreeVars(func);
+    Array<Var> params;
+    for (auto free_var : free_vars) {
+      auto var = VarNode::make("free_var", free_var->checked_type());
+      params.push_back(var);
+    }
+    std::string abs_func = "abstracted_func_";
+    abs_func += std::to_string(counter++);
+    abs_func += std::to_string(expr_hash);
+    auto gv = GlobalVarNode::make(abs_func);
+    auto lifted_func = FunctionNode::make(params, func, Type(), {}, {});
+    env->Add(gv, lifted_func);
+    Array<Expr> args;
+    for (auto free_var : free_vars) {
+      args.push_back(free_var);
+    }
+    return CallNode::make(gv, args, {});
+  }
+};
+
+struct LiveFunctions : ExprVisitor {
+  Environment env;
+  explicit LiveFunctions(Environment env) : env(env), global_funcs() {}
+
+  std::unordered_set<GlobalVar, NodeHash, NodeEqual> visited_funcs;
+  std::unordered_set<GlobalVar, NodeHash, NodeEqual> global_funcs;
+
+  void Live(const Expr& e) {
+    CHECK(!e.as<FunctionNode>())
+        << "functions should of been transformed away by previous pass";
+    VisitExpr(e);
+  }
+
+  void VisitExpr_(const FunctionNode* func_node) {
+    LOG(FATAL) << "functions should of been transformed away by previous pass";
+  }
+
+  void VisitExpr_(const GlobalVarNode* var_node) final {
+    GlobalVar var = GetRef<GlobalVar>(var_node);
+    auto it = visited_funcs.find(var);
+    if (it == visited_funcs.end()) {
+      auto func = env->Lookup(var);
+      visited_funcs.insert(var);
+      // The last pass has trasnformed functions of the form:
+      //
+      // let x = fn (p_1, ..., p_n) { ... };
+      // ...
+      //
+      // into, a top-level declaration:
+      //
+      // def abs_f(fv_1, ..., fv_n) {
+      //    return (fn (p_1...,p_N) { ... };)
+      // }
+      //
+      // and:
+      //
+      // let x = abs_f(fv_1, ... fv_n);
+      //
+      // The only other case we can handle is
+      //
+      // fn foo(...) { body }
+      //
+      // We just search through the body in this case.
+      if (auto inner_func = func->body.as<FunctionNode>()) {
+        return VisitExpr(inner_func->body);
+      } else {
+        return VisitExpr(func->body);
+      }
+    }
+  }
+
+  void VisitExpr_(const CallNode* call) final {
+    RELAY_LOG(INFO) << "LiveOps: CallNode=" << GetRef<Call>(call);
+    if (auto gv_node = call->op.as<GlobalVarNode>()) {
+      GlobalVar gvar = GetRef<GlobalVar>(gv_node);
+      Function func = env->Lookup(gvar);
+
+      auto attr = FunctionGetAttr(func, "Primitive");
+
+      if (attr.defined() && Downcast<Integer>(attr)->value == 1) {
+        global_funcs.insert(gvar);
+      } else {
+         VisitExpr(gvar);
+      }
+
+      // Finally we need to ensure to visit all the args no matter what.
+      for (auto arg : call->args) {
+        VisitExpr(arg);
+      }
+    } else {
+      return ExprVisitor::VisitExpr_(call);
+    }
+  }
+};
+
+using FCompute = TypedPackedFunc<Array<Tensor>(
+    const Attrs&, const Array<Tensor>&, Type, std::string)>;
+using FSchedule = TypedPackedFunc<Schedule(const Array<Tensor>&, std::string)>;
+
+/*! \brief Return the set of operators in their TVM format. */
+Array<LoweredOp> LowerOps(const Environment& env, const Expr& e,
+                          const std::string& target) {
+  RELAY_LOG(INFO) << "LowerOps: e=" << e;
+  auto flower_ptr = Registry::Get("relay.op.compiler._lower");
+  CHECK(flower_ptr);
+  PackedFunc flower = *flower_ptr;
+
+  auto abstracted_e = AbstractLocalFunctions(env).Abstract(e);
+  auto live_funcs = LiveFunctions(env);
+  live_funcs.VisitExpr(abstracted_e);
+
+  auto schedule_reg = Op::GetAttr<FSchedule>("FTVMSchedule");
+  auto compute_reg = Op::GetAttr<FCompute>("FTVMCompute");
+
+  Array<LoweredOp> lowered_funcs;
+
+  for (auto func_name : live_funcs.global_funcs) {
+    auto func = env->Lookup(func_name);
+    auto call = Downcast<Call>(func->body);
+    auto op_node = call->op.as<OpNode>();
+    CHECK(op_node) << "violated invariant that primtiive calls contain a single op call";
+    auto op = GetRef<Op>(op_node);
+    RELAY_LOG(INFO) << "LowerOps: Lowering " << op->name;
+
+    CHECK(IsPrimitiveOp(op)) << "failed to lower "
+      << op->name << "can only lower primitve operations";
+
+    Array<Tensor> inputs;
+    std::string input_name = "in";
+    int i = 0;
+    for (auto type_arg : call->type_args) {
+      auto tt = Downcast<TensorType>(type_arg);
+      inputs.push_back(PlaceholderOpNode::make(input_name + std::to_string(i),
+                                               tt->shape, tt->dtype)
+                           .output(0));
+      i++;
+    }
+
+    auto output_tt = op->op_type->ret_type;
+    Array<Tensor> outputs =
+        compute_reg[op](call->attrs, inputs, output_tt, target);
+    auto schedule = schedule_reg[op](outputs, target);
+    size_t hash = StructuralHash()(func);
+    LoweredFunc lf =
+        flower(op->name + std::to_string(hash), schedule, inputs, outputs);
+    func = FunctionSetAttr(func, "LoweredFunc", lf);
+    env->Add(func_name, func, true);
+    lowered_funcs.push_back(LoweredOpNode::make(func, lf));
+  }
+
+  return lowered_funcs;
+}
+
+TVM_REGISTER_API("relay._ir_pass.LowerOps")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = LowerOps(args[0], args[1], args[2]);
+});
+
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index e3e8ad7ecdf7..864b7ad78abd 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -298,8 +298,8 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
     auto* fn_ty_node = ftype.as<FuncTypeNode>();
 
     CHECK(fn_ty_node != nullptr)
-        << "only expressions with function types can be called, at "
-        << call->span;
+        << "only expressions with function types can be called, found "
+        << ftype << " at " << call->span;
 
     Array<Type> type_args;
     FuncType fn_ty = Instantiate(fn_ty_node, &type_args);
@@ -505,12 +505,16 @@ Expr TypeInferencer::Infer(Expr expr) {
   // Step 1: Solve the constraints.
   solver_.Solve();
   // Step 2: Attach resolved types to checked_type field.
-  return Resolver(type_map_, &solver_).VisitExpr(expr);
+  auto resolved_expr = Resolver(type_map_, &solver_).VisitExpr(expr);
+  CHECK(WellFormed(resolved_expr));
+  return resolved_expr;
 }
 
 
 Expr InferType(const Expr& expr, const Environment& env) {
-  return TypeInferencer(env).Infer(expr);
+  auto e = TypeInferencer(env).Infer(expr);
+  CHECK(WellFormed(e));
+  return e;
 }
 
 Function InferType(const Function& func,
@@ -522,6 +526,7 @@ Function InferType(const Function& func,
   Expr func_ret = TypeInferencer(env).Infer(func_copy);
   auto map_node = env->functions.CopyOnWrite();
   map_node->data.erase(var.node_);
+  CHECK(WellFormed(func_ret));
   return Downcast<Function>(func_ret);
 }
 
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index c1f00c7b65e0..51ef0377868f 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -3,7 +3,7 @@
  *
  * \file util.cc
  *
- * \brief simple util for relay.
+ * \brief Utility functions for Relay.
  */
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
diff --git a/tests/python/relay/test_graph_runtime.py b/tests/python/relay/test_graph_runtime.py
new file mode 100644
index 000000000000..1e55f890e514
--- /dev/null
+++ b/tests/python/relay/test_graph_runtime.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+from tvm import relay
+from tvm.relay.ir_pass import infer_type
+from tvm.relay.interpreter import evaluate
+from tvm.relay.graph_runtime_codegen import graph_evaluate
+from tvm.relay.scope_builder import ScopeBuilder
+from tvm.relay.op import add
+from tvm.relay.env import Environment
+
+# @tq, @jr should we put this in testing ns?
+def check_rts(env, expr, args, expected_result):
+    """
+    Check that evaluating `expr` applied to the arguments produces
+    `result` on both the evaluator and TVM runtime.
+
+    Parameters
+    ----------
+    expr:
+        The expression to evaluate
+
+    args: list of Expr
+        The arguments to supply the expr.
+
+    expected_result:
+        The expected result of running the expression.
+    """
+    eval_result = evaluate(env, expr, *args)
+    rts_result = graph_evaluate(env, expr, *args)
+    np.testing.assert_allclose(eval_result.asnumpy(), rts_result.asnumpy())
+
+def test_add_op_scalar():
+    """
+    Program:
+        fn (x, y) {
+            return x + y;
+        }
+    """
+    env = Environment()
+    x = relay.var('x', shape=())
+    y = relay.var('y', shape=())
+    func = relay.Function([x, y], add(x, y))
+    x_data = np.array(10.0, dtype='float32')
+    y_data = np.array(1.0, dtype='float32')
+    check_rts(env, func, [x_data, y_data], x_data + y_data)
+
+def test_add_op_tensor():
+    """
+    Program:
+        fn (x, y) {
+            return x + y;
+        }
+    """
+    env = Environment()
+    x = relay.var('x', shape=(10, 5))
+    y = relay.var('y', shape=(10, 5))
+    func = relay.Function([x, y], add(x, y))
+    x_data = np.random.rand(10, 5).astype('float32')
+    y_data = np.random.rand(10, 5).astype('float32')
+    check_rts(env, func, [x_data, y_data], x_data + y_data)
+
+def test_add_op_broadcast():
+    """
+    Program:
+        fn (x, y) {
+            return x + y;
+        }
+    """
+    env = Environment()
+    x = relay.var('x', shape=(10, 5))
+    y = relay.var('y', shape=(1, 5))
+    func = relay.Function([x, y], add(x, y))
+    x_data = np.random.rand(10, 5).astype('float32')
+    y_data = np.random.rand(1, 5).astype('float32')
+    check_rts(env, func, [x_data, y_data], x_data + y_data)
+
+if __name__ == "__main__":
+    test_add_op_scalar()
+    test_add_op_tensor()
+    test_add_op_broadcast()
diff --git a/tests/python/relay/test_interpreter.py b/tests/python/relay/test_interpreter.py
new file mode 100644
index 000000000000..9a431b4c9524
--- /dev/null
+++ b/tests/python/relay/test_interpreter.py
@@ -0,0 +1,142 @@
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay.interpreter import Value, TupleValue, evaluate
+from tvm.relay import op
+from tvm.relay.scope_builder import ScopeBuilder
+from tvm.relay import testing
+
+
+def check_eval(expr, args, expected_result, env=None, rtol=1e-07):
+    if env is None:
+        env = relay.env.Environment({})
+
+    result = evaluate(env, expr, *args)
+    np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
+
+
+def test_from_scalar():
+    np.testing.assert_allclose(Value.from_scalar(1, 'int32').asnumpy(), 1)
+    np.testing.assert_allclose(Value.from_scalar(10.0, 'float32').asnumpy(), 10.0)
+    np.testing.assert_allclose(Value.from_scalar(True).asnumpy(), True)
+
+
+def test_tuple_value():
+    tv = TupleValue(Value.from_scalar(
+        1), Value.from_scalar(2), Value.from_scalar(3))
+    np.testing.assert_allclose(tv[0].asnumpy(), 1)
+    np.testing.assert_allclose(tv[1].asnumpy(), 2)
+    np.testing.assert_allclose(tv[2].asnumpy(), 3)
+
+
+def test_id():
+    x = relay.var('x', 'float32')
+    ident = relay.Function([x], x)
+    env = relay.env.Environment({})
+    res = evaluate(env, ident, 1.0)
+    check_eval(ident, [1.0], 1.0)
+
+
+def test_add_const():
+    two = op.add(relay.const(1), relay.const(1))
+    func = relay.Function([], two)
+    check_eval(func, [], 2)
+
+
+def test_mul_param():
+    x = relay.var('x', shape=(10, 10))
+    y = relay.var('y', shape=(1, 10))
+    func = relay.Function([x, y], op.multiply(x, y))
+    x_data = np.random.rand(10, 10).astype('float32')
+    y_data = np.random.rand(1, 10).astype('float32')
+    check_eval(func, [x_data, y_data], x_data * y_data)
+
+
+# failing due to numeric issues
+
+# def test_dense():
+#     x = relay.var('x', shape=(10, 10))
+#     w = relay.var('w', shape=(10, 10))
+#     y = op.nn.dense(x, w)
+#     func = relay.Function([x, w], y)
+#     x_data = np.random.rand(10, 10).astype('float32')
+#     w_data = np.random.rand(10, 10).astype('float32')
+#     check_eval(func, [x_data, w_data], x_data @ w_data, rtol=0.1)
+
+# def test_linear():
+#     x = relay.var('x', shape=(10, 10))
+#     w = relay.var('w', shape=(10, 10))
+#     b = relay.var('b', shape=(10,))
+#     y = op.add(op.nn.dense(x, w), b)
+#     func = relay.Function([x, w, b], y)
+#     x_data = np.random.rand(10, 10).astype('float32')
+#     w_data = np.random.rand(10, 10).astype('float32')
+#     b_data = np.random.rand(10).astype('float32')
+#     check_eval(func, [x_data, w_data, b_data], x_data @ w_data + b_data)
+
+def test_equal():
+    i = relay.var('i', shape=[], dtype='int32')
+    j = relay.var('i', shape=[], dtype='int32')
+    z = op.equal(i, j)
+    func = relay.Function([i, j], z, ret_type=relay.TensorType([], 'bool'))
+    i_data = relay.const(0)
+    j_data = relay.const(0)
+    check_eval(func, [i_data, j_data], True)
+
+def test_subtract():
+    i = relay.var('i', shape=[], dtype='int32')
+    sub = op.subtract(i, relay.const(1, dtype='int32'))
+    func = relay.Function([i], sub, ret_type=relay.TensorType([], 'int32'))
+    i_data = np.array(1, dtype='int32')
+    check_eval(func, [i_data], 0)
+
+def test_simple_loop():
+    env = relay.env.Environment({})
+    sum_up = relay.GlobalVar('sum_up')
+    i = relay.var('i', shape=[], dtype='int32')
+    sb = ScopeBuilder()
+    with sb.if_scope(op.equal(i, relay.const(0, dtype='int32'))):
+        sb.ret(i)
+    with sb.else_scope():
+        one_less = op.subtract(i, relay.const(1, dtype='int32'))
+        rec_call = relay.Call(sum_up, [one_less])
+        sb.ret(op.add(rec_call, i))
+    func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], 'int32'))
+    env[sum_up] = func
+    i_data = np.array(10, dtype='int32')
+    check_eval(sum_up, [i_data], sum(range(1, 11)), env=env)
+
+def test_loop():
+    env = relay.env.Environment({})
+    sum_up = relay.GlobalVar('sum_up')
+    i = relay.var('i', shape=[], dtype='int32')
+    accum = relay.var('accum', shape=[], dtype='int32')
+    sb = ScopeBuilder()
+    with sb.if_scope(op.equal(i, relay.const(0))):
+        sb.ret(accum)
+    with sb.else_scope():
+        one_less = op.subtract(i, relay.const(1))
+        new_accum = op.add(accum, i)
+        sb.ret(relay.Call(sum_up, [one_less, new_accum]))
+    func = relay.Function([i, accum], sb.get())
+    env[sum_up] = func
+    i_data = np.array(10, dtype='int32')
+    accum_data = np.array(0, dtype='int32')
+    check_eval(sum_up, [i_data, accum_data], sum(range(1, 11)), env=env)
+
+def test_mlp():
+    pass
+    # net = testing.mlp.get_workload(1)
+    # import pdb; pdb.set_trace()
+
+if __name__ == "__main__":
+    test_id()
+    test_add_const()
+    # test_dense()
+    # test_linear()
+    test_equal()
+    test_subtract()
+    test_simple_loop()
+    test_loop()
+    test_mlp()
+
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index b1823004022c..31d350dc7ff7 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -5,6 +5,16 @@
 import numpy as np
 from tvm.relay.ir_pass import infer_type
 from tvm import relay
+from tvm.relay import op
+from tvm.relay.scope_builder import ScopeBuilder
+
+
+def assert_has_type(expr, typ, env=relay.env.Environment({})):
+    checked_expr = infer_type(expr, env)
+    checked_type = checked_expr.checked_type
+    if checked_type != typ:
+        raise RuntimeError("Type mismatch %s vs %s" % (
+            checked_type, typ))
 
 
 def test_monomorphic_let():
@@ -16,6 +26,31 @@ def test_monomorphic_let():
     assert xchecked.checked_type == relay.scalar_type("float64")
 
 
+def test_single_op():
+    "Program: fn (x : float32) { let t1 = f(x); t1 }"
+    x = relay.var('x', shape=[])
+    func = relay.Function([x], op.log(x))
+    ttype = relay.TensorType([], dtype='float32')
+    assert_has_type(func, relay.FuncType([ttype], ttype))
+
+
+def test_add_broadcast_op():
+    """
+    Program:
+        fn (x: Tensor[(10, 4), f32], y: Tensor[(5, 10, 1), f32]) -> Tensor[(5, 10, 4), f32] {
+            return x + y;
+        }
+    """
+    pass
+    # x = relay.var('x', shape=(10, 4))
+    # y = relay.var('y', shape=(5, 10, 1))
+    # z = x + y
+    # func = relay.Function([x, y], z)
+    # ttype = relay.TensorType((5, 5, 5), 'float32')
+    # expected_ty = relay.FuncType([ttype, ttype], ttype)
+    # assert_has_type(func.to_func(), expected_ty)
+
+
 def test_dual_op():
     """Program:
        fn (x : Tensor[f32, (10, 10)]) {
@@ -41,7 +76,6 @@ def f(x : Tensor[(10, 10), f32]) {
            return log(x);
        }
     """
-    sb = relay.ScopeBuilder()
     tp = relay.TensorType((10, 10))
     x = relay.var("x", tp)
     f = relay.Function([x], relay.log(x))
@@ -76,6 +110,24 @@ def f(n: i32, data: f32) -> f32 {
     assert "%3 = @f(%1, %2)" in env.astext()
     assert env[f].checked_type == relay.FuncType([ti32, tf32], tf32)
 
+# This currently fails and should pass under the type system.
+#
+# This test is to illustrate problem with our weak form of
+# unification.
+#
+
+
+def test_incomplete_call():
+    sb = ScopeBuilder()
+    x = relay.var('x', dtype='int32')
+    f = relay.var('f')
+    func = relay.Function([x, f], relay.Call(f, [x]))
+
+    try:
+        relay.ir_pass.infer_type(func)
+        assert False
+    except tvm.TVMError as e:
+        assert True
 
 def test_tuple():
     tp = relay.TensorType((10,))
@@ -84,13 +136,13 @@ def test_tuple():
     assert (relay.ir_pass.infer_type(res).checked_type ==
             relay.TupleType([tp, tp]))
 
-
 def test_free_expr():
     x = relay.var("x", "float32")
     y = relay.add(x, x)
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.scalar_type("float32")
 
+
 def test_type_args():
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
@@ -107,6 +159,7 @@ def test_type_args():
     assert sh2[0].value == 1
     assert sh2[1].value == 10
 
+
 def test_self_reference():
     """
     Program:
@@ -117,31 +170,41 @@ def f(x) {
     a = relay.TypeVar("a")
     x = relay.var("x", a)
     sb = relay.ScopeBuilder()
+
     f = relay.Function([x], x)
     fx = relay.Call(f, [x])
     assert relay.ir_pass.infer_type(x).checked_type == a
     assert relay.ir_pass.infer_type(f).checked_type == relay.FuncType([a], a)
     assert relay.ir_pass.infer_type(fx).checked_type == a
 
+
 def test_global_var_cow_issue():
     env = relay.env.Environment({})
     gv = relay.GlobalVar("foo")
     x = relay.var('x', shape=[])
-    func = relay.Function([x], relay.Call(gv, [x]), relay.TensorType([], 'float32'))
+    func = relay.Function([x], relay.Call(gv, [x]),
+                          relay.TensorType([], 'float32'))
     env[gv] = func
-    # They should both point to the same global variable if global variables are
-    # stable across type checking.
-    assert gv == func.body.op
+
+
+def test_equal():
+    i = relay.var('i', shape=[], dtype='int32')
+    eq = op.equal(i, relay.const(0, dtype='int32'))
+    # This should fail ....
+    func = relay.Function([i], eq, ret_type=relay.TensorType([], 'int32'))
+
 
 if __name__ == "__main__":
     test_free_expr()
     test_dual_op()
+    test_single_op()
     test_recursion()
     test_monomorphic_let()
     test_decl()
     test_recursion()
     test_tuple()
+    test_incomplete_call()
     test_free_expr()
     test_type_args()
     test_self_reference()
-    test_global_var_cow_issue()
\ No newline at end of file
+    test_global_var_cow_issue()
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 818376717176..d11dcd5da71a 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-export PYTHONPATH=python:apps/extension/python
+export PYTHONPATH=python:topi/python:apps/extension/python
 export LD_LIBRARY_PATH=build:${LD_LIBRARY_PATH}
 
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc

From 8400038c68e2a446bff3befc433296ac3e580255 Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Tue, 30 Oct 2018 19:16:01 -0700
Subject: [PATCH 314/529] [TEAM] Add Zhi Chen as a reviewer. (#2040)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 057e0a18abd5..f1f37a0f3c39 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -21,6 +21,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
 
 ## Reviewers
+- [Zhi Chen](https://github.com/zhiics)
 - [Xiaoqiang Dan](https://github.com/xqdan)
 - [Liangfu Chen](https://github.com/liangfu)
 - [Masahiro Masuda](https://github.com/masahi)

From a5a365112bc15c786a1224f547999df983912e37 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Tue, 30 Oct 2018 19:20:53 -0700
Subject: [PATCH 315/529] [AUTOTVM] Misc fix to document and style (#2035)

---
 apps/benchmark/util.py                        |  2 +-
 docs/install/from_source.rst                  | 13 ++++---
 python/tvm/autotvm/measure/measure_methods.py |  4 +-
 python/tvm/autotvm/record.py                  |  6 +--
 python/tvm/autotvm/task/dispatcher.py         | 39 +++++++------------
 python/tvm/relay/testing/init.py              |  5 ++-
 python/tvm/target.py                          |  6 +--
 7 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/apps/benchmark/util.py b/apps/benchmark/util.py
index bdf47dd660f7..ac732d7945b9 100644
--- a/apps/benchmark/util.py
+++ b/apps/benchmark/util.py
@@ -34,7 +34,7 @@ def get_network(name, batch_size, dtype='float32'):
     elif name == 'mobilenet_v2':
         net, params = nnvm.testing.mobilenet_v2.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == 'inception_v3':
-        input_shape = (1, 3, 299, 299)
+        input_shape = (batch_size, 3, 299, 299)
         net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif "resnet" in name:
         n_layer = int(name.split('-')[1])
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 2228e92b2f22..84bfa3c63bf0 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -124,13 +124,15 @@ TVM package
 ~~~~~~~~~~~
 
 The python package is located at `tvm/python`
-There are several ways to install the package:
+There are two ways to install the package:
 
-1. Set the environment variable `PYTHONPATH` to tell python where to find
+Method 1
+   This method is **recommended for developers** who may change the codes.
+
+   Set the environment variable `PYTHONPATH` to tell python where to find
    the library. For example, assume we cloned `tvm` on the home directory
    `~`. then we can added the following line in `~/.bashrc`.
-   It is **recommended for developers** who may change the codes.
-   The changes will be immediately reflected once you pulled the code and rebuild the project (no need to call ``setup`` again)
+   The changes will be immediately reflected once you pull the code and rebuild the project (no need to call ``setup`` again)
 
    .. code:: bash
 
@@ -138,7 +140,8 @@ There are several ways to install the package:
        export PYTHONPATH=$TVM_HOME/python:$TVM_HOME/topi/python:$TVM_HOME/nnvm/python:${PYTHONPATH}
 
 
-2. Install tvm python bindings by `setup.py`:
+Method 2
+   Install tvm python bindings by `setup.py`:
 
    .. code:: bash
 
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 975faf71b5a0..802abe019013 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -551,7 +551,9 @@ def check_remote(target, device_key, host=None, port=None, priority=100, timeout
     """
     def _check():
         remote = request_remote(device_key, host, port, priority)
-        remote.context(str(target))
+        ctx = remote.context(str(target))
+        while not ctx.exist:  # wait until we get an available device
+            pass
     t = threading.Thread(target=_check,)
     t.start()
     t.join(timeout)
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index 910f7595ad01..3135e5c58f3d 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -252,13 +252,13 @@ def pick_best(in_file, out_file):
 This record executable module has three modes.
 
 * Print log file in readable format
-e.g. python -m autotvm.record --mode read --i collect_conv.log --begin 0 --end 5 --ir --code
+e.g. python -m tvm.autotvm.record --mode read --i collect_conv.log --begin 0 --end 5 --ir --code
 
 * Extract history best from a large log file
-e.g. python -m autotvm.record --mode pick --i collect.log
+e.g. python -m tvm.autotvm.record --mode pick --i collect.log
 
 * Split a log file into separate files, each of which contains only a single wkl
-e.g. python -m autotvm.record --mode split --i collect.log
+e.g. python -m tvm.autotvm.record --mode split --i collect.log
 """
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index fd91d60e7982..c5464f94f285 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -292,17 +292,13 @@ def load(self, records):
                         best_by_targetkey[key] = (inp, res)
 
             # use model as key to build best map
-            for opt in inp.target.options:
-                if opt.startswith("-model"):
-                    model = opt[7:]
-                    key = (model, inp.task.workload)
-                    if key not in best_by_model:
-                        best_by_model[key] = (inp, res)
-                    else:
-                        _, other_res = best_by_model[key]
-                        if np.mean(other_res.costs) > np.mean(res.costs):
-                            best_by_model[key] = (inp, res)
-                    break
+            key = (inp.target.model, inp.task.workload)
+            if key not in best_by_model:
+                best_by_model[key] = (inp, res)
+            else:
+                _, other_res = best_by_model[key]
+                if np.mean(other_res.costs) > np.mean(res.costs):
+                    best_by_model[key] = (inp, res)
 
         logger.debug("Finish loading %d records", counter)
 
@@ -313,14 +309,11 @@ def _query_inside(self, target, workload):
                                " above the dispatcher call. So does other target. ")
 
         # first try matching by model
-        for opt in target.options:
-            if opt.startswith("-model"):
-                model = opt[7:]
-                key = (model, workload)
-                if key in self._best_user_defined:
-                    return self._best_user_defined[key]
-                if key in self.best_by_model:
-                    return self.best_by_model[key][0].config
+        key = (target.model, workload)
+        if key in self._best_user_defined:
+            return self._best_user_defined[key]
+        if key in self.best_by_model:
+            return self.best_by_model[key][0].config
 
         # then try matching by target key
         for k in target.keys:
@@ -333,11 +326,9 @@ def _query_inside(self, target, workload):
         return None
 
     def update(self, target, workload, cfg):
-        for opt in target.options:
-            if opt.startswith("-model"):
-                model = opt[7:]
-                key = (model, workload)
-                self._best_user_defined[key] = cfg
+        model = target.model
+        key = (model, workload)
+        self._best_user_defined[key] = cfg
 
         for k in target.keys:
             key = (k, workload)
diff --git a/python/tvm/relay/testing/init.py b/python/tvm/relay/testing/init.py
index fdbde9d289d6..7f92b539a1a3 100644
--- a/python/tvm/relay/testing/init.py
+++ b/python/tvm/relay/testing/init.py
@@ -1,7 +1,8 @@
 """Initializer of parameters."""
+import numpy as np
+
 import tvm
 from tvm import relay
-import numpy as np
 
 class Initializer(object):
     """The base class of an initializer."""
@@ -103,7 +104,7 @@ def _init_weight(self, name, arr):
             raise ValueError("Incorrect factor type")
         # Hack for mobilenet, because there is less connectivity
         if "depthwise" in name:
-            factor = 3 * 3
+            factor = hw_scale
         scale = np.sqrt(self.magnitude / factor)
         if self.rnd_type == "uniform":
             arr[:] = np.random.uniform(-scale, scale, size=arr.shape)
diff --git a/python/tvm/target.py b/python/tvm/target.py
index b3a9086e74b1..75f82743f9fa 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -419,7 +419,7 @@ def intel_graphics(model='unknown', options=None):
     return _api_internal._TargetCreate("opencl", *opts)
 
 
-def opengl(options=None):
+def opengl(model='unknown', options=None):
     """Returns a OpenGL target.
 
     Parameters
@@ -427,8 +427,8 @@ def opengl(options=None):
     options : str or list of str
         Additional options
     """
-    options = _merge_opts([], options)
-    return _api_internal._TargetCreate("opengl", *options)
+    opts = _merge_opts(["-model=%s" % model], options)
+    return _api_internal._TargetCreate("opengl", *opts)
 
 
 def arm_cpu(model='unknown', options=None):

From e74a8ca36d0e47021a051de04d0a48a4effa612c Mon Sep 17 00:00:00 2001
From: Chris Nuernberger <cnuernber@gmail.com>
Date: Tue, 30 Oct 2018 20:33:19 -0600
Subject: [PATCH 316/529] Better gemm support for cublas and cpu (#1967)

---
 cmake/util/FindCUDA.cmake          |   2 +-
 src/contrib/cblas/cblas.cc         |  76 ++++++++++++--------
 src/contrib/cblas/gemm_common.h    | 101 ++++++++++++++++++++++++++
 src/contrib/cublas/cublas.cc       | 112 ++++++++++++++---------------
 src/contrib/cublas/cublas_utils.cc |  39 ++++++++++
 src/contrib/cublas/cublas_utils.h  |  53 ++++++++++++++
 6 files changed, 298 insertions(+), 85 deletions(-)
 create mode 100644 src/contrib/cblas/gemm_common.h
 create mode 100644 src/contrib/cublas/cublas_utils.cc
 create mode 100644 src/contrib/cublas/cublas_utils.h

diff --git a/cmake/util/FindCUDA.cmake b/cmake/util/FindCUDA.cmake
index e715ad2efd2f..3a99551358f6 100644
--- a/cmake/util/FindCUDA.cmake
+++ b/cmake/util/FindCUDA.cmake
@@ -63,7 +63,7 @@ macro(find_cuda use_cuda)
       endif()
       find_library(CUDA_NVRTC_LIBRARY nvrtc
         PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs
+        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs lib/x86_64-linux-gnu
         NO_DEFAULT_PATH)
       find_library(CUDA_CUDNN_LIBRARY cudnn
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
diff --git a/src/contrib/cblas/cblas.cc b/src/contrib/cblas/cblas.cc
index 24ed9deb97cd..7473d45562fd 100644
--- a/src/contrib/cblas/cblas.cc
+++ b/src/contrib/cblas/cblas.cc
@@ -5,6 +5,8 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/util.h>
 #include <dmlc/logging.h>
+#include "gemm_common.h"
+
 
 extern "C" {
 #if USE_MKL_BLAS == 1
@@ -19,38 +21,56 @@ namespace contrib {
 
 using namespace runtime;
 
+inline CBLAS_TRANSPOSE BooleanToTranspose(bool trans) {
+  return trans ? CblasTrans : CblasNoTrans;
+}
+
+struct CblasSgemmOp {
+  typedef float TDatatype;
+  void operator()(bool ta, bool tb,
+                  int M, int N, int K,
+                  float alpha, float* A, int lda,
+                  float* B, int ldb,
+                  float beta, float* C, int ldc) {
+    cblas_sgemm(CblasColMajor,
+                BooleanToTranspose(ta),
+                BooleanToTranspose(tb),
+                M, N, K,
+                alpha, A, lda,
+                B, ldb,
+                beta, C, ldc);
+  }
+};
+
+struct CblasDgemmOp {
+  typedef double TDatatype;
+  void operator()(bool ta, bool tb,
+                  int M, int N, int K,
+                  double alpha, double* A, int lda,
+                  double* B, int ldb,
+                  double beta, double* C, int ldc) {
+    cblas_dgemm(CblasColMajor,
+                BooleanToTranspose(ta),
+                BooleanToTranspose(tb),
+                M, N, K,
+                alpha, A, lda,
+                B, ldb,
+                beta, C, ldc);
+  }
+};
+
+
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.cblas.matmul")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     DLTensor* A = args[0];
-    DLTensor* B = args[1];
-    DLTensor* C = args[2];
-    bool transa = args[3];
-    bool transb = args[4];
-    // call gemm for simple compact code.
-    CHECK_EQ(A->ndim, 2);
-    CHECK_EQ(B->ndim, 2);
-    CHECK_EQ(C->ndim, 2);
-    CHECK(C->strides == nullptr);
-    CHECK(B->strides == nullptr);
-    CHECK(A->strides == nullptr);
-    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
-    cblas_sgemm(CblasColMajor,
-                transb ? CblasTrans : CblasNoTrans,
-                transa ? CblasTrans : CblasNoTrans,
-                transb ? B->shape[0] : B->shape[1],
-                transa ? A->shape[1] : A->shape[0],
-                transb ? B->shape[1] : B->shape[0],
-                1.0f,
-                reinterpret_cast<float*>(static_cast<char*>(B->data) + B->byte_offset),
-                B->shape[1],
-                reinterpret_cast<float*>(static_cast<char*>(A->data) + A->byte_offset),
-                A->shape[1],
-                0.0f,
-                reinterpret_cast<float*>(static_cast<char*>(C->data) + C->byte_offset),
-                C->shape[1]);
+    CHECK(TypeMatch(A->dtype, kDLFloat, 32) ||
+          TypeMatch(A->dtype, kDLFloat, 64));
+
+    if (TypeMatch(A->dtype, kDLFloat, 32))
+      CallGemm(args, ret, CblasSgemmOp());
+    else
+      CallGemm(args, ret, CblasDgemmOp());
   });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/cblas/gemm_common.h b/src/contrib/cblas/gemm_common.h
new file mode 100644
index 000000000000..c69da5ea3e17
--- /dev/null
+++ b/src/contrib/cblas/gemm_common.h
@@ -0,0 +1,101 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/contrib/gemm.h
+ * \brief Shared implementation of gemm
+ */
+#ifndef TVM_CONTRIB_CBLAS_GEMM_COMMON_H_
+#define TVM_CONTRIB_CBLAS_GEMM_COMMON_H_
+#include <algorithm>
+
+namespace tvm {
+namespace contrib {
+
+using namespace runtime;
+
+inline int ColumnStride(DLTensor* tensor) {
+  // If the tensor itself is transposed then it will have strides
+  // backward from what we expect.  Regardless, the max of the strides
+  // (the other stride is 1) is the column stride.
+  if (tensor->strides) {
+    return std::max(tensor->strides[0], tensor->strides[1]);
+  } else {
+    return tensor->shape[1];
+  }
+}
+
+
+inline int ElementStride(DLTensor* tensor) {
+  if (tensor->strides) {
+    return std::min(tensor->strides[0], tensor->strides[1]);
+  } else {
+    return 1;
+  }
+}
+
+
+// Reversed strides indicates an in-place transpose operation.
+inline bool IsInPlaceTransposed(DLTensor* tensor) {
+  return tensor->strides && (tensor->strides[1] > tensor->strides[0]);
+}
+
+
+inline int RowCount(DLTensor* tensor, bool trans) {
+  return tensor->shape[trans ? 1 : 0];
+}
+
+
+inline int ColumnCount(DLTensor* tensor, bool trans) {
+  return tensor->shape[trans ? 0 : 1];
+}
+
+// Call a column major blas.  Note that data is stored in tvm as row
+// major, so this we switch the arguments.
+template<typename TGemmOp>
+inline void CallGemm(TVMArgs args, TVMRetValue *ret, TGemmOp op) {
+  DLTensor* A = args[0];
+  DLTensor* B = args[1];
+  DLTensor* C = args[2];
+  bool transa = args[3];
+  bool transb = args[4];
+  int bit_depth = sizeof(typename TGemmOp::TDatatype) * 8;
+  CHECK_EQ(A->ndim, 2);
+  CHECK_EQ(B->ndim, 2);
+  CHECK_EQ(C->ndim, 2);
+
+  CHECK_EQ(ElementStride(A), 1);
+  CHECK_EQ(ElementStride(B), 1);
+  CHECK_EQ(ElementStride(C), 1);
+
+  // C can never be transposed.
+  CHECK(!IsInPlaceTransposed(C));
+
+  // Reversed strides indicates an in-place transpose operation.
+  transa = IsInPlaceTransposed(A) ? !transa : transa;
+  transb = IsInPlaceTransposed(B) ? !transb : transb;
+
+  CHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
+  CHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  double alpha = args.size() > 5 ? args[5] : 1.0;
+  double beta = args.size() > 6 ? args[6] : 0.0;
+  op(transb,
+     transa,
+     ColumnCount(B, transb),
+     RowCount(A, transa),
+     ColumnCount(A, transa),
+     static_cast<float>(alpha),
+     reinterpret_cast<typename TGemmOp::TDatatype*>(static_cast<char*>(B->data)
+                                                    + B->byte_offset),
+     ColumnStride(B),
+     reinterpret_cast<typename TGemmOp::TDatatype*>(static_cast<char*>(A->data)
+                                                    + A->byte_offset),
+     ColumnStride(A),
+     static_cast<float>(beta),
+     reinterpret_cast<typename TGemmOp::TDatatype*>(static_cast<char*>(C->data)
+                                                    + C->byte_offset),
+     ColumnStride(C));
+}
+
+}  // namespace contrib
+}  // namespace tvm
+
+#endif  // TVM_CONTRIB_CBLAS_GEMM_COMMON_H_
diff --git a/src/contrib/cublas/cublas.cc b/src/contrib/cublas/cublas.cc
index 4171aadf6381..364129b7cba7 100644
--- a/src/contrib/cublas/cublas.cc
+++ b/src/contrib/cublas/cublas.cc
@@ -1,81 +1,81 @@
 /*!
- *  Copyright (c) 2017 by Contributors
+ *  Copyright (c) 2018 by Contributors
  * \file Use external cblas library call.
  */
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/util.h>
 #include <dmlc/logging.h>
+#include "../cblas/gemm_common.h"
+#include "cublas_utils.h"
 
-extern "C" {
-#include <cublas_v2.h>
-}
 
 namespace tvm {
 namespace contrib {
 
 using namespace runtime;
 
-#ifndef CHECK_CUBLAS_ERROR
-#define CHECK_CUBLAS_ERROR(error) \
-if (error != CUBLAS_STATUS_SUCCESS) { \
-  fprintf(stderr, "cuBLAS error: "); \
-  if (error == CUBLAS_STATUS_NOT_INITIALIZED) fprintf(stderr, "CUBLAS_STATUS_NOT_INITIALIZED"); \
-  if (error == CUBLAS_STATUS_ALLOC_FAILED) fprintf(stderr, "CUBLAS_STATUS_ALLOC_FAILED"); \
-  if (error == CUBLAS_STATUS_INVALID_VALUE) fprintf(stderr, "CUBLAS_STATUS_INVALID_VALUE"); \
-  if (error == CUBLAS_STATUS_ARCH_MISMATCH) fprintf(stderr, "CUBLAS_STATUS_ARCH_MISMATCH"); \
-  if (error == CUBLAS_STATUS_MAPPING_ERROR) fprintf(stderr, "CUBLAS_STATUS_MAPPING_ERROR"); \
-  if (error == CUBLAS_STATUS_EXECUTION_FAILED) fprintf(stderr, "CUBLAS_STATUS_EXECUTION_FAILED"); \
-  if (error == CUBLAS_STATUS_INTERNAL_ERROR) fprintf(stderr, "CUBLAS_STATUS_INTERNAL_ERROR"); \
-  if (error == CUBLAS_STATUS_NOT_SUPPORTED) fprintf(stderr, "CUBLAS_STATUS_NOT_SUPPORTED"); \
-  if (error == CUBLAS_STATUS_LICENSE_ERROR) fprintf(stderr, "CUBLAS_STATUS_LICENSE_ERROR"); \
-  fprintf(stderr, "\n"); \
-  exit(EXIT_FAILURE); \
+inline cublasOperation_t BooleanToTranspose(bool item) {
+  return item ? CUBLAS_OP_T : CUBLAS_OP_N;
 }
-#endif
+
+struct CublasSgemmOp {
+  typedef float TDatatype;
+  cublasHandle_t handle;
+  explicit CublasSgemmOp(cublasHandle_t hdl)
+    : handle(hdl)
+    {}
+
+  void operator()(bool ta, bool tb,
+                  int M, int N, int K,
+                  float alpha, float* A, int lda,
+                  float* B, int ldb,
+                  float beta, float* C, int ldc) {
+    CHECK_CUBLAS_ERROR(cublasSgemm(handle,
+                                   BooleanToTranspose(ta),
+                                   BooleanToTranspose(tb),
+                                   M, N, K,
+                                   &alpha, A, lda,
+                                   B, ldb,
+                                   &beta, C, ldc));
+  }
+};
+
+
+struct CublasDgemmOp {
+  typedef double TDatatype;
+  cublasHandle_t handle;
+  explicit CublasDgemmOp(cublasHandle_t hdl)
+    : handle(hdl)
+    {}
+  void operator()(bool ta, bool tb,
+                  int M, int N, int K,
+                  double alpha, double* A, int lda,
+                  double* B, int ldb,
+                  double beta, double* C, int ldc) {
+    CHECK_CUBLAS_ERROR(cublasDgemm(handle,
+                                   BooleanToTranspose(ta),
+                                   BooleanToTranspose(tb),
+                                   M, N, K,
+                                   &alpha, A, lda,
+                                   B, ldb,
+                                   &beta, C, ldc));
+  }
+};
 
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.cublas.matmul")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     DLTensor* A = args[0];
-    DLTensor* B = args[1];
-    DLTensor* C = args[2];
-    bool transa = args[3];
-    bool transb = args[4];
-    // call gemm for simple compact code.
-    CHECK_EQ(A->ndim, 2);
-    CHECK_EQ(B->ndim, 2);
-    CHECK_EQ(C->ndim, 2);
-    CHECK(C->strides == nullptr);
-    CHECK(B->strides == nullptr);
-    CHECK(A->strides == nullptr);
-    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
 
-    cublasHandle_t handle;
-    CHECK_CUBLAS_ERROR(cublasCreate(&handle));
-    float alpha = 1.0;
-    float beta = 0.0;
-    float *A_ptr = reinterpret_cast<float*>(static_cast<char*>(B->data) + B->byte_offset);
-    float *B_ptr = reinterpret_cast<float*>(static_cast<char*>(A->data) + A->byte_offset);
-    float *C_ptr = reinterpret_cast<float*>(static_cast<char*>(C->data) + C->byte_offset);
+    CHECK(TypeMatch(A->dtype, kDLFloat, 32) ||
+          TypeMatch(A->dtype, kDLFloat, 64));
 
-    CHECK_CUBLAS_ERROR(cublasSgemm(handle,
-                                   transb ? CUBLAS_OP_T : CUBLAS_OP_N,
-                                   transa ? CUBLAS_OP_T : CUBLAS_OP_N,
-                                   transb ? B->shape[0] : B->shape[1],
-                                   transa ? A->shape[1] : A->shape[0],
-                                   transb ? B->shape[1] : B->shape[0],
-                                   &alpha,
-                                   A_ptr,
-                                   B->shape[1],
-                                   B_ptr,
-                                   A->shape[1],
-                                   &beta,
-                                   C_ptr,
-                                   C->shape[1]));
+    CuBlasThreadEntry* entry_ptr = CuBlasThreadEntry::ThreadLocal();
 
-    CHECK_CUBLAS_ERROR(cublasDestroy(handle));
+    if (TypeMatch(A->dtype, kDLFloat, 32))
+      CallGemm(args, ret, CublasSgemmOp(entry_ptr->handle));
+    else
+      CallGemm(args, ret, CublasDgemmOp(entry_ptr->handle));
 });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/cublas/cublas_utils.cc b/src/contrib/cublas/cublas_utils.cc
new file mode 100644
index 000000000000..0011fe853d8d
--- /dev/null
+++ b/src/contrib/cublas/cublas_utils.cc
@@ -0,0 +1,39 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file Use external cudnn utils function
+ */
+#include "cublas_utils.h"
+#include <dmlc/thread_local.h>
+#include <tvm/runtime/registry.h>
+#include "../../runtime/cuda/cuda_common.h"
+
+namespace tvm {
+namespace contrib {
+
+
+CuBlasThreadEntry::CuBlasThreadEntry() {
+  CHECK_CUBLAS_ERROR(cublasCreate(&handle));
+}
+
+
+CuBlasThreadEntry::~CuBlasThreadEntry() {
+  if (handle) {
+    cublasDestroy(handle);
+    handle = 0;
+  }
+}
+
+
+typedef dmlc::ThreadLocalStore<CuBlasThreadEntry> CuBlasThreadStore;
+
+
+CuBlasThreadEntry* CuBlasThreadEntry::ThreadLocal() {
+  auto stream = runtime::CUDAThreadEntry::ThreadLocal()->stream;
+  CuBlasThreadEntry* retval = CuBlasThreadStore::Get();
+  CHECK_CUBLAS_ERROR(cublasSetStream(retval->handle, static_cast<cudaStream_t>(stream)));
+  return retval;
+}
+
+
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/cublas/cublas_utils.h b/src/contrib/cublas/cublas_utils.h
new file mode 100644
index 000000000000..2b0874757d98
--- /dev/null
+++ b/src/contrib/cublas/cublas_utils.h
@@ -0,0 +1,53 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file Use external cudnn utils function
+ */
+
+#ifndef TVM_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
+#define TVM_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
+
+#include <dmlc/logging.h>
+
+extern "C" {
+#include <cublas_v2.h>
+}
+
+namespace tvm {
+namespace contrib {
+
+inline const char* GetCublasErrorString(int error) {
+  switch (error) {
+  case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+  case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+  case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+  return "Unrecognized error";
+}
+
+#ifndef CHECK_CUBLAS_ERROR
+#define CHECK_CUBLAS_ERROR(fn)                  \
+  do {                                          \
+    int error = static_cast<int>(fn);                      \
+    CHECK_EQ(error, CUBLAS_STATUS_SUCCESS) << "CUBLAS: " << GetCublasErrorString(error); \
+  } while (0)  // ; intentionally left off.
+#endif  // CHECK_CUBLAS_ERROR
+
+
+struct CuBlasThreadEntry {
+  CuBlasThreadEntry();
+  ~CuBlasThreadEntry();
+  cublasHandle_t handle{nullptr};
+  static CuBlasThreadEntry* ThreadLocal();
+};  // CuBlasThreadEntry
+
+
+}  // namespace contrib
+}  // namespace tvm
+
+#endif  // TVM_CONTRIB_CUBLAS_CUBLAS_UTILS_H_

From 3bed8749b8c0e53168f15e6b519f500fe599e63f Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Wed, 31 Oct 2018 09:51:54 -0700
Subject: [PATCH 317/529] [RELAY/PASS] Simplify inference. (#2033)

---
 .../compiler/test_simplify_inference.py       |  1 -
 python/tvm/relay/expr.py                      | 61 ++++++++++++++-
 python/tvm/relay/ir_pass.py                   | 15 ++++
 python/tvm/relay/op/__init__.py               |  8 ++
 src/relay/pass/pattern_util.h                 | 34 ++++++++
 src/relay/pass/simplify_inference.cc          | 77 +++++++++++++++++++
 .../relay/test_pass_simplify_inference.py     | 47 +++++++++++
 7 files changed, 241 insertions(+), 2 deletions(-)
 create mode 100644 src/relay/pass/simplify_inference.cc
 create mode 100644 tests/python/relay/test_pass_simplify_inference.py

diff --git a/nnvm/tests/python/compiler/test_simplify_inference.py b/nnvm/tests/python/compiler/test_simplify_inference.py
index e2826765995e..fd0e1e3c182e 100644
--- a/nnvm/tests/python/compiler/test_simplify_inference.py
+++ b/nnvm/tests/python/compiler/test_simplify_inference.py
@@ -10,7 +10,6 @@ def simple_bn(x, gamma, beta, moving_mean, moving_var,
         scale = sym.elemwise_mul(1 / sym.sqrt(moving_var + epsilon), gamma)
         shift = sym.elemwise_add(
             sym.elemwise_mul(sym.negative(moving_mean), scale), beta)
-        shape = [-1 if i == axis else 1 for i in range(len(shape))]
         # for 2D
         num_newaxis=len(shape) - axis - 1
         if num_newaxis:
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 0650a493d9a6..43ec46d35a82 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -1,6 +1,7 @@
 # pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
 """The expression nodes of Relay."""
 from __future__ import absolute_import
+from numbers import Number as _Number
 
 import numpy as _np
 from .base import RelayNode, register_relay_node
@@ -11,6 +12,8 @@
 from .. import nd as _nd
 from .. import convert
 
+# will be registered afterwards
+_op_make = None
 
 class Expr(RelayNode):
     """The base type for all Relay expressions."""
@@ -48,6 +51,62 @@ def astype(self, dtype):
         """
         return _make.dtype_cast(self, dtype)
 
+    def __add__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.add(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __sub__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.subtract(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __rsub__(self, other):
+        if isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __mul__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.multiply(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __rmul__(self, other):
+        return self.__mul__(other)
+
+    def __div__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.divide(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __rdiv__(self, other):
+        if isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __truediv__(self, other):
+        return self.__div__(other)
+
+    def __rtruediv__(self, other):
+        return self.__rdiv__(other)
+
 
 @register_relay_node
 class Constant(Expr):
@@ -305,7 +364,7 @@ def __len__(self):
 
     def __repr__(self):
         return ("TupleWrapper(" + self.tuple_value.__repr__() +
-                ", " + self.size + ")")
+                ", " + str(self.size) + ")")
 
     def astype(self, _):
         raise TypeError("astype cannot be used on tuple")
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 68a07f190d42..f3950fffc45f 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -160,6 +160,21 @@ def free_type_vars(expr):
     """
     return _ir_pass.free_type_vars(expr)
 
+def simplify_inference(expr):
+    """ Simplify the data-flow graph for inference phase.
+
+    Parameters
+    ----------
+    e: tvm.relay.Expr
+        The input Expression
+
+    Returns
+    -------
+    result: tvm.relay.Expr
+        An expression which is semantically equal to the input expression,
+        but with some simplification
+    """
+    return _ir_pass.simplify_inference(expr)
 
 def dead_code_elimination(expr):
     """ Remove expressions which does not effect the program result (dead code).
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index c0af986be4f7..7b61fd10f5b0 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -15,3 +15,11 @@
 from . import _tensor
 from ..expr import Expr
 from ..base import register_relay_node
+
+
+def _register_op_make():
+    from . import _make
+    from .. import expr
+    expr._op_make = _make
+
+_register_op_make()
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index a41e6c35b93a..f8e67bac33c5 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -120,6 +120,40 @@ inline bool IsDepthwiseConv2D(const Call& call,
 }
 
 
+/*!
+ * \brief Create a Constant with a scalar
+ *
+ * \param dtype The data type.
+ * \param value The value of the scalar.
+ * \return A Constant.
+ */
+template<typename T>
+inline Constant MakeConstantScalar(DataType dtype, T value) {
+  CHECK_EQ(sizeof(T) * 8, dtype.bits()) << "data type mismatch";
+  runtime::NDArray arr = runtime::NDArray::Empty({}, Type2TVMType(dtype), {kDLCPU, 0});
+  *static_cast<T*>(arr->data) = value;
+  return ConstantNode::make(arr);
+}
+
+
+inline Expr Negative(Expr x) {
+  static const Op& op = Op::Get("negative");
+  return CallNode::make(op, {x}, Attrs(), {});
+}
+
+
+inline Expr Sqrt(Expr x) {
+  static const Op& op = Op::Get("sqrt");
+  return CallNode::make(op, {x}, Attrs(), {});
+}
+
+
+inline Expr Add(Expr lhs, Expr rhs) {
+  static const Op& op = Op::Get("add");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+
 inline Expr Multiply(Expr lhs, Expr rhs) {
   static const Op& op = Op::Get("multiply");
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
diff --git a/src/relay/pass/simplify_inference.cc b/src/relay/pass/simplify_inference.cc
new file mode 100644
index 000000000000..785b486ddc06
--- /dev/null
+++ b/src/relay/pass/simplify_inference.cc
@@ -0,0 +1,77 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file simplify_inference.cc
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/attrs/nn.h>
+#include "./pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+Expr BatchNormToInferUnpack(const Attrs attrs,
+                            Expr data,
+                            Expr gamma,
+                            Expr beta,
+                            Expr moving_mean,
+                            Expr moving_var) {
+  const auto param = attrs.as<BatchNormAttrs>();
+  Expr epsilon = MakeConstantScalar(Float(32), static_cast<float>(param->epsilon));
+  Expr var_add_eps = Add(moving_var, epsilon);
+  Expr sqrt_var = Sqrt(var_add_eps);
+  Expr scale = Divide(MakeConstantScalar(Float(32), 1.0f), sqrt_var);
+
+  if (param->scale) {
+    scale = Multiply(scale, gamma);
+  }
+  Expr neg_mean = Negative(moving_mean);
+  Expr shift = Multiply(neg_mean, scale);
+  if (param->center) {
+    shift = Add(shift, beta);
+  }
+
+  int axis = param->axis;
+  const auto* tdata = data->type_as<TensorTypeNode>();
+  scale = ExpandBiasToMatchAxis(scale, tdata->shape.size(), {axis});
+  shift = ExpandBiasToMatchAxis(shift, tdata->shape.size(), {axis});
+
+  Expr out = Multiply(data, scale);
+  out = Add(out, shift);
+  return out;
+}
+
+class InferenceSimplifier : public ExprMutator {
+ public:
+  Expr VisitExpr_(const TupleGetItemNode* n) final {
+    static const Op& batch_norm = Op::Get("nn.batch_norm");
+    static const Op& dropout = Op::Get("nn.dropout");
+
+    Expr new_e = ExprMutator::VisitExpr_(n);
+    const auto* new_n = new_e.as<TupleGetItemNode>();
+    if (new_n->index != 0) {
+      return new_e;
+    }
+    if (const auto* call = new_n->tuple.as<CallNode>()) {
+      if (call->op.same_as(batch_norm)) {
+        return BatchNormToInferUnpack(call->attrs,
+          call->args[0], call->args[1], call->args[2], call->args[3], call->args[4]);
+      } else if (call->op.same_as(dropout)) {
+        return call->args[0];
+      }
+    }
+    return new_e;
+  }
+};
+
+Expr SimplifyInference(const Expr& e) {
+  return InferenceSimplifier().Mutate(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.simplify_inference")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = SimplifyInference(args[0]);
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_pass_simplify_inference.py b/tests/python/relay/test_pass_simplify_inference.py
new file mode 100644
index 000000000000..9830b83dc6e5
--- /dev/null
+++ b/tests/python/relay/test_pass_simplify_inference.py
@@ -0,0 +1,47 @@
+from tvm import relay as rly
+from tvm.relay.ir_pass import simplify_inference, alpha_equal
+
+def test_simplify_batchnorm():
+    def simple_bn(x, gamma, beta, moving_mean, moving_var,
+                  axis=1, epsilon=1e-5, shape=None):
+        # expect = (x - moving_mean) / sqrt(moving_var + eps) * gamma + beta
+        scale = rly.multiply(rly.const(1, 'float32') /
+                rly.sqrt(moving_var + rly.const(epsilon, 'float32')), gamma)
+        shift = rly.add(
+            rly.multiply(rly.negative(moving_mean), scale), beta)
+        num_newaxis = len(shape) - (axis + 1)
+        if num_newaxis:
+            scale = rly.expand_dims(scale, axis=1, num_newaxis=num_newaxis)
+            shift = rly.expand_dims(shift, axis=1, num_newaxis=num_newaxis)
+        return x * scale + shift
+
+    def check(dim, axis, nstep):
+        eps = 0.01
+        ttype1 = rly.TensorType(tuple(10 for i in range(dim)), 'float32')
+        ttype2 = rly.TensorType((10,), 'float32')
+        x = rly.var("x", ttype1)
+        beta = rly.var("beta", ttype2)
+        gamma = rly.var("gamma", ttype2)
+        moving_var = rly.var("moving_var", ttype2)
+        moving_mean = rly.var("moving_mean", ttype2)
+        y1, y2 = x, x
+
+        for _ in range(nstep):
+            y1, _, _ = rly.nn.batch_norm(y1 + rly.const(1, 'float32'),
+                gamma, beta, moving_mean, moving_var, epsilon=eps, axis=axis)
+            y1 = rly.nn.dropout(y1)
+            y1 = rly.ir_pass.infer_type(y1)
+            y1 = simplify_inference(y1)
+
+            y2 = simple_bn(y2 + rly.const(1, 'float32'),
+                           gamma, beta, moving_mean, moving_var,
+                           epsilon=eps, axis=axis, shape=ttype1.shape)
+        assert rly.ir_pass.graph_equal(y1, y2)
+
+    check(2, 1, 1)
+    check(4, 1, 1)
+    check(4, 0, 3)
+
+
+if __name__ == "__main__":
+    test_simplify_batchnorm()

From 6837dcba5fc8f46f5e553fdd20ec78e1c94ed065 Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Wed, 31 Oct 2018 10:51:19 -0700
Subject: [PATCH 318/529] [RELAY] MobileNet (#1997)

---
 python/tvm/relay/testing/__init__.py       |   1 +
 python/tvm/relay/testing/mobilenet.py      | 142 +++++++++++++++++++++
 tests/python/relay/test_ir_text_printer.py |   5 +
 3 files changed, 148 insertions(+)
 create mode 100644 python/tvm/relay/testing/mobilenet.py

diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 547fff425595..776b61317da7 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -5,3 +5,4 @@
 from . import resnet
 from . import dqn
 from . import dcgan
+from . import mobilenet
diff --git a/python/tvm/relay/testing/mobilenet.py b/python/tvm/relay/testing/mobilenet.py
new file mode 100644
index 000000000000..78e1d82456c8
--- /dev/null
+++ b/python/tvm/relay/testing/mobilenet.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Port of NNVM version of MobileNet to Relay.
+"""
+# pylint: disable=invalid-name
+
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def conv_block(data, name, channels, kernel_size=(3, 3), strides=(1, 1),
+               padding=(1, 1), epsilon=1e-5):
+    """Helper function to construct conv_bn-relu"""
+    # convolution + bn + relu
+    conv = layers.conv2d(
+        data=data,
+        channels=channels,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_layout='NCHW',
+        name=name+'_conv')
+    bn = layers.batch_norm_infer(data=conv, epsilon=epsilon, name=name + '_bn')
+    act = relay.nn.relu(data=bn)
+    return act
+
+
+def separable_conv_block(data, name, depthwise_channels, pointwise_channels,
+                         kernel_size=(3, 3), downsample=False, padding=(1, 1),
+                         epsilon=1e-5):
+    """Helper function to get a separable conv block"""
+    if downsample:
+        strides = (2, 2)
+    else:
+        strides = (1, 1)
+    # depthwise convolution + bn + relu
+    conv1 = layers.conv2d(
+        data=data,
+        channels=depthwise_channels,
+        groups=depthwise_channels,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        name=name+'_depthwise_conv1')
+    bn1 = layers.batch_norm_infer(data=conv1, epsilon=epsilon, name=name+'_bn1')
+    act1 = relay.nn.relu(data=bn1)
+    # pointwise convolution + bn + relu
+    conv2 = layers.conv2d(
+        data=act1,
+        channels=pointwise_channels,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        padding=(0, 0),
+        data_layout='NCHW',
+        name=name + '_conv2')
+    bn2 = layers.batch_norm_infer(data=conv2, epsilon=epsilon, name=name+'_bn2')
+    act2 = relay.nn.relu(data=bn2)
+    return act2
+
+
+def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224),
+               dtype='float32', alpha=1.0, is_shallow=False):
+    """Function to construct a MobileNet"""
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    body = conv_block(data, 'conv_block_1', int(32*alpha), strides=(2, 2))
+    body = separable_conv_block(body, 'separable_conv_block_1',
+                                int(32*alpha), int(64*alpha))
+    body = separable_conv_block(body, 'separable_conv_block_2',
+                                int(64*alpha), int(128*alpha), downsample=True)
+    body = separable_conv_block(body, 'separable_conv_block_3',
+                                int(128*alpha), int(128*alpha))
+    body = separable_conv_block(body, 'separable_conv_block_4',
+                                int(128*alpha), int(256*alpha), downsample=True)
+    body = separable_conv_block(body, 'separable_conv_block_5',
+                                int(256*alpha), int(256*alpha))
+    body = separable_conv_block(body, 'separable_conv_block_6',
+                                int(256*alpha), int(512*alpha), downsample=True)
+    if is_shallow:
+        body = separable_conv_block(body, 'separable_conv_block_7',
+                                    int(512*alpha), int(1024*alpha), downsample=True)
+        body = separable_conv_block(body, 'separable_conv_block_8',
+                                    int(1024*alpha), int(1024*alpha), downsample=True)
+    else:
+        for i in range(7, 12):
+            body = separable_conv_block(body, 'separable_conv_block_%d' % i,
+                                        int(512*alpha), int(512*alpha))
+        body = separable_conv_block(body, 'separable_conv_block_12',
+                                    int(512*alpha), int(1024*alpha), downsample=True)
+        body = separable_conv_block(body, 'separable_conv_block_13',
+                                    int(1024*alpha), int(1024*alpha))
+    pool = relay.nn.global_avg_pool2d(data=body)
+    flatten = relay.nn.batch_flatten(data=pool)
+    weight = relay.var('fc_weight')
+    fc = relay.nn.dense(data=flatten, weight=weight, units=num_classes)
+    softmax = relay.nn.softmax(data=fc)
+    return relay.Function(relay.ir_pass.free_vars(softmax), softmax)
+
+
+def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 224, 224), dtype='float32'):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int, optional
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : relay.Function
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    data_shape = tuple([batch_size] + list(image_shape))
+    net = mobile_net(num_classes=num_classes, data_shape=data_shape,
+                     dtype=dtype, alpha=1.0, is_shallow=False)
+    return create_workload(net)
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index fd446f9b7f03..aa944bc217c2 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -104,6 +104,10 @@ def test_resnet():
     net, params = tvm.relay.testing.resnet.get_workload(batch_size=1)
     net.astext()
 
+def test_mobilenet():
+    net, params = tvm.relay.testing.mobilenet.get_workload(batch_size=1)
+    net.astext()
+
 def test_dqn():
     net, params = tvm.relay.testing.dqn.get_workload(batch_size=1)
     net.astext()
@@ -115,6 +119,7 @@ def test_dcgan():
 if __name__ == "__main__":
     do_print[0] = True
     test_resnet()
+    test_mobilenet()
     test_mlp()
     test_dqn()
     test_dcgan()

From 8a6690d361a09e9f7729c05f8988469dd0a4ccec Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 1 Nov 2018 06:12:44 +0800
Subject: [PATCH 319/529] [TOPI] Add dilation argument to conv2d and
 depthwise_conv2d (#1970)

---
 nnvm/python/nnvm/top/nn.py                    |  24 ++--
 python/tvm/autotvm/tophub.py                  |  14 +--
 .../unittest/test_lang_tensor_overload_op.py  |   3 +-
 topi/python/topi/arm_cpu/conv2d.py            |  44 +++++--
 topi/python/topi/cuda/conv2d.py               |  43 +++----
 topi/python/topi/cuda/conv2d_int8.py          | 119 ++++--------------
 topi/python/topi/cuda/conv2d_winograd.py      |  89 +++++++------
 topi/python/topi/generic/nn.py                |  18 ---
 topi/python/topi/mali/conv2d.py               |  27 +++-
 topi/python/topi/nn/conv2d.py                 | 110 ++++++++--------
 topi/python/topi/nn/depthwise_conv2d.py       |  38 +++++-
 topi/python/topi/rocm/conv2d.py               |  29 +----
 topi/python/topi/x86/conv2d.py                |  36 ++++--
 topi/tests/python/test_topi_conv2d_hwcn.py    |   3 +-
 topi/tests/python/test_topi_conv2d_int8.py    |   5 +-
 topi/tests/python/test_topi_conv2d_nchw.py    |   7 +-
 topi/tests/python/test_topi_conv2d_nhwc.py    |   7 +-
 .../tests/python/test_topi_conv2d_winograd.py |   5 +-
 .../python/test_topi_depthwise_conv2d.py      |  17 ++-
 tutorials/autotvm/tune_conv2d_cuda.py         |   2 +-
 tutorials/topi/intro_topi.py                  |   2 +-
 .../integration/test_benchmark_topi_conv2d.py |   1 +
 22 files changed, 312 insertions(+), 331 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index a4b36ea853d5..03ffb46a5c5c 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -94,34 +94,26 @@ def compute_conv2d(attrs, inputs, _):
     (dilation_h, dilation_w) = dilation
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
-    elif layout == "NCHW4c" and (dilation_h > 1 or dilation_w > 1):
-        raise ValueError("not support dilate now")
-    elif dilation == (1, 1):
-        kernel = inputs[1]
-    elif layout == "NCHW":
-        kernel = topi.nn.dilate(inputs[1], [1, 1, dilation_h, dilation_w])
-    else: #layout == NHWC
-        kernel = topi.nn.dilate(inputs[1], [1, dilation_h, dilation_w, 1])
 
     if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
         # pylint: disable=assignment-from-no-return
-        out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], kernel, strides, padding,
-                                                  layout, out_dtype=out_dtype)
+        out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding,
+                             dilation, layout, out_dtype=out_dtype)
         # pylint: enable=assignment-from-no-return
     elif groups == 1:
         out = topi.nn.conv2d(
-            inputs[0], kernel, strides, padding, layout, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype)
     elif layout == "NCHW" and \
          groups == get_const_int(inputs[0].shape[1]) and \
          groups == channels:
         out = topi.nn.depthwise_conv2d_nchw(
-            inputs[0], kernel, strides, padding, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
     elif layout == "NHWC" and \
          kernel_layout == "HWOI" and \
          groups == get_const_int(inputs[0].shape[3]) and \
          groups == channels:
         out = topi.nn.depthwise_conv2d_nhwc(
-            inputs[0], kernel, strides, padding, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
     else:
         raise ValueError("not support arbitrary group number for now")
 
@@ -144,7 +136,7 @@ def schedule_conv2d(attrs, outs, target):
         if groups == 1 and layout == "NCHW":
             return topi.generic.schedule_conv2d_nchw(outs)
         elif groups == 1 and layout == "NCHW4c":
-            return topi.generic.schedule_conv2d_NCHWc_int8_prepacked(outs)
+            return topi.generic.schedule_conv2d_nchw(outs)
         elif groups == 1 and layout == "NHWC":
             return topi.generic.schedule_conv2d_nhwc(outs)
         elif groups == channels and layout == "NCHW":
@@ -175,7 +167,7 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
     assert dilation == (1, 1), "not support dilate now"
     if groups == 1:
         # pylint: disable=assignment-from-no-return
-        out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], strides, padding,
+        out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation,
                                    layout, out_layout, out_dtype)
         # pylint: enable=assignment-from-no-return
     else:
@@ -227,7 +219,7 @@ def compute_contrib_conv2d_winograd_without_weight_transform(attrs, inputs, _):
 
     # pylint: disable=assignment-from-no-return
     out = topi.nn.conv2d_winograd_without_weight_transform(
-        inputs[0], inputs[1], strides, padding, layout, out_dtype,
+        inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype,
         tile_size)
 
     if attrs.get_bool("use_bias"):
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 7798d5522036..3e52ecb52b73 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -20,15 +20,15 @@
 
 # the version of each package
 PACKAGE_VERSION = {
-    'arm_cpu': "v0.03",
-    'llvm':    "v0.01",
+    'arm_cpu': "v0.04",
+    'llvm':    "v0.02",
 
-    'cuda':    "v0.03",
-    'rocm':    "v0.01",
-    'opencl':  "v0.01",
-    'mali':    "v0.03",
+    'cuda':    "v0.04",
+    'rocm':    "v0.02",
+    'opencl':  "v0.02",
+    'mali':    "v0.04",
 
-    'vta':     "v0.01",
+    'vta':     "v0.04",
 }
 
 logger = logging.getLogger('autotvm')
diff --git a/tests/python/unittest/test_lang_tensor_overload_op.py b/tests/python/unittest/test_lang_tensor_overload_op.py
index 95cceaac338e..ee6eaf74a79c 100644
--- a/tests/python/unittest/test_lang_tensor_overload_op.py
+++ b/tests/python/unittest/test_lang_tensor_overload_op.py
@@ -175,10 +175,11 @@ def check_device(device):
         print("Running on target: %s" % device)
 
         k = 10.0
+        dilation = (1, 1)
         with tvm.target.create(device):
             A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A')
             W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-            B = topi.nn.conv2d(A, W, stride, padding)
+            B = topi.nn.conv2d(A, W, stride, padding, dilation)
             if typ == "add":
                 C = B + k
             elif typ == "sub":
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index c34bf256788b..c30ad496b24d 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -9,11 +9,11 @@
 
 from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
 from ..util import traverse_inline, get_const_tuple, const_matrix
-from ..nn import pad, conv2d, conv2d_alter_layout, conv2d_winograd_without_weight_transform
+from ..nn import dilate, pad, conv2d, conv2d_alter_layout, conv2d_winograd_without_weight_transform
 from ..nn.util import get_const_int, get_pad_tuple
 
 @autotvm.register_topi_compute(conv2d, 'arm_cpu', ['direct'])
-def conv2d_arm_cpu(cfg, data, kernel, strides, padding, layout, out_dtype):
+def conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
     """TOPI compute callback for conv2d
 
     Parameters
@@ -35,6 +35,9 @@ def conv2d_arm_cpu(cfg, data, kernel, strides, padding, layout, out_dtype):
     padding : list of two ints
         [pad_height, pad_width]
 
+    dilation : list of two ints
+        [dilation_height, dilation_width]
+
     layout : str
         layout of data
 
@@ -46,7 +49,8 @@ def conv2d_arm_cpu(cfg, data, kernel, strides, padding, layout, out_dtype):
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    return _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile=2)
+    return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                              num_tile=2)
 
 @autotvm.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd'])
 def schedule_conv2d_nchw_arm_cpu(cfg, outs):
@@ -96,11 +100,22 @@ def _callback(op):
     return s
 
 
-def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
+def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, num_tile):
     assert layout == "NCHW", "Only support NCHW"
     # create workload according to raw arguments
     out_dtype = out_dtype or data.dtype
     N, CI, IH, IW = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if dilation_h != 1 or dilation_w != 1:
+        dilation_args = (1, 1, dilation_h, dilation_w) if len(kernel.shape) == 4\
+                else (1, 1, dilation_h, dilation_w, 1)
+        kernel = dilate(kernel, dilation_args)
+
     if len(kernel.shape) == 4:
         pre_packed = False
         CO, _, KH, KW = get_const_tuple(kernel.shape)
@@ -242,17 +257,27 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
 
 
 @autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd'])
-def conv2d_arm_cpu_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
+def conv2d_arm_cpu_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
     """ TOPI compute callback. Use winograd template """
     tile_size = 4
-    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout,
+                          out_dtype, tile_size)
 
-def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
     N, CI, IH, IW = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
     if len(kernel.shape) == 4:
+        if dilation_h != 1 or dilation_w != 1:
+            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
         pre_computed = False
         CO, _, KH, KW = get_const_tuple(kernel.shape)
     else:
+        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
         pre_computed = True
         H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
         CO *= VC
@@ -459,9 +484,10 @@ def _schedule_winograd(cfg, s, output, last):
 
 ##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
 @autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'arm_cpu', ['winograd'])
-def conv2d_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
     """TOPI compute callback"""
-    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,\
+                          tile_size)
 
 
 @autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index 4dac40746419..400c8f6bade1 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -5,7 +5,7 @@
 from tvm.contrib import cudnn
 
 from .. import nn, generic
-from ..util import get_const_int, get_const_tuple, traverse_inline
+from ..util import get_const_tuple, traverse_inline
 
 from .conv2d_direct import schedule_direct_cuda
 from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda
@@ -13,7 +13,7 @@
 
 
 @autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8'])
-def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='float32'):
+def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for cuda backend.
 
     Parameters
@@ -36,6 +36,9 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
     padding : int or a list/tuple of two ints
         padding size, or [pad_height, pad_width]
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     layout : str
         layout of data
 
@@ -63,32 +66,15 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
         # handle dilation
         stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
         pad_h, pad_w = (padding, padding) if isinstance(padding, int) else padding
+        dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
 
         OH = (H + 2 * pad_h - KH) // stride_h + 1
         OW = (W + 2 * pad_w - KW) // stride_w + 1
-        cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
-
-        dilation_h = dilation_w = 1
-        kernel_before_dilation = kernel
-        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-            kernel_before_dilation = kernel.op.input_tensors[0]
-            if layout == 'NCHW':
-                dilation_h = (get_const_int(kernel.shape[2]) +
-                              get_const_int(kernel_before_dilation.shape[2]) - 1) \
-                             // get_const_int(kernel_before_dilation.shape[2])
-                dilation_w = (get_const_int(kernel.shape[3]) +
-                              get_const_int(kernel_before_dilation.shape[3]) - 1) \
-                             // get_const_int(kernel_before_dilation.shape[2])
-            elif layout == 'NHWC':
-                dilation_h = (get_const_int(kernel.shape[1]) +
-                              get_const_int(kernel_before_dilation.shape[1]) - 1) \
-                             // get_const_int(kernel_before_dilation.shape[1])
-                dilation_w = (get_const_int(kernel.shape[2]) +
-                              get_const_int(kernel_before_dilation.shape[2]) - 1) \
-                             // get_const_int(kernel_before_dilation.shape[2])
+        cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\
+                    ((KW - 1) * dilation_w + 1))
 
         return cudnn.conv2d_forward(data,
-                                    kernel_before_dilation,
+                                    kernel,
                                     stride_h,
                                     stride_w,
                                     pad_h,
@@ -100,16 +86,15 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
                                     algo=-1)  # let CUDNN choose the best algo
 
     if cfg.template_key == 'winograd':
-        return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype,
+        return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
                              pre_computed=False)
     if cfg.template_key == 'int8':
-        return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, layout, out_dtype,
-                                 pre_computed=False)
+        return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
 
     if layout == 'NCHW':
-        return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype)
+        return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
     elif layout == 'HWCN':
-        return nn.conv2d_hwcn(data, kernel, strides, padding, out_dtype)
+        return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
     else:
         raise ValueError("not support this layout {} yet".format(layout))
 
@@ -146,7 +131,7 @@ def _callback(op):
         if op.tag == 'conv2d_nchw_winograd':
             schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
         if op.tag == "conv2d_NCHWc_int8":
-            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0), pre_computed=False)
+            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 9d3757c35fbb..200ed1a3887a 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -4,37 +4,13 @@
 from tvm import autotvm
 
 from .injective import _schedule_injective
-from ..generic import schedule_conv2d_NCHWc_int8_prepacked
 from .tensor_intrin import dp4a
-from ..nn.conv2d import conv2d_NCHWc_int8_prepacked
 from ..nn.pad import pad
 from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple, traverse_inline
+from ..util import get_const_tuple
 
 
-def _conv2d_NCHWc_int8_arg_to_workload(data, kernel, stride, padding, out_dtype):
-    """convert argument to workload"""
-    shape = get_const_tuple(data.shape)
-    if len(shape) == 5:
-        N, ic_chunk, H, W, ic_block = shape
-        raw_data = tvm.placeholder(
-            (N, ic_chunk*ic_block, H, W), dtype=data.dtype)
-    else:
-        raw_data = data
-
-    shape = get_const_tuple(kernel.shape)
-    if len(shape) == 6:
-        oc_chunk, ic_chunk, KH, KW, oc_block, ic_block = shape
-        raw_kernel = tvm.placeholder(
-            (oc_chunk*oc_block, ic_chunk*ic_block, KH, KW), dtype=kernel.dtype)
-    else:
-        raw_kernel = kernel
-
-    return ('conv2d', ) + autotvm.task.task.args_to_workload(
-        [raw_data, raw_kernel, stride, padding, "NCHW", out_dtype])
-
-
-def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre_computed):
+def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_dtype):
     """Convolution operator in NCHW[x]c layout for int8.
 
     Parameters
@@ -57,25 +33,25 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
     padding: int or a list/tuple of two ints
         padding size, or [pad_height, pad_width]
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     layout : str
         layout of data
 
     out_dtype : str
         The output type. This is used for mixed precision.
 
-    pre_computed : str
-        Whether packed data and kernel are pre-computed
-
     Returns
     -------
     output : tvm.Tensor
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
     assert layout in ["NCHW", "NCHW4c"]
-
     ic_block_factor = 4
     oc_block_factor = 4
 
+    pre_computed = len(kernel.shape) == 6
     if not pre_computed:
         batch, channels, height, width = get_const_tuple(data.shape)
         assert channels % ic_block_factor == 0, \
@@ -109,10 +85,15 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
         packed_kernel.shape)
 
     if isinstance(stride, int):
-        stride_h, stride_w = stride
+        stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
 
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (kernel_h, kernel_w))
     # compute graph
@@ -121,8 +102,8 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
     pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
 
     # compute the output shape
-    out_height = (in_height - kernel_h + pad_top + pad_down) // stride_h + 1
-    out_width = (in_width - kernel_w + pad_left + pad_right) // stride_w + 1
+    out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1
 
     oshape = (batch, oc_chunk, out_height, out_width, oc_block)
 
@@ -132,7 +113,8 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
     kw = tvm.reduce_axis((0, kernel_w), name='kw')
 
     conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(pad_data[n, icc, oh*stride_h+kh, ow*stride_w+kw, icb]
+                       tvm.sum(pad_data[n, icc, oh*stride_h+kh*dilation_h, \
+                               ow*stride_w+kw*dilation_w, icb]
                                .astype('int32') *
                                packed_kernel[oc_chunk, icc,
                                              kh, kw, oc_block, icb]
@@ -141,9 +123,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
 
     output = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
                          conv[n, oc_chunk, oh, ow, oc_block].astype(out_dtype),
-                         tag="conv2d_NCHWc_int8",
-                         attrs={"workload": _conv2d_NCHWc_int8_arg_to_workload(
-                             data, kernel, stride, padding, out_dtype)})
+                         tag="conv2d_NCHWc_int8")
 
     # num flop
     num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
@@ -156,7 +136,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
 _dp4a = dp4a('shared', 'shared', 'local')
 
 
-def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
+def schedule_conv2d_NCHWc_int8(cfg, s, output):
     """Schedule conv2d int8 NCHWc template"""
     workload = output.op.attrs["workload"]
 
@@ -171,22 +151,17 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
     else:
         pad_data = packed_data
 
-    if not pre_computed:
-        kernel, = packed_kernel.op.input_tensors
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # skip this part during tuning to make recrods accurate
-            # this part will be pre-computed during NNVM's pre-compute optimization pass
-            s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
-            s[packed_kernel].pragma(
-                s[packed_kernel].op.axis[0], "debug_skip_region")
-        else:
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # skip this part during tuning to make recrods accurate
+        # this part will be pre-computed during NNVM's pre-compute optimization pass
+        s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
+        s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region")
+    else:
+        if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and\
+                       packed_kernel.name == 'packed_kernel':
+            # data and kernel are not pre-computed, schedule layout transform here
             _schedule_injective(packed_data.op, s)
             _schedule_injective(packed_kernel.op, s)
-    else:
-        kernel = packed_kernel
-
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
 
     if pad_data != packed_data:
         s[pad_data].compute_inline()
@@ -310,43 +285,3 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
     s[output].pragma(kernel_scope, 'unroll_explicit', False)
 
     return s
-
-
-@conv2d_NCHWc_int8_prepacked.register(["cuda"])
-@autotvm.task.dispatcher
-def conv2d_NCHWc_int8_prepacked_dispatcher(data, kernel, stride, padding, layout, out_dtype):
-    assert layout == 'NCHW4c'
-    return _conv2d_NCHWc_int8_arg_to_workload(data, kernel, stride, padding, out_dtype)
-
-
-@conv2d_NCHWc_int8_prepacked_dispatcher.register("int8")
-def _decl_conv2d_NCHWc_int8_prepacked(cfg, data, kernel, stride, padding, layout, out_dtype):
-    return conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype,
-                             pre_computed=True)
-
-@autotvm.register_topi_schedule(schedule_conv2d_NCHWc_int8_prepacked, ["cuda"], ["int8"])
-def schedule_conv2d_NCHWc_int8_prepacked_cuda(cfg, outs):
-    """TOPI schedule callback of conv2d for cuda
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if 'conv2d_NCHWc_int8' in op.tag:
-            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0), pre_computed=True)
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 6a0a126b9e4f..fb30a4f9ad2e 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -7,23 +7,10 @@
 from tvm import autotvm
 
 from .. import nn
-from ..nn import conv2d_winograd_without_weight_transform
+from ..nn import conv2d, conv2d_winograd_without_weight_transform
 from ..util import get_const_int, get_const_tuple, const_matrix, traverse_inline
 from ..generic import schedule_conv2d_winograd_without_weight_transform
 
-def _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
-    """convert argument to workload"""
-    K = 3
-
-    shape = get_const_tuple(kernel.shape)
-    if shape[-2:] == (K, K):
-        raw_kernel = kernel
-    else:  # pre-transformed
-        _, _, CI, CO = shape
-        raw_kernel = tvm.placeholder((CO, CI, K, K), dtype=kernel.dtype)
-
-    return ('conv2d', ) + autotvm.task.args_to_workload(
-        [data, raw_kernel, strides, padding, layout, out_dtype])
 
 def _infer_tile_size(data, kernel):
     N, CI, H, W = get_const_tuple(data.shape)
@@ -32,7 +19,7 @@ def _infer_tile_size(data, kernel):
         return 4
     return 2
 
-def winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype, pre_computed):
+def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, pre_computed):
     """Compute declaration for winograd"""
     assert layout == 'NCHW'
 
@@ -41,12 +28,20 @@ def winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype, pre_co
     N, CI, H, W = get_const_tuple(data.shape)
 
     if not pre_computed: # kernel tensor is raw tensor, do strict check
+        if isinstance(dilation, int):
+            dilation_h = dilation_w = dilation
+        else:
+            dilation_h, dilation_w = dilation
+        if dilation_h != 1 or dilation_w != 1:
+            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
+
         CO, CI, KH, KW = get_const_tuple(kernel.shape)
         HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel)
         HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
         assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3
     else:                   # kernel tensor is pre-transfomred. this op is created by
                             # alter op layout, do not check
+        # dilation is not supported
         HSTR = WSTR = 1
         HPAD = WPAD = 1
         KH = KW = 3
@@ -150,9 +145,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype, pre_co
     # output
     output = tvm.compute((N, CO, H, W), lambda n, co, h, w:
                          inverse[co][n * nH * nW + (h // m) * nW + w // m][h % m][w % m],
-                         name='output', tag='conv2d_nchw_winograd',
-                         attrs={"workload": _winograd_conv_arg_to_workload(
-                             data, kernel, strides, padding, layout, out_dtype)})
+                         name='output', tag='conv2d_nchw_winograd')
     cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)
 
     return output
@@ -314,16 +307,11 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     return s
 
 ##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
-@conv2d_winograd_without_weight_transform.register(['cuda', 'gpu'])
-@autotvm.task.dispatcher
-def winograd_ww_config_dispatcher_cuda(data, kernel, strides, padding, layout, out_dtype,
-                                       tile_size):
-    return _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
-
-
-@winograd_ww_config_dispatcher_cuda.register(['winograd'])
-def decl_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
-    return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype, pre_computed=True)
+@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform,
+                               ['cuda', 'gpu'], ['winograd'])
+def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
+    return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                         pre_computed=True)
 
 
 @autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
@@ -352,36 +340,54 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
 
     new_attrs = {k: attrs[k] for k in attrs.keys()}
 
-    assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
-                                                      "when alter_op_layout is enabled"
     strides = attrs.get_int_tuple("strides")
     padding = attrs.get_int_tuple("padding")
+    dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int('groups')
     layout = attrs["layout"]
     out_dtype = attrs["out_dtype"]
     out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype
 
+    data, kernel = tinfos[0:2]
+    N, CI, H, W = get_const_tuple(data.shape)
+    CO, _, KH, KW = get_const_tuple(kernel.shape)
+
+    dispatch_ctx = autotvm.DispatchContext.current
+
     if groups == 1:
         # query config of this workload
         workload = ('conv2d',) + autotvm.task.args_to_workload(
-            [tinfos[0], tinfos[1], strides, padding, layout, out_dtype])
-
-        cfg = autotvm.DispatchContext.current.query(tvm.target.current_target(), workload)
+            [tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype])
+        target = tvm.target.current_target()
+        cfg = autotvm.DispatchContext.current.query(target, workload)
 
         if cfg.is_fallback:  # if is fallback, clear query cache and return None
-            autotvm.task.clear_fallback_cache(tvm.target.current_target(), workload)
+            autotvm.task.clear_fallback_cache(target, workload)
             return None
 
         if cfg.template_key == 'direct':
             return None
 
         if cfg.template_key == 'int8':
-            assert 'cuda' in tvm.target.current_target().keys
-            new_attrs['layout'] = 'NCHW4c'
-            new_attrs['out_layout'] = 'NCHW4c'
+            assert 'cuda' in target.keys
+            new_layout = 'NCHW4c'
+            new_attrs['layout'] = new_layout
+            new_attrs['out_layout'] = new_layout
             new_attrs['kernel_layout'] = 'OIHW4o4i'
+            ic_block_factor = oc_block_factor = 4
+            new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
+                                       dtype=data.dtype)
+            new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW,\
+                                         oc_block_factor, ic_block_factor), dtype=kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype],
+                conv2d
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
             return sym.conv2d(*copy_inputs, **new_attrs)
 
+        if attrs.get_int_tuple("dilation") != (1, 1):
+            return None
         # pre-compute weight transformation in winograd
         tile_size = _infer_tile_size(tinfos[0], tinfos[1])
 
@@ -390,6 +396,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
         weight = sym.transpose(weight, axes=[0, 1, 3, 2])
         copy_inputs[1] = weight
         new_attrs['tile_size'] = tile_size
+
+        new_data = data
+        new_weight = tvm.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO),
+                                     dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_weight, strides, padding, dilation, layout, out_dtype, tile_size],
+            conv2d_winograd_without_weight_transform
+        )
+        dispatch_ctx.update(target, new_workload, cfg)
         return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
 
     # do nothing for depthwise convolution
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 765b48d286bc..a48b85638fb1 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -121,24 +121,6 @@ def schedule_conv2d_winograd_without_weight_transform(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
-def schedule_conv2d_NCHWc_int8_prepacked(outs):
-    """Schedule for conv2d NCHWc int8 with prepacked data and kernel
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of this operator
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
 @tvm.target.generic_func
 def schedule_conv2d_transpose_nchw(outs):
     """Schedule for conv2d_transpose_nchw
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 390b60ba6a97..7c3b4a23cbc5 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -16,7 +16,7 @@
 
 
 @autotvm.register_topi_compute(conv2d, 'mali', ['direct'])
-def conv2d_mali(cfg, data, kernel, strides, padding, layout, out_dtype):
+def conv2d_mali(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
     """TOPI compute callback for conv2d
 
     Parameters
@@ -38,6 +38,9 @@ def conv2d_mali(cfg, data, kernel, strides, padding, layout, out_dtype):
     padding : list of two ints
         [pad_height, pad_width]
 
+    dilation : list of two ints
+        [dilation_height, dilation_width]
+
     layout : str
         layout of data
 
@@ -49,7 +52,8 @@ def conv2d_mali(cfg, data, kernel, strides, padding, layout, out_dtype):
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    return _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile=3)
+    return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                              num_tile=3)
 
 @autotvm.register_topi_schedule(schedule_conv2d_nchw, 'mali', ['direct', 'winograd'])
 def schedule_conv2d_nchw_mali(cfg, outs):
@@ -175,16 +179,26 @@ def _pick_tile_size(data, kernel):
         return 2
 
 @autotvm.register_topi_compute(conv2d, 'mali', ['winograd'])
-def conv2d_mali_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
+def conv2d_mali_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
     tile_size = _pick_tile_size(data, kernel)
-    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                          tile_size)
 
-def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
     N, CI, IH, IW = get_const_tuple(data.shape)
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
     if len(kernel.shape) == 4:
+
+        if dilation_h != 1 or dilation_w != 1:
+            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
         pre_computed = False
         CO, _, KH, KW = get_const_tuple(kernel.shape)
     else:
+        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
         pre_computed = True
         H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
         CO *= VC
@@ -428,7 +442,8 @@ def _schedule_winograd(cfg, s, op):
 @autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'mali', ['winograd'])
 def conv2d_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
     """TOPI compute callback"""
-    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                          tile_size)
 
 
 @autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 17b1ceb7ab13..2b88886524bd 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -6,6 +6,7 @@
 import numpy as np
 import tvm
 
+from .dilate import dilate
 from .pad import pad
 from .util import get_pad_tuple
 from ..util import simplify, const_matrix, get_const_tuple
@@ -16,7 +17,7 @@
                        'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
 
 @tvm.target.generic_func
-def conv2d(input, filter, strides, padding, layout='NCHW', out_dtype=None):
+def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=None):
     """Conv2D operator.
 
     Parameters
@@ -33,6 +34,9 @@ def conv2d(input, filter, strides, padding, layout='NCHW', out_dtype=None):
     padding : int or a list/tuple of two ints
         padding size, or [pad_height, pad_width]
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     layout : str
         layout of data
 
@@ -44,11 +48,11 @@ def conv2d(input, filter, strides, padding, layout='NCHW', out_dtype=None):
     # search platform specific declaration first
     # default declaration
     if layout == 'NCHW':
-        return conv2d_nchw(input, filter, strides, padding, out_dtype)
+        return conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
     elif layout == 'HWCN':
-        return conv2d_hwcn(input, filter, strides, padding, out_dtype)
+        return conv2d_hwcn(input, filter, strides, padding, dilation, out_dtype)
     elif layout == 'NHWC':
-        return conv2d_nhwc(input, filter, strides, padding, out_dtype)
+        return conv2d_nhwc(input, filter, strides, padding, dilation, out_dtype)
     else:
         raise ValueError("not support this layout {} yet".format(layout))
 
@@ -85,7 +89,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
     return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
 
 
-def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
+def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Convolution operator in NCHW layout.
 
     Parameters
@@ -102,6 +106,9 @@ def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     Returns
     -------
     Output : tvm.Tensor
@@ -110,12 +117,22 @@ def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     if out_dtype is None:
         out_dtype = Input.dtype
     assert isinstance(stride, int) or len(stride) == 2
-    batch, in_channel, in_height, in_width = Input.shape
-    num_filter, channel, kernel_h, kernel_w = Filter.shape
+    assert isinstance(dilation, int) or len(dilation) == 2
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if dilation_h != 1 or dilation_w != 1:
+        Filter = dilate(Filter, (1, 1, dilation_h, dilation_w))
+
+    batch, in_channel, in_height, in_width = Input.shape
+    num_filter, channel, kernel_h, kernel_w = Filter.shape
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (kernel_h, kernel_w))
     # compute the output shape
@@ -138,7 +155,7 @@ def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
             axis=[rc, ry, rx]), tag="conv2d_nchw")
 
 
-def conv2d_hwcn(Input, Filter, stride, padding, out_dtype=None):
+def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Convolution operator in HWCN layout.
 
     Parameters
@@ -155,6 +172,9 @@ def conv2d_hwcn(Input, Filter, stride, padding, out_dtype=None):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     Returns
     -------
     output : tvm.Tensor
@@ -163,13 +183,23 @@ def conv2d_hwcn(Input, Filter, stride, padding, out_dtype=None):
     if out_dtype is None:
         out_dtype = Input.dtype
     assert isinstance(stride, int) or len(stride) == 2
-    in_height, in_width, in_channel, batch = Input.shape
-    kernel_h, kernel_w, channel, num_filter = Filter.shape
+    assert isinstance(dilation, int) or len(dilation) == 2
+
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
 
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if dilation_h != 1 or dilation_w != 1:
+        Filter = dilate(Filter, (dilation_h, dilation_w, 1, 1))
+
+    in_height, in_width, in_channel, batch = Input.shape
+    kernel_h, kernel_w, channel, num_filter = Filter.shape
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (kernel_h, kernel_w))
     # compute the output shape
@@ -191,7 +221,7 @@ def conv2d_hwcn(Input, Filter, stride, padding, out_dtype=None):
     return Output
 
 
-def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
+def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
     """Convolution operator in NHWC layout.
 
     Parameters
@@ -208,19 +238,32 @@ def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     Returns
     -------
     output : tvm.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     assert isinstance(stride, int) or len(stride) == 2
-    batch, in_height, in_width, in_channel = Input.shape
-    kernel_h, kernel_w, channel, num_filter = Filter.shape
+    assert isinstance(dilation, int) or len(dilation) == 2
+
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
 
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if dilation_h != 1 or dilation_w != 1:
+        Filter = dilate(Filter, (dilation_h, dilation_w, 1, 1))
+
+    batch, in_height, in_width, in_channel = Input.shape
+    kernel_h, kernel_w, channel, num_filter = Filter.shape
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (kernel_h, kernel_w))
     # compute the output shape
@@ -243,7 +286,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
 
 
 @tvm.target.generic_func
-def conv2d_NCHWc(data, kernel, stride, padding, layout, out_layout, out_dtype='float32'):
+def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'):
     """Conv2D operator for nChw[x]c layout.
 
     Parameters
@@ -262,6 +305,9 @@ def conv2d_NCHWc(data, kernel, stride, padding, layout, out_layout, out_dtype='f
     padding : int or a list/tuple of two ints
         padding size, or [pad_height, pad_width]
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     layout : str
         Input data layout
 
@@ -333,7 +379,7 @@ def conv2d_winograd_weight_transform(kernel, tile_size):
 
 
 @tvm.target.generic_func
-def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
+def conv2d_winograd_without_weight_transform(input, filter, strides, padding, dilation,
                                              layout, out_dtype, tile_size):
     """Compute convolution in winograd algorithm. The filter is supposed to be transformed
     in advance.
@@ -357,37 +403,3 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
-
-
-@tvm.target.generic_func
-def conv2d_NCHWc_int8_prepacked(data, kernel, stride, padding, layout, out_dtype):
-    """Convolution operator in NCHW[x]c layout for int8. Data and kernel should be packed in
-    advance.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
-
-    kernel : tvm.Tensor
-        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
-        filter_width, num_filter_block, in_channel_block]
-
-    stride : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding: int or a list/tuple of two ints
-        padding size, or [pad_height, pad_width]
-
-    layout : str
-        layout of data
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.Tensor
-        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
-    """
-    raise ValueError("missing register for topi.nn.conv2d_NCHWc_int8_prepacked")
diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py
index c7906d3a4373..78107d2bd1ce 100644
--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -10,7 +10,7 @@
 
 
 @tvm.target.generic_func
-def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
+def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Depthwise convolution nchw forward operator.
 
     Parameters
@@ -27,6 +27,9 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     out_dtype: str, optional
         Output data type
 
@@ -37,13 +40,23 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     """
     out_dtype = Input.dtype if out_dtype is None else out_dtype
 
-    batch, in_channel, in_height, in_width = Input.shape
-    filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
 
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if dilation_h != 1 or dilation_w != 1:
+        Filter = dilate(Filter, (1, 1, dilation_h, dilation_w))
+
+    batch, in_channel, in_height, in_width = Input.shape
+    # shape of dilated kernel
+    filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape
+
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (filter_height, filter_width))
     out_channel = simplify(in_channel * channel_multiplier)
@@ -68,7 +81,7 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
 
 
 @tvm.target.generic_func
-def depthwise_conv2d_nhwc(Input, Filter, stride, padding, out_dtype=None):
+def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Depthwise convolution nhwc forward operator.
 
     Parameters
@@ -85,6 +98,9 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, out_dtype=None):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     out_dtype: str, optional
         Output data type
 
@@ -95,13 +111,23 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, out_dtype=None):
     """
     out_dtype = Input.dtype if out_dtype is None else out_dtype
 
-    batch, in_height, in_width, in_channel = Input.shape
-    filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
 
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if dilation_h != 1 or dilation_w != 1:
+        Filter = dilate(Filter, (dilation_h, dilation_w, 1, 1))
+
+    batch, in_height, in_width, in_channel = Input.shape
+    # shape of dilated kernel
+    filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
+
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (filter_height, filter_width))
     out_channel = simplify(in_channel * channel_multiplier)
diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py
index 2d8058fb276b..b5839c0c866b 100644
--- a/topi/python/topi/rocm/conv2d.py
+++ b/topi/python/topi/rocm/conv2d.py
@@ -5,11 +5,11 @@
 from tvm.contrib import miopen
 
 from .. import nn, generic
-from ..util import get_const_int, get_const_tuple
+from ..util import get_const_tuple
 from ..cuda.conv2d import conv2d_cuda, schedule_conv2d_nchw_cuda
 
 @autotvm.register_topi_compute(nn.conv2d, 'rocm', ['direct', 'winograd'])
-def conv2d_rocm(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='float32'):
+def conv2d_rocm(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for rocm backend.
 
     Parameters
@@ -47,29 +47,12 @@ def conv2d_rocm(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
         # handle dilation
         stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
         pad_h, pad_w = (padding, padding) if isinstance(padding, int) else padding
+        dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
 
         OH = (H + 2 * pad_h - KH) // stride_h + 1
         OW = (W + 2 * pad_w - KW) // stride_w + 1
-        cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
-
-        dilation_h = dilation_w = 1
-        kernel_before_dilation = kernel
-        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-            kernel_before_dilation = kernel.op.input_tensors[0]
-            if layout == 'NCHW':
-                dilation_h = (get_const_int(kernel.shape[2]) +
-                              get_const_int(kernel_before_dilation.shape[2]) - 1) \
-                             // get_const_int(kernel_before_dilation.shape[2])
-                dilation_w = (get_const_int(kernel.shape[3]) +
-                              get_const_int(kernel_before_dilation.shape[3]) - 1) \
-                             // get_const_int(kernel_before_dilation.shape[2])
-            elif layout == 'NHWC':
-                dilation_h = (get_const_int(kernel.shape[1]) +
-                              get_const_int(kernel_before_dilation.shape[1]) - 1) \
-                             // get_const_int(kernel_before_dilation.shape[1])
-                dilation_w = (get_const_int(kernel.shape[2]) +
-                              get_const_int(kernel_before_dilation.shape[2]) - 1) \
-                             // get_const_int(kernel_before_dilation.shape[2])
+        cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\
+                    ((KW - 1) * dilation_w + 1))
 
         return miopen.conv2d_forward(data,
                                      kernel_before_dilation,
@@ -81,7 +64,7 @@ def conv2d_rocm(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
                                      dilation_w,
                                      conv_mode=0)
 
-    return conv2d_cuda(cfg, data, kernel, strides, padding, layout, out_dtype)
+    return conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
 
 
 @autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, 'rocm', ["direct", 'winograd'])
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 3dc6d5e4bab8..afeb99e7051d 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -8,6 +8,7 @@
 from .. import nn
 from ..util import get_const_tuple
 from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, _get_workload
+from ..nn.dilate import dilate
 from ..nn.pad import pad
 
 from . import conv2d_avx_1x1, conv2d_avx_common
@@ -38,7 +39,7 @@ def _get_default_config(cfg, workload):
         conv2d_avx_common._fallback_schedule(cfg, workload, fp32_vec_len)
 
 
-def _create_tuning_space(cfg, data, kernel, strides, padding, layout):
+def _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout):
     """Create schedule configuration from input arguments"""
     dshape = get_const_tuple(data.shape)
     kshape = get_const_tuple(kernel.shape)
@@ -65,28 +66,39 @@ def _create_tuning_space(cfg, data, kernel, strides, padding, layout):
 
 
 @autotvm.register_topi_compute(conv2d, 'cpu', 'direct')
-def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype):
+def _declaration_conv(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
     out_dtype = data.dtype if out_dtype is None else out_dtype
     padding = padding if isinstance(padding, (tuple, list)) else (padding, padding)
     strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
     if layout == 'NCHW':
-        _create_tuning_space(cfg, data, kernel, strides, padding, layout)
+        _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout)
         if cfg.is_fallback:
             wkl = _get_workload(data, kernel, strides, padding, out_dtype)
             _get_default_config(cfg, wkl)
-        return _declaration_conv_impl(cfg, data, kernel, strides, padding, layout, out_dtype)
+        return _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout,
+                                      out_dtype)
     elif layout == 'HWCN':
-        return nn.conv2d_hwcn(data, kernel, strides, padding, out_dtype)
+        return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
     elif layout == 'NHWC':
-        return nn.conv2d_nhwc(data, kernel, strides, padding, out_dtype)
+        return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
     else:
         raise ValueError("not support this layout {} yet".format(layout))
 
 
-def _declaration_conv_impl(cfg, data, kernel, strides, padding, layout, out_dtype):
+def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
     out_dtype = data.dtype if out_dtype is None else out_dtype
     assert layout == 'NCHW', "only support NCHW convolution for AVX"
 
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(dilation, int):
+        dilation_h, dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if dilation_h != 1 or dilation_w != 1:
+        kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
+
     HPAD, WPAD = padding
     HSTR, WSTR = strides
 
@@ -251,13 +263,13 @@ def traverse(op):
 @autotvm.task.register("topi_x86_conv2d_NCHWc")
 def _topi_nn_conv2d_NCHWc(*args, **kwargs):
     assert not kwargs, "Do not support kwargs in template function call"
-    data, kernel, strides, padding, origin_layout, dtype = deserialize_args(args)
+    data, kernel, strides, padding, dilation, origin_layout, dtype = deserialize_args(args)
     raw_data_shape = get_const_tuple(data.shape)
     raw_kernel_shape = get_const_tuple(kernel.shape)
 
     # get config here
     cfg = get_config()
-    _create_tuning_space(cfg, data, kernel, strides, padding, origin_layout)
+    _create_tuning_space(cfg, data, kernel, strides, padding, dilation, origin_layout)
 
     # change shape with the value in config
     ic_bn, oc_bn, ow_bn = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
@@ -271,7 +283,7 @@ def _topi_nn_conv2d_NCHWc(*args, **kwargs):
     new_data = tvm.placeholder(new_data_shape, data.dtype)
     new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
 
-    C = _declaration_conv_NCHWc(cfg, new_data, new_kernel, strides, padding,
+    C = _declaration_conv_NCHWc(cfg, new_data, new_kernel, strides, padding, dilation,
                                 data_layout, out_layout, dtype)
     s = _schedule_conv2d_NCHWc(cfg, [C])
     return s, [new_data, new_kernel, C]
@@ -326,11 +338,13 @@ def _alter_conv2d_layout(attrs, inputs, tinfo):
 
 @autotvm.register_topi_compute(conv2d_NCHWc, 'cpu', 'direct')
 def _declaration_conv_NCHWc(cfg, data, kernel, strides,
-                            padding, layout, out_layout, out_dtype):
+                            padding, dilation, layout, out_layout, out_dtype):
     # layout and out_layout are not used here,
     # we keep them for debug convenience when dumping autotvm workload
     HPAD, WPAD = padding if isinstance(padding, (tuple, list)) else (padding, padding)
     HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    assert (dh, dw) == (1, 1), "Does not support dilation"
 
     n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
     in_channel = ic_chunk * ic_bn
diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py
index bbd8dc3a6db9..1af7fa4938dd 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -13,8 +13,7 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p
 
     A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A')
     W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
-    dW = topi.nn.dilate(W, (dilation, dilation, 1, 1))
-    B = topi.nn.conv2d_hwcn(A, dW, stride, padding)
+    B = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation)
     C = topi.nn.relu(B)
     s1 = topi.cuda.schedule_conv2d_hwcn([B])
     s2 = topi.cuda.schedule_conv2d_hwcn([C])
diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
index 93a0587c64ff..cbffda95d8d6 100644
--- a/topi/tests/python/test_topi_conv2d_int8.py
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -15,7 +15,7 @@
 
 
 def verify_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
-    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
 
     in_height = in_width = in_size
 
@@ -63,8 +63,7 @@ def check_device(device):
 
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
-            C = topi.nn.conv2d(A, dW, (stride, stride), (padding, padding),
+            C = topi.nn.conv2d(A, W, (stride, stride), (padding, padding), (dilation, dilation),
                                layout='NCHW', out_dtype=dtype)
             if add_bias:
                 C = topi.add(C, bias)
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index 45dded7953d4..abd1d61c34ed 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -11,7 +11,7 @@
 from common import get_all_backend
 
 def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
-    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
 
     in_height = in_width = in_size
 
@@ -47,9 +47,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
-            C = topi.nn.conv2d(A, dW, (stride, stride), (padding, padding),
-                               layout='NCHW', out_dtype=dtype)
+            C = topi.nn.conv2d(A, W, (stride, stride), (padding, padding),
+                               (dilation, dilation), layout='NCHW', out_dtype=dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
diff --git a/topi/tests/python/test_topi_conv2d_nhwc.py b/topi/tests/python/test_topi_conv2d_nhwc.py
index ba52251c4f5b..af55f5bc172c 100644
--- a/topi/tests/python/test_topi_conv2d_nhwc.py
+++ b/topi/tests/python/test_topi_conv2d_nhwc.py
@@ -13,18 +13,17 @@ def verify_conv2d_nhwc(batch, in_channel, in_size, num_filter, kernel, stride, p
 
     A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A')
     W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
-    dW = topi.nn.dilate(W, (1, dilation, dilation, 1))
-    B = topi.nn.conv2d_nhwc(A, dW, stride, padding)
+    B = topi.nn.conv2d_nhwc(A, W, stride, padding, dilation)
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
     dtype = A.dtype
 
-    @memoize("topi.tests.test_topi_conv2d_nhwc.verify_nhwc")
+    @memoize("topi.tests.test_topi_conv2d_nhwc.verify_nhwc.v2")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
-        dw_np = topi.testing.dilate_python(w_np, (1, dilation, dilation, 1))
+        dw_np = topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
         b_np = topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
         return a_np, w_np, b_np
     a_np, w_np, b_np = get_ref_data()
diff --git a/topi/tests/python/test_topi_conv2d_winograd.py b/topi/tests/python/test_topi_conv2d_winograd.py
index 1666bc24991c..1ca7240a41b0 100644
--- a/topi/tests/python/test_topi_conv2d_winograd.py
+++ b/topi/tests/python/test_topi_conv2d_winograd.py
@@ -11,7 +11,7 @@
 
 
 def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
-    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
 
     in_height = in_width = in_size
 
@@ -47,8 +47,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
-            C = topi.nn.conv2d(A, dW, stride, padding, layout='NCHW', out_dtype=dtype)
+            C = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NCHW', out_dtype=dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index 51f2c418c121..a5dd6d328f07 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -26,7 +26,6 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
     # placeholder
     Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input')
     Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
-    DilatedFilter = topi.nn.dilate(Filter, (1, 1, dilation, dilation), name='DilatedFilter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
 
@@ -40,8 +39,8 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             # declare
-            DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, DilatedFilter,
-                (stride_h, stride_w), padding_args, dtype)
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter,
+                (stride_h, stride_w), padding_args, dilation, dtype)
             ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
             Relu = topi.nn.relu(ScaleShift)
             # schedule
@@ -123,7 +122,6 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
     # placeholder
     Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input')
     Filter = tvm.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter')
-    DilatedFilter = topi.nn.dilate(Filter, (1, 1, dilation, dilation), name='DilatedFilter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
 
@@ -138,8 +136,8 @@ def check_device(device):
 
         with tvm.target.create(device):
             # declare
-            DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, DilatedFilter,
-                (stride_h, stride_w), padding_args, dtype)
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, Filter,
+                (stride_h, stride_w), padding_args, dilation, dtype)
             ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
             Relu = topi.nn.relu(ScaleShift)
             # schedule
@@ -159,11 +157,11 @@ def check_device(device):
         scale_shift_shape = get_const_tuple(ScaleShift.shape)
 
         # Use memoize, pickle the test data for next time use.
-        @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc")
+        @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc.v2")
         def get_ref_data():
             input_np = np.random.uniform(size=input_shape).astype(dtype)
             filter_np = np.random.uniform(size=filter_shape).astype(dtype)
-            dilated_filter_np = topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation))
+            dilated_filter_np = topi.testing.dilate_python(filter_np, (dilation, dilation, 1, 1))
             scale_np = np.random.uniform(size=scale_shape).astype(dtype)
             shift_np = np.random.uniform(size=shift_shape).astype(dtype)
             # correctness with scipy
@@ -232,7 +230,8 @@ def test_depthwise_conv2d():
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID")
     # dilation = 2
-    depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
+    # disabled because it uses too large shared memory on cuda
+    # depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
 
 if __name__ == "__main__":
     test_depthwise_conv2d()
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index a09c7d51869e..347aa4207c9b 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -68,7 +68,7 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
 
     data = tvm.placeholder((N, CI, H, W), name='data')
     kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
-    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, 'float32')
+    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32')
     s = tvm.create_schedule([conv.op])
 
     ##### space definition begin #####
diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py
index c8ecbf848792..8b8124c95e2b 100644
--- a/tutorials/topi/intro_topi.py
+++ b/tutorials/topi/intro_topi.py
@@ -117,7 +117,7 @@
 kernel = tvm.placeholder((10, 3, 5, 5))
 
 with tvm.target.create("cuda"):
-    conv = topi.nn.conv2d(data, kernel, strides=1, padding=2)
+    conv = topi.nn.conv2d(data, kernel, strides=1, padding=2, dilation=1)
     out = topi.nn.relu(conv)
     sconv = topi.generic.nn.schedule_conv2d_nchw(out)
     print(tvm.lower(sconv, [data, kernel], simple_mode=True))
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 4bc0a8844a4b..6915ff8285ba 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -33,6 +33,7 @@ def run_cpu_conv2d(env, remote, key, batch_size, wl, profile=True):
         res_conv = topi.nn.conv2d(
             data, kernel, padding=(wl.hpad, wl.wpad),
             strides=(wl.hstride, wl.wstride),
+            dilation=(1, 1),
             out_dtype="int32")
         res = topi.right_shift(res_conv, 8)
         res = my_clip(res, 0, 127)

From c4ada6cfbeb4fdb6cbd615d2e76cf08448dca228 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Wed, 31 Oct 2018 15:14:01 -0700
Subject: [PATCH 320/529] [NNVM/TOPI][OP] gather_nd (#2041)

---
 docs/api/python/topi.rst                      |   2 +
 docs/nnvm_top.rst                             |   2 +
 nnvm/python/nnvm/frontend/mxnet.py            |   2 +-
 nnvm/python/nnvm/top/transform.py             |   4 +
 nnvm/src/top/tensor/transform.cc              | 116 ++++++++++++++++--
 nnvm/tests/python/compiler/test_top_level1.py |  31 +++++
 .../tests/python/unittest/test_infer_shape.py |  21 ++++
 topi/include/topi/transform.h                 |  54 ++++++++
 topi/python/topi/testing/__init__.py          |   1 +
 topi/python/topi/testing/gather_nd_python.py  |  36 ++++++
 topi/python/topi/transform.py                 |  18 +++
 topi/src/topi.cc                              |   5 +
 topi/tests/python/test_topi_transform.py      |  49 ++++++++
 13 files changed, 333 insertions(+), 8 deletions(-)
 create mode 100644 topi/python/topi/testing/gather_nd_python.py

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 767dfe1ba844..886822475db9 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -30,6 +30,7 @@ List of operators
    topi.concatenate
    topi.split
    topi.take
+   topi.gather_nd
    topi.full
    topi.full_like
    topi.nn.relu
@@ -103,6 +104,7 @@ topi
 .. autofunction:: topi.concatenate
 .. autofunction:: topi.split
 .. autofunction:: topi.take
+.. autofunction:: topi.gather_nd
 .. autofunction:: topi.full
 .. autofunction:: topi.full_like
 .. autofunction:: topi.max
diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
index be1077f664c3..717ce985e002 100644
--- a/docs/nnvm_top.rst
+++ b/docs/nnvm_top.rst
@@ -61,6 +61,7 @@ This level enables fully connected multi-layer perceptron.
    nnvm.symbol.flip
    nnvm.symbol.lrn
    nnvm.symbol.where
+   nnvm.symbol.gather_nd
 
 
 **Level 2: Convolutions**
@@ -197,6 +198,7 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.flip
 .. autofunction:: nnvm.symbol.lrn
 .. autofunction:: nnvm.symbol.where
+.. autofunction:: nnvm.symbol.gather_nd
 
 .. autofunction:: nnvm.symbol.conv2d
 .. autofunction:: nnvm.symbol.conv2d_transpose
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index d1c2f305c27d..bf55af2a36f0 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -290,7 +290,7 @@ def _zeros(_, attrs):
                   'elemwise_div', 'elemwise_mul', 'elemwise_sub', 'exp',
                   'flatten', 'log', 'log_softmax', 'max', 'min', 'negative',
                   'ones_like', 'relu', 'sigmoid', 'slice_like', 'softmax',
-                  'sum', 'tanh', 'transpose', 'zeros_like']
+                  'sum', 'tanh', 'transpose', 'zeros_like', 'gather_nd']
 
 _convert_map = {
     '_copy'         : _rename('copy'),
diff --git a/nnvm/python/nnvm/top/transform.py b/nnvm/python/nnvm/top/transform.py
index 594007239d4a..8fde9632a8af 100644
--- a/nnvm/python/nnvm/top/transform.py
+++ b/nnvm/python/nnvm/top/transform.py
@@ -86,3 +86,7 @@ def schedule_concatenate(_, outs, target):
 # where
 reg.register_pattern("where", OpPattern.INJECTIVE)
 reg.register_schedule("where", _fschedule_injective)
+
+# gather_nd
+reg.register_pattern("gather_nd", OpPattern.INJECTIVE)
+reg.register_schedule("gather_nd", _fschedule_injective)
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 8e35039a8085..f643f8891728 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -1003,7 +1003,7 @@ Examples::
        [ 3, 4]]
 
   flip(x) = [[ 3.,  4.],
-                  [ 1.,  2.]]
+             [ 1.,  2.]]
 
   x = [[[ 1.,  2.],
         [ 3.,  4.]],
@@ -1012,16 +1012,16 @@ Examples::
         [ 7.,  8.]]]
 
   flip(x) = [[[ 5.,  6.],
-                   [ 7.,  8.]],
+              [ 7.,  8.]],
 
-                  [[ 1.,  2.],
-                   [ 3.,  4.]]]
+             [[ 1.,  2.],
+              [ 3.,  4.]]]
 
   flip(x, axis=1) = [[[ 3.,  4.],
-                                 [ 1.,  2.]],
+                      [ 1.,  2.]],
 
-                                [[ 7.,  8.],
-                                 [ 5.,  6.]]]
+                     [[ 7.,  8.],
+                      [ 5.,  6.]]]
 )code" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "Source input")
 .add_arguments(FlipParam::__FIELDS__())
@@ -1353,5 +1353,107 @@ Examples::
 })
 .set_support_level(4);
 
+// gather_nd
+inline bool GatherNDInferShape(const nnvm::NodeAttrs& attrs,
+                               std::vector<TShape>* in_attrs,
+                               std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape& data_shape = in_attrs->at(0);
+  const TShape& indices_shape = in_attrs->at(1);
+  CHECK_GT(indices_shape.ndim(), 1) << "indices must have at least 2 dimensions";
+  CHECK_LE(indices_shape[0], data_shape.ndim()) <<
+      "dim 0 of indices must be no more than rank of data";
+  std::vector<dim_t> oshape;
+  for (size_t i = 1; i < indices_shape.ndim(); ++i) {
+    oshape.push_back(indices_shape[i]);
+  }
+  for (size_t i = indices_shape[0]; i < data_shape.ndim(); ++i) {
+    oshape.push_back(data_shape[i]);
+  }
+  if (oshape.size() == 0) {
+    oshape.push_back(1);
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0,
+                           TShape(oshape.begin(), oshape.end()));
+  return true;
+}
+
+inline bool GatherNDInferType(const NodeAttrs &attrs,
+                              std::vector<int> *in_attrs,
+                              std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, (*in_attrs)[0]);
+  return true;
+}
+
+inline bool GatherNDCorrectLayout(const NodeAttrs& attrs,
+                                  std::vector<Layout> *ilayouts,
+                                  const std::vector<Layout> *last_ilayouts,
+                                  std::vector<Layout> *olayouts) {
+  CHECK_EQ(ilayouts->size(), last_ilayouts->size());
+  CHECK_EQ(olayouts->size(), 1U);
+
+  for (size_t i = 0; i < ilayouts->size(); ++i) {
+    const Layout& input = last_ilayouts->at(i).defined() ?
+                          last_ilayouts->at(i) : ilayouts->at(i);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, i, input);
+  }
+
+  return true;
+}
+
+NNVM_REGISTER_OP(gather_nd)
+.describe(R"code(
+Gather elements or slices from ``data`` into a tensor specified by ``indices``.
+
+The shape of output tensor is inferred from ``indices``. Given ``data`` with
+shape ``(X0, X1, ..., X_{N-1})`` and ``indices`` with shape ``(Y_0, ...,
+Y_{M-1})``, the output will have shape ``(Y_1, ..., Y_{M-1}, X_{Y_0}, ...,
+X_{N-1})`` when ``Y_0 < N``, or ``(Y_1, ..., Y_{M-1})`` when ``Y_0 == N``. The
+operator is invalid when ``Y_0 > N``.
+
+The element in output is defined as follows::
+
+  output[y_1, ..., y_{M-1}, x_{Y_0}, ..., x_{N-1}] = data[indices[0, y_1, ..., y_{M-1}],
+                                                     ...,
+                                                     indices[Y_0-1, y_1, ..., y_{M-1}],
+                                                     x_{Y_0}, ..., x_{N-1}]
+
+Examples::
+
+  data = [[0, 1], [2, 3]]
+  indices = [[1], [0]]
+  gather_nd(data, indices) = [2]
+
+  data = [[0, 1], [2, 3]]
+  indices = [[1, 1, 0], [0, 1, 0]]
+  gather_nd(data, indices) = [2, 3, 0]
+
+  data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+  indices = [[0, 1], [1, 0]]
+  gather_nd(data, indices) = [[3, 4], [5, 6]]
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("indices", "Tensor", "Indices of data")
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<FInferShape>("FInferShape", GatherNDInferShape)
+.set_attr<FInferType>("FInferType", GatherNDInferType)
+.set_attr<FCorrectLayout>("FCorrectLayout", GatherNDCorrectLayout)
+.set_attr<FTVMCompute>(
+    "FTVMCompute", [](const NodeAttrs& attrs,
+                      const Array<Tensor>& inputs,
+                      const Array<Tensor>& out_info) {
+      return Array<Tensor>{
+        topi::gather_nd(inputs[0], inputs[1]) };
+  })
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data", "indices"};
+})
+.set_support_level(3);
+
 }  // namespace top
 }  // namespace nnvm
diff --git a/nnvm/tests/python/compiler/test_top_level1.py b/nnvm/tests/python/compiler/test_top_level1.py
index 089ae84cd2b8..d89bf359f2ac 100644
--- a/nnvm/tests/python/compiler/test_top_level1.py
+++ b/nnvm/tests/python/compiler/test_top_level1.py
@@ -533,6 +533,36 @@ def test_l2_normalize():
     verify_l2_normalize((1, 3, 20, 20), 0.001, (1,))
     verify_l2_normalize((1, 3, 20, 20), 0.001, (1, 2))
 
+def verify_gather_nd(src_shape, indices_src):
+    src_dtype = "float32"
+    indices_dtype = "int32"
+    indices_src = np.array(indices_src, dtype=indices_dtype)
+    a = sym.Variable("a", shape=src_shape)
+    indices = sym.Variable("indices", shape=indices_src.shape)
+    y = sym.gather_nd(a, indices)
+
+    def forward(a, indices):
+        return topi.testing.gather_nd_python(a, indices)
+
+    a_src = np.arange(np.prod(src_shape), dtype=src_dtype).reshape(src_shape)
+
+    check_function(y, forward,
+                   dtype={'a': src_dtype, 'indices': indices_dtype},
+                   values={'a': a_src, 'indices': indices_src})
+
+def test_gather_nd():
+    verify_gather_nd((4,), [[1]])
+    verify_gather_nd((4,), [[1, 3, 2]])
+    verify_gather_nd((2, 3), [[1]])
+    verify_gather_nd((2, 3), [[1], [0]])
+    verify_gather_nd((2, 3), [[1, 0], [0, 2]])
+    verify_gather_nd((2, 3, 4), [[1, 0], [0, 2]])
+    verify_gather_nd((2, 3, 4), [[1, 0], [0, 2], [3, 1]])
+    verify_gather_nd((2, 3, 4), [[[1, 0], [0, 1]], [[0, 2], [1, 2]],
+                                 [[3, 1], [0, 2]]])
+    verify_gather_nd((2, 3, 4, 5), [[1, 0], [0, 2]])
+    verify_gather_nd((2, 3, 4, 5), [[1, 0], [2, 1], [3, 2], [4, 2]])
+
 if __name__ == "__main__":
     test_check_function()
     test_split()
@@ -556,3 +586,4 @@ def test_l2_normalize():
     test_lrn()
     test_l2_normalize()
     test_strided_slice()
+    test_gather_nd()
diff --git a/nnvm/tests/python/unittest/test_infer_shape.py b/nnvm/tests/python/unittest/test_infer_shape.py
index eee8c3bdcacb..bbd92cea7b5f 100644
--- a/nnvm/tests/python/unittest/test_infer_shape.py
+++ b/nnvm/tests/python/unittest/test_infer_shape.py
@@ -356,6 +356,26 @@ def check(in_shape, out_shape, **kwargs):
     check((4, 5, 10), (1, 5, 1), axis=(0, 2), keepdims=True)
 
 
+def test_gather_nd():
+    def check(data_shape, indices_shape, out_shape):
+        x = sym.Variable("x", shape=data_shape)
+        indices = sym.Variable("indices", shape=indices_shape)
+        y = sym.gather_nd(x, indices, name="y")
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4,), (1, 1), (1,))
+    check((4,), (1, 3), (3,))
+    check((2, 3), (1, 1), (1, 3))
+    check((2, 3), (2, 1), (1,))
+    check((2, 3), (2, 5, 6), (5, 6))
+    check((2, 3, 4), (1, 1), (1, 3, 4))
+    check((2, 3, 4), (2, 1), (1, 4))
+    check((2, 3, 4), (2, 5), (5, 4))
+    check((2, 3, 4), (2, 5, 6), (5, 6, 4))
+    check((2, 3, 4, 5), (2, 6, 7), (6, 7, 4, 5))
+
+
 if __name__ == "__main__":
     test_conv2d_packed()
     test_expand_dims()
@@ -376,3 +396,4 @@ def check(in_shape, out_shape, **kwargs):
     test_transpose()
     test_prelu()
     test_squeeze()
+    test_gather_nd()
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 756aa2ec3b49..7fc408c2c79c 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -639,6 +639,60 @@ inline Tensor where(const Tensor& condition,
   return out;
 }
 
+/*!
+* \brief Gather elements from a n-dimension array.
+*
+* \param data The source array.
+* \param indices The indices of the values to extract.
+* \param name The name of the operation.
+* \param tag The tag to mark the operation.
+*
+* \return A Tensor whose op member is the gather_nd operation
+*/
+inline Tensor gather_nd(const Tensor& data,
+                        const Tensor& indices,
+                        std::string name = "tensor",
+                        std::string tag = kInjective) {
+  size_t ndim_d = data->shape.size();
+  size_t ndim_i = indices->shape.size();
+  CHECK_GT(ndim_i, 1) << "indices tensor must have at least 2 dimensions";
+  size_t indices_dim0 = static_cast<size_t>(GetConstInt(indices->shape[0]));
+  CHECK_LE(indices_dim0, ndim_d) << "dim 0 of indices tensor must be no more "
+                                 << "than dimensions of data tensor";
+  Array<Expr> out_shape;
+  for (size_t i = 1; i < ndim_i; ++i) {
+    out_shape.push_back(indices->shape[i]);
+  }
+  for (size_t i = indices_dim0; i < ndim_d; ++i) {
+    out_shape.push_back(data->shape[i]);
+  }
+  if (out_shape.size() == 0) {
+    out_shape.push_back(make_const(Int(32), 1));
+  }
+  return compute(
+        out_shape, [&](const Array<Var>& out_index) {
+          Array<Expr> indices_position;
+          indices_position.push_back(0);
+          for (size_t i = 0; i < ndim_i - 1; ++i) {
+            indices_position.push_back(out_index[i]);
+          }
+          Array<Expr> real_indices;
+          for (size_t i = 0; i < indices_dim0; ++i) {
+            indices_position.Set(0, make_const(Int(32), i));
+            if (indices->dtype.is_int()) {
+              real_indices.push_back(indices(indices_position));
+            } else {
+              real_indices.push_back(
+                  tvm::cast(tvm::Int(32), indices(indices_position)));
+            }
+          }
+          for (size_t i = ndim_i - 1; i < out_index.size(); ++i) {
+            real_indices.push_back(out_index[i]);
+          }
+          return data(real_indices);
+        }, name, tag);
+}
+
 /*!
  * \brief Creates an operation that calculates a matrix multiplication
  *  (row-major notation):
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index c91eea7958ea..8a3269ba83ae 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -18,3 +18,4 @@
 from .shortcut_python import shortcut_python
 from .lrn_python import lrn_python
 from .l2_normalize_python import l2_normalize_python
+from .gather_nd_python import gather_nd_python
diff --git a/topi/python/topi/testing/gather_nd_python.py b/topi/python/topi/testing/gather_nd_python.py
new file mode 100644
index 000000000000..e2d74cfee1fd
--- /dev/null
+++ b/topi/python/topi/testing/gather_nd_python.py
@@ -0,0 +1,36 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""gather_nd in python"""
+import numpy as np
+
+def gather_nd_python(a_np, indices_np):
+    """ Python version of GatherND operator
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        Numpy array
+
+    indices_np : numpy.ndarray
+        Numpy array
+
+    Returns
+    -------
+    b_np : numpy.ndarray
+        Numpy array
+    """
+    a_shape = a_np.shape
+    indices_np = indices_np.astype('int32')
+    indices_shape = indices_np.shape
+    assert len(indices_shape) > 1
+    assert indices_shape[0] <= len(a_shape)
+    b_shape = list(indices_shape[1:])
+    for i in range(indices_shape[0], len(a_shape)):
+        b_shape.append(a_shape[i])
+    b_np = np.zeros(b_shape)
+    for idx in np.ndindex(*indices_shape[1:]):
+        a_idx = []
+        for i in range(indices_shape[0]):
+            indices_pos = tuple([i] + list(idx))
+            a_idx.append(indices_np[indices_pos])
+        b_np[idx] = a_np[tuple(a_idx)]
+    return b_np
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 311b0facabdb..46929c6eb1f8 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -240,6 +240,24 @@ def take(a, indices, axis=None):
     return cpp.take(a, indices, int(axis))
 
 
+def gather_nd(a, indices):
+    """Gather elements from a n-dimension array..
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The source array.
+
+    indices : tvm.Tensor
+        The indices of the values to extract.
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.gather_nd(a, indices)
+
+
 def matmul(a, b, transp_a=False, transp_b=False):
     """
     Creates an operation that calculates a matrix multiplication (row-major notation):
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 2d9f2fd6c6b2..b47ba1165eb9 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -291,6 +291,11 @@ TVM_REGISTER_GLOBAL("topi.where")
   *rv = where(args[0], args[1], args[2]);
 });
 
+TVM_REGISTER_GLOBAL("topi.gather_nd")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = gather_nd(args[0], args[1]);
+});
+
 TVM_REGISTER_GLOBAL("topi.matmul")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   switch ( args.size() ) {
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 5c810f85e4c6..75e4d3b675b0 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -2,6 +2,7 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
 
 from common import get_all_backend
 
@@ -275,6 +276,38 @@ def check_device(device):
     for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
 
+def verify_gather_nd(src_shape, indices_src, indices_dtype):
+    src_dtype = "float32"
+    indices_src = np.array(indices_src, dtype=indices_dtype)
+    A = tvm.placeholder(shape=src_shape, dtype=src_dtype, name="A")
+    indices = tvm.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
+    out_tensor = topi.gather_nd(a=A, indices=indices)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(out_tensor)
+
+        func = tvm.build(s, [A, indices, out_tensor] , device, name="take")
+        shape_size = 1
+        for i in range(len(src_shape)):
+            shape_size = shape_size * src_shape[i]
+        data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
+        out_npys = topi.testing.gather_nd_python(data_npy, indices_src)
+        
+        data_nd = tvm.nd.array(data_npy, ctx)
+        indices_nd = tvm.nd.array(indices_src, ctx)
+        out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
+        func(data_nd, indices_nd, out_nd)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
+
+    for device in get_all_backend():
+        check_device(device)
+
 def test_strided_slice():
     verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
     verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
@@ -363,6 +396,21 @@ def test_take():
     verify_take((2,2), [[[1,0],[0,1]]], 1)
     verify_take((4,3,5,6), [[2,1,0,0]], -2)
 
+def test_gather_nd():
+    for indices_dtype in ['int32', 'float32']:
+        verify_gather_nd((4,), [[1.8]], indices_dtype)
+        verify_gather_nd((4,), [[1, 3, 2]], indices_dtype)
+        verify_gather_nd((2, 3), [[1]], indices_dtype)
+        verify_gather_nd((2, 3), [[1], [0]], indices_dtype)
+        verify_gather_nd((2, 3), [[1, 0], [0, 2]], indices_dtype)
+        verify_gather_nd((2, 3, 4), [[1, 0], [0, 2]], indices_dtype)
+        verify_gather_nd((2, 3, 4), [[1, 0], [0, 2], [3, 1]], indices_dtype)
+        verify_gather_nd((2, 3, 4), [[[1, 0], [0, 1]], [[0, 2], [1, 2]],
+                                     [[3, 1], [0, 2]]], indices_dtype)
+        verify_gather_nd((2, 3, 4, 5), [[1, 0], [0, 2]], indices_dtype)
+        verify_gather_nd((2, 3, 4, 5), [[1, 0], [2, 1], [3, 2], [4, 2]],
+                         indices_dtype)
+
 if __name__ == "__main__":
     test_concatenate()
     test_tranpose()
@@ -374,3 +422,4 @@ def test_take():
     test_expand_like()
     test_take()
     test_strided_slice()
+    test_gather_nd()

From 801ab88dd51727fe891d319c047aa545387e39d2 Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Wed, 31 Oct 2018 19:33:59 -0700
Subject: [PATCH 321/529] [Cleanliness] [Easy] Make TVM leak-sanitizer and
 Wnon-virtual-dtor clean. (#2046)

---
 src/codegen/codegen_source_base.h  |  1 +
 src/codegen/llvm/codegen_amdgpu.cc |  4 ++--
 src/codegen/llvm/codegen_nvptx.cc  |  4 ++--
 src/codegen/llvm/llvm_common.cc    |  4 ++--
 src/codegen/llvm/llvm_common.h     |  2 +-
 src/codegen/llvm/llvm_module.cc    | 12 ++++++------
 src/runtime/dsl_api.h              |  1 +
 src/runtime/registry.cc            |  7 +++++--
 8 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/codegen/codegen_source_base.h b/src/codegen/codegen_source_base.h
index 89c5bbc05ce4..d2f80a538a33 100644
--- a/src/codegen/codegen_source_base.h
+++ b/src/codegen/codegen_source_base.h
@@ -23,6 +23,7 @@ namespace codegen {
  */
 class CodeGenSourceBase {
  public:
+  virtual ~CodeGenSourceBase() = default;
   /*!
    * \brief Register constant value appeared in expresion tree
    *  This avoid generated a ssa id for each appearance of the value
diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc
index dd2cf6714251..9cccdf4466fd 100644
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -160,10 +160,10 @@ runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
   config << "-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx"
          << DetectROCMComputeVersion(target)
          << target.substr(4, target.length() - 4);
-  llvm::TargetMachine* tm = GetLLVMTargetMachine(config.str());
+  std::unique_ptr<llvm::TargetMachine> tm = GetLLVMTargetMachine(config.str());
   std::unique_ptr<CodeGenAMDGPU> cg(new CodeGenAMDGPU());
   std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext());
-  cg->Init(funcs[0]->name, tm, ctx.get(), false, false);
+  cg->Init(funcs[0]->name, tm.get(), ctx.get(), false, false);
   for (LoweredFunc f :  funcs) {
     cg->AddFunction(f);
   }
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index fc5ad99119ae..6bc6ccaff582 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -171,10 +171,10 @@ runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
   config << "-mtriple=nvptx64-nvidia-cuda -mcpu=sm_"
          << compute_ver
          << target.substr(5, target.length() - 5);
-  llvm::TargetMachine* tm = GetLLVMTargetMachine(config.str());
+  std::unique_ptr<llvm::TargetMachine> tm = GetLLVMTargetMachine(config.str());
   std::unique_ptr<CodeGenNVPTX> cg(new CodeGenNVPTX());
   std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext());
-  cg->Init(funcs[0]->name, tm, ctx.get(), false, false);
+  cg->Init(funcs[0]->name, tm.get(), ctx.get(), false, false);
   for (LoweredFunc f :  funcs) {
     cg->AddFunction(f);
   }
diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc
index 9d1ba6b1068f..48c3e788a7f2 100644
--- a/src/codegen/llvm/llvm_common.cc
+++ b/src/codegen/llvm/llvm_common.cc
@@ -114,7 +114,7 @@ void ParseLLVMTargetOptions(const std::string& target_str,
 }
 
 
-llvm::TargetMachine*
+std::unique_ptr<llvm::TargetMachine>
 GetLLVMTargetMachine(const std::string& target_str,
                      bool allow_null) {
   std::string target_triple, mcpu, mattr;
@@ -143,7 +143,7 @@ GetLLVMTargetMachine(const std::string& target_str,
   }
   llvm::TargetMachine* tm = target->createTargetMachine(
       target_triple, mcpu, mattr, opt, llvm::Reloc::PIC_);
-  return tm;
+  return std::unique_ptr<llvm::TargetMachine>(tm);
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/llvm_common.h b/src/codegen/llvm/llvm_common.h
index d5d27bf83d71..9f162ee5c6d7 100644
--- a/src/codegen/llvm/llvm_common.h
+++ b/src/codegen/llvm/llvm_common.h
@@ -78,7 +78,7 @@ void ParseLLVMTargetOptions(const std::string& target_str,
  * \param allow_null Whether allow null to be returned.
  * \return target machine
  */
-llvm::TargetMachine*
+std::unique_ptr<llvm::TargetMachine>
 GetLLVMTargetMachine(const std::string& target_str, bool allow_null = false);
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index 54f986d628d3..495f9982022a 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -160,9 +160,9 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     bool system_lib = (target.find("-system-lib") != std::string::npos);
     CHECK_NE(funcs.size(), 0U);
     ctx_ = std::make_shared<llvm::LLVMContext>();
-    std::unique_ptr<CodeGenLLVM> cg = CodeGenLLVM::Create(tm_);
+    std::unique_ptr<CodeGenLLVM> cg = CodeGenLLVM::Create(tm_.get());
     entry_func_ = funcs[0]->name;
-    cg->Init(funcs[0]->name, tm_, ctx_.get(), system_lib, system_lib);
+    cg->Init(funcs[0]->name, tm_.get(), ctx_.get(), system_lib, system_lib);
     for (LoweredFunc f :  funcs) {
       cg->AddFunction(f);
     }
@@ -218,8 +218,8 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       builder.setMAttrs(mattrs);
     }
     builder.setTargetOptions(opt);
-    llvm::TargetMachine *tm = builder.selectTarget();
-    llvm::TargetMachine *tm_sys = GetLLVMTargetMachine("llvm");
+    auto tm = std::unique_ptr<llvm::TargetMachine>(builder.selectTarget());
+    std::unique_ptr<llvm::TargetMachine> tm_sys = GetLLVMTargetMachine("llvm");
     if (tm_sys->getTargetTriple().getArch() != tm->getTargetTriple().getArch()) {
       LOG(FATAL) << "Cannot run module, architecture mismatch "
                  << " module=" << tm->getTargetTriple().str()
@@ -231,7 +231,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
         << mptr_->getDataLayout().getStringRepresentation() << ")"
         << " and ExecutionEngine ("
         << layout.getStringRepresentation() << ")";
-    ee_ = builder.create(tm);
+    ee_ = builder.create(tm.release());
     CHECK(ee_ != nullptr)
         << "Failed to initialize git engine for " << mptr_->getTargetTriple();
     ee_->runStaticConstructorsDestructors(false);
@@ -275,7 +275,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
   // The raw pointer to the module.
   llvm::Module* mptr_{nullptr};
   // The target machine
-  llvm::TargetMachine* tm_{nullptr};
+  std::unique_ptr<llvm::TargetMachine> tm_{nullptr};
   // The module, can be moved to ee if JIT is enabled.
   std::unique_ptr<llvm::Module> module_;
   // the context.
diff --git a/src/runtime/dsl_api.h b/src/runtime/dsl_api.h
index a1d6e48ceb2f..3e1299bd8c96 100644
--- a/src/runtime/dsl_api.h
+++ b/src/runtime/dsl_api.h
@@ -16,6 +16,7 @@ namespace runtime {
  */
 class DSLAPI {
  public:
+  virtual ~DSLAPI() = default;
   virtual void NodeFree(NodeHandle handle) const = 0;
 
   virtual void NodeTypeKey2Index(const char* type_key,
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index d7bbc3ce9996..3c792fdb9063 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -34,8 +34,11 @@ struct Registry::Manager {
   }
 
   static Manager* Global() {
-    static Manager inst;
-    return &inst;
+    // We deliberately leak the Manager instance, to avoid leak sanitizers
+    // complaining about the entries in Manager::fmap being leaked at program
+    // exit.
+    static Manager* inst = new Manager();
+    return inst;
   }
 };
 

From 3fd770b68651b234a7f206ae134b49e84a4863e0 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Wed, 31 Oct 2018 19:41:47 -0700
Subject: [PATCH 322/529] [RELAY][RUNTIME] Refactor interpreter and
 graph_runtime into consistent interface. (#2042)

---
 python/tvm/relay/__init__.py              |   2 +
 python/tvm/relay/build_module.py          |  46 ++++++
 python/tvm/relay/expr.py                  | 112 +++++++++++++
 python/tvm/relay/graph_runtime_codegen.py | 189 +---------------------
 python/tvm/relay/interpreter.py           | 154 +++++++++++++-----
 tests/python/relay/test_graph_runtime.py  |  21 ++-
 tests/python/relay/test_interpreter.py    |  12 +-
 7 files changed, 293 insertions(+), 243 deletions(-)
 create mode 100644 python/tvm/relay/build_module.py

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index d3b60c1174fa..b0a1fcec509e 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -7,6 +7,8 @@
 from . import expr
 from . import env
 from . import ir_pass
+from .build_module import build
+from .interpreter import create_executor
 
 # Root operators
 from .op import Op
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
new file mode 100644
index 000000000000..6b60fd3f17fe
--- /dev/null
+++ b/python/tvm/relay/build_module.py
@@ -0,0 +1,46 @@
+"""
+Construct the necessary state for the TVM graph runtime
+from a Relay expression.
+"""
+from ..build_module import build as tvm_build_module
+from . graph_runtime_codegen import GraphRuntimeCodegen
+from . import ir_pass
+from .env import Environment
+
+def build(func, params=None, target=None, env=None):
+    """
+    Compile a single function to the components needed by the
+    TVM RTS.
+
+    Parameters
+    ----------
+    func: relay.Expr
+        The function to build.
+
+    target: optional str
+        The target platform.
+
+    Returns
+    -------
+    (graph_json, mod, params): tuple of (str, tvm.Module, dict)
+        The outputs of building a Relay function for the TVM runtime.
+
+    """
+    if target is None:
+        target = 'llvm'
+
+    if env is None:
+        env = Environment({})
+
+    comp = GraphRuntimeCodegen(env)
+    # NB(@jroesch) This creates lowered functions, and generates names for them
+    #
+    # We need these names to emit the correct graph as these are names of the
+    # functions contained in the module.
+    lowered_ops = ir_pass.lower_ops(env, func)
+    mod = tvm_build_module([lf.lowered_func for lf in lowered_ops], target)
+
+    # Therefore the call to compile must come after.
+    comp.codegen(func)
+    graph_json = comp.to_json()
+    return graph_json, mod, params
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 43ec46d35a82..dd9477aa9580 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -319,6 +319,118 @@ def __init__(self, tuple_value, index):
         self.__init_handle_by_constructor__(
             _make.TupleGetItem, tuple_value, index)
 
+class ExprFunctor(object):
+    """
+    An abstract visitor defined over Expr.
+
+    A Python version of the class defined in `expr_functor.h`.
+
+    Defines the default dispatch over expressions, and
+    implements memoization.
+    """
+    def __init__(self):
+        self.memo_map = {}
+
+    # pylint: disable=no-else-return
+    def visit(self, expr):
+        """Apply the visitor to an expression."""
+        found = self.memo_map.get(expr)
+        if found:
+            return found
+
+        if isinstance(expr, Function):
+            res = self.visit_function(expr)
+        elif isinstance(expr, Call):
+            res = self.visit_call(expr)
+        elif isinstance(expr, Let):
+            res = self.visit_let(expr)
+        elif isinstance(expr, Var):
+            res = self.visit_var(expr)
+        elif isinstance(expr, GlobalVar):
+            res = self.visit_global_var(expr)
+        elif isinstance(expr, If):
+            res = self.visit_if(expr)
+        elif isinstance(expr, Tuple):
+            res = self.visit_tuple(expr)
+        elif isinstance(expr, Constant):
+            res = self.visit_constant(expr)
+        else:
+            raise Exception("warning unhandled case: {0}".format(type(expr)))
+
+        self.memo_map[expr] = res
+        return res
+
+    def visit_function(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_let(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_call(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_var(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_type(self, typ):
+        return typ
+
+    def visit_if(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_tuple(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_constant(self, _):
+        raise Exception("Abstract method please implement me.")
+
+    def visit_global_var(self, _):
+        raise Exception("Abstract method please implement me.")
+
+
+class ExprMutator(ExprFunctor):
+    """
+    A functional visitor over Expr.
+
+    The default behavior recursively traverses the AST
+    and reconstructs the AST.
+    """
+
+    def visit_function(self, fn):
+        new_body = self.visit(fn.body)
+        return Function(
+            list(fn.params),
+            fn.ret_type, new_body,
+            fn.type_params)
+
+    def visit_let(self, let):
+        new_var = self.visit(let.var)
+        new_val = self.visit(let.value)
+        new_body = self.visit(let.body)
+        return Let(new_var, new_val, new_body)
+
+    def visit_call(self, call):
+        new_fn = self.visit(call.op)
+        new_args = [self.visit(arg) for arg in call.args]
+        return Call(new_fn, new_args, call.attrs)
+
+    def visit_var(self, rvar):
+        return rvar
+
+    def visit_global_id(self, global_var):
+        return global_var
+
+    def visit_if(self, ite):
+        return If(
+            self.visit(ite.guard),
+            self.visit(ite.true_b),
+            self.visit(ite.false_b))
+
+    def visit_tuple(self, tup):
+        return Tuple([self.visit(field) for field in tup.fields])
+
+    def visit_constant(self, rconst):
+        return rconst
 
 class TupleWrapper(object):
     """TupleWrapper.
diff --git a/python/tvm/relay/graph_runtime_codegen.py b/python/tvm/relay/graph_runtime_codegen.py
index d0ce239fa7fd..3fd408a58f0d 100644
--- a/python/tvm/relay/graph_runtime_codegen.py
+++ b/python/tvm/relay/graph_runtime_codegen.py
@@ -25,113 +25,7 @@
 import attr
 from . import ir_pass
 from .op import Op
-from .expr import Var, Function, Call, If, GlobalVar, Constant, Let, Tuple
-from ..build_module import build as tvm_build_module
-from .. contrib import graph_runtime
-from .ir_pass import infer_type
-from .. import cpu
-
-class AbstractExprVisitor(object):
-    """A visitor over Expr in Python."""
-
-    def __init__(self):
-        self.memo_map = {}
-
-    # pylint: disable=no-else-return
-    def visit(self, expr):
-        """Apply the visitor to an expression."""
-        found = self.memo_map.get(expr)
-        if found:
-            return found
-
-        if isinstance(expr, Function):
-            res = self.visit_function(expr)
-        elif isinstance(expr, Call):
-            res = self.visit_call(expr)
-        elif isinstance(expr, Let):
-            res = self.visit_let(expr)
-        elif isinstance(expr, Var):
-            res = self.visit_var(expr)
-        elif isinstance(expr, GlobalVar):
-            res = self.visit_global_var(expr)
-        elif isinstance(expr, If):
-            res = self.visit_if(expr)
-        elif isinstance(expr, Tuple):
-            res = self.visit_tuple(expr)
-        elif isinstance(expr, Constant):
-            res = self.visit_constant(expr)
-        else:
-            raise Exception("warning unhandled case: {0}".format(type(expr)))
-
-        self.memo_map[expr] = res
-        return res
-
-    def visit_function(self, _):
-        raise Exception("Abstract method please implement me.")
-
-    def visit_let(self, _):
-        raise Exception("Abstract method please implement me.")
-
-    def visit_call(self, _):
-        raise Exception("Abstract method please implement me.")
-
-    def visit_var(self, _):
-        raise Exception("Abstract method please implement me.")
-
-    def visit_type(self, typ):
-        return typ
-
-    def visit_if(self, _):
-        raise Exception("Abstract method please implement me.")
-
-    def visit_tuple(self, _):
-        raise Exception("Abstract method please implement me.")
-
-    def visit_constant(self, _):
-        raise Exception("Abstract method please implement me.")
-
-    def visit_global_var(self, _):
-        raise Exception("Abstract method please implement me.")
-
-
-class ExprMutator(AbstractExprVisitor):
-    """A functional visitor over Expr in Python."""
-
-    def visit_function(self, fn):
-        new_body = self.visit(fn.body)
-        return Function(
-            list(fn.params),
-            fn.ret_type, new_body,
-            fn.type_params)
-
-    def visit_let(self, let):
-        new_var = self.visit(let.var)
-        new_val = self.visit(let.value)
-        new_body = self.visit(let.body)
-        return Let(new_var, new_val, new_body)
-
-    def visit_call(self, call):
-        new_fn = self.visit(call.op)
-        new_args = [self.visit(arg) for arg in call.args]
-        return Call(new_fn, new_args, call.attrs)
-
-    def visit_var(self, var):
-        return var
-
-    def visit_global_id(self, global_var):
-        return global_var
-
-    def visit_if(self, ite):
-        return If(
-            self.visit(ite.guard),
-            self.visit(ite.true_b),
-            self.visit(ite.false_b))
-
-    def visit_tuple(self, tup):
-        return Tuple([self.visit(field) for field in tup.fields])
-
-    def visit_constant(self, const):
-        return const
+from .expr import Function, GlobalVar, ExprMutator
 
 
 @attr.s
@@ -359,8 +253,8 @@ def visit_let(self, let):
         self.add_binding(ident, val_ref)
         return self.visit(body)
 
-    def visit_var(self, var):
-        return self.lookup(var)
+    def visit_var(self, rvar):
+        return self.lookup(rvar)
 
     def visit_call(self, call):
         """Transform a ::tvm.relay.Call into an operator in the TVM graph."""
@@ -472,80 +366,3 @@ def compute_node_row_ptr(nodes):
         }
 
         return json.dumps(json_dict)
-
-
-def build(env, func, target=None):
-    """
-    Compile a single function to the components needed by the
-    TVM RTS.
-
-    Parameters
-    ----------
-    func: relay.Expr
-        The function to build.
-
-    target: optional str
-        The target platform.
-
-    Returns
-    -------
-    (graph_json, mod, params): tuple of (str, tvm.Module, dict)
-        The outputs of building a Relay function for the TVM runtime.
-
-    """
-    if target is None:
-        target = 'llvm'
-
-    comp = GraphRuntimeCodegen(env)
-    # NB(@jroesch) This creates lowered functions, and generates names for them
-    #
-    # We need these names to emit the correct graph as these are names of the
-    # functions contained in the module.
-    lowered_ops = ir_pass.lower_ops(env, func)
-    mod = tvm_build_module([lf.lowered_func for lf in lowered_ops], target)
-
-    # Therefore the call to compile must come after.
-    comp.codegen(func)
-    graph_json = comp.to_json()
-    return graph_json, mod, None  # params currently isn't supported by API
-
-
-def graph_evaluate(env, func, *args):
-    """
-    Corresponding function to tvm.relay.eval.evaluate.
-
-    This function evaluates a Relay expression on the
-    TVM graph_runtime.
-
-    Parameters
-    ----------
-    env: tvm.relay.Environment
-        The global environment used.
-
-    expr: tvm.relay.Expr
-        The expression to evaluate.
-
-    args: list of tvm.relay.Expr
-        The arguments to apply to the expression, only works
-        if the expression has a function type.
-
-    Returns
-    -------
-    value: tvm.NDArray
-        The output Tensor produced by evaluating the expression.
-    """
-    func = infer_type(func, env)
-    func = ir_pass.fuse_ops(env, func)
-    func = infer_type(func, env)
-    graph_json, mod, params = build(env, func)
-    assert params is None
-    gmodule = graph_runtime.create(graph_json, mod, cpu(0))
-    # Create map of inputs.
-    inputs = {}
-    for i, arg in enumerate(args):
-        inputs[func.params[i].name_hint] = arg
-    # Set the inputs here.
-    gmodule.set_input(**inputs)
-    # Run the module, and fetch the output.
-    gmodule.run()
-    return gmodule.get_output(0)
diff --git a/python/tvm/relay/interpreter.py b/python/tvm/relay/interpreter.py
index 06dc3c79fba4..d95943c130dc 100644
--- a/python/tvm/relay/interpreter.py
+++ b/python/tvm/relay/interpreter.py
@@ -4,12 +4,16 @@
 import numpy as np
 from .. import register_func, nd
 from .base import NodeBase, register_relay_node
+from . import build_module
 from . import _make
 from . import _interpreter
 from . import ir_pass
-from .expr import Call, Constant, GlobalVar
-from . import const
+from .env import Environment
+from .expr import Call, Constant, GlobalVar, Function, const
+from .scope_builder import ScopeBuilder
 from .._ffi.base import integer_types
+from ..contrib import graph_runtime as tvm_runtime
+from .. import cpu
 
 class Value(NodeBase):
     """Base class of all values.
@@ -83,48 +87,122 @@ def _arg_to_ast(arg):
     else:
         return const(arg)
 
+class Executor(object):
+    """An abstract interface for executing Relay programs."""
 
-def apply_passes(expr, env=None):
-    ck_expr = ir_pass.infer_type(expr, env=env)
-    fused_expr = ir_pass.fuse_ops(env, ck_expr)
-    return fused_expr
+    def __init__(self, env=None):
+        """
+        Parameters
+        ----------
+        env: relay.Environment
+            The environment.
+        """
+        if env is None:
+            self.env = Environment({})
+        else:
+            self.env = env
 
 
-def evaluate(env, expr, *args):
-    """
-    Evaluate a Relay expression on the interpreter.
+    def optimize(self, expr):
+        # TODO: We need to move this optimization code into the optimizer/pass manager
+        ck_expr = ir_pass.infer_type(expr, env=self.env)
+        fused_expr = ir_pass.fuse_ops(self.env, ck_expr)
+        ck_fused = ir_pass.infer_type(fused_expr, env=self.env)
+        return ck_fused
+
+    def _make_executor(self, _):
+        """
+        Construct a Python function that implements the evaluation
+        of expression.
+
+        Parameters
+        ----------
+        expr: relay.Expr
+            The Relay expression to execute.
+
+        Returns
+        -------
+        executor: function
+            A Python function which implements the behavior of `expr`.
+        """
+        raise Exception("abstract method: please implement me.")
 
-    Parameters
-    ----------
-    env: tvm.relay.Environment
-        The global environment used.
+    def evaluate(self, expr, params=None):
+        """
+        Evaluate a Relay expression on the interpreter.
+
+        Parameters
+        ----------
+        expr: tvm.relay.Expr
+            The expression to evaluate.
+        """
+        if params:
+            scope_builder = ScopeBuilder()
+            for key, value in params:
+                scope_builder.let(key, value)
+            scope_builder.ret(expr)
+            expr = scope_builder.get()
 
-    expr: tvm.relay.Expr
-        The expression to evaluate.
+        if isinstance(expr, Function):
+            assert not ir_pass.free_vars(expr)
 
-    args: list of tvm.relay.Expr
-        The arguments to apply to the expression, only works
-        if the expression has a function type.
+        return self._make_executor(expr)
 
-    Returns
-    -------
-    value: tvm.relay.eval.Value
-        The value produced by evaluating the expression.
+
+class Interpreter(Executor):
+    """
+    A wrapper around the Relay interpreter, implements the excecutor interface.
     """
-    # assert len(args) == 0
-    relay_args = []
-    for arg in args:
-        relay_args.append(_arg_to_ast(arg))
-
-    # TODO: We need to move this optimization code into the optimizer/pass manager
-    if isinstance(expr, GlobalVar):
-        func = env[expr]
-        func = apply_passes(func, env)
-        env._add(expr, func, True)
-        opt_expr = Call(expr, relay_args)
-        # import pdb; pdb.set_trace()
-        return _interpreter.evaluate(env, opt_expr)
+    def __init__(self, env=None):
+        Executor.__init__(self, env)
+
+    def _make_executor(self, expr):
+        def _interp_wrapper(*args):
+            relay_args = []
+            for arg in args:
+                relay_args.append(_arg_to_ast(arg))
+
+            if isinstance(expr, GlobalVar):
+                func = self.env[expr]
+                func = self.optimize(func)
+                self.env._add(expr, func, True)
+                opt_expr = Call(expr, relay_args)
+                return _interpreter.evaluate(self.env, opt_expr)
+            else:
+                call = Call(expr, relay_args)
+                opt_expr = self.optimize(call)
+                return _interpreter.evaluate(self.env, opt_expr)
+
+        return _interp_wrapper
+
+
+class GraphRuntime(Executor):
+    """A wrapper around the TVM graph runtime, implements the Executor interface."""
+    def __init__(self, env=None):
+        Executor.__init__(self, env)
+
+    def _make_executor(self, expr):
+        def _graph_wrapper(*args):
+            func = self.optimize(expr)
+            graph_json, mod, params = build_module.build(func, env=self.env)
+            assert params is None
+            gmodule = tvm_runtime.create(graph_json, mod, cpu(0))
+            # Create map of inputs.
+            inputs = {}
+            for i, arg in enumerate(args):
+                inputs[func.params[i].name_hint] = arg
+            # Set the inputs here.
+            gmodule.set_input(**inputs)
+            # Run the module, and fetch the output.
+            gmodule.run()
+            return gmodule.get_output(0)
+
+        return _graph_wrapper
+
+def create_executor(mode='debug', env=None):
+    if mode == 'debug':
+        return Interpreter(env)
+    elif mode == 'graph':
+        return GraphRuntime(env)
     else:
-        expr = Call(expr, relay_args)
-        opt_expr = apply_passes(expr, env)
-        return _interpreter.evaluate(env, opt_expr)
+        raise Exception("unknown mode {0}".format(mode))
diff --git a/tests/python/relay/test_graph_runtime.py b/tests/python/relay/test_graph_runtime.py
index 1e55f890e514..38acc5df08d8 100644
--- a/tests/python/relay/test_graph_runtime.py
+++ b/tests/python/relay/test_graph_runtime.py
@@ -1,15 +1,15 @@
 import numpy as np
 
 from tvm import relay
+from tvm.relay import create_executor
 from tvm.relay.ir_pass import infer_type
-from tvm.relay.interpreter import evaluate
-from tvm.relay.graph_runtime_codegen import graph_evaluate
+from tvm.relay.interpreter import Interpreter
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay.op import add
 from tvm.relay.env import Environment
 
 # @tq, @jr should we put this in testing ns?
-def check_rts(env, expr, args, expected_result):
+def check_rts(expr, args, expected_result, env=None):
     """
     Check that evaluating `expr` applied to the arguments produces
     `result` on both the evaluator and TVM runtime.
@@ -25,8 +25,10 @@ def check_rts(env, expr, args, expected_result):
     expected_result:
         The expected result of running the expression.
     """
-    eval_result = evaluate(env, expr, *args)
-    rts_result = graph_evaluate(env, expr, *args)
+    intrp = create_executor('graph', env=env)
+    graph = create_executor('graph', env=env)
+    eval_result = intrp.evaluate(expr)(*args)
+    rts_result = graph.evaluate(expr)(*args)
     np.testing.assert_allclose(eval_result.asnumpy(), rts_result.asnumpy())
 
 def test_add_op_scalar():
@@ -36,13 +38,12 @@ def test_add_op_scalar():
             return x + y;
         }
     """
-    env = Environment()
     x = relay.var('x', shape=())
     y = relay.var('y', shape=())
     func = relay.Function([x, y], add(x, y))
     x_data = np.array(10.0, dtype='float32')
     y_data = np.array(1.0, dtype='float32')
-    check_rts(env, func, [x_data, y_data], x_data + y_data)
+    check_rts(func, [x_data, y_data], x_data + y_data)
 
 def test_add_op_tensor():
     """
@@ -51,13 +52,12 @@ def test_add_op_tensor():
             return x + y;
         }
     """
-    env = Environment()
     x = relay.var('x', shape=(10, 5))
     y = relay.var('y', shape=(10, 5))
     func = relay.Function([x, y], add(x, y))
     x_data = np.random.rand(10, 5).astype('float32')
     y_data = np.random.rand(10, 5).astype('float32')
-    check_rts(env, func, [x_data, y_data], x_data + y_data)
+    check_rts(func, [x_data, y_data], x_data + y_data)
 
 def test_add_op_broadcast():
     """
@@ -66,13 +66,12 @@ def test_add_op_broadcast():
             return x + y;
         }
     """
-    env = Environment()
     x = relay.var('x', shape=(10, 5))
     y = relay.var('y', shape=(1, 5))
     func = relay.Function([x, y], add(x, y))
     x_data = np.random.rand(10, 5).astype('float32')
     y_data = np.random.rand(1, 5).astype('float32')
-    check_rts(env, func, [x_data, y_data], x_data + y_data)
+    check_rts(func, [x_data, y_data], x_data + y_data)
 
 if __name__ == "__main__":
     test_add_op_scalar()
diff --git a/tests/python/relay/test_interpreter.py b/tests/python/relay/test_interpreter.py
index 9a431b4c9524..f2eaa3d02dec 100644
--- a/tests/python/relay/test_interpreter.py
+++ b/tests/python/relay/test_interpreter.py
@@ -1,17 +1,15 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.interpreter import Value, TupleValue, evaluate
+from tvm.relay.interpreter import Value, TupleValue
 from tvm.relay import op
 from tvm.relay.scope_builder import ScopeBuilder
-from tvm.relay import testing
+from tvm.relay import testing, create_executor
 
 
 def check_eval(expr, args, expected_result, env=None, rtol=1e-07):
-    if env is None:
-        env = relay.env.Environment({})
-
-    result = evaluate(env, expr, *args)
+    intrp = create_executor(env=env)
+    result = intrp.evaluate(expr)(*args)
     np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
 
 
@@ -32,8 +30,6 @@ def test_tuple_value():
 def test_id():
     x = relay.var('x', 'float32')
     ident = relay.Function([x], x)
-    env = relay.env.Environment({})
-    res = evaluate(env, ident, 1.0)
     check_eval(ident, [1.0], 1.0)
 
 

From dff4bb6dfcc8fb99928f1953380aa287339e4f6c Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Fri, 2 Nov 2018 00:25:55 +0800
Subject: [PATCH 323/529] Refine CMakeLists.txt (#2049)

---
 CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66c690314a42..8bfca8020c3c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,8 +80,12 @@ if(MSVC)
 else(MSVC)
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++11"    SUPPORT_CXX11)
-  set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}")
-  set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -std=c++11 ${CMAKE_CXX_FLAGS}")
+  if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+    add_compile_options(-Wall -fPIC -std=c++11)
+  else()
+    set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -std=c++11 ${CMAKE_CXX_FLAGS}")
+  endif ()
   if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
       CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
     set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")

From 1baac573559197c9fec6a61fb82782261b93e7dc Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Thu, 1 Nov 2018 09:26:54 -0700
Subject: [PATCH 324/529] [TOPI] Fix adding dilation arguments (#2047)

---
 topi/python/topi/arm_cpu/conv2d.py       | 14 ++++++++++----
 topi/python/topi/cuda/conv2d_winograd.py |  5 +++++
 topi/python/topi/mali/conv2d.py          |  2 +-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index c30ad496b24d..cfd423b584cf 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -2,6 +2,8 @@
 """Conv2D schedule for ARM CPU"""
 from __future__ import absolute_import as _abs
 
+import warnings
+
 import numpy as np
 
 import tvm
@@ -522,7 +524,10 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
     out_dtype = attrs["out_dtype"]
     out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype
 
-    if layout != 'NCHW' or groups != 1 or dilation != (1, 1):
+    if layout != 'NCHW' or groups != 1:
+        return None
+    if dilation != (1, 1):
+        warnings.warn("Does not support weight pre-transform for dilated convolution.")
         return None
 
     data, kernel = tinfos[0:2]
@@ -531,7 +536,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
 
     # query config of this workload
     workload = autotvm.task.args_to_workload(
-        [data, kernel, strides, padding, layout, out_dtype], conv2d)
+        [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d)
     target = tvm.target.current_target()
     dispatch_ctx = autotvm.DispatchContext.current
     cfg = dispatch_ctx.query(target, workload)
@@ -548,7 +553,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
         new_data = data
         new_kernel = tvm.placeholder((CO // VC, CI, KH, KW, VC), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, 'NCHW', out_dtype], conv2d)
+            [new_data, new_kernel, strides, padding, dilation, 'NCHW', out_dtype], conv2d)
         dispatch_ctx.update(target, new_workload, cfg)
 
         return sym.conv2d(*copy_inputs, **new_attrs)
@@ -574,7 +579,8 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
         new_weight = tvm.placeholder((KH + tile_size - 1, KH + tile_size -1, CO // VC, CI, VC),
                                      kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
-            [new_data, new_weight, strides, padding, new_attrs['layout'], out_dtype, tile_size],
+            [new_data, new_weight, strides, padding, dilation,
+             new_attrs['layout'], out_dtype, tile_size],
             conv2d_winograd_without_weight_transform)
         dispatch_ctx.update(target, new_workload, cfg)
 
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index fb30a4f9ad2e..1f2112979ee7 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -375,6 +375,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
             new_attrs['out_layout'] = new_layout
             new_attrs['kernel_layout'] = 'OIHW4o4i'
             ic_block_factor = oc_block_factor = 4
+
+            # Store the same config for the altered operator (workload)
             new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
                                        dtype=data.dtype)
             new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW,\
@@ -387,7 +389,9 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
             return sym.conv2d(*copy_inputs, **new_attrs)
 
         if attrs.get_int_tuple("dilation") != (1, 1):
+            warnings.warn("Does not support weight pre-transform for dilated convolution.")
             return None
+
         # pre-compute weight transformation in winograd
         tile_size = _infer_tile_size(tinfos[0], tinfos[1])
 
@@ -397,6 +401,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
         copy_inputs[1] = weight
         new_attrs['tile_size'] = tile_size
 
+        # Store the same config for the altered operator (workload)
         new_data = data
         new_weight = tvm.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO),
                                      dtype=kernel.dtype)
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 7c3b4a23cbc5..1ad58038abb1 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -440,7 +440,7 @@ def _schedule_winograd(cfg, s, op):
 
 ##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
 @autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'mali', ['winograd'])
-def conv2d_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
     """TOPI compute callback"""
     return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
                           tile_size)

From 75bbf443e1447dfefed45968ef70035e7370d9f7 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Fri, 2 Nov 2018 01:00:01 +0530
Subject: [PATCH 325/529] [FRONTEND][TENSORFLOW] Enhancements. (#1923)

* [FRONTEND][TENSORFLOW] Enhancements.
	* Generalize the shape with explicite argument.
	* Supported entire range of mobilenet_v2 models.
	* Cast op updated to latest tensorflow.
	* Documentation updates.
	* CheckNumerics op handling without exception.
	* Test data from tensorflow official releases.

* 	* CI error.

* 	* self review

* 	* Enhanced reshape handling.

* 	* docs.

* 	* tutorials

* 	* review comments.

* 	* review.
---
 docs/frontend/tensorflow.md                   | 36 +++++++++
 nnvm/python/nnvm/frontend/tensorflow.py       | 81 ++++++++++++-------
 nnvm/python/nnvm/testing/tf.py                | 79 +++++++++++++-----
 .../frontend/tensorflow/test_forward.py       | 13 ++-
 tutorials/nnvm/from_tensorflow.py             | 12 ++-
 5 files changed, 169 insertions(+), 52 deletions(-)
 create mode 100644 docs/frontend/tensorflow.md

diff --git a/docs/frontend/tensorflow.md b/docs/frontend/tensorflow.md
new file mode 100644
index 000000000000..acafbb5bb93e
--- /dev/null
+++ b/docs/frontend/tensorflow.md
@@ -0,0 +1,36 @@
+# Tensorflow Frontend
+Tensorflow frontend helps in importing tensorflow released model into TVM.
+
+This document helps few steps while importing various different models from
+[tensorflow research/slim](https://github.com/tensorflow/models/tree/master/research/slim).
+
+Current frontend is tested with all versions of below models
+- Inception (V1/V2/V3/V4)
+- Resnet (All)
+- Mobilenet (V1/V2 All)
+- Vgg (16/19)
+
+Tensorflow frontend expects a freezed protobuf format as input.
+
+Not all models are released as freezed protobuf. Some of them are checkpoints (.ckpt).
+Please refer to [export](https://github.com/tensorflow/models/tree/master/research/slim#exporting-the-inference-graph) 
+and [freeze](https://github.com/tensorflow/models/tree/master/research/slim#freezing-the-exported-graph) 
+instructions to generate protobuf from checkpoint.
+
+## General Instructions
+
+### Add Shapes:
+While freezing of protobuf add additional option ```add_shapes=True``` to embed output shapes of each node into graph.
+You may use ```nnvm.testing.tf.AddShapesToGraphDef``` from nnvm for the same.
+Please refer to [tensorflow tutorial](https://github.com/dmlc/tvm/blob/master/tutorials/nnvm/from_tensorflow.py).
+
+### Explicit Shape:
+There might be situations where the add_shapes=True may not provide sufficient information about shape.
+You may pass explicit dictionary of input shapes argument for ```from_tensorflow```.
+Please refer to [test cases](https://github.com/dmlc/tvm/blob/master/nnvm/tests/python/frontend/tensorflow/test_forward.py#L36).
+
+### GPU:
+Most of these tensorflow models are released for CPU with NHWC layout.
+To compile for GPU we need to pass extra argument ```layout='NCHW'``` for from_tensorflow.
+This option will do a layout conversion before and after for neural network ops.
+Remaining nnvm build options for GPU compilation remain as it is.
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 9cd07cca3cc6..e7282eb9afd6 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -9,7 +9,7 @@
 import tvm
 from .. import symbol as _sym
 from .. import graph as _graph
-from .. compiler import graph_util
+from .. compiler import graph_util, build_module
 from .common import get_nnvm_op, AttrConverter as AttrConvert
 
 __all__ = ['from_tensorflow']
@@ -380,7 +380,7 @@ def _pack():
     def _impl(inputs, attr, params):
         axis = int(attr["axis"])
         inputs_reshaped = [_sym.expand_dims(i, axis=axis, num_newaxis=1) for i in inputs]
-        return _sym.concatenate(*inputs_reshaped, axis=axis)
+        return _sym.concatenate(*inputs_reshaped, axis=axis, name=attr["_node_name"])
 
     return _impl
 
@@ -396,9 +396,19 @@ def _impl(inputs, attr, params):
                 extras={'shape':tuple(shape_arg.asnumpy())},
                 ignores=['Tshape'])(inputs, attr)
         except KeyError:
-            return AttrCvt(
-                op_name="reshape_like",
-                ignores=['Tshape'])(inputs, attr)
+            # Shape operator is already pruned, hence
+            # try to infer shape by precompute prune if possible.
+            if all(in_node in params for in_node in inputs[1].list_input_names()):
+                graph = _graph.create(_sym.Group(inputs[1]))
+                params_pre = {k: params[k] for k in inputs[1].list_input_names()}
+                params_new = build_module._run_graph(graph, params_pre)
+                inputs.pop(1)
+                return AttrCvt(
+                    op_name="reshape",
+                    extras={'shape':tuple(params_new[0].asnumpy().flatten())},
+                    ignores=['Tshape'])(inputs, attr)
+            else:
+                raise RuntimeError("Reshape with dynamic shape input not supported yet.")
     return _impl
 
 def _bias_add():
@@ -470,9 +480,7 @@ def _impl(inputs, attr, params):
 
 def _shape():
     def _impl(inputs, attr, params):
-        # Result of this operator is prominently used by reshape operator.
-        # Just pass the input as it is so that reshape_like can be used there.
-        return inputs[0]
+        return np.array(attr['_input_shapes'][inputs[0]][0], dtype='int32')
     return _impl
 
 def _fill():
@@ -1031,28 +1039,33 @@ def __init__(self):
         self._num_param = 0
         self._num_rnn_layer = False
 
-    def from_tensorflow(self, graph, layout="NHWC"):
+    def from_tensorflow(self, graph, layout="NHWC", shape=None):
         """Construct nnvm nodes from tensorflow  graph definition - GraphDef.
 
         Follow the tensorflow graph definition to parse and convert it to NNVM.
         Some of the assumptions listed below.
 
-            -> First Placeholder or Const node will be considered as graph input.
-            -> Rest all Const nodes are params.
+            -> All Placeholders are considered as graph input.
+            -> All Const nodes are params.
             -> Last node is assumed as graph output.
-            -> _output_shapes : Attribute should present in the tenserflow forzen graph.
+            -> _output_shapes : Graph should be frozen with add_shapes=True.
+                                Or user can pass input shape dictionaly optionally.
             -> DecodeJpeg, ResizeBilinear: These are dummy operators.
                                            Hence user should handle preprocessing outside.
             -> CheckNumerics: No implementation as of now for this.
                               Just copies input to output.
 
-        TODO: Change algorithm to stop treating first 'Const' in a special way.
-
         Parameters
         ----------
         graph : tensorflow graph definition object
             The loaded tensorflow GraphDef
 
+        layout : target layout to be used (Optional)
+            NCHW only supported now to enable NHWC models on GPU.
+
+        shape : Dictionary of input dimensions (Optional)
+            Graph level input shape dictionary.
+
         Returns
         -------
         sym : nnvm.sym.Symbol
@@ -1079,7 +1092,6 @@ def from_tensorflow(self, graph, layout="NHWC"):
             # Operator name 'Const' is treated as a parameter to build NNVM params dict.
 
             input_shapes = {}
-
             attr = self._parse_attr(node.attr)
 
             #Variable converted to Const will not have only value attr
@@ -1092,6 +1104,10 @@ def from_tensorflow(self, graph, layout="NHWC"):
                 self._output_shapes[node.name] = \
                     [tensor_util.TensorShapeProtoToList(shape) \
                     for shape in attr['_output_shapes']]
+            elif shape:
+                # Keep the list indexable to avoid key error.
+                # Actual value will be filled after node creation.
+                self._output_shapes[node.name] = [None]
             else:
                 raise NotImplementedError( \
                     "Please freeze the graph with add_shapes=True")
@@ -1100,7 +1116,6 @@ def from_tensorflow(self, graph, layout="NHWC"):
                 self._nodes[node.name] = _sym.Variable(name=node.name,
                                                        shape=self._output_shapes[node.name][0])
 
-                #input_shapes[self._nodes[node.name]] = self._output_shapes[node.name]
             elif node.op == "Const":
                 # All Const nodes are Param nodes, lets parse
                 self._num_param += 1
@@ -1132,21 +1147,33 @@ def from_tensorflow(self, graph, layout="NHWC"):
                     node.input[0] = in_name
 
                 # Fill shapes for all inputs in a list
-                try:
-                    inputs = [self._nodes[i] for i in node.input]
-                    for i in node.input:
+                inputs = []
+                for i in node.input:
+                    if i in self._nodes:
+                        inputs.append(self._nodes[i])
                         input_shapes[self._nodes[i]] = self._output_shapes[i]
-                    attr['_input_shapes'] = input_shapes
-                except KeyError:
-                    # TODO: Need to find clean way to handle '^CheckNumerics'
-                    pass
+                attr['_input_shapes'] = input_shapes
 
                 inputs = self._fix_extranodes(node.op, attr, inputs)
-
                 op = self._convert_operator(node.op, inputs, attr, graph)
+
+                # Check is op is converted to param
+                if isinstance(op, np.ndarray):
+                    self._params[node.name] = tvm.nd.array(op)
+                    op = _sym.Variable(name=node.name,
+                                       shape=self._params[node.name].shape)
+
                 # Assuming only one output.
                 self._nodes[node.name] = op
-                node_output = op
+
+            # Infer shapes if passed explicitely
+            node_output = self._nodes[node.name]
+            if shape:
+                g = _graph.create(node_output)
+                shape_dict = {k: v.shape for k, v in self._params.items()}
+                shape_dict.update(shape)
+                _, out_shapes = graph_util.infer_shape(g, **shape_dict)
+                self._output_shapes[node.name] = out_shapes
 
         # Assume the final node is the output node
         out = node_output
@@ -1351,7 +1378,7 @@ def _fix_extranodes(self, op_name, attr, inputs):
 
         return inputs
 
-def from_tensorflow(graph, layout="NHWC"):
+def from_tensorflow(graph, layout="NHWC", shape=None):
     """  Load tensorflow graph which is a python tensorflow graph object into nnvm graph.
     The companion parameters will be handled automatically.
 
@@ -1369,5 +1396,5 @@ def from_tensorflow(graph, layout="NHWC"):
         Dict of converted parameters stored in tvm.ndarray format
     """
     g = GraphProto()
-    sym, params = g.from_tensorflow(graph, layout)
+    sym, params = g.from_tensorflow(graph, layout, shape)
     return sym, params
diff --git a/nnvm/python/nnvm/testing/tf.py b/nnvm/python/nnvm/testing/tf.py
index d89ac497a46f..effe19808a59 100644
--- a/nnvm/python/nnvm/testing/tf.py
+++ b/nnvm/python/nnvm/testing/tf.py
@@ -46,13 +46,15 @@ def ProcessGraphDefParam(graph_def):
     return graph_def
 
 
-def AddShapesToGraphDef(out_node):
+def AddShapesToGraphDef(session, out_node):
     """ Add shapes attribute to nodes of the graph.
         Input graph here is the default graph in context.
 
     Parameters
     ----------
-    out_node: String
+    session : tf.Session
+        Tensorflow session
+    out_node : String
         Final output node of the graph.
 
     Returns
@@ -62,13 +64,12 @@ def AddShapesToGraphDef(out_node):
 
     """
 
-    with tf.Session() as sess:
-        graph_def = tf.graph_util.convert_variables_to_constants(
-            sess,
-            sess.graph.as_graph_def(add_shapes=True),
-            [out_node],
-            )
-        return graph_def
+    graph_def = tf.graph_util.convert_variables_to_constants(
+        session,
+        session.graph.as_graph_def(add_shapes=True),
+        [out_node],
+        )
+    return graph_def
 
 class NodeLookup(object):
     """Converts integer node ID's to human readable labels."""
@@ -135,13 +136,19 @@ def id_to_string(self, node_id):
             return ''
         return self.node_lookup[node_id]
 
-def get_workload(model_path):
-    """ Import workload from frozen protobuf
+def get_workload_official(model_url, model_sub_path, temp_dir):
+    """ Import workload from tensorflow official
 
     Parameters
     ----------
-    model_path: str
-        model_path on remote repository to download from.
+    model_url: str
+        URL from where it will be downloaded.
+
+    model_sub_path:
+        Sub path in extracted tar for the ftozen protobuf file.
+
+    temp_dir: TempDirectory
+        The temporary directory object to download the content.
 
     Returns
     -------
@@ -150,16 +157,52 @@ def get_workload(model_path):
 
     """
 
-    repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/'
-    model_name = os.path.basename(model_path)
-    model_url = os.path.join(repo_base, model_path)
+    model_tar_name = os.path.basename(model_url)
 
     from mxnet.gluon.utils import download
+    temp_path = temp_dir.relpath("./")
+    path_model = temp_path + model_tar_name
+
+    download(model_url, path_model)
+
+    import tarfile
+    if path_model.endswith("tgz") or path_model.endswith("gz"):
+        tar = tarfile.open(path_model)
+        tar.extractall(path=temp_path)
+        tar.close()
+    else:
+        raise RuntimeError('Could not decompress the file: ' + path_model)
+    return temp_path + model_sub_path
+
+def get_workload(model_path, model_sub_path=None):
+    """ Import workload from frozen protobuf
+
+    Parameters
+    ----------
+    model_path: str
+        model_path on remote repository to download from.
+
+    model_sub_path: str
+        Model path in the compressed archive.
+
+    Returns
+    -------
+    graph_def: graphdef
+        graph_def is the tensorflow workload for mobilenet.
+
+    """
 
     temp = util.tempdir()
-    path_model = temp.relpath(model_name)
+    if model_sub_path:
+        path_model = get_workload_official(model_path, model_sub_path, temp)
+    else:
+        repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/'
+        model_name = os.path.basename(model_path)
+        model_url = os.path.join(repo_base, model_path)
 
-    download(model_url, path_model)
+        from mxnet.gluon.utils import download
+        path_model = temp.relpath(model_name)
+        download(model_url, path_model)
 
     # Creates graph from saved graph_def.pb.
     with tf.gfile.FastGFile(path_model, 'rb') as f:
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 2ebc7b671ba5..62d3577ba10a 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -32,9 +32,8 @@ def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm'
     layout = None
     if target == "cuda":
         layout = "NCHW"
-
-    sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout)
     target_host = 'llvm'
+
     if isinstance(input_data, list):
         shape_dict = {}
         dtype_dict = {}
@@ -45,6 +44,7 @@ def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm'
         shape_dict = {input_node: input_data.shape}
         dtype_dict = {input_node: input_data.dtype}
 
+    sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict)
     graph, lib, params = nnvm.compiler.build(sym, target=target, target_host=target_host, shape=shape_dict,
                                              dtype=dtype_dict, params=params)
 
@@ -696,15 +696,20 @@ def test_forward_inception_v1():
 # ---------
 def test_forward_mobilenet():
     '''test mobilenet model'''
+    # MobilenetV2
     with tf.Graph().as_default():
-        graph_def = nnvm.testing.tf.get_workload("MobilenetV1/mobilenet_v1_1.0_224_frozen-with-shapes.pb")
+        graph_def = nnvm.testing.tf.get_workload(
+            "https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.4_224.tgz",
+            "mobilenet_v2_1.4_224_frozen.pb")
         # Call the utility to import the graph definition into default graph.
         graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
 
         data = np.random.uniform(size=(1, 224, 224, 3)).astype('float32')
-        out_node = 'MobilenetV1/Predictions/Reshape_1'
+        out_node = 'MobilenetV2/Predictions/Reshape_1'
 
         with tf.Session() as sess:
+            # Add shapes to the graph.
+            graph_def = nnvm.testing.tf.AddShapesToGraphDef(sess, out_node)
             tf_output = run_tf_graph(sess, data, 'input:0', out_node + ':0')
             tvm_output = run_tvm_graph(graph_def, data, 'input')
             tvm.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
diff --git a/tutorials/nnvm/from_tensorflow.py b/tutorials/nnvm/from_tensorflow.py
index 7cd7e784e7c4..92c287e4ade7 100644
--- a/tutorials/nnvm/from_tensorflow.py
+++ b/tutorials/nnvm/from_tensorflow.py
@@ -32,13 +32,18 @@
 img_name = 'elephant-299.jpg'
 image_url = os.path.join(repo_base, img_name)
 
-# InceptionV1 model protobuf
+######################################################################
+# Tutorials
+# ---------
 # .. note::
 #
 #   protobuf should be exported with :any:`add_shapes=True` option.
 #   Could use https://github.com/dmlc/web-data/tree/master/tensorflow/scripts/tf-to-nnvm.py
 #   to add shapes for existing models.
 #
+# Please refer docs/frontend/tensorflow.md for more details for various models
+# from tensorflow.
+
 model_name = 'classify_image_graph_def-with_shapes.pb'
 model_url = os.path.join(repo_base, model_name)
 
@@ -84,14 +89,15 @@
     # Call the utility to import the graph definition into default graph.
     graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
     # Add shapes to the graph.
-    graph_def = nnvm.testing.tf.AddShapesToGraphDef('softmax')
+    with tf.Session() as sess:
+        graph_def = nnvm.testing.tf.AddShapesToGraphDef(sess, 'softmax')
 
 ######################################################################
 # Decode image
 # ------------
 # .. note::
 #
-#   tensorflow frontend import doesn't support preprocessing ops like JpegDecode
+#   tensorflow frontend import doesn't support preprocessing ops like JpegDecode.
 #   JpegDecode is bypassed (just return source node).
 #   Hence we supply decoded frame to TVM instead.
 #

From 657ec0c2302f92627450a67b8e90d82039abe126 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Thu, 1 Nov 2018 20:28:03 -0700
Subject: [PATCH 326/529] [NNVM][OP] Allow two input tensors with different
 type in reshape_like op  (#2052)

---
 nnvm/python/nnvm/frontend/mxnet.py |  3 ++-
 nnvm/src/top/tensor/transform.cc   | 11 ++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index bf55af2a36f0..1be76c46fe82 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -290,7 +290,8 @@ def _zeros(_, attrs):
                   'elemwise_div', 'elemwise_mul', 'elemwise_sub', 'exp',
                   'flatten', 'log', 'log_softmax', 'max', 'min', 'negative',
                   'ones_like', 'relu', 'sigmoid', 'slice_like', 'softmax',
-                  'sum', 'tanh', 'transpose', 'zeros_like', 'gather_nd']
+                  'sum', 'tanh', 'transpose', 'zeros_like', 'gather_nd',
+                  'reshape_like']
 
 _convert_map = {
     '_copy'         : _rename('copy'),
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index f643f8891728..4d08bf761326 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -631,6 +631,15 @@ The significance of each is explained below:
 })
 .set_support_level(3);
 
+inline bool ReshapeLikeInferType(const NodeAttrs &attrs,
+                                 std::vector<int> *in_attrs,
+                                 std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, (*in_attrs)[0]);
+  return true;
+}
+
 NNVM_REGISTER_OP(reshape_like)
   .describe(R"code(Reshapes the input array by the size of another array.
 For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
@@ -651,7 +660,7 @@ the input array into an output array with the same shape as the second input arr
     NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, in_attrs->at(1));
     return true;
 })
-.set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FInferType>("FInferType", ReshapeLikeInferType)
 // never transform layout of the second input array.
 .set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_attr<FGradient>(

From e23116f42e251a7bd23727a4240a7f4c3d51c709 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Fri, 2 Nov 2018 14:27:34 -0700
Subject: [PATCH 327/529] Rename relay::Environment to relay::Module (#2054)

---
 include/tvm/relay/base.h                      |  2 +-
 include/tvm/relay/build_module.h              |  6 +-
 include/tvm/relay/expr.h                      |  2 +-
 include/tvm/relay/interpreter.h               |  6 +-
 include/tvm/relay/{environment.h => module.h} | 40 ++++-----
 include/tvm/relay/pass.h                      | 18 ++--
 include/tvm/relay/type.h                      |  4 +-
 python/tvm/relay/__init__.py                  |  4 +-
 python/tvm/relay/_ir_pass.pyi                 |  8 +-
 python/tvm/relay/{_env.py => _module.py}      |  4 +-
 python/tvm/relay/{_env.pyi => _module.pyi}    |  2 +-
 python/tvm/relay/build_module.py              | 12 +--
 python/tvm/relay/expr.py                      |  2 +-
 python/tvm/relay/interpreter.py               | 44 +++++-----
 python/tvm/relay/ir_pass.py                   | 28 +++----
 python/tvm/relay/{env.py => module.py}        | 32 ++++----
 src/relay/interpreter.cc                      | 32 ++++----
 src/relay/ir/{environment.cc => module.cc}    | 82 +++++++++----------
 src/relay/ir/text_printer.cc                  | 10 +--
 src/relay/pass/fuse_ops.cc                    | 10 +--
 src/relay/pass/kind_check.cc                  |  4 +-
 src/relay/pass/lower_ops.cc                   | 30 +++----
 src/relay/pass/type_infer.cc                  | 22 ++---
 tests/cpp/relay_pass_type_infer_test.cc       |  2 +-
 tests/python/relay/test_graph_runtime.py      |  8 +-
 tests/python/relay/test_interpreter.py        | 16 ++--
 tests/python/relay/test_ir_text_printer.py    |  2 +-
 tests/python/relay/test_type_infer.py         | 16 ++--
 28 files changed, 224 insertions(+), 224 deletions(-)
 rename include/tvm/relay/{environment.h => module.h} (76%)
 rename python/tvm/relay/{_env.py => _module.py} (56%)
 rename python/tvm/relay/{_env.pyi => _module.pyi} (84%)
 rename python/tvm/relay/{env.py => module.py} (72%)
 rename src/relay/ir/{environment.cc => module.cc} (55%)

diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index b7621e20cf6a..49e276b07c59 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -165,7 +165,7 @@ class RelayNode : public Node {
   TVM_DECLARE_BASE_NODE_INFO(RelayNode, Node);
 };
 
-struct Environment;
+struct Module;
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/include/tvm/relay/build_module.h b/include/tvm/relay/build_module.h
index ed889eba0bd0..35402d655507 100644
--- a/include/tvm/relay/build_module.h
+++ b/include/tvm/relay/build_module.h
@@ -8,7 +8,7 @@
 #define TVM_RELAY_BUILD_MODULE_H_
 
 #include <tvm/lowered_func.h>
-#include <tvm/relay/environment.h>
+#include <tvm/relay/module.h>
 #include <tvm/relay/expr.h>
 #include <string>
 
@@ -61,13 +61,13 @@ RELAY_DEFINE_NODE_REF(LoweredOp, LoweredOpNode, NodeRef);
  * \note This will do a reachability analysis and lower all definitions
  * reachable from the provided expression.
  *
- * \param env  The environment.
+ * \param mod  The module.
  * \param expr The expression with operations to be lowered.
  * \param target The target to lower the functions to.
  *
  * \return The set of lowered operations.
  */
-Array<LoweredOp> LowerOps(const Environment& env, const Expr& expr,
+Array<LoweredOp> LowerOps(const Module& mod, const Expr& expr,
                           const std::string& target = "llvm");
 
 }  // namespace relay
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 029470c067ce..1a5470489ce2 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -160,7 +160,7 @@ class VarNode : public ExprNode {
 RELAY_DEFINE_NODE_REF(Var, VarNode, Expr);
 
 /*!
- * \brief Global variable that leaves in the top-level environment.
+ * \brief Global variable that leaves in the top-level module.
  * This is used to enable recursive calls between function.
  *
  * \note A GlobalVar may only point to functions.
diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
index 1c382faaef04..403dd50ad778 100644
--- a/include/tvm/relay/interpreter.h
+++ b/include/tvm/relay/interpreter.h
@@ -4,7 +4,7 @@
  * \brief An interpreter for Relay.
  *
  * This file implements a simple reference interpreter for Relay programs.
- * Given a Relay environment, and a Relay expression it produces a value.
+ * Given a Relay module, and a Relay expression it produces a value.
  *
  * The interpreter's values are a naive representation of the values that
  * can be produced by a Relay program and are exposed via tvm::Node's
@@ -16,7 +16,7 @@
 #ifndef TVM_RELAY_INTERPRETER_H_
 #define TVM_RELAY_INTERPRETER_H_
 
-#include <tvm/relay/environment.h>
+#include <tvm/relay/module.h>
 #include <tvm/relay/expr.h>
 
 namespace tvm {
@@ -39,7 +39,7 @@ class Value;
  * Our intent is that this will never be the most efficient implementation of
  * Relay's semantics, but a readable and clear one.
  */
-Value Evaluate(Environment env, Expr e);
+Value Evaluate(Module mod, Expr e);
 
 /*! \brief The base container type of Relay values. */
 class ValueNode : public RelayNode {
diff --git a/include/tvm/relay/environment.h b/include/tvm/relay/module.h
similarity index 76%
rename from include/tvm/relay/environment.h
rename to include/tvm/relay/module.h
index 2ed389571ad6..b04d6fec20c5 100644
--- a/include/tvm/relay/environment.h
+++ b/include/tvm/relay/module.h
@@ -1,11 +1,11 @@
 /*!
  *  Copyright (c) 2018 by Contributors
- * \file tvm/relay/environment.h
+ * \file tvm/relay/module.h
  * \brief The global environment: contains information needed to
  * compile & optimize Relay programs.
  */
-#ifndef TVM_RELAY_ENVIRONMENT_H_
-#define TVM_RELAY_ENVIRONMENT_H_
+#ifndef TVM_RELAY_MODULE_H_
+#define TVM_RELAY_MODULE_H_
 
 #include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
@@ -17,7 +17,7 @@
 namespace tvm {
 namespace relay {
 
-struct Environment;
+struct Module;
 
 /*! \brief The global environment of Relay programs.
  *
@@ -28,29 +28,29 @@ struct Environment;
  *  options.
  *
  *  Many operations require access to the global
- *  Environment. We pass the Environment by value
+ *  Module. We pass the Module by value
  *  in a functional style as an explicit argument,
- *  but we mutate the Environment while optimizing
+ *  but we mutate the Module while optimizing
  *  Relay programs.
  *
  *  The functional style allows users to construct custom
  *  environments easily, for example each thread can store
- *  an Environment while auto-tuning.
+ *  an Module while auto-tuning.
  * */
 
-class EnvironmentNode : public RelayNode {
+class ModuleNode : public RelayNode {
  public:
   /*! \brief A map from ids to all global functions. */
   tvm::Map<GlobalVar, Function> functions;
 
-  EnvironmentNode() {}
+  ModuleNode() {}
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("functions", &functions);
     v->Visit("global_var_map_", &global_var_map_);
   }
 
-  TVM_DLL static Environment make(tvm::Map<GlobalVar, Function> global_funcs);
+  TVM_DLL static Module make(tvm::Map<GlobalVar, Function> global_funcs);
 
   /*!
    * \brief Add a function to the global environment.
@@ -100,10 +100,10 @@ class EnvironmentNode : public RelayNode {
    *        functions in another environment.
    * \param other The other environment.
    */
-  void Update(const Environment& other);
+  void Update(const Module& other);
 
-  static constexpr const char* _type_key = "relay.Environment";
-  TVM_DECLARE_NODE_TYPE_INFO(EnvironmentNode, Node);
+  static constexpr const char* _type_key = "relay.Module";
+  TVM_DECLARE_NODE_TYPE_INFO(ModuleNode, Node);
 
  private:
   /*! \brief A map from string names to global variables that
@@ -112,18 +112,18 @@ class EnvironmentNode : public RelayNode {
   tvm::Map<std::string, GlobalVar> global_var_map_;
 };
 
-struct Environment : public NodeRef {
-  Environment() {}
-  explicit Environment(NodePtr<tvm::Node> p) : NodeRef(p) {}
+struct Module : public NodeRef {
+  Module() {}
+  explicit Module(NodePtr<tvm::Node> p) : NodeRef(p) {}
 
-  inline EnvironmentNode* operator->() const {
-    return static_cast<EnvironmentNode*>(node_.get());
+  inline ModuleNode* operator->() const {
+    return static_cast<ModuleNode*>(node_.get());
   }
 
-  using ContainerType = EnvironmentNode;
+  using ContainerType = ModuleNode;
 };
 
 }  // namespace relay
 }  // namespace tvm
 
-#endif  // TVM_RELAY_ENVIRONMENT_H_
+#endif  // TVM_RELAY_MODULE_H_
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index b29678106d21..5ff60c7035d3 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -6,7 +6,7 @@
 #ifndef TVM_RELAY_PASS_H_
 #define TVM_RELAY_PASS_H_
 
-#include <tvm/relay/environment.h>
+#include <tvm/relay/module.h>
 #include <tvm/relay/expr.h>
 #include <string>
 
@@ -21,23 +21,23 @@ namespace relay {
  * populated with the result type.
  *
  * \param expr The expression to type check.
- * \param env The environment used for referencing global functions, can be
+ * \param mod The module used for referencing global functions, can be
  * None.
  *
  * \return A type checked expression with its checked_type field populated.
  */
-Expr InferType(const Expr& expr, const Environment& env);
+Expr InferType(const Expr& expr, const Module& mod);
 /*!
- * \brief Infer the type of a function as if it is mapped to var in the env.
+ * \brief Infer the type of a function as if it is mapped to var in the mod.
  *
  * \param f the function.
- * \param env The environment used for referencing global functions.
+ * \param mod The module used for referencing global functions.
  * \param var The global variable corresponding to the function.
  *
  * \return A type checked Function with its checked_type field populated.
- * \note this function mutates env and is not thread-safe.
+ * \note this function mutates mod and is not thread-safe.
  */
-Function InferType(const Function& f, const Environment& env,
+Function InferType(const Function& f, const Module& mod,
                    const GlobalVar& var);
 
 /*!
@@ -52,11 +52,11 @@ Function InferType(const Function& f, const Environment& env,
  * a data type such as `int`, `float`, `uint`.
  *
  * \param t The type to check.
- * \param env The global environment.
+ * \param mod The global module.
  *
  * \return true if the rules are satisified otherwise false
  */
-bool KindCheck(const Type& t, const Environment& env);
+bool KindCheck(const Type& t, const Module& mod);
 
 /*! \brief Compare two expressions for structural equivalence.
  *
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index c8ccb6035ae9..69a8a4fb0bd7 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -349,14 +349,14 @@ class TypeRelation;
 /*!
  * \brief TypeRelation container.
  * \note This node is not directly serializable.
- * The type function need to be lookedup in the environment.
+ * The type function need to be lookedup in the module.
  */
 class TypeRelationNode : public TypeConstraintNode {
  public:
   /*!
    * \brief The function on input and output variables which
    *  this is not directly serializable,
-   *  need to be looked-up in the environment.
+   *  need to be looked-up in the module.
    */
   TypeRelationFn func;
   /*! \brief The type arguments to the type function. */
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index b0a1fcec509e..f474eb449c0c 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -5,7 +5,7 @@
 from . import base
 from . import ty
 from . import expr
-from . import env
+from . import module
 from . import ir_pass
 from .build_module import build
 from .interpreter import create_executor
@@ -26,7 +26,7 @@
 Span = base.Span
 
 # Env
-Environment = env.Environment
+Module = module.Module
 
 # Type
 Type = ty.Type
diff --git a/python/tvm/relay/_ir_pass.pyi b/python/tvm/relay/_ir_pass.pyi
index f1432803e9e2..6bf4e2dac871 100644
--- a/python/tvm/relay/_ir_pass.pyi
+++ b/python/tvm/relay/_ir_pass.pyi
@@ -1,8 +1,8 @@
-from .env import Environment
+from .env import Module
 from . import ir
 
-def check_expr(env: Environment, expr: ir.Expr) -> ir.Type: ...
-def generalize(env: Environment, expr: ir.Expr) -> ir.Expr: ...
+def check_expr(env: Module, expr: ir.Expr) -> ir.Type: ...
+def generalize(env: Module, expr: ir.Expr) -> ir.Expr: ...
 def _get_checked_type(expr: ir.Expr) -> ir.Type: ...
 def well_formed(expr: ir.Expr) -> bool: ...
-def dead_code_elimination(expr: ir.Expr) -> ir.Expr: ...
\ No newline at end of file
+def dead_code_elimination(expr: ir.Expr) -> ir.Expr: ...
diff --git a/python/tvm/relay/_env.py b/python/tvm/relay/_module.py
similarity index 56%
rename from python/tvm/relay/_env.py
rename to python/tvm/relay/_module.py
index 25b8715a7816..b6e74c451915 100644
--- a/python/tvm/relay/_env.py
+++ b/python/tvm/relay/_module.py
@@ -1,5 +1,5 @@
 # pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable
-"""The interface to the Environment exposed from C++."""
+"""The interface to the Module exposed from C++."""
 from tvm._ffi.function import _init_api
 
-_init_api("relay._env", __name__)
+_init_api("relay._module", __name__)
diff --git a/python/tvm/relay/_env.pyi b/python/tvm/relay/_module.pyi
similarity index 84%
rename from python/tvm/relay/_env.pyi
rename to python/tvm/relay/_module.pyi
index c6b5d0f6c4bd..de3aabefba4c 100644
--- a/python/tvm/relay/_env.pyi
+++ b/python/tvm/relay/_module.pyi
@@ -2,4 +2,4 @@ from typing import Union, Tuple, Dict, List
 from relay.ir import GlobalId, OperatorId, Item, NodeBase, Span, FileId
 from relay.ir import ShapeExtension, Operator, Defn
 
-class Environment(NodeBase): ...
\ No newline at end of file
+class Module(NodeBase): ...
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 6b60fd3f17fe..e71571e63a99 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -5,9 +5,9 @@
 from ..build_module import build as tvm_build_module
 from . graph_runtime_codegen import GraphRuntimeCodegen
 from . import ir_pass
-from .env import Environment
+from .module import Module
 
-def build(func, params=None, target=None, env=None):
+def build(func, params=None, target=None, mod=None):
     """
     Compile a single function to the components needed by the
     TVM RTS.
@@ -29,15 +29,15 @@ def build(func, params=None, target=None, env=None):
     if target is None:
         target = 'llvm'
 
-    if env is None:
-        env = Environment({})
+    if mod is None:
+        mod = Module({})
 
-    comp = GraphRuntimeCodegen(env)
+    comp = GraphRuntimeCodegen(mod)
     # NB(@jroesch) This creates lowered functions, and generates names for them
     #
     # We need these names to emit the correct graph as these are names of the
     # functions contained in the module.
-    lowered_ops = ir_pass.lower_ops(env, func)
+    lowered_ops = ir_pass.lower_ops(mod, func)
     mod = tvm_build_module([lf.lowered_func for lf in lowered_ops], target)
 
     # Therefore the call to compile must come after.
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index dd9477aa9580..d789f281d25a 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -172,7 +172,7 @@ class GlobalVar(Expr):
     """A global variable in Tvm.Relay.
 
     GlobalVar is used to refer to the global functions
-    stored in the environment.
+    stored in the module.
 
     Parameters
     ----------
diff --git a/python/tvm/relay/interpreter.py b/python/tvm/relay/interpreter.py
index d95943c130dc..4dfe3e02989e 100644
--- a/python/tvm/relay/interpreter.py
+++ b/python/tvm/relay/interpreter.py
@@ -8,7 +8,7 @@
 from . import _make
 from . import _interpreter
 from . import ir_pass
-from .env import Environment
+from .module import Module
 from .expr import Call, Constant, GlobalVar, Function, const
 from .scope_builder import ScopeBuilder
 from .._ffi.base import integer_types
@@ -90,24 +90,24 @@ def _arg_to_ast(arg):
 class Executor(object):
     """An abstract interface for executing Relay programs."""
 
-    def __init__(self, env=None):
+    def __init__(self, mod=None):
         """
         Parameters
         ----------
-        env: relay.Environment
-            The environment.
+        mod: relay.Module
+            The module.
         """
-        if env is None:
-            self.env = Environment({})
+        if mod is None:
+            self.mod = Module({})
         else:
-            self.env = env
+            self.mod = mod
 
 
     def optimize(self, expr):
         # TODO: We need to move this optimization code into the optimizer/pass manager
-        ck_expr = ir_pass.infer_type(expr, env=self.env)
-        fused_expr = ir_pass.fuse_ops(self.env, ck_expr)
-        ck_fused = ir_pass.infer_type(fused_expr, env=self.env)
+        ck_expr = ir_pass.infer_type(expr, mod=self.mod)
+        fused_expr = ir_pass.fuse_ops(self.mod, ck_expr)
+        ck_fused = ir_pass.infer_type(fused_expr, mod=self.mod)
         return ck_fused
 
     def _make_executor(self, _):
@@ -153,8 +153,8 @@ class Interpreter(Executor):
     """
     A wrapper around the Relay interpreter, implements the excecutor interface.
     """
-    def __init__(self, env=None):
-        Executor.__init__(self, env)
+    def __init__(self, mod=None):
+        Executor.__init__(self, mod)
 
     def _make_executor(self, expr):
         def _interp_wrapper(*args):
@@ -163,28 +163,28 @@ def _interp_wrapper(*args):
                 relay_args.append(_arg_to_ast(arg))
 
             if isinstance(expr, GlobalVar):
-                func = self.env[expr]
+                func = self.mod[expr]
                 func = self.optimize(func)
-                self.env._add(expr, func, True)
+                self.mod._add(expr, func, True)
                 opt_expr = Call(expr, relay_args)
-                return _interpreter.evaluate(self.env, opt_expr)
+                return _interpreter.evaluate(self.mod, opt_expr)
             else:
                 call = Call(expr, relay_args)
                 opt_expr = self.optimize(call)
-                return _interpreter.evaluate(self.env, opt_expr)
+                return _interpreter.evaluate(self.mod, opt_expr)
 
         return _interp_wrapper
 
 
 class GraphRuntime(Executor):
     """A wrapper around the TVM graph runtime, implements the Executor interface."""
-    def __init__(self, env=None):
-        Executor.__init__(self, env)
+    def __init__(self, mod=None):
+        Executor.__init__(self, mod)
 
     def _make_executor(self, expr):
         def _graph_wrapper(*args):
             func = self.optimize(expr)
-            graph_json, mod, params = build_module.build(func, env=self.env)
+            graph_json, mod, params = build_module.build(func, mod=self.mod)
             assert params is None
             gmodule = tvm_runtime.create(graph_json, mod, cpu(0))
             # Create map of inputs.
@@ -199,10 +199,10 @@ def _graph_wrapper(*args):
 
         return _graph_wrapper
 
-def create_executor(mode='debug', env=None):
+def create_executor(mode='debug', mod=None):
     if mode == 'debug':
-        return Interpreter(env)
+        return Interpreter(mod)
     elif mode == 'graph':
-        return GraphRuntime(env)
+        return GraphRuntime(mod)
     else:
         raise Exception("unknown mode {0}".format(mode))
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index f3950fffc45f..989e5ad7622f 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -11,16 +11,16 @@
 from .ty import Type
 
 
-def infer_type(expr, env=None):
-    """Infer the type of expr under the context of env.
+def infer_type(expr, mod=None):
+    """Infer the type of expr under the context of mod.
 
     Parameters
     ----------
     expr: tvm.relay.Expr
         The input expression.
 
-    env: Optional[tvm.relay.Environment]
-        The global environment.
+    mod: Optional[tvm.relay.Module]
+        The global module.
 
 
     Returns
@@ -28,7 +28,7 @@ def infer_type(expr, env=None):
     checked_expr : tvm.relay.Expr
         The checked expression.
     """
-    return _ir_pass.infer_type(expr, env)
+    return _ir_pass.infer_type(expr, mod)
 
 
 def backward_fold_scale_axis(expr):
@@ -93,7 +93,7 @@ def well_formed(expr):
     return _ir_pass.well_formed(expr)
 
 
-def check_kind(t, env=None):
+def check_kind(t, mod=None):
     """Check that the type is well kinded.
     For example, this mean type cannot has tensor of tensor, or is a tuple type of 2 shapes.
 
@@ -102,8 +102,8 @@ def check_kind(t, env=None):
     t: tvm.relay.Type
         The type to check
 
-    env: tvm.relay.Environment, optional
-        The global environment
+    mod: tvm.relay.Module, optional
+        The global module
 
     Returns
     -------
@@ -117,8 +117,8 @@ def check_kind(t, env=None):
         assert not check_kind(relay.TupleType([relay.TypeParam('tp1', relay.Kind.Shape)]))
         assert check_kind(relay.TupleType([relay.TypeParam('tp1', relay.Kind.Type)]))
     """
-    if env is not None:
-        return _ir_pass.check_kind(t, env)
+    if mod is not None:
+        return _ir_pass.check_kind(t, mod)
     else:
         return _ir_pass.check_kind(t)
 
@@ -256,8 +256,8 @@ def structural_hash(value):
                "relay.Expr or relay.Type").format(type(value))
         raise TypeError(msg)
 
-def fuse_ops(expr, env):
-    return _ir_pass.FuseOps(env, expr)
+def fuse_ops(expr, mod):
+    return _ir_pass.FuseOps(mod, expr)
 
-def lower_ops(env, expr, target='llvm'):
-    return _ir_pass.LowerOps(env, expr, target)
+def lower_ops(mod, expr, target='llvm'):
+    return _ir_pass.LowerOps(mod, expr, target)
diff --git a/python/tvm/relay/env.py b/python/tvm/relay/module.py
similarity index 72%
rename from python/tvm/relay/env.py
rename to python/tvm/relay/module.py
index 37e0999dce9e..024c6baf7012 100644
--- a/python/tvm/relay/env.py
+++ b/python/tvm/relay/module.py
@@ -1,18 +1,18 @@
 # pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable, wildcard-import
-"""A global environment storing everything needed to interpret or compile a Relay program."""
+"""A global module storing everything needed to interpret or compile a Relay program."""
 from .base import register_relay_node, RelayNode
 from .._ffi import base as _base
 from . import _make
-from . import _env
+from . import _module
 from . import expr as _expr
 
 
 @register_relay_node
-class Environment(RelayNode):
-    """The global Relay environment containing collection of functions.
+class Module(RelayNode):
+    """The global Relay module containing collection of functions.
 
     Each global function is identified by an unique tvm.relay.GlobalVar.
-    tvm.relay.GlobalVar and Environment is necessary in order to enable
+    tvm.relay.GlobalVar and Module is necessary in order to enable
     recursions in function to avoid cyclic reference in the function.x
 
     Parameters
@@ -32,10 +32,10 @@ def __init__(self, functions=None):
                     raise TypeError("Expect functions to be Dict[GlobalVar, Function]")
                 mapped_funcs[k] = v
             functions = mapped_funcs
-        self.__init_handle_by_constructor__(_make.Environment, functions)
+        self.__init_handle_by_constructor__(_make.Module, functions)
 
     def __setitem__(self, var, func):
-        """Add a function to the environment.
+        """Add a function to the module.
 
         Parameters
         ---------
@@ -50,7 +50,7 @@ def __setitem__(self, var, func):
     def _add(self, var, func, update=False):
         if isinstance(var, _base.string_types):
             var = _expr.GlobalVar(var)
-        return _env.Environment_Add(self, var, func, update)
+        return _module.Module_Add(self, var, func, update)
 
     def __getitem__(self, var):
         """Lookup a global function by name or by variable.
@@ -66,21 +66,21 @@ def __getitem__(self, var):
                 The function referenced by :code:`var`.
         """
         if isinstance(var, _base.string_types):
-            return _env.Environment_Lookup_str(self, var)
+            return _module.Module_Lookup_str(self, var)
         else:
-            return _env.Environment_Lookup(self, var)
+            return _module.Module_Lookup(self, var)
 
     def update(self, other):
-        """Insert functions in another Environment to current one.
+        """Insert functions in another Module to current one.
 
         Parameters
         ----------
-        other: Environment
-            The environment to merge into the current Environment.
+        other: Module
+            The module to merge into the current Module.
         """
         if isinstance(other, dict):
-            other = Environment(other)
-        return _env.Environment_Update(self, other)
+            other = Module(other)
+        return _module.Module_Update(self, other)
 
     def get_global_var(self, name):
         """Get a global variable in the function by name.
@@ -99,4 +99,4 @@ def get_global_var(self, name):
         ------
         tvm.TVMError if we cannot find corresponding global var.
         """
-        return _env.Environment_GetGlobalVar(self, name)
+        return _module.Module_GetGlobalVar(self, name)
diff --git a/src/relay/interpreter.cc b/src/relay/interpreter.cc
index 534a2a980e4a..5db7b66ebe83 100644
--- a/src/relay/interpreter.cc
+++ b/src/relay/interpreter.cc
@@ -183,7 +183,7 @@ struct ExprEqual {
 };
 
 struct Interpreter : ExprFunctor<Value(const Expr& n)> {
-  Environment env;
+  Module mod;
   Stack stack;
   using JitKey = Function;
 
@@ -197,8 +197,8 @@ struct Interpreter : ExprFunctor<Value(const Expr& n)> {
     return f();
   }
 
-  Interpreter(Environment env) : env(env), operator_map_() {}
-  Interpreter(Environment env, OpMap operator_map) : env(env), operator_map_(operator_map) {}
+  Interpreter(Module mod) : mod(mod), operator_map_() {}
+  Interpreter(Module mod, OpMap operator_map) : mod(mod), operator_map_(operator_map) {}
 
   void extend(const Var& id, Value v) {
     this->stack.current_frame().locals.Set(id, v);
@@ -223,7 +223,7 @@ struct Interpreter : ExprFunctor<Value(const Expr& n)> {
   }
 
   Value VisitExpr_(const GlobalVarNode* op) override {
-    return Eval(this->env->Lookup(GetRef<GlobalVar>(op)));
+    return Eval(this->mod->Lookup(GetRef<GlobalVar>(op)));
   }
 
   Value VisitExpr_(const OpNode* id) override {
@@ -251,14 +251,14 @@ struct Interpreter : ExprFunctor<Value(const Expr& n)> {
 
   Value VisitExpr_(const FunctionNode* func_node) override {
     auto func = GetRef<Function>(func_node);
-    tvm::Map<Var, Value> captured_env;
+    tvm::Map<Var, Value> captured_mod;
     Array<Var> free_vars = FreeVars(func);
 
     for (const auto& var : free_vars) {
-      captured_env.Set(var, Eval(var));
+      captured_mod.Set(var, Eval(var));
     }
 
-    return ClosureNode::make(captured_env, func);
+    return ClosureNode::make(captured_mod, func);
   }
 
   inline Value InvokeCompiledOp(PackedFunc func, const Array<Value>& args,
@@ -315,7 +315,7 @@ struct Interpreter : ExprFunctor<Value(const Expr& n)> {
       locals.Set(func->params[i], args[i]);
     }
 
-    // Add the var to value mappings from the Closure's environment.
+    // Add the var to value mappings from the Closure's modironment.
     for (auto it = closure->env.begin(); it != closure->env.end(); ++it) {
       CHECK_EQ(locals.count((*it).first), 0);
       locals.Set((*it).first, (*it).second);
@@ -384,9 +384,9 @@ struct Interpreter : ExprFunctor<Value(const Expr& n)> {
   }
 };
 
-Interpreter::OpMap CompileOperators(const Environment& env, const Expr& e) {
+Interpreter::OpMap CompileOperators(const Module& mod, const Expr& e) {
   Interpreter::OpMap op_map;
-  auto lowered_ops = LowerOps(env, e);
+  auto lowered_ops = LowerOps(mod, e);
   RELAY_LOG(INFO) << "LoweredFuncs: " << lowered_ops << std::endl;
   if (lowered_ops.size()) {
     const PackedFunc* fbuild_ptr = Registry::Get("relay.op.compiler._build");
@@ -399,7 +399,7 @@ Interpreter::OpMap CompileOperators(const Environment& env, const Expr& e) {
       lowered_funcs.push_back(lop->lowered_func);
     }
 
-    Module module = fbuild(lowered_funcs);
+    runtime::Module module = fbuild(lowered_funcs);
 
     // Loop over the lowered operations to map them into the operator map.
     for (auto lop : lowered_ops) {
@@ -415,17 +415,17 @@ Interpreter::OpMap CompileOperators(const Environment& env, const Expr& e) {
   return op_map;
 }
 
-Value Evaluate(Environment env, Expr e) {
-  auto op_map = CompileOperators(env, e);
-  Interpreter interp(env, op_map);
+Value Evaluate(Module mod, Expr e) {
+  auto op_map = CompileOperators(mod, e);
+  Interpreter interp(mod, op_map);
   return interp.Eval(e);
 }
 
 TVM_REGISTER_API("relay._interpreter.evaluate")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
-      Environment env = args[0];
+      Module mod = args[0];
       Expr expr = args[1];
-      *ret = Evaluate(env, expr);
+      *ret = Evaluate(mod, expr);
     });
 
 }  // namespace relay
diff --git a/src/relay/ir/environment.cc b/src/relay/ir/module.cc
similarity index 55%
rename from src/relay/ir/environment.cc
rename to src/relay/ir/module.cc
index 262758ba0478..4443ed50783e 100644
--- a/src/relay/ir/environment.cc
+++ b/src/relay/ir/module.cc
@@ -1,9 +1,9 @@
 /*!
  *  Copyright (c) 2018 by Contributors
- * \file  environment.cc
- * \brief The global environment in Relay.
+ * \file  module.cc
+ * \brief The global module in Relay.
  */
-#include <tvm/relay/environment.h>
+#include <tvm/relay/module.h>
 #include <tvm/relay/pass.h>
 #include <sstream>
 
@@ -13,8 +13,8 @@ namespace relay {
 using tvm::IRPrinter;
 using namespace runtime;
 
-Environment EnvironmentNode::make(tvm::Map<GlobalVar, Function> global_funcs) {
-  auto n = make_node<EnvironmentNode>();
+Module ModuleNode::make(tvm::Map<GlobalVar, Function> global_funcs) {
+  auto n = make_node<ModuleNode>();
   n->functions = std::move(global_funcs);
 
   for (const auto& kv : n->functions) {
@@ -23,22 +23,22 @@ Environment EnvironmentNode::make(tvm::Map<GlobalVar, Function> global_funcs) {
         << "Duplicate global function name " << kv.first->name_hint;
     n->global_var_map_.Set(kv.first->name_hint, kv.first);
   }
-  return Environment(n);
+  return Module(n);
 }
 
-GlobalVar EnvironmentNode::GetGlobalVar(const std::string& name) {
+GlobalVar ModuleNode::GetGlobalVar(const std::string& name) {
   auto it = global_var_map_.find(name);
   CHECK(it != global_var_map_.end())
-      << "Cannot find global var " << name << " in the Environment";
+      << "Cannot find global var " << name << " in the Module";
   return (*it).second;
 }
 
-void EnvironmentNode::Add(const GlobalVar& var,
+void ModuleNode::Add(const GlobalVar& var,
                           const Function& func,
                           bool update) {
-  // Type check the item before we add it to the environment.
-  auto env = GetRef<Environment>(this);
-  Function checked_func = InferType(func, env, var);
+  // Type check the item before we add it to the modironment.
+  auto mod = GetRef<Module>(this);
+  Function checked_func = InferType(func, mod, var);
   auto type = checked_func->checked_type();
   CHECK(type.as<IncompleteTypeNode>() == nullptr);
   if (functions.find(var) != functions.end()) {
@@ -46,7 +46,7 @@ void EnvironmentNode::Add(const GlobalVar& var,
         << "Already have definition for " << var->name_hint;
     auto old_type = functions[var].as<FunctionNode>()->checked_type();
     CHECK(AlphaEqual(type, old_type))
-        << "Environment#update changes type, not possible in this mode.";
+        << "Module#update changes type, not possible in this mode.";
   }
   this->functions.Set(var, checked_func);
 
@@ -62,79 +62,79 @@ void EnvironmentNode::Add(const GlobalVar& var,
   global_var_map_.Set(var->name_hint, var);
 }
 
-void EnvironmentNode::Update(const GlobalVar& var, const Function& func) {
+void ModuleNode::Update(const GlobalVar& var, const Function& func) {
   this->Add(var, func, true);
 }
 
-void EnvironmentNode::Remove(const GlobalVar& var) {
+void ModuleNode::Remove(const GlobalVar& var) {
   auto functions_node = this->functions.CopyOnWrite();
   functions_node->data.erase(var.node_);
   auto gvar_node = global_var_map_.CopyOnWrite();
   gvar_node->data.erase(var->name_hint);
 }
 
-Function EnvironmentNode::Lookup(const GlobalVar& var) {
+Function ModuleNode::Lookup(const GlobalVar& var) {
   auto it = functions.find(var);
   CHECK(it != functions.end())
       << "There is no definition of " << var->name_hint;
   return (*it).second;
 }
 
-Function EnvironmentNode::Lookup(const std::string& name) {
+Function ModuleNode::Lookup(const std::string& name) {
   GlobalVar id = this->GetGlobalVar(name);
   return this->Lookup(id);
 }
 
-void EnvironmentNode::Update(const Environment& env) {
-  for (auto pair : env->functions) {
+void ModuleNode::Update(const Module& mod) {
+  for (auto pair : mod->functions) {
     this->Update(pair.first, pair.second);
   }
 }
 
-TVM_REGISTER_NODE_TYPE(EnvironmentNode);
+TVM_REGISTER_NODE_TYPE(ModuleNode);
 
-TVM_REGISTER_API("relay._make.Environment")
+TVM_REGISTER_API("relay._make.Module")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = EnvironmentNode::make(args[0]);
+    *ret = ModuleNode::make(args[0]);
   });
 
-TVM_REGISTER_API("relay._env.Environment_Add")
+TVM_REGISTER_API("relay._module.Module_Add")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    Environment env = args[0];
-    env->Add(args[1], args[2], args[3]);
+    Module mod = args[0];
+    mod->Add(args[1], args[2], args[3]);
   });
 
-TVM_REGISTER_API("relay._env.Environment_GetGlobalVar")
+TVM_REGISTER_API("relay._module.Module_GetGlobalVar")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    Environment env = args[0];
-    *ret = env->GetGlobalVar(args[1]);
+    Module mod = args[0];
+    *ret = mod->GetGlobalVar(args[1]);
   });
 
-TVM_REGISTER_API("relay._env.Environment_Lookup")
+TVM_REGISTER_API("relay._module.Module_Lookup")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    Environment env = args[0];
+    Module mod = args[0];
     GlobalVar var = args[1];
-    *ret = env->Lookup(var);
+    *ret = mod->Lookup(var);
   });
 
-TVM_REGISTER_API("relay._env.Environment_Lookup_str")
+TVM_REGISTER_API("relay._module.Module_Lookup_str")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    Environment env = args[0];
+    Module mod = args[0];
     std::string var_name = args[1];
-    auto var = env->GetGlobalVar(var_name);
-    *ret = env->Lookup(var);
+    auto var = mod->GetGlobalVar(var_name);
+    *ret = mod->Lookup(var);
   });
 
-TVM_REGISTER_API("relay._env.Environment_Update")
+TVM_REGISTER_API("relay._module.Module_Update")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    Environment env = args[0];
-    env->Update(args[1]);
+    Module mod = args[0];
+    mod->Update(args[1]);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<EnvironmentNode>(
-    [](const EnvironmentNode *node, tvm::IRPrinter *p) {
-      p->stream << "EnvironmentNode( " << node->functions << ")";
+.set_dispatch<ModuleNode>(
+    [](const ModuleNode *node, tvm::IRPrinter *p) {
+      p->stream << "ModuleNode( " << node->functions << ")";
     });
 
 }  // namespace relay
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index 8056adc9a8b8..04f51a14ac5f 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -3,7 +3,7 @@
  * \file text_printer.cc
  * \brief Text printer to print relay in text form.
  */
-#include <tvm/relay/environment.h>
+#include <tvm/relay/module.h>
 #include <tvm/relay/expr_functor.h>
 #include <sstream>
 #include "type_functor.h"
@@ -133,8 +133,8 @@ class TextPrinter :
   std::string Print(const NodeRef& node) {
     if (node.as<FunctionNode>()) {
       this->PrintFunc(Downcast<Function>(node));
-    } else if (node.as<EnvironmentNode>()) {
-      this->PrintEnv(Downcast<Environment>(node));
+    } else if (node.as<ModuleNode>()) {
+      this->PrintEnv(Downcast<Module>(node));
     } else if (node.as_derived<TypeNode>()) {
       this->PrintType(Downcast<Type>(node), stream_);
     } else if (node.as_derived<ExprNode>()) {
@@ -158,9 +158,9 @@ class TextPrinter :
     stream_ << "\n";
   }
 
-  void PrintEnv(const Environment& env) {
+  void PrintEnv(const Module& mod) {
     int counter = 0;
-    for (const auto& kv : env->functions) {
+    for (const auto& kv : mod->functions) {
       std::ostringstream os;
       if (counter++ != 0) {
         stream_ << "\n";
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 3aea12931649..f5538331a778 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -20,12 +20,12 @@ namespace relay {
 using namespace runtime;
 
 struct AbstractFusableOps : ExprMutator {
-  Environment env;
+  Module mod;
   Array<GlobalVar> fusable_funcs;
   int counter = 0;
   size_t expr_hash;
 
-  AbstractFusableOps(Environment env, size_t expr_hash) : env(env), expr_hash(expr_hash) {}
+  AbstractFusableOps(Module mod, size_t expr_hash) : mod(mod), expr_hash(expr_hash) {}
 
   Expr VisitExpr_(const CallNode* call) {
     if (auto op_node = call->op.as<OpNode>()) {
@@ -55,7 +55,7 @@ struct AbstractFusableOps : ExprMutator {
       func_name += "_";
       func_name += std::to_string(expr_hash);
       auto gv = GlobalVarNode::make(func_name);
-      env->Add(gv, func);
+      mod->Add(gv, func);
       fusable_funcs.push_back(gv);
       return CallNode::make(gv, args, Attrs());
     } else {
@@ -64,12 +64,12 @@ struct AbstractFusableOps : ExprMutator {
   }
 };
 
-Expr FuseOps(const Environment& env, const Expr& e) {
+Expr FuseOps(const Module& mod, const Expr& e) {
   // First we convert all chains of fusable ops into
   // abstracted functions which we mark as primtive
   // then we convert these primtive functions into
   // new operators.
-  auto abstract = AbstractFusableOps(env, StructuralHash()(e));
+  auto abstract = AbstractFusableOps(mod, StructuralHash()(e));
   auto abstracted_e = abstract.VisitExpr(e);
   RELAY_LOG(INFO) << "FuseOps: before=" << e
                   << "Fuse: after=" << abstracted_e;
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
index 81e72c6d7df8..7253a600dabf 100644
--- a/src/relay/pass/kind_check.cc
+++ b/src/relay/pass/kind_check.cc
@@ -99,7 +99,7 @@ struct KindChecker : TypeVisitor {
   }
 };
 
-bool KindCheck(const Type& t, const Environment& env) {
+bool KindCheck(const Type& t, const Module& mod) {
   KindChecker kc;
   return kc.Check(t);
 }
@@ -107,7 +107,7 @@ bool KindCheck(const Type& t, const Environment& env) {
 TVM_REGISTER_API("relay._ir_pass.check_kind")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     if (args.size() == 1) {
-      *ret = KindCheck(args[0], EnvironmentNode::make({}));
+      *ret = KindCheck(args[0], ModuleNode::make({}));
     } else {
       *ret = KindCheck(args[0], args[1]);
     }
diff --git a/src/relay/pass/lower_ops.cc b/src/relay/pass/lower_ops.cc
index 6bab9a924269..f2c8ceba866d 100644
--- a/src/relay/pass/lower_ops.cc
+++ b/src/relay/pass/lower_ops.cc
@@ -28,12 +28,12 @@ LoweredOp LoweredOpNode::make(Function func, LoweredFunc lowered_func) {
 }
 
 struct AbstractLocalFunctions : ExprMutator {
-  Environment env;
+  Module mod;
   size_t expr_hash;
   int counter = 0;
   std::unordered_set<GlobalVar, NodeHash, NodeEqual> visited_funcs;
-  explicit AbstractLocalFunctions(Environment env)
-      : env(env), expr_hash(0), counter(0), visited_funcs() {}
+  explicit AbstractLocalFunctions(Module mod)
+      : mod(mod), expr_hash(0), counter(0), visited_funcs() {}
 
   Expr Abstract(const Expr& e) {
     expr_hash = StructuralHash()(e);
@@ -44,7 +44,7 @@ struct AbstractLocalFunctions : ExprMutator {
     auto gvar = GetRef<GlobalVar>(gvar_node);
     auto it = visited_funcs.find(gvar);
     if (it == visited_funcs.end()) {
-      auto func = env->Lookup(gvar);
+      auto func = mod->Lookup(gvar);
       visited_funcs.insert(gvar);
       auto new_func = FunctionNode::make(
         func->params,
@@ -52,7 +52,7 @@ struct AbstractLocalFunctions : ExprMutator {
         func->ret_type,
         func->type_params,
         func->attrs);
-      env->Update(gvar, new_func);
+      mod->Update(gvar, new_func);
     }
     return gvar;
   }
@@ -70,7 +70,7 @@ struct AbstractLocalFunctions : ExprMutator {
     abs_func += std::to_string(expr_hash);
     auto gv = GlobalVarNode::make(abs_func);
     auto lifted_func = FunctionNode::make(params, func, Type(), {}, {});
-    env->Add(gv, lifted_func);
+    mod->Add(gv, lifted_func);
     Array<Expr> args;
     for (auto free_var : free_vars) {
       args.push_back(free_var);
@@ -80,8 +80,8 @@ struct AbstractLocalFunctions : ExprMutator {
 };
 
 struct LiveFunctions : ExprVisitor {
-  Environment env;
-  explicit LiveFunctions(Environment env) : env(env), global_funcs() {}
+  Module mod;
+  explicit LiveFunctions(Module mod) : mod(mod), global_funcs() {}
 
   std::unordered_set<GlobalVar, NodeHash, NodeEqual> visited_funcs;
   std::unordered_set<GlobalVar, NodeHash, NodeEqual> global_funcs;
@@ -100,7 +100,7 @@ struct LiveFunctions : ExprVisitor {
     GlobalVar var = GetRef<GlobalVar>(var_node);
     auto it = visited_funcs.find(var);
     if (it == visited_funcs.end()) {
-      auto func = env->Lookup(var);
+      auto func = mod->Lookup(var);
       visited_funcs.insert(var);
       // The last pass has trasnformed functions of the form:
       //
@@ -134,7 +134,7 @@ struct LiveFunctions : ExprVisitor {
     RELAY_LOG(INFO) << "LiveOps: CallNode=" << GetRef<Call>(call);
     if (auto gv_node = call->op.as<GlobalVarNode>()) {
       GlobalVar gvar = GetRef<GlobalVar>(gv_node);
-      Function func = env->Lookup(gvar);
+      Function func = mod->Lookup(gvar);
 
       auto attr = FunctionGetAttr(func, "Primitive");
 
@@ -159,15 +159,15 @@ using FCompute = TypedPackedFunc<Array<Tensor>(
 using FSchedule = TypedPackedFunc<Schedule(const Array<Tensor>&, std::string)>;
 
 /*! \brief Return the set of operators in their TVM format. */
-Array<LoweredOp> LowerOps(const Environment& env, const Expr& e,
+Array<LoweredOp> LowerOps(const Module& mod, const Expr& e,
                           const std::string& target) {
   RELAY_LOG(INFO) << "LowerOps: e=" << e;
   auto flower_ptr = Registry::Get("relay.op.compiler._lower");
   CHECK(flower_ptr);
   PackedFunc flower = *flower_ptr;
 
-  auto abstracted_e = AbstractLocalFunctions(env).Abstract(e);
-  auto live_funcs = LiveFunctions(env);
+  auto abstracted_e = AbstractLocalFunctions(mod).Abstract(e);
+  auto live_funcs = LiveFunctions(mod);
   live_funcs.VisitExpr(abstracted_e);
 
   auto schedule_reg = Op::GetAttr<FSchedule>("FTVMSchedule");
@@ -176,7 +176,7 @@ Array<LoweredOp> LowerOps(const Environment& env, const Expr& e,
   Array<LoweredOp> lowered_funcs;
 
   for (auto func_name : live_funcs.global_funcs) {
-    auto func = env->Lookup(func_name);
+    auto func = mod->Lookup(func_name);
     auto call = Downcast<Call>(func->body);
     auto op_node = call->op.as<OpNode>();
     CHECK(op_node) << "violated invariant that primtiive calls contain a single op call";
@@ -205,7 +205,7 @@ Array<LoweredOp> LowerOps(const Environment& env, const Expr& e,
     LoweredFunc lf =
         flower(op->name + std::to_string(hash), schedule, inputs, outputs);
     func = FunctionSetAttr(func, "LoweredFunc", lf);
-    env->Add(func_name, func, true);
+    mod->Add(func_name, func, true);
     lowered_funcs.push_back(LoweredOpNode::make(func, lf));
   }
 
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 864b7ad78abd..b224a099aee1 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -104,8 +104,8 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
   // constructors
   TypeInferencer() {
   }
-  explicit TypeInferencer(Environment env)
-      : env_(env) {
+  explicit TypeInferencer(Module mod)
+      : mod_(mod) {
   }
 
   // inference the type of expr.
@@ -115,7 +115,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
   // type resolver that maps back to type
   class Resolver;
   // internal environment
-  Environment env_;
+  Module mod_;
   // map from expression to checked type
   // type inferencer will populate it up
   std::unordered_map<Expr, ResolvedTypeInfo, NodeHash, NodeEqual> type_map_;
@@ -164,9 +164,9 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
 
   Type VisitExpr_(const GlobalVarNode* op) final {
     GlobalVar var = GetRef<GlobalVar>(op);
-    CHECK(env_.defined())
+    CHECK(mod_.defined())
         << "Cannot do type inference without a global variable";
-    Expr e = env_->Lookup(var);
+    Expr e = mod_->Lookup(var);
     return e->checked_type();
   }
 
@@ -511,20 +511,20 @@ Expr TypeInferencer::Infer(Expr expr) {
 }
 
 
-Expr InferType(const Expr& expr, const Environment& env) {
-  auto e = TypeInferencer(env).Infer(expr);
+Expr InferType(const Expr& expr, const Module& mod) {
+  auto e = TypeInferencer(mod).Infer(expr);
   CHECK(WellFormed(e));
   return e;
 }
 
 Function InferType(const Function& func,
-                   const Environment& env,
+                   const Module& mod,
                    const GlobalVar& var) {
   Function func_copy = Function(make_node<FunctionNode>(*func.operator->()));
   func_copy->checked_type_ = func_copy->func_type_annotation();
-  env->functions.Set(var, func_copy);
-  Expr func_ret = TypeInferencer(env).Infer(func_copy);
-  auto map_node = env->functions.CopyOnWrite();
+  mod->functions.Set(var, func_copy);
+  Expr func_ret = TypeInferencer(mod).Infer(func_copy);
+  auto map_node = mod->functions.CopyOnWrite();
   map_node->data.erase(var.node_);
   CHECK(WellFormed(func_ret));
   return Downcast<Function>(func_ret);
diff --git a/tests/cpp/relay_pass_type_infer_test.cc b/tests/cpp/relay_pass_type_infer_test.cc
index e1a81d3c0535..385bde974014 100644
--- a/tests/cpp/relay_pass_type_infer_test.cc
+++ b/tests/cpp/relay_pass_type_infer_test.cc
@@ -11,7 +11,7 @@ TEST(Relay, SelfReference) {
   auto x = relay::VarNode::make("x", type_a);
   auto f = relay::FunctionNode::make(tvm::Array<relay::Var>{ x }, x, type_b, Array<relay::TypeVar>{});
   auto fx = relay::CallNode::make(f, Array<relay::Expr>{ x });
-  auto type_fx = relay::InferType(fx, relay::EnvironmentNode::make(Map<relay::GlobalVar, relay::Function>{}));
+  auto type_fx = relay::InferType(fx, relay::ModuleNode::make(Map<relay::GlobalVar, relay::Function>{}));
   CHECK_EQ(type_fx->checked_type(), type_a);
 }
 
diff --git a/tests/python/relay/test_graph_runtime.py b/tests/python/relay/test_graph_runtime.py
index 38acc5df08d8..7b89831dbfce 100644
--- a/tests/python/relay/test_graph_runtime.py
+++ b/tests/python/relay/test_graph_runtime.py
@@ -6,10 +6,10 @@
 from tvm.relay.interpreter import Interpreter
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay.op import add
-from tvm.relay.env import Environment
+from tvm.relay.module import Module
 
 # @tq, @jr should we put this in testing ns?
-def check_rts(expr, args, expected_result, env=None):
+def check_rts(expr, args, expected_result, mod=None):
     """
     Check that evaluating `expr` applied to the arguments produces
     `result` on both the evaluator and TVM runtime.
@@ -25,8 +25,8 @@ def check_rts(expr, args, expected_result, env=None):
     expected_result:
         The expected result of running the expression.
     """
-    intrp = create_executor('graph', env=env)
-    graph = create_executor('graph', env=env)
+    intrp = create_executor('graph', mod=mod)
+    graph = create_executor('graph', mod=mod)
     eval_result = intrp.evaluate(expr)(*args)
     rts_result = graph.evaluate(expr)(*args)
     np.testing.assert_allclose(eval_result.asnumpy(), rts_result.asnumpy())
diff --git a/tests/python/relay/test_interpreter.py b/tests/python/relay/test_interpreter.py
index f2eaa3d02dec..b7214965db22 100644
--- a/tests/python/relay/test_interpreter.py
+++ b/tests/python/relay/test_interpreter.py
@@ -7,8 +7,8 @@
 from tvm.relay import testing, create_executor
 
 
-def check_eval(expr, args, expected_result, env=None, rtol=1e-07):
-    intrp = create_executor(env=env)
+def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
+    intrp = create_executor(mod=mod)
     result = intrp.evaluate(expr)(*args)
     np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
 
@@ -87,7 +87,7 @@ def test_subtract():
     check_eval(func, [i_data], 0)
 
 def test_simple_loop():
-    env = relay.env.Environment({})
+    mod = relay.module.Module({})
     sum_up = relay.GlobalVar('sum_up')
     i = relay.var('i', shape=[], dtype='int32')
     sb = ScopeBuilder()
@@ -98,12 +98,12 @@ def test_simple_loop():
         rec_call = relay.Call(sum_up, [one_less])
         sb.ret(op.add(rec_call, i))
     func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], 'int32'))
-    env[sum_up] = func
+    mod[sum_up] = func
     i_data = np.array(10, dtype='int32')
-    check_eval(sum_up, [i_data], sum(range(1, 11)), env=env)
+    check_eval(sum_up, [i_data], sum(range(1, 11)), mod=mod)
 
 def test_loop():
-    env = relay.env.Environment({})
+    mod = relay.module.Module({})
     sum_up = relay.GlobalVar('sum_up')
     i = relay.var('i', shape=[], dtype='int32')
     accum = relay.var('accum', shape=[], dtype='int32')
@@ -115,10 +115,10 @@ def test_loop():
         new_accum = op.add(accum, i)
         sb.ret(relay.Call(sum_up, [one_less, new_accum]))
     func = relay.Function([i, accum], sb.get())
-    env[sum_up] = func
+    mod[sum_up] = func
     i_data = np.array(10, dtype='int32')
     accum_data = np.array(0, dtype='int32')
-    check_eval(sum_up, [i_data, accum_data], sum(range(1, 11)), env=env)
+    check_eval(sum_up, [i_data, accum_data], sum(range(1, 11)), mod=mod)
 
 def test_mlp():
     pass
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index aa944bc217c2..dd790a6d7d87 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -28,7 +28,7 @@ def test_env():
     z = relay.add(x, y)
     z = relay.add(z, z)
     f = relay.Function([x, y], z)
-    env = relay.Environment()
+    env = relay.Module()
     env["myf"] = f
     text = env.astext()
     assert "def @myf" in text
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 31d350dc7ff7..c1f06ccc763a 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -9,8 +9,8 @@
 from tvm.relay.scope_builder import ScopeBuilder
 
 
-def assert_has_type(expr, typ, env=relay.env.Environment({})):
-    checked_expr = infer_type(expr, env)
+def assert_has_type(expr, typ, mod=relay.module.Module({})):
+    checked_expr = infer_type(expr, mod)
     checked_type = checked_expr.checked_type
     if checked_type != typ:
         raise RuntimeError("Type mismatch %s vs %s" % (
@@ -105,10 +105,10 @@ def f(n: i32, data: f32) -> f32 {
         sb.ret(data)
     with sb.else_scope():
         sb.ret(f(relay.subtract(n, relay.const(1, ti32)), relay.log(data)))
-    env = relay.Environment()
-    env[f] = relay.Function([n, data], sb.get())
-    assert "%3 = @f(%1, %2)" in env.astext()
-    assert env[f].checked_type == relay.FuncType([ti32, tf32], tf32)
+    mod = relay.Module()
+    mod[f] = relay.Function([n, data], sb.get())
+    assert "%3 = @f(%1, %2)" in mod.astext()
+    assert mod[f].checked_type == relay.FuncType([ti32, tf32], tf32)
 
 # This currently fails and should pass under the type system.
 #
@@ -179,12 +179,12 @@ def f(x) {
 
 
 def test_global_var_cow_issue():
-    env = relay.env.Environment({})
+    mod = relay.Module({})
     gv = relay.GlobalVar("foo")
     x = relay.var('x', shape=[])
     func = relay.Function([x], relay.Call(gv, [x]),
                           relay.TensorType([], 'float32'))
-    env[gv] = func
+    mod[gv] = func
 
 
 def test_equal():

From 7f01770c2304efa8d62b113f91d3d8de64e05b68 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Sun, 4 Nov 2018 17:46:21 -0800
Subject: [PATCH 328/529] [RELAY][RUNTIME] Add compute and schedule attributes
 for all ops in relay/op/tensor.py (#2050)

---
 include/tvm/attrs.h                  |  16 +-
 include/tvm/build_module.h           |   2 +-
 python/tvm/relay/interpreter.py      |  21 ++-
 python/tvm/relay/op/__init__.py      |   2 +-
 python/tvm/relay/op/_tensor.py       | 269 ++++++++++++++++++++++++---
 python/tvm/relay/op/op.py            |   5 +
 python/tvm/relay/op/tensor.py        |  33 ++--
 src/relay/pass/lower_ops.cc          |  14 +-
 tests/python/relay/test_op_level1.py |  83 +++++++--
 tests/python/relay/test_op_level3.py |  47 +++--
 tests/python/relay/test_op_level4.py |  66 +++++--
 11 files changed, 453 insertions(+), 105 deletions(-)

diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 51d916ca488d..cc1abe6e57de 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -735,12 +735,12 @@ template<typename DerivedType>
 class AttrsNode : public BaseAttrsNode {
  public:
   void VisitAttrs(AttrVisitor* v) final {
-    detail::AttrNormalVisitor vis(v);
+    ::tvm::detail::AttrNormalVisitor vis(v);
     self()->__VisitAttrs__(vis);
   }
 
   void VisitNonDefaultAttrs(AttrVisitor* v) final {
-    detail::AttrNonDefaultVisitor vis(v);
+    ::tvm::detail::AttrNonDefaultVisitor vis(v);
     self()->__VisitAttrs__(vis);
   }
 
@@ -761,7 +761,7 @@ class AttrsNode : public BaseAttrsNode {
         }
         return false;
       };
-      auto vis = detail::CreateInitVisitor(DerivedType::_type_key, ffind);
+      auto vis = ::tvm::detail::CreateInitVisitor(DerivedType::_type_key, ffind);
       self()->__VisitAttrs__(vis);
       hit_count = vis.hit_count_;
     } else {
@@ -779,14 +779,14 @@ class AttrsNode : public BaseAttrsNode {
         }
         return false;
       };
-      auto vis = detail::CreateInitVisitor(DerivedType::_type_key, ffind);
+      auto vis = ::tvm::detail::CreateInitVisitor(DerivedType::_type_key, ffind);
       self()->__VisitAttrs__(vis);
       hit_count = vis.hit_count_;
     }
     // error handling, slow path
     if (hit_count * 2 != args.size() && !allow_unknown) {
       for (int i = 0; i < args.size(); i += 2) {
-        detail::AttrExistVisitor visitor;
+        ::tvm::detail::AttrExistVisitor visitor;
         visitor.key_ = args[i].operator std::string();
         self()->__VisitAttrs__(visitor);
         if (!visitor.exist_) {
@@ -803,7 +803,7 @@ class AttrsNode : public BaseAttrsNode {
   }
 
   Array<AttrFieldInfo> ListFieldInfo() const final {
-    detail::AttrDocVisitor visitor;
+    ::tvm::detail::AttrDocVisitor visitor;
     self()->__VisitAttrs__(visitor);
     return visitor.fields_;
   }
@@ -813,13 +813,13 @@ class AttrsNode : public BaseAttrsNode {
     if (pself == other) return true;
     if (other == nullptr) return false;
     if (pself->type_index() != other->type_index()) return false;
-    detail::AttrsEqualVisitor visitor(pself, other, equal);
+    ::tvm::detail::AttrsEqualVisitor visitor(pself, other, equal);
     self()->__VisitAttrs__(visitor);
     return visitor.result_;
   }
 
   size_t ContentHash(AttrsHash hasher) const final {
-    detail::AttrsHashVisitor visitor(hasher);
+    ::tvm::detail::AttrsHashVisitor visitor(hasher);
     visitor.result_ = std::hash<std::string>()(this->type_key());
     self()->__VisitAttrs__(visitor);
     return visitor.result_;
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 7aafad4216e1..ddd54f604a68 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -417,7 +417,7 @@ inline TVMRetValue GenericFunc::operator()(Args&& ...args) const {
   const int kArraySize = kNumArgs > 0 ? kNumArgs : 1;
   TVMValue values[kArraySize];
   int type_codes[kArraySize];
-  detail::for_each(TVMArgsSetter(values, type_codes),
+  runtime::detail::for_each(TVMArgsSetter(values, type_codes),
     std::forward<Args>(args)...);
   TVMRetValue rv;
   CallPacked(TVMArgs(values, type_codes, kNumArgs), &rv);
diff --git a/python/tvm/relay/interpreter.py b/python/tvm/relay/interpreter.py
index 4dfe3e02989e..bd8ef0d14415 100644
--- a/python/tvm/relay/interpreter.py
+++ b/python/tvm/relay/interpreter.py
@@ -138,7 +138,8 @@ def evaluate(self, expr, params=None):
         """
         if params:
             scope_builder = ScopeBuilder()
-            for key, value in params:
+            for key in params:
+                value = params[key]
                 scope_builder.let(key, value)
             scope_builder.ret(expr)
             expr = scope_builder.get()
@@ -146,7 +147,17 @@ def evaluate(self, expr, params=None):
         if isinstance(expr, Function):
             assert not ir_pass.free_vars(expr)
 
-        return self._make_executor(expr)
+        executor = self._make_executor(expr)
+
+        # If we are evaluating a function or top-level defintion
+        # the user must call the function themselves.
+        #
+        # If we are evaluating an open term with parameters we will
+        # just return them the result.
+        if isinstance(expr, (Function, GlobalVar)):
+            return executor
+        else:
+            return executor()
 
 
 class Interpreter(Executor):
@@ -168,10 +179,14 @@ def _interp_wrapper(*args):
                 self.mod._add(expr, func, True)
                 opt_expr = Call(expr, relay_args)
                 return _interpreter.evaluate(self.mod, opt_expr)
-            else:
+            elif isinstance(expr, Function):
                 call = Call(expr, relay_args)
                 opt_expr = self.optimize(call)
                 return _interpreter.evaluate(self.mod, opt_expr)
+            else:
+                assert not args
+                opt_expr = self.optimize(expr)
+                return _interpreter.evaluate(self.mod, opt_expr)
 
         return _interp_wrapper
 
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 7b61fd10f5b0..9b581486608b 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -1,7 +1,7 @@
 #pylint: disable=wildcard-import, redefined-builtin
 """Relay core operators."""
 # operator defs
-from .op import get, register, Op
+from .op import get, register, register_schedule, register_compute, Op
 
 # Operators
 from .reduce import *
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 6ccb394ef8db..5841d278378a 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -1,49 +1,272 @@
 #pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
+from __future__ import absolute_import
 import tvm
 import topi
-from . import register
+import topi.cuda
+from . import register_schedule, register_compute
 
+def schedule_injective(outputs, target):
+    """Generic schedule for binary broadcast."""
+    with tvm.target.create(target):
+        return topi.generic.schedule_injective(outputs)
+
+schedule_broadcast = schedule_injective
+schedule_elemwise = schedule_injective
+
+# log
+def log_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.log(inputs[0])]
+
+register_compute("log", log_compute)
+register_schedule("log", schedule_broadcast)
+
+# exp
+def exp_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.exp(inputs[0])]
+
+register_compute("exp", exp_compute)
+register_schedule("exp", schedule_broadcast)
+
+# sqrt
+def sqrt_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.sqrt(inputs[0])]
+
+register_compute("sqrt", sqrt_compute)
+register_schedule("sqrt", schedule_broadcast)
+
+# sigmoid
+def sigmoid_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.sigmoid(inputs[0])]
+
+register_compute("sigmoid", sigmoid_compute)
+register_schedule("sigmoid", schedule_broadcast)
+
+# floor
+def floor_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.floor(inputs[0])]
+
+register_compute("floor", floor_compute)
+register_schedule("floor", schedule_broadcast)
+
+# ceil
+def ceil_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.ceil(inputs[0])]
+
+register_compute("ceil", ceil_compute)
+register_schedule("ceil", schedule_broadcast)
+
+# trunc
+def trunc_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.trunc(inputs[0])]
+
+register_compute("trunc", trunc_compute)
+register_schedule("trunc", schedule_broadcast)
+
+# round
+def round_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.round(inputs[0])]
+
+register_compute("round", round_compute)
+register_schedule("round", schedule_broadcast)
+
+# abs
+def abs_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.abs(inputs[0])]
+
+register_compute("abs", abs_compute)
+register_schedule("abs", schedule_broadcast)
+
+# tanh
+def tanh_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.tanh(inputs[0])]
+
+register_compute("tanh", tanh_compute)
+register_schedule("tanh", schedule_broadcast)
+
+# negative
+def negative_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.negative(inputs[0])]
+
+register_compute("negative", negative_compute)
+register_schedule("negative", schedule_broadcast)
+
+# add
 def add_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.add(inputs[0], inputs[1])]
 
-def add_schedule(outputs, target):
-    assert len(outputs) == 1
-    return tvm.create_schedule(outputs[0].op)
-
-register("add", "FTVMCompute", add_compute)
-register("add", "FTVMSchedule", add_schedule)
+register_compute("add", add_compute)
+register_schedule("add", schedule_injective)
 
+# subtract
 def subtract_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.subtract(inputs[0], inputs[1])]
 
-def subtract_schedule(outputs, target):
-    assert len(outputs) == 1
-    return tvm.create_schedule(outputs[0].op)
-
-register("subtract", "FTVMCompute", subtract_compute)
-register("subtract", "FTVMSchedule", subtract_schedule)
+register_compute("subtract", subtract_compute)
+register_schedule("subtract", schedule_broadcast)
 
+# multiply
 def multiply_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.multiply(inputs[0], inputs[1])]
 
-def multiply_schedule(outputs, target):
-    assert len(outputs) == 1
-    return tvm.create_schedule(outputs[0].op)
+register_compute("multiply", multiply_compute)
+register_schedule("multiply", schedule_broadcast)
+
+# divide
+def divide_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.divide(inputs[0], inputs[1])]
+
+register_compute("divide", divide_compute)
+register_schedule("divide", schedule_broadcast)
 
-register("multiply", "FTVMCompute", multiply_compute)
-register("multiply", "FTVMSchedule", multiply_schedule)
+# pow
+def pow_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.power(inputs[0], inputs[1])]
 
+register_compute("pow", pow_compute)
+register_schedule("pow", schedule_injective)
+
+# mod
+def mod_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.mod(inputs[0], inputs[1])]
+
+register_compute("mod", mod_compute)
+register_schedule("mod", schedule_broadcast)
+
+# equal
 def equal_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.equal(inputs[0], inputs[1])]
 
-def equal_schedule(outputs, target):
-    assert len(outputs) == 1
-    return tvm.create_schedule(outputs[0].op)
+register_compute("equal", equal_compute)
+register_schedule("equal", schedule_broadcast)
+
+# not_equal
+def not_equal_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.not_equal(inputs[0], inputs[1])]
+
+register_compute("not_equal", not_equal_compute)
+register_schedule("not_equal", schedule_broadcast)
+
+# less
+def less_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.less(inputs[0], inputs[1])]
+
+register_compute("less", less_compute)
+register_schedule("less", schedule_broadcast)
+
+# less equal
+def less_equal_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.less_equal(inputs[0], inputs[1])]
+
+register_compute("less_equal", less_equal_compute)
+register_schedule("less_equal", schedule_broadcast)
+
+# greater
+def greater_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.greater(inputs[0], inputs[1])]
+
+register_compute("greater", greater_compute)
+register_schedule("greater", schedule_broadcast)
+
+# greater equal
+def greater_equal_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.greater_equal(inputs[0], inputs[1])]
+
+register_compute("greater_equal", greater_equal_compute)
+register_schedule("greater_equal", schedule_broadcast)
+
+# maximum
+def maximum_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.maximum(inputs[0], inputs[1])]
+
+register_compute("maximum_compute", maximum_compute)
+register_schedule("maximum_compute", schedule_injective)
+
+# minimum
+def minimum_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.minimum(inputs[0], inputs[1])]
+
+register_compute("minimum", minimum_compute)
+register_schedule("minimum", schedule_injective)
+
+# right shift
+def right_shift_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.right_shift(inputs[0], inputs[1])]
+
+register_compute("right_shift", right_shift_compute)
+register_schedule("right_shift", schedule_injective)
+
+# lift shift
+def left_shift_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 2
+    return [topi.left_shift(inputs[0], inputs[1])]
+
+register_compute("left_shift", left_shift_compute)
+register_schedule("left_shift", schedule_injective)
+
+# zeros
+def zeros_compute(attrs, inputs, output_type, target):
+    assert not inputs
+    return [topi.full(output_type.shape, output_type.dtype, 0.0)]
+
+register_compute("zeros", zeros_compute)
+register_schedule("zeros", schedule_injective)
+
+# zeros_like
+def zeros_like_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.full_like(inputs[0], 0.0)]
+
+register_compute("zeros_like", zeros_like_compute)
+register_schedule("zeros_like", schedule_injective)
+
+# ones
+def ones_compute(attrs, inputs, output_type, target):
+    assert not inputs
+    return [topi.full(output_type.shape, output_type.dtype, 1.0)]
+
+register_compute("ones", ones_compute)
+register_schedule("ones", schedule_injective)
+
+# ones_like
+def ones_like(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.full_like(inputs[0], 1.0)]
+
+register_compute("ones_like", ones_like)
+register_schedule("ones_like", schedule_injective)
+
+# clip
+def clip_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.clip(inputs[0], attrs.a_min, attrs.a_max)]
+
 
-register("equal", "FTVMCompute", equal_compute)
-register("equal", "FTVMSchedule", equal_schedule)
+register_compute("clip", clip_compute)
+register_schedule("clip", schedule_injective)
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 0c09f39a3c83..91523f65f6b7 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -74,6 +74,11 @@ def _register(v):
         return v
     return _register(value) if value else _register
 
+def register_schedule(op_name, schedule):
+    register(op_name, "FTVMSchedule", schedule)
+
+def register_compute(op_name, compute):
+    register(op_name, "FTVMCompute", compute)
 
 _init_api("relay.op", __name__)
 
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 3c432b58092d..2505da8f1dfd 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -213,9 +213,8 @@ def add(lhs, rhs):
     """
     return _make.add(lhs, rhs)
 
-
-def multiply(lhs, rhs):
-    """Multiplication with numpy-style broadcasting.
+def subtract(lhs, rhs):
+    """Subtraction with numpy-style broadcasting.
 
     Parameters
     ----------
@@ -229,11 +228,10 @@ def multiply(lhs, rhs):
     result : relay.Expr
         The computed result.
     """
-    return _make.multiply(lhs, rhs)
-
+    return _make.subtract(lhs, rhs)
 
-def divide(lhs, rhs):
-    """Division with numpy-style broadcasting.
+def multiply(lhs, rhs):
+    """Multiplication with numpy-style broadcasting.
 
     Parameters
     ----------
@@ -247,11 +245,11 @@ def divide(lhs, rhs):
     result : relay.Expr
         The computed result.
     """
-    return _make.divide(lhs, rhs)
+    return _make.multiply(lhs, rhs)
 
 
-def pow(lhs, rhs):
-    """Power with numpy-style broadcasting.
+def divide(lhs, rhs):
+    """Division with numpy-style broadcasting.
 
     Parameters
     ----------
@@ -265,11 +263,11 @@ def pow(lhs, rhs):
     result : relay.Expr
         The computed result.
     """
-    return _make.pow(lhs, rhs)
+    return _make.divide(lhs, rhs)
 
 
-def mod(lhs, rhs):
-    """Mod with numpy-style broadcasting.
+def pow(lhs, rhs):
+    """Power with numpy-style broadcasting.
 
     Parameters
     ----------
@@ -283,11 +281,11 @@ def mod(lhs, rhs):
     result : relay.Expr
         The computed result.
     """
-    return _make.mod(lhs, rhs)
+    return _make.pow(lhs, rhs)
 
 
-def subtract(lhs, rhs):
-    """Subtraction with numpy-style broadcasting.
+def mod(lhs, rhs):
+    """Mod with numpy-style broadcasting.
 
     Parameters
     ----------
@@ -301,7 +299,7 @@ def subtract(lhs, rhs):
     result : relay.Expr
         The computed result.
     """
-    return _make.subtract(lhs, rhs)
+    return _make.mod(lhs, rhs)
 
 
 def equal(lhs, rhs):
@@ -553,7 +551,6 @@ def ones_like(data):
     """
     return _make.ones_like(data)
 
-
 def clip(a, a_min, a_max):
     """Clip the elements in `a` between `a_min` and `a_max`.
     `a_min` and `a_max` are cast to `a`'s dtype.
diff --git a/src/relay/pass/lower_ops.cc b/src/relay/pass/lower_ops.cc
index f2c8ceba866d..55102fe5cf67 100644
--- a/src/relay/pass/lower_ops.cc
+++ b/src/relay/pass/lower_ops.cc
@@ -8,6 +8,7 @@
  */
 #include <tvm/lowered_func.h>
 #include <tvm/operation.h>
+#include <tvm/build_module.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/logging.h>
 #include <tvm/relay/pass.h>
@@ -155,8 +156,8 @@ struct LiveFunctions : ExprVisitor {
 };
 
 using FCompute = TypedPackedFunc<Array<Tensor>(
-    const Attrs&, const Array<Tensor>&, Type, std::string)>;
-using FSchedule = TypedPackedFunc<Schedule(const Array<Tensor>&, std::string)>;
+    const Attrs&, const Array<Tensor>&, Type, tvm::Target)>;
+using FSchedule = TypedPackedFunc<Schedule(const Array<Tensor>&, tvm::Target)>;
 
 /*! \brief Return the set of operators in their TVM format. */
 Array<LoweredOp> LowerOps(const Module& mod, const Expr& e,
@@ -179,7 +180,7 @@ Array<LoweredOp> LowerOps(const Module& mod, const Expr& e,
     auto func = mod->Lookup(func_name);
     auto call = Downcast<Call>(func->body);
     auto op_node = call->op.as<OpNode>();
-    CHECK(op_node) << "violated invariant that primtiive calls contain a single op call";
+    CHECK(op_node) << "violated invariant that primtive calls contain a single op call";
     auto op = GetRef<Op>(op_node);
     RELAY_LOG(INFO) << "LowerOps: Lowering " << op->name;
 
@@ -197,10 +198,11 @@ Array<LoweredOp> LowerOps(const Module& mod, const Expr& e,
       i++;
     }
 
-    auto output_tt = op->op_type->ret_type;
+    auto output_tt = call->checked_type();
+    auto target_node = Target::create(target);
     Array<Tensor> outputs =
-        compute_reg[op](call->attrs, inputs, output_tt, target);
-    auto schedule = schedule_reg[op](outputs, target);
+        compute_reg[op](call->attrs, inputs, output_tt, target_node);
+    auto schedule = schedule_reg[op](outputs, target_node);
     size_t hash = StructuralHash()(func);
     LoweredFunc lf =
         flower(op->name + std::to_string(hash), schedule, inputs, outputs);
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index a622dfc2cbd4..7ab13409cc43 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -1,11 +1,23 @@
+import math
 import tvm
 import numpy as np
 from tvm import relay
+from tvm.relay.interpreter import create_executor
 
+def sigmoid(x):
+    one = np.ones_like(x)
+    return one / (one + np.exp(-x))
+
+def relu(x):
+    x_copy = np.copy(x)
+    np.maximum(x_copy, 0, x_copy)
+    return x_copy
 
 def test_unary_op():
-    def check_single_op(opfunc):
-        tp = relay.TensorType((10, 4), "float32")
+    def check_single_op(opfunc, ref):
+        shape = (10, 4)
+        dtype = 'float32'
+        tp = relay.TensorType(shape, dtype)
         x = relay.var("x", tp)
         y = opfunc(x)
         # test printer
@@ -13,20 +25,33 @@ def check_single_op(opfunc):
         # test type inference
         assert relay.ir_pass.infer_type(y).checked_type == tp
 
-    for opfunc in [tvm.relay.log,
-                   tvm.relay.exp,
-                   tvm.relay.sqrt,
-                   tvm.relay.sigmoid,
-                   tvm.relay.tanh,
-                   relay.nn.relu]:
-        check_single_op(opfunc)
+        if ref is not None:
+            data = np.random.rand(*shape).astype(dtype)
+            intrp = create_executor()
+            op_res = intrp.evaluate(y, { x: relay.const(data) })
+            ref_res = ref(data)
+            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+    for opfunc, ref in [(tvm.relay.log, np.log),
+                   (tvm.relay.exp, np.exp),
+                   (tvm.relay.sqrt, np.sqrt),
+                   (tvm.relay.sigmoid, sigmoid),
+                   (tvm.relay.tanh, np.tanh),
+                   (relay.nn.relu, None)]: # Just add RELU here after registering.
+        check_single_op(opfunc, ref)
 
 
 def test_binary_op():
-    def check_binary_op(opfunc):
+    def inst(vars, sh):
+        return [vars.get(s, s) for s in sh]
+
+    def check_binary_op(opfunc, ref):
+        # TODO(@jroesch): this piece of code improperly uses type variables.
         n = tvm.var("n")
-        t1 = relay.TensorType((5, n, 5))
-        t2 = relay.TensorType((n, 1))
+        s1 = (5, n, 5)
+        s2 = (n, 1)
+        t1 = relay.TensorType(s1)
+        t2 = relay.TensorType(s2)
         x = relay.var("x", t1)
         y = relay.var("y", t2)
         z = opfunc(x, y)
@@ -34,12 +59,25 @@ def check_binary_op(opfunc):
         assert ("%0 = {}(%x, %y)".format(z.op.name)) in z.astext()
         assert relay.ir_pass.infer_type(z).checked_type == t1
 
-    for opfunc in [relay.add,
-                   relay.subtract,
-                   relay.mod,
-                   relay.multiply,
-                   relay.divide]:
-        check_binary_op(opfunc)
+        if ref is not None:
+            t1 = relay.TensorType((5, 10, 5))
+            t2 = relay.TensorType((5, 10, 5))
+            x = relay.var("x", t1)
+            y = relay.var("y", t2)
+            z = opfunc(x, y)
+            x_data = np.random.rand(5, 10, 5).astype(t1.dtype)
+            y_data = np.random.rand(5, 10, 5).astype(t2.dtype)
+            intrp = create_executor()
+            op_res = intrp.evaluate(z, { x: relay.const(x_data), y: relay.const(y_data) })
+            ref_res = ref(x_data, y_data)
+            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+    for opfunc, ref in [(relay.add, np.add),
+                   (relay.subtract, np.subtract),
+                   (relay.mod, np.mod),
+                   (relay.multiply, np.multiply),
+                   (relay.divide, np.divide)]:
+        check_binary_op(opfunc, ref)
 
 
 def test_bias_add():
@@ -96,6 +134,15 @@ def test_concatenate_infer_type():
     zz = relay.ir_pass.infer_type(z)
     assert zz.checked_type == relay.TensorType((n, t + t, 100))
 
+    # x = relay.var("x", shape=(10, 5))
+    # y = relay.var("y", shape=(10, 5))
+    # z = relay.concatenate((x, y), axis=1)
+    # intrp = create_executor()
+    # x_data = np.random.rand(10, 5).astype('float32')
+    # y_data = np.random.rand(10, 5).astype('float32')
+    # op_res = intrp.evaluate(z, { x: relay.const(x_data), y: relay.const(y_data) })
+    # ref_res = np.concatenate(x_data, y_data, axis=1)
+    # np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
 def test_dropout():
     n, t, d = tvm.var("n"), tvm.var("t"), tvm.var("d")
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 6f06c8698e3f..26eccf991d0e 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -3,29 +3,40 @@
 import tvm
 import numpy as np
 from tvm import relay
+from tvm.relay import create_executor
 from nose.tools import raises
 
 def test_zeros_ones():
-    for op in [relay.zeros, relay.ones]:
+    for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
         y = op(shape=(124, 50), dtype="float64")
         yy = relay.ir_pass.infer_type(y)
         assert yy.checked_type == relay.TensorType((124, 50), "float64")
+        intrp = create_executor()
+        intrp_res = intrp.evaluate(y).asnumpy()
+        np.testing.assert_allclose(intrp_res, ref((124, 50), 'float64'))
 
 def test_unary_identity():
-    for op in [relay.zeros_like,
-               relay.ones_like,
-               relay.ceil,
-               relay.floor,
-               relay.trunc,
-               relay.round,
-               relay.abs,
-               relay.copy,
-               relay.negative]:
-        x = relay.var("x", relay.TensorType((8, 9, 4), "float32"))
+    for op, ref in [(relay.zeros_like, np.zeros_like),
+               (relay.ones_like, np.ones_like),
+               (relay.ceil, np.ceil),
+               (relay.floor, np.floor),
+               (relay.trunc, np.trunc),
+               (relay.round, np.round),
+               (relay.abs, np.abs),
+               (relay.copy, None), # np.copy
+               (relay.negative, np.negative)]:
+        shape = (8, 9, 4)
+        x = relay.var("x", relay.TensorType(shape, "float32"))
         y = op(x)
         yy = relay.ir_pass.infer_type(y)
-        assert yy.checked_type == relay.TensorType((8, 9, 4), "float32")
+        assert yy.checked_type == relay.TensorType(shape, "float32")
 
+        if ref is not None:
+            data = np.random.rand(*shape).astype('float32')
+            intrp = create_executor()
+            op_res = intrp.evaluate(y, { x: relay.const(data) })
+            ref_res = ref(data)
+            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
 def test_cast():
     x = relay.var("x", relay.TensorType((8, 9, 4), "float32"))
@@ -35,12 +46,20 @@ def test_cast():
     assert yy.checked_type == relay.TensorType((8, 9, 4), "int32")
 
 
-def test_clip_type():
+def test_clip():
     a = relay.var("a", relay.TensorType((10, 4), "float32"))
     y = relay.clip(a, 1., 4.)
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((10, 4), "float32")
 
+    data = np.random.rand(10, 4).astype('float32')
+    intrp = create_executor()
+    op_res = intrp.evaluate(y, { a: relay.const(data) })
+    ref_res = np.clip(data, 1., 4.)
+    np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+
+
 
 def test_transpose_infer_type():
     n, t, d = tvm.var("n"), tvm.var("t"), 100
@@ -226,7 +245,7 @@ def test_infer_type_prelu():
     test_cast()
     test_zeros_ones()
     test_unary_identity()
-    test_clip_type()
+    test_clip()
     test_transpose_infer_type()
     test_reshape_infer_type()
     test_reshape_like()
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 2dc643cfd7e4..d20997010b4c 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -1,10 +1,11 @@
 import tvm
 import numpy as np
 from tvm import relay
+from tvm.relay import create_executor
 
 
 def test_binary_op():
-    def check_binary_op(opfunc):
+    def check_binary_op(opfunc, ref):
         n = tvm.var("n")
         t1 = relay.TensorType((5, n, 5))
         t2 = relay.TensorType((n, 1))
@@ -15,17 +16,30 @@ def check_binary_op(opfunc):
         assert ("%0 = {}(%x, %y)".format(z.op.name)) in z.astext()
         assert relay.ir_pass.infer_type(z).checked_type == t1
 
-    for opfunc in [relay.pow]:
-        check_binary_op(opfunc)
+        if ref is not None:
+            t1 = relay.TensorType((5, 10, 5))
+            t2 = relay.TensorType((5, 10, 5))
+            x = relay.var("x", t1)
+            y = relay.var("y", t2)
+            z = opfunc(x, y)
+            x_data = np.random.rand(5, 10, 5).astype(t1.dtype)
+            y_data = np.random.rand(5, 10, 5).astype(t2.dtype)
+            intrp = create_executor()
+            op_res = intrp.evaluate(z, { x: relay.const(x_data), y: relay.const(y_data) })
+            ref_res = ref(x_data, y_data)
+            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+    for opfunc, ref in [(relay.pow, np.power)]:
+        check_binary_op(opfunc, ref)
 
 
 def test_cmp_type():
-    for op in (relay.greater,
-               relay.greater_equal,
-               relay.less,
-               relay.less_equal,
-               relay.equal,
-               relay.not_equal):
+    for op, ref in ((relay.greater, np.greater),
+               (relay.greater_equal, np.greater_equal),
+               (relay.less, np.less),
+               (relay.less_equal, np.less_equal),
+               (relay.equal, np.equal),
+               (relay.not_equal, np.not_equal)):
         x = relay.var("x", relay.TensorType((10, 4), "float32"))
         y = relay.var("y", relay.TensorType((5, 10, 1), "float32"))
         z = op(x, y)
@@ -33,18 +47,44 @@ def test_cmp_type():
         zz = relay.ir_pass.infer_type(z)
         assert zz.checked_type == relay.TensorType((5, 10, 4), "bool")
 
+        if ref is not None:
+            x_shape = (10, 4)
+            y_shape = (5, 10, 1)
+            t1 = relay.TensorType(x_shape)
+            t2 = relay.TensorType(y_shape)
+            x = relay.var("x", t1)
+            y = relay.var("y", t2)
+            z = op(x, y)
+            x_data = np.random.rand(*x_shape).astype(t1.dtype)
+            y_data = np.random.rand(*y_shape).astype(t2.dtype)
+            intrp = create_executor()
+            op_res = intrp.evaluate(z, { x: relay.const(x_data), y: relay.const(y_data) })
+            ref_res = ref(x_data, y_data)
+            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
 
 def test_binary_int_broadcast():
-    for op in [relay.right_shift,
-               relay.left_shift,
-               relay.maximum,
-               relay.minimum]:
+    for op, ref in [(relay.right_shift, np.right_shift),
+               (relay.left_shift, np.left_shift),
+               (relay.maximum, np.maximum),
+               (relay.minimum, np.minimum)]:
         x = relay.var("x", relay.TensorType((10, 4), "int32"))
         y = relay.var("y", relay.TensorType((5, 10, 1), "int32"))
         z = op(x, y)
         zz = relay.ir_pass.infer_type(z)
         assert zz.checked_type == relay.TensorType((5, 10, 4), "int32")
 
+    if ref is not None:
+        x_shape = (10, 4)
+        y_shape = (5, 10, 1)
+        t1 = relay.TensorType(x_shape, 'int32')
+        t2 = relay.TensorType(y_shape, 'int32')
+        x_data = np.random.rand(*x_shape).astype(t1.dtype)
+        y_data = np.random.rand(*y_shape).astype(t2.dtype)
+        intrp = create_executor()
+        op_res = intrp.evaluate(z, { x: relay.const(x_data), y: relay.const(y_data) })
+        ref_res = ref(x_data, y_data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
 def test_where():
     cond = relay.var("cond", relay.TensorType((3, 4), "float32"))

From 8617947e0018214574ee2c980dd13e2ae933fda6 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 4 Nov 2018 22:02:27 -0800
Subject: [PATCH 329/529] [RELAY][BACKEND] CompileEngine refactor. (#2059)

---
 include/tvm/build_module.h                    |  84 ++--
 include/tvm/relay/build_module.h              |  76 ---
 include/tvm/relay/interpreter.h               |  16 +-
 include/tvm/relay/op_attr_types.h             |  79 ++++
 include/tvm/runtime/device_api.h              |  34 ++
 include/tvm/runtime/packed_func.h             |   1 +
 python/tvm/contrib/graph_runtime.py           |   2 +-
 python/tvm/relay/__init__.py                  |  12 +-
 python/tvm/relay/_interpreter.py              |   4 -
 python/tvm/relay/backend/__init__.py          |   2 +
 python/tvm/relay/backend/_backend.py          |  84 ++++
 python/tvm/relay/backend/compile_engine.py    | 142 ++++++
 .../relay/backend/graph_runtime_codegen.py    | 335 ++++++++++++++
 python/tvm/relay/backend/interpreter.py       | 182 ++++++++
 python/tvm/relay/build_module.py              | 266 +++++++++--
 python/tvm/relay/expr.py                      |  35 +-
 python/tvm/relay/graph_runtime_codegen.py     | 368 ---------------
 python/tvm/relay/interpreter.py               | 223 ---------
 python/tvm/relay/ir_pass.py                   |  20 +-
 python/tvm/relay/op/_tensor.py                |  97 ++--
 python/tvm/relay/op/op.py                     |  77 +++-
 python/tvm/relay/op/tensor.py                 |   4 +-
 python/tvm/relay/testing/__init__.py          |   1 +
 python/tvm/relay/testing/config.py            |  14 +
 src/codegen/build_module.cc                   |   6 +-
 src/relay/backend/compile_engine.cc           | 351 ++++++++++++++
 src/relay/backend/compile_engine.h            | 206 +++++++++
 src/relay/backend/interpreter.cc              | 426 +++++++++++++++++
 src/relay/interpreter.cc                      | 432 ------------------
 src/relay/ir/expr.cc                          |   6 +-
 src/relay/ir/op.cc                            |  50 +-
 src/relay/ir/text_printer.cc                  |   9 +-
 src/relay/op/op_common.h                      |   7 +-
 src/relay/op/tensor/binary.cc                 |   5 +-
 src/relay/pass/fuse_ops.cc                    |  69 ++-
 src/relay/pass/lower_ops.cc                   | 224 ---------
 src/runtime/c_runtime_api.cc                  |  21 -
 src/runtime/graph/graph_runtime.cc            |   4 +-
 .../relay/test_backend_compile_engine.py      |  38 ++
 ...ntime.py => test_backend_graph_runtime.py} |   6 +-
 ...rpreter.py => test_backend_interpreter.py} |  62 ++-
 tests/python/relay/test_op_level1.py          |  54 ++-
 tests/python/relay/test_op_level4.py          |  33 +-
 tests/python/relay/test_pass_fuse_ops.py      |  17 +
 topi/python/topi/transform.py                 |   9 +-
 45 files changed, 2551 insertions(+), 1642 deletions(-)
 delete mode 100644 include/tvm/relay/build_module.h
 create mode 100644 include/tvm/relay/op_attr_types.h
 delete mode 100644 python/tvm/relay/_interpreter.py
 create mode 100644 python/tvm/relay/backend/__init__.py
 create mode 100644 python/tvm/relay/backend/_backend.py
 create mode 100644 python/tvm/relay/backend/compile_engine.py
 create mode 100644 python/tvm/relay/backend/graph_runtime_codegen.py
 create mode 100644 python/tvm/relay/backend/interpreter.py
 delete mode 100644 python/tvm/relay/graph_runtime_codegen.py
 delete mode 100644 python/tvm/relay/interpreter.py
 create mode 100644 python/tvm/relay/testing/config.py
 create mode 100644 src/relay/backend/compile_engine.cc
 create mode 100644 src/relay/backend/compile_engine.h
 create mode 100644 src/relay/backend/interpreter.cc
 delete mode 100644 src/relay/interpreter.cc
 delete mode 100644 src/relay/pass/lower_ops.cc
 create mode 100644 tests/python/relay/test_backend_compile_engine.py
 rename tests/python/relay/{test_graph_runtime.py => test_backend_graph_runtime.py} (92%)
 rename tests/python/relay/{test_interpreter.py => test_backend_interpreter.py} (73%)
 create mode 100644 tests/python/relay/test_pass_fuse_ops.py

diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index ddd54f604a68..ba340166339b 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -14,7 +14,6 @@
 #include "lowered_func.h"
 
 namespace tvm {
-using namespace tvm::runtime;
 
 /*!
 * \brief Container for target device information.
@@ -40,7 +39,7 @@ class TargetNode : public Node {
   Array<Expr> libs_array;
 
   /*! \return the full device string to pass to codegen::Build */
-  EXPORT std::string str() const;
+  TVM_DLL const std::string& str() const;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("target_name", &target_name);
@@ -54,16 +53,20 @@ class TargetNode : public Node {
   }
 
   /*! \brief Get the keys for this target as a vector of string */
-  EXPORT std::vector<std::string> keys() const;
+  TVM_DLL std::vector<std::string> keys() const;
 
   /*! \brief Get the options for this target as a vector of string */
-  EXPORT std::vector<std::string> options() const;
+  TVM_DLL std::vector<std::string> options() const;
 
   /*! \brief Get the keys for this target as an unordered_set of string */
-  EXPORT std::unordered_set<std::string> libs() const;
+  TVM_DLL std::unordered_set<std::string> libs() const;
 
   static constexpr const char* _type_key = "Target";
   TVM_DECLARE_NODE_TYPE_INFO(TargetNode, Node);
+
+ private:
+  /*! \brief Internal string repr. */
+  mutable std::string str_repr_;
 };
 
 class Target : public NodeRef {
@@ -75,20 +78,20 @@ class Target : public NodeRef {
   * \brief Create a Target given a string
   * \param target_str the string to parse
   */
-  EXPORT static Target create(const std::string& target_str);
+  TVM_DLL static Target create(const std::string& target_str);
 
   /*!
   * \brief Push a new target context onto the thread local stack. The Target on top of
   * the stack is used to determine which specialization to use when invoking a GenericFunc.
   * \param target The target to set as the current context.
   */
-  EXPORT static void EnterTargetScope(const tvm::Target& target);
+  TVM_DLL static void EnterTargetScope(const tvm::Target& target);
 
   /*!
   * \brief Pop a target off the thread local context stack, restoring the previous target
   * as the current context.
   */
-  EXPORT static void ExitTargetScope();
+  TVM_DLL static void ExitTargetScope();
 
   /*!
   * \brief Get the current target context from thread local storage.
@@ -98,7 +101,7 @@ class Target : public NodeRef {
   * \return The target that is the current context. The target may not be defined if
   * allow_not_defined is true.
   */
-  EXPORT static tvm::Target current_target(bool allow_not_defined = true);
+  TVM_DLL static tvm::Target current_target(bool allow_not_defined = true);
 
   inline const TargetNode* operator->() const {
       return static_cast<const TargetNode*>(node_.get());
@@ -130,39 +133,39 @@ struct TargetContext {
 /*! \brief This namespace provides functions to construct Target instances */
 namespace target {
 /*! \return A target for LLVM */
-EXPORT Target llvm(const std::vector<std::string>& options =
+TVM_DLL Target llvm(const std::vector<std::string>& options =
                    std::vector<std::string>());
 
 /*! \return A target for CUDA */
-EXPORT Target cuda(const std::vector<std::string>& options =
+TVM_DLL Target cuda(const std::vector<std::string>& options =
                    std::vector<std::string>());
 
 /*! \return A target for ROCm */
-EXPORT Target rocm(const std::vector<std::string>& options =
+TVM_DLL Target rocm(const std::vector<std::string>& options =
                    std::vector<std::string>());
 
 /*! \return A target for OpenCL */
-EXPORT Target opencl(const std::vector<std::string>& options =
+TVM_DLL Target opencl(const std::vector<std::string>& options =
                      std::vector<std::string>());
 
 /*! \return A target for Metal */
-EXPORT Target metal(const std::vector<std::string>& options =
+TVM_DLL Target metal(const std::vector<std::string>& options =
                     std::vector<std::string>());
 
 /*! \return A target for rasp */
-EXPORT Target rasp(const std::vector<std::string>& options =
+TVM_DLL Target rasp(const std::vector<std::string>& options =
                    std::vector<std::string>());
 
 /*! \return A target for Mali */
-EXPORT Target mali(const std::vector<std::string>& options =
+TVM_DLL Target mali(const std::vector<std::string>& options =
                    std::vector<std::string>());
 
 /*! \return A target for Intel Graphics */
-EXPORT Target intel_graphics(const std::vector<std::string>& options =
+TVM_DLL Target intel_graphics(const std::vector<std::string>& options =
                              std::vector<std::string>());
 
 /*! \return A target for stackvm */
-EXPORT Target stackvm(const std::vector<std::string>& options =
+TVM_DLL Target stackvm(const std::vector<std::string>& options =
                       std::vector<std::string>());
 
 }  // namespace target
@@ -212,7 +215,7 @@ class BuildConfigNode : public Node {
   bool partition_const_loop = false;
 
   /*! \brief Whether to dump the IR of each pass (only when building from python) */
-  std::vector< std::pair<int, PackedFunc> > add_lower_pass;
+  std::vector< std::pair<int, runtime::PackedFunc> > add_lower_pass;
 
   /*! \brief Whether to dump the IR of each pass (only when building from python) */
   bool dump_pass_ir = false;
@@ -255,20 +258,20 @@ class BuildConfig : public ::tvm::NodeRef {
    * \brief Push a new BuildConfig context onto the thread local stack.
    * \param build_config The configuration to set as the current context.
    */
-  EXPORT static void EnterBuildConfigScope(const tvm::BuildConfig& build_config);
+  TVM_DLL static void EnterBuildConfigScope(const tvm::BuildConfig& build_config);
 
   /*!
    * \brief Pop a build config off the thread local context stack, restoring the previous
    * configuration as the current context.
    */
-  EXPORT static void ExitBuildConfigScope();
+  TVM_DLL static void ExitBuildConfigScope();
 
   /*!
    * \brief Get the current BuildConfig context from thread local storage, or a default
    * configuration if a BuildConfig scope has not been entered.
    * \return The configuration that is the current context.
    */
-  EXPORT static tvm::BuildConfig Current();
+  TVM_DLL static tvm::BuildConfig Current();
 
   using ContainerType = BuildConfigNode;
 };
@@ -297,7 +300,7 @@ struct BuildConfigContext {
 * \brief Construct a BuildConfig containing a new BuildConfigNode
 * \return The new BuildConfig
 */
-EXPORT BuildConfig build_config();
+TVM_DLL BuildConfig build_config();
 
 /*!
 * \brief Build a LoweredFunc given a schedule, args and binds
@@ -308,11 +311,11 @@ EXPORT BuildConfig build_config();
 * \param config The build configuration.
 * \return The lowered function.
 */
-EXPORT Array<LoweredFunc> lower(Schedule sch,
-                                const Array<Tensor>& args,
-                                const std::string& name,
-                                const std::unordered_map<Tensor, Buffer>& binds,
-                                const BuildConfig& config);
+TVM_DLL Array<LoweredFunc> lower(Schedule sch,
+                                 const Array<Tensor>& args,
+                                 const std::string& name,
+                                 const std::unordered_map<Tensor, Buffer>& binds,
+                                 const BuildConfig& config);
 
 /*!
 * \brief Build a device and host module for a specific target from an array of lowered functions.
@@ -322,10 +325,10 @@ EXPORT Array<LoweredFunc> lower(Schedule sch,
 * \param config The build configuration.
 * \return The built module.
 */
-EXPORT runtime::Module build(const Array<LoweredFunc>& funcs,
-                             const Target& target,
-                             const Target& target_host,
-                             const BuildConfig& config);
+TVM_DLL runtime::Module build(const Array<LoweredFunc>& funcs,
+                              const Target& target,
+                              const Target& target_host,
+                              const BuildConfig& config);
 
 class GenericFuncNode;
 
@@ -344,7 +347,7 @@ class GenericFunc : public NodeRef {
    * false, an error will be logged if the call would override a previously registered function.
    * \return reference to self.
    */
-  TVM_DLL GenericFunc& set_default(const PackedFunc value,
+  TVM_DLL GenericFunc& set_default(const runtime::PackedFunc value,
                                    bool allow_override = false);
   /*!
    * \brief Register a specialized function
@@ -355,7 +358,7 @@ class GenericFunc : public NodeRef {
    * \return reference to self.
    */
   TVM_DLL GenericFunc& register_func(const std::vector<std::string>& tags,
-                                     const PackedFunc value,
+                                     const runtime::PackedFunc value,
                                      bool allow_override = false);
   /*!
    * \brief Call generic function by directly passing in unpacked format.
@@ -372,14 +375,15 @@ class GenericFunc : public NodeRef {
    * \endcode
    */
   template<typename... Args>
-  inline TVMRetValue operator()(Args&& ...args) const;
+  inline runtime::TVMRetValue operator()(Args&& ...args) const;
   /*!
    * \brief Invoke the relevant function for the current target context, set by set_target_context.
    * Arguments are passed in packed format.
    * \param args The arguments to pass to the function.
    * \param ret The return value
    */
-  TVM_DLL void CallPacked(TVMArgs args, TVMRetValue* ret) const;
+  TVM_DLL void CallPacked(runtime::TVMArgs args,
+                          runtime::TVMRetValue* ret) const;
 
   /*!
    * \brief Find or register the GenericFunc instance corresponding to the give name
@@ -412,14 +416,14 @@ class GenericFunc : public NodeRef {
 };
 
 template<typename... Args>
-inline TVMRetValue GenericFunc::operator()(Args&& ...args) const {
+inline runtime::TVMRetValue GenericFunc::operator()(Args&& ...args) const {
   const int kNumArgs = sizeof...(Args);
   const int kArraySize = kNumArgs > 0 ? kNumArgs : 1;
   TVMValue values[kArraySize];
   int type_codes[kArraySize];
   runtime::detail::for_each(TVMArgsSetter(values, type_codes),
     std::forward<Args>(args)...);
-  TVMRetValue rv;
+  runtime::TVMRetValue rv;
   CallPacked(TVMArgs(values, type_codes, kNumArgs), &rv);
   return rv;
 }
@@ -432,9 +436,9 @@ class GenericFuncNode : public Node {
   /*! \brief name of the function */
   std::string name_;
   /* \brief the generic builder */
-  PackedFunc generic_func_;
+  runtime::PackedFunc generic_func_;
   /* \brief map from keys to registered functions */
-  std::unordered_map<std::string, PackedFunc> dispatch_dict_;
+  std::unordered_map<std::string, runtime::PackedFunc> dispatch_dict_;
 
   static constexpr const char* _type_key = "GenericFunc";
   TVM_DECLARE_NODE_TYPE_INFO(GenericFuncNode, Node);
diff --git a/include/tvm/relay/build_module.h b/include/tvm/relay/build_module.h
deleted file mode 100644
index 35402d655507..000000000000
--- a/include/tvm/relay/build_module.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file tvm/relay/build_module.h
- * \brief The passes and data structures needed to build a
- * tvm::Module from a Relay program.
- */
-#ifndef TVM_RELAY_BUILD_MODULE_H_
-#define TVM_RELAY_BUILD_MODULE_H_
-
-#include <tvm/lowered_func.h>
-#include <tvm/relay/module.h>
-#include <tvm/relay/expr.h>
-#include <string>
-
-namespace tvm {
-namespace relay {
-
-/*! \brief A lowered Relay operation.
- *
- * A lowered operation is a pair containing the "primitive" function used
- * to produce the lowered function as well as the lowered function itself.
- */
-class LoweredOp;
-/*! \brief Call container. */
-class LoweredOpNode : public Node {
- public:
-  /*!
-   * \brief The primitive function to be lowered.
-   *
-   * A primitive function consists only of calls to relay::Op which
-   * can be fused.
-   */
-  Function func;
-
-  /*!
-   * \brief The lowered function.
-   */
-  LoweredFunc lowered_func;
-
-  void VisitAttrs(tvm::AttrVisitor* v) final {
-    v->Visit("func", &func);
-    v->Visit("lowered_func", &lowered_func);
-  }
-
-  TVM_DLL static LoweredOp make(
-      Function func,
-      LoweredFunc lowered_func);
-
-  static constexpr const char* _type_key = "relay.LoweredOp";
-  TVM_DECLARE_NODE_TYPE_INFO(LoweredOpNode, Node);
-};
-
-RELAY_DEFINE_NODE_REF(LoweredOp, LoweredOpNode, NodeRef);
-
-/*!
- * \brief Lower the operations contained in a Relay expression.
- *
- * The lowering pass will only lower functions marked as primitive,
- * the FuseOps pass will provide this behavior, if run before LowerOps.
- *
- * \note This will do a reachability analysis and lower all definitions
- * reachable from the provided expression.
- *
- * \param mod  The module.
- * \param expr The expression with operations to be lowered.
- * \param target The target to lower the functions to.
- *
- * \return The set of lowered operations.
- */
-Array<LoweredOp> LowerOps(const Module& mod, const Expr& expr,
-                          const std::string& target = "llvm");
-
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_BUILD_MODULE_H_
diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
index 403dd50ad778..1099ef0f3cfd 100644
--- a/include/tvm/relay/interpreter.h
+++ b/include/tvm/relay/interpreter.h
@@ -16,6 +16,7 @@
 #ifndef TVM_RELAY_INTERPRETER_H_
 #define TVM_RELAY_INTERPRETER_H_
 
+#include <tvm/build_module.h>
 #include <tvm/relay/module.h>
 #include <tvm/relay/expr.h>
 
@@ -27,7 +28,9 @@ namespace relay {
  */
 class Value;
 
-/*! \brief Evaluate an expression using the interpreter producing a value.
+/*!
+ *\brief Create a Interpreter function that can
+ *  evaluate an expression and produce a value.
  *
  * The resulting value can be passed to Python, making it easy to use
  * for testing and debugging.
@@ -38,8 +41,14 @@ class Value;
  *
  * Our intent is that this will never be the most efficient implementation of
  * Relay's semantics, but a readable and clear one.
+ *
+ * \param mod The function module.
+ * \param context The primary context that the interepreter runs on.
+ * \param target Compiler target flag to compile the functions on the context.
+ * \return A function that takes in an expression and returns a value.
  */
-Value Evaluate(Module mod, Expr e);
+runtime::TypedPackedFunc<Value(Expr)>
+CreateInterpreter(Module mod, DLContext context, Target target);
 
 /*! \brief The base container type of Relay values. */
 class ValueNode : public RelayNode {
@@ -125,9 +134,6 @@ struct TensorValueNode : ValueNode {
   /*! \brief Build a value from an NDArray. */
   TVM_DLL static TensorValue make(runtime::NDArray data);
 
-  /*! \brief Construct an empty tensor value from t. */
-  TVM_DLL static TensorValue FromType(const Type& t);
-
   static constexpr const char* _type_key = "relay.TensorValue";
   TVM_DECLARE_NODE_TYPE_INFO(TensorValueNode, ValueNode);
 };
diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
new file mode 100644
index 000000000000..941b32e9d33a
--- /dev/null
+++ b/include/tvm/relay/op_attr_types.h
@@ -0,0 +1,79 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file nnvm/compiler/op_attr_types.h
+ * \brief The Expr and related elements in DataFlow construction.
+ */
+#ifndef TVM_RELAY_OP_ATTR_TYPES_H_
+#define TVM_RELAY_OP_ATTR_TYPES_H_
+
+#include <tvm/tensor.h>
+#include <tvm/schedule.h>
+#include <tvm/build_module.h>
+#include <tvm/relay/type.h>
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief operator pattern used in graph fusion */
+enum OpPatternKind {
+  // Elementwise operation
+  kElemWise = 0,
+  // Broadcasting operator, can always map output axis to the input in order.
+  // for example :code:`out[i, ax1, j, ax2] = input[i, j]`.
+  // Note that the axis need to be in order so transpose is not a bcast operator.
+  kBroadcast = 1,
+  // Injective operator, can always injectively map output axis to a single input axis.
+  // All injective operator can still be safely fused to injective and reduction.
+  kInjective = 2,
+  // Communicative reduction operator.
+  kCommReduce = 3,
+  // Complex operation, can still fuse elemwise operations into its output.
+  // but cannot chain another complex op
+  kOutEWiseFusable = 4,
+  // Opaque operation, cannot fuse anything.
+  kOpaque = 8
+};
+
+/*! \brief the operator pattern */
+using TOpPattern = int;
+
+/*!
+ * \brief Computation description interface.
+ *
+ * \note This function have a special convention
+ *  for functions with tuple input/output.
+ *
+ *  So far we restrict tuple support to the following case:
+ *  - Function which takes a single tuple as input.
+ *  - Function which outputs a single tuple.
+ *
+ *  In both cases, the tuple is flattened as array.
+ *
+ * \param attrs The attribute of the primitive
+ * \param inputs The input tensors.
+ * \param out_type The output type information
+ &                 these are always placeholders.
+ * \return The output compute description of the operator.
+ */
+using FTVMCompute = runtime::TypedPackedFunc<
+  Array<Tensor>(const Attrs& attrs,
+                const Array<Tensor>& inputs,
+                const Type& out_type,
+                const Target& target)>;
+
+/*!
+ * \brief Build the computation schedule for
+ *  op whose root is at current op.
+ *
+ * \param attrs The attribute of the node.
+ * \param outs The output tensors.
+ * \param target The build target.
+ * \return schedule The computation schedule.
+ */
+using FTVMSchedule = runtime::TypedPackedFunc<
+  Schedule(const Array<Tensor>& outs,
+           const Target& target)>;
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_OP_ATTR_TYPES_H_
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 0b91deafd9c0..2a5ea83a4d2d 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -178,6 +178,40 @@ class DeviceAPI {
 
 /*! \brief The device type bigger than this is RPC device */
 constexpr int kRPCSessMask = 128;
+
+/*!
+ * \brief The name of Device API factory.
+ * \param type The device type.
+ * \return the device name.
+ */
+inline const char* DeviceName(int type) {
+  switch (type) {
+    case kDLCPU: return "cpu";
+    case kDLGPU: return "gpu";
+    case kDLOpenCL: return "opencl";
+    case kDLSDAccel: return "sdaccel";
+    case kDLAOCL: return "aocl";
+    case kDLVulkan: return "vulkan";
+    case kDLMetal: return "metal";
+    case kDLVPI: return "vpi";
+    case kDLROCM: return "rocm";
+    case kOpenGL: return "opengl";
+    case kDLExtDev: return "ext_dev";
+    default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
+  }
+}
+
+#ifndef _LIBCPP_SGX_NO_IOSTREAMS
+inline std::ostream& operator<<(std::ostream& os, DLContext ctx) {  // NOLINT(*)
+  int device_type = static_cast<int>(ctx.device_type);
+  if (device_type > kRPCSessMask) {
+    os << "remote[" << (device_type / kRPCSessMask) << "]-";
+    device_type = device_type % kRPCSessMask;
+  }
+  os << runtime::DeviceName(device_type) << "(" << ctx.device_id << ")";
+  return os;
+}
+#endif
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_DEVICE_API_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index c2098636f687..59ad52ccf3fd 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -888,6 +888,7 @@ inline std::ostream& operator<<(std::ostream& os, TVMType t) {  // NOLINT(*)
   }
   return os;
 }
+
 #endif
 
 inline std::string TVMType2String(TVMType t) {
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 383711477bb7..1ba402e20e7e 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -132,7 +132,7 @@ def set_input(self, key=None, value=None, **params):
         params : dict of str to NDArray
            Additonal arguments
         """
-        if key:
+        if key is not None:
             self._get_input(key).copyfrom(value)
 
         if params:
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index f474eb449c0c..19f3a55d491a 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -7,8 +7,7 @@
 from . import expr
 from . import module
 from . import ir_pass
-from .build_module import build
-from .interpreter import create_executor
+from .build_module import build, create_executor
 
 # Root operators
 from .op import Op
@@ -18,7 +17,7 @@
 from . import nn
 from . import vision
 from . import image
-
+from . import backend
 
 from .scope_builder import ScopeBuilder
 
@@ -56,13 +55,6 @@
 var = expr.var
 const = expr.const
 
-@register_func("relay._tensor_value_repr")
-def _tensor_value_repr(tv):
-    return str(tv.data.asnumpy())
-
-@register_func("relay._constant_repr")
-def _tensor_constant_repr(tv):
-    return str(tv.data.asnumpy())
 
 # pylint: disable=unused-argument
 @register_func("relay.debug")
diff --git a/python/tvm/relay/_interpreter.py b/python/tvm/relay/_interpreter.py
deleted file mode 100644
index d04319c17a99..000000000000
--- a/python/tvm/relay/_interpreter.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""The interface to the Evaluator exposed from C++."""
-from tvm._ffi.function import _init_api
-
-_init_api("relay._interpreter", __name__)
diff --git a/python/tvm/relay/backend/__init__.py b/python/tvm/relay/backend/__init__.py
new file mode 100644
index 000000000000..158989e9bf2f
--- /dev/null
+++ b/python/tvm/relay/backend/__init__.py
@@ -0,0 +1,2 @@
+"""Backend codege modules for relay."""
+from . import compile_engine
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
new file mode 100644
index 000000000000..b5454031cb4a
--- /dev/null
+++ b/python/tvm/relay/backend/_backend.py
@@ -0,0 +1,84 @@
+"""The interface of expr function exposed from C++."""
+from __future__ import absolute_import
+
+import logging
+from ... import build_module as _build
+from ... import container as _container
+from ..._ffi.function import _init_api, register_func
+
+
+@register_func("relay.backend.lower")
+def lower(sch, inputs, func_name, source_func):
+    """Backend function for lowering.
+
+    Parameters
+    ----------
+    sch : tvm.Schedule
+        The schedule.
+
+    inputs : List[tvm.Tensor]
+        The inputs to the function.
+
+    func_name : str
+        The name of the function.
+
+    source-func : tvm.relay.Function
+        The source function to be lowered.
+
+    Returns
+    -------
+    lowered_funcs : List[tvm.LoweredFunc]
+        The result of lowering.
+    """
+    import traceback
+    # pylint: disable=broad-except
+    try:
+        f = _build.lower(sch, inputs, name=func_name)
+        logging.debug("lower function %s", func_name)
+        logging.debug("%s", _build.lower(sch, inputs, simple_mode=True))
+    except Exception:
+        msg = traceback.format_exc()
+        msg += "Error during compile function\n"
+        msg += "-----------------------------\n"
+        msg += source_func.astext()
+        raise RuntimeError(msg)
+    return f if isinstance(
+        f, (_container.Array, tuple, list)) else [f]
+
+
+@register_func("relay.backend.build")
+def build(funcs, target, target_host=None):
+    """Backend build function.
+
+    Parameters
+    ----------
+    funcs : List[tvm.LoweredFunc]
+         The list of lowered functions.
+
+    target : tvm.Target
+         The target to run the code on.
+
+    target_host : tvm.Target
+         The host target.
+
+    Returns
+    -------
+    module : tvm.Module
+         The runtime module.
+    """
+    if target_host == "":
+        target_host = None
+    return _build.build(funcs, target=target, target_host=target_host)
+
+
+@register_func("relay._tensor_value_repr")
+def _tensor_value_repr(tvalue):
+    return str(tvalue.data.asnumpy())
+
+
+@register_func("relay._constant_repr")
+def _tensor_constant_repr(tvalue):
+    return str(tvalue.data.asnumpy())
+
+
+_init_api("relay.backend", __name__)
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
new file mode 100644
index 000000000000..a02579e2ac7a
--- /dev/null
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -0,0 +1,142 @@
+"""Backend code generation engine."""
+from __future__ import absolute_import
+
+from ..base import register_relay_node, NodeBase
+from ... import target as _target
+from .. import expr as _expr
+from . import _backend
+
+@register_relay_node
+class CachedFunc(NodeBase):
+    """Low-level tensor function to back a relay primitive function.
+    """
+    pass
+
+
+@register_relay_node
+class CCacheKey(NodeBase):
+    """Key in the CompileEngine.
+
+    Parameters
+    ----------
+    source_func : tvm.relay.Function
+        The source function.
+
+    target : tvm.Target
+        The target we want to run the function on.
+    """
+    def __init__(self, source_func, target):
+        self.__init_handle_by_constructor__(
+            _backend._make_CCacheKey, source_func, target)
+
+
+@register_relay_node
+class CCacheValue(NodeBase):
+    """Value in the CompileEngine, including usage statistics.
+    """
+    pass
+
+
+def _get_cache_key(source_func, target):
+    if isinstance(source_func, _expr.Function):
+        if isinstance(target, str):
+            target = _target.create(target)
+            if not target:
+                raise ValueError("Need target when source_func is a Function")
+        return CCacheKey(source_func, target)
+    if not isinstance(source_func, CCacheKey):
+        raise TypeError("Expect source_func to be CCacheKey")
+    return source_func
+
+
+@register_relay_node
+class CompileEngine(NodeBase):
+    """CompileEngine to get lowered code.
+    """
+    def __init__(self):
+        raise RuntimeError("Cannot construct a CompileEngine")
+
+    def lower(self, source_func, target=None):
+        """Lower a source_func to a CachedFunc.
+
+        Parameters
+        ----------
+        source_func : Union[tvm.relay.Function, CCacheKey]
+            The source relay function.
+
+        target : tvm.Target
+            The target platform.
+
+        Returns
+        -------
+        cached_func: CachedFunc
+            The result of lowering.
+        """
+        key = _get_cache_key(source_func, target)
+        return _backend._CompileEngineLower(self, key)
+
+    def jit(self, source_func, target=None):
+        """JIT a source_func to a tvm.Function.
+
+        Parameters
+        ----------
+        source_func : Union[tvm.relay.Function, CCacheKey]
+            The source relay function.
+
+        target : tvm.Target
+            The target platform.
+
+        Returns
+        -------
+        cached_func: CachedFunc
+            The result of lowering.
+        """
+        key = _get_cache_key(source_func, target)
+        return _backend._CompileEngineJIT(self, key)
+
+    def clear(self):
+        """clear the existing cached functions"""
+        _backend._CompileEngineClear(self)
+
+    def items(self):
+        """List items in the cache.
+
+        Returns
+        -------
+        item_list : List[Tuple[CCacheKey, CCacheValue]]
+            The list of items.
+        """
+        res = _backend._CompileEngineListItems(self)
+        assert len(res) % 2 == 0
+        return [(res[2*i], res[2*i+1]) for i in range(len(res) // 2)]
+
+    def dump(self):
+        """Return a string representation of engine dump.
+
+        Returns
+        -------
+        dump : str
+            The dumped string representation
+        """
+        items = self.items()
+        res = "====================================\n"
+        res += "CompilerEngine dump, %d items cached\n" % len(items)
+        for k, v in items:
+            res += "------------------------------------\n"
+            res += "target={}\n".format(k.target)
+            res += "use_count={}\n".format(v.use_count)
+            res += "func_name={}\n".format(v.cached_func.func_name)
+            res += k.source_func.astext() + "\n"
+        res += "===================================\n"
+        return res
+
+
+def get():
+    """Get the global compile engine.
+
+    Returns
+    -------
+    engine : tvm.relay.backend.CompileEngine
+        The compile engine.
+    """
+    return _backend._CompileEngineGlobal()
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
new file mode 100644
index 000000000000..9bd03945c847
--- /dev/null
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -0,0 +1,335 @@
+"""
+A compiler from a Relay expression to TVM's graph runtime.
+
+The compiler is built from a few pieces.
+
+First we define a compiler from a single Relay expression to the
+graph langauge. We require the expression to be a function.
+The function's parameters correpond to the placeholder/inputs
+and model parameters found in the computation graph representation.
+The body of the function represents the computation graph.
+
+The compiler's output is a program in the graph language, which is composed of
+graph langauge is composed of Node, NodeRef, InputNode, OpNode.
+This "little language" represents programs in TVM's graph format.
+
+To connect to the graph runtime, we use a printer that converts our graph format
+into TVM's JSON format. The resulting string can be loaded by
+contrib.graph_runtime or any other TVM runtime comptatible system.
+"""
+
+from __future__ import absolute_import
+import json
+import attr
+from . import compile_engine
+from ..op import Op
+from ..expr import Function, GlobalVar, ExprFunctor
+from ..ty import TupleType, TensorType
+
+
+@attr.s
+class NodeRef(object):
+    """A reference to a node, used for constructing the graph."""
+    ident = attr.ib()
+    index = attr.ib(default=0)
+    version = attr.ib(default=0)
+
+    def to_json(self):
+        return [self.ident, self.index, self.version]
+
+
+@attr.s
+class Node(object):
+    """The base class for nodes in the TVM runtime system graph input."""
+    name = attr.ib()
+    attrs = attr.ib()
+
+    def to_json(self):
+        raise Exception("Abstract method, please implement me.")
+
+
+@attr.s
+class InputNode(Node):
+    """An input node in the TVM runtime system graph input."""
+    name = attr.ib()
+    attrs = attr.ib()
+
+    def to_json(self):
+        return {
+            "op": "null",
+            "name": self.name,
+            "inputs": []
+        }
+
+
+@attr.s
+class OpNode(Node):
+    """An operator node in the TVM runtime system"s graph input."""
+    op_name = attr.ib()
+    inputs = attr.ib()
+    op_attrs = attr.ib()
+    num_outputs = attr.ib(default=1)
+
+    def to_json(self):
+        attrs = dict.copy(self.op_attrs)
+        # Extend ops with extra info.
+        attrs["func_name"] = self.op_name
+        attrs["flatten_data"] = "0"
+        attrs["num_inputs"] = str(len(self.inputs))
+        attrs["num_outputs"] = str(self.num_outputs)
+
+        return {
+            "op": "tvm_op",
+            "name": self.name,
+            "attrs": attrs,
+            "inputs": self.inputs
+        }
+
+
+def shape_to_json(shape):
+    """Convert symbolic shape to json compatible forma."""
+    return [sh.value for sh in shape]
+
+
+class GraphRuntimeCodegen(ExprFunctor):
+    """The compiler from Relay to the TVM runtime system."""
+    nodes = attr.ib()
+    var_map = attr.ib()
+
+    def __init__(self, mod, target):
+        ExprFunctor.__init__(self)
+        self.mod = mod
+        self.target = target
+        self.nodes = []
+        self.var_map = {}
+        self.compile_engine = compile_engine.get()
+        self.lowered_funcs = set()
+        self._name_map = {}
+
+    def add_node(self, node, checked_type):
+        """
+        Add a node to the graph.
+
+        Parameters
+        ----------
+        node: Node
+            The node to add to the graph.
+
+        checked_type: Type
+            The type of the node.
+
+        Returns
+        -------
+        node_ref: Union[NodeRef, List[NodeRef]]
+            A reference to the node.
+        """
+        node_id = len(self.nodes)
+        self.nodes.append(node)
+        # Tuple return value, flatten as tuple
+        if isinstance(checked_type, TupleType):
+            ret = []
+            shape = []
+            dtype = []
+            for i, typ in enumerate(checked_type.fields):
+                if not isinstance(typ, TensorType):
+                    raise RuntimeError("type %s not supported" % typ)
+                ret.append(NodeRef(node_id, i))
+                shape.append(shape_to_json(typ.shape))
+                dtype.append(typ.dtype)
+            node.attrs["shape"] = shape
+            node.attrs["dtype"] = dtype
+            assert isinstance(node, OpNode)
+            node.num_outputs = len(checked_type.fields)
+            return tuple(ret)
+        # Normal tensor return type
+        if not isinstance(checked_type, TensorType):
+            raise RuntimeError("type %s not supported" % checked_type)
+        node.attrs["shape"] = [shape_to_json(checked_type.shape)]
+        node.attrs["dtype"] = [checked_type.dtype]
+        node.num_outputs = 1
+        return NodeRef(node_id, 0)
+
+    def visit_tuple(self, vtuple):
+        fields = []
+        for field in vtuple.fields:
+            ref = self.visit(field)
+            assert isinstance(ref, NodeRef)
+            fields.append(ref)
+        return tuple(fields)
+
+    def visit_tuple_getitem(self, op):
+        vtuple = self.visit(op.tuple_value)
+        assert isinstance(vtuple, tuple)
+        return vtuple[op.index]
+
+    def visit_constant(self, _):
+        raise RuntimeError("constant not supported")
+
+    def visit_function(self, _):
+        raise RuntimeError("function not supported")
+
+    def visit_if(self, _):
+        raise RuntimeError("if not supported")
+
+    def visit_global_var(self, _):
+        raise RuntimeError()
+
+    def visit_let(self, let):
+        """
+        Visit the let binding, by first traversing its value,
+        then setting the metadata on the returned NodeRef.
+
+        Finally visit the body, and return the NodeRef corresponding
+        to it.
+
+        Parameters
+        ----------
+        let: tvm.relay.Expr
+            The let binding to transform.
+
+        Returns
+        -------
+        ref: NodeRef
+            The node reference to the body.
+        """
+        assert let.var not in self.var_map
+        self.var_map[let.var] = self.visit(let.value)
+        return self.visit(let.body)
+
+    def visit_var(self, rvar):
+        return self.var_map[rvar]
+
+    def visit_call(self, call):
+        """Transform a ::tvm.relay.Call into an operator in the TVM graph."""
+        if isinstance(call.op, Op):
+            raise Exception(
+                "Operators should be transformed away; try applying" +
+                "the fuse_ops transformation to the expression.")
+        elif isinstance(call.op, GlobalVar):
+            func = self.mod[call.op]
+        elif isinstance(call.op, Function):
+            func = call.op
+        else:
+            raise Exception(
+                "TVM runtime does not support calls to {0}".format(type(call.op)))
+        if int(func.attrs.Primitive) != 1:
+            raise Exception(
+                "TVM only support calls to primitive functions " +
+                "(i.e functions composed of fusable operator invocations)")
+
+        cached_func = self.compile_engine.lower(func, self.target)
+        for loweredf in cached_func.funcs:
+            self.lowered_funcs.add(loweredf)
+
+        inputs = []
+        tuple_arg_count = 0
+        for arg in call.args:
+            if isinstance(arg.checked_type, TupleType):
+                tuple_arg_count += 1
+            inputs.append(self.visit(arg))
+        # We need to specially handle tuple inputs and
+        # tuple output cases.
+        # Tuple input function(e.g. concat)
+        if tuple_arg_count:
+            assert len(call.args) == 1
+            assert isinstance(inputs[0], tuple)
+            inputs = list(inputs[0])
+
+        inputs = [x.to_json() for x in inputs]
+        op_name = cached_func.func_name
+        op_node = OpNode(self._get_unique_name(op_name), {},
+                         op_name, inputs, {})
+        return self.add_node(op_node, call.checked_type)
+
+    def _get_json(self):
+        """
+        Convert the sequence of nodes stored by the compiler into the
+        TVM graph runtime format.
+
+        Returns
+        -------
+        graph_json : str
+            The generated JSON as a string.
+        """
+        nodes = []
+        # First we compute "nodes" field.
+        for node in self.nodes:
+            nodes.append(node.to_json())
+
+        arg_nodes = []
+        # Compute "arg_nodes" and "heads" fields.
+        for i, node in enumerate(self.nodes):
+            if isinstance(node, InputNode):
+                arg_nodes.append(i)
+
+        heads = self.heads
+        heads = heads if isinstance(heads, tuple) else [heads]
+        heads = [x.to_json() for x in heads]
+
+        # Compute "node_row_ptr" and entry attributes.
+        num_entry = 0
+        shapes = []
+        storage_ids = []
+        dltypes = []
+        node_row_ptr = [0]
+        for node in self.nodes:
+            assert node.num_outputs == len(node.attrs["shape"])
+            shapes += node.attrs["shape"]
+            dltypes += node.attrs["dtype"]
+            for i in range(node.num_outputs):
+                storage_ids.append(i + num_entry)
+            num_entry += node.num_outputs
+            node_row_ptr.append(num_entry)
+
+        # Compute "attrs" field.
+        attrs = {}
+        attrs["shape"] = ["list_shape", shapes]
+        attrs["storage_id"] = ["list_int", storage_ids]
+        attrs["dltype"] = ["list_str", dltypes]
+
+        json_dict = {
+            "nodes": nodes,
+            "arg_nodes": arg_nodes,
+            "heads": heads,
+            "attrs": attrs,
+            "node_row_ptr":  node_row_ptr
+        }
+
+        return json.dumps(json_dict, indent=2)
+
+    def codegen(self, func):
+        """Compile a single function into a graph.
+
+        Parameters
+        ----------
+        func: tvm.relay.Expr
+            The function to compile.
+
+        Returns
+        -------
+        graph_json : str
+            The graph json that can be consumed by runtime.
+
+        lowered_funcs : List[tvm.LoweredFunc]
+            The lowered functions.
+        """
+        # First we convert all the parameters into input nodes.
+        for param in func.params:
+            node = InputNode(param.name_hint, {})
+            self.var_map[param] = self.add_node(
+                node, param.type_annotation)
+
+        # Then we compile the body into a graph which can depend
+        # on input variables.
+        self.heads = self.visit(func.body)
+        graph_json = self._get_json()
+        lowered_funcs = list(self.lowered_funcs)
+        return graph_json, lowered_funcs
+
+    def _get_unique_name(self, name):
+        if name not in self._name_map:
+            self._name_map[name] = 1
+            return name
+        index = self._name_map[name]
+        self._name_map[name] += 1
+        return self.get_unique_name(name + str(index))
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
new file mode 100644
index 000000000000..5c7401c8c146
--- /dev/null
+++ b/python/tvm/relay/backend/interpreter.py
@@ -0,0 +1,182 @@
+#pylint: disable=no-else-return
+"""An interface to the Realy interpreter."""
+from __future__ import absolute_import
+
+import numpy as np
+
+from . import _backend
+from .. import _make, ir_pass
+from ... import register_func, nd
+from ..base import NodeBase, register_relay_node
+from ..expr import Call, Constant, GlobalVar, Function, const
+from ..scope_builder import ScopeBuilder
+
+class Value(NodeBase):
+    """Base class of all values.
+    """
+    @staticmethod
+    @register_func("relay.from_scalar")
+    def from_scalar(value, dtype=None):
+        """Convert a Python scalar to a Relay scalar."""
+        return TensorValue(const(value, dtype).data)
+
+
+@register_relay_node
+class TupleValue(Value):
+    def __init__(self, *fields):
+        self.__init_handle_by_constructor__(
+            _make.TupleValue, fields)
+
+    def __getitem__(self, field_no):
+        return self.fields[field_no]
+
+
+@register_relay_node
+class Closure(Value):
+    pass
+
+
+@register_relay_node
+class TensorValue(Value):
+    """A Tensor value produced by the evaluator."""
+
+    def __init__(self, data):
+        """Allocate a new TensorValue and copy the data from `array` into
+           the new array.
+        """
+        if isinstance(data, np.ndarray):
+            data = nd.array(data)
+
+        self.__init_handle_by_constructor__(
+            _make.TensorValue, data)
+
+    def asnumpy(self):
+        """Convert a Relay TensorValue into a numpy.ndarray."""
+        return self.data.asnumpy()
+
+    def __eq__(self, other):
+        return self.data == other.data
+
+
+def _arg_to_ast(arg):
+    if isinstance(arg, TensorValue):
+        return Constant(arg.data.copyto(_nd.cpu(0)))
+    elif isinstance(arg, np.ndarray):
+        return Constant(nd.array(arg))
+    elif isinstance(arg, Constant):
+        return arg
+    else:
+        return const(arg)
+
+
+class Executor(object):
+    """An abstract interface for executing Relay programs."""
+    def _make_executor(self, _):
+        """
+        Construct a Python function that implements the evaluation
+        of expression.
+
+        Parameters
+        ----------
+        expr: relay.Expr
+            The Relay expression to execute.
+
+        Returns
+        -------
+        executor: function,
+            A Python function which implements the behavior of `expr`.
+        """
+        raise NotImplementedError()
+
+    def evaluate(self, expr, binds=None):
+        """
+        Evaluate a Relay expression on the executor.
+
+        Parameters
+        ----------
+        expr: tvm.relay.Expr
+            The expression to evaluate.
+
+        binds: Map[tvm.relay.Var, tvm.relay.Expr]
+            Additional binding of free variable.
+
+        Returns
+        -------
+        val : Union[function, Value]
+            The evaluation result.
+        """
+        if binds:
+            scope_builder = ScopeBuilder()
+            for key, value in binds.items():
+                scope_builder.let(key, _arg_to_ast(value))
+            scope_builder.ret(expr)
+            expr = scope_builder.get()
+
+        if isinstance(expr, Function):
+            assert not ir_pass.free_vars(expr)
+
+        if isinstance(expr, (Function, GlobalVar)):
+            return self._make_executor(expr)
+
+        # normal expression evaluated by running a function.
+        func = Function([], expr)
+        return self._make_executor(func)()
+
+
+class Interpreter(Executor):
+    """
+    Simple interpreter interface.
+
+    Parameters
+    ----------
+    mod : tvm.relay.Module
+        The module to support the execution.
+
+    ctx : tvm.TVMContext
+        The runtime context to run the code on.
+
+    target : tvm.Target
+        The target option to build the function.
+    """
+    def __init__(self, mod, ctx, target):
+        self.mod = mod
+        self.ctx = ctx
+        self.target = target
+        self._intrp = _backend.CreateInterpreter(mod, ctx, target)
+
+    def optimize(self, expr):
+        """Optimize an expr.
+
+        Parameters
+        ----------
+        expr : Expr
+            The expression to be optimized.
+
+        Returns
+        -------
+        opt_expr : Expr
+            The optimized expression.
+        """
+        # TODO: We need to move this optimization code into the optimizer/pass manager
+        ck_expr = ir_pass.infer_type(expr, mod=self.mod)
+        fused_expr = ir_pass.fuse_ops(ck_expr)
+        ck_fused = ir_pass.infer_type(fused_expr, mod=self.mod)
+        return ck_fused
+
+    def _make_executor(self, expr):
+        def _interp_wrapper(*args):
+            relay_args = []
+            for arg in args:
+                relay_args.append(_arg_to_ast(arg))
+
+            if isinstance(expr, GlobalVar):
+                func = self.mod[expr]
+                func = self.optimize(func)
+                self.mod._add(expr, func, True)
+                opt_expr = Call(expr, relay_args)
+                return self._intrp(opt_expr)
+            else:
+                call = Call(expr, relay_args)
+                opt_expr = self.optimize(call)
+                return self._intrp(opt_expr)
+        return _interp_wrapper
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index e71571e63a99..c48ec90e9e12 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -2,45 +2,257 @@
 Construct the necessary state for the TVM graph runtime
 from a Relay expression.
 """
-from ..build_module import build as tvm_build_module
-from . graph_runtime_codegen import GraphRuntimeCodegen
+from ..build_module import build as _tvm_build_module
+from .. import nd as _nd, target as _target, autotvm
+from ..contrib import graph_runtime as _graph_rt
 from . import ir_pass
-from .module import Module
+from .backend import interpreter as _interpreter
+from .backend import graph_runtime_codegen as _graph_gen
 
-def build(func, params=None, target=None, mod=None):
+# List of optimization pass and level when switch on
+OPT_PASS_LEVEL = {
+    "SimplifyInference": 0,
+    "OpFusion": 1,
+    "FoldScaleAxis": 3,
+}
+
+class BuildConfig(object):
+    """Configuration scope to set a build config option.
+
+    Parameters
+    ----------
+    kwargs
+        Keyword arguments of configurations to set.
+    """
+    current = None
+    defaults = {
+        "opt_level": 2,
+        "add_pass": None,
+    }
+    def __init__(self, **kwargs):
+        self._old_scope = None
+        for k, _ in kwargs.items():
+            if k not in BuildConfig.defaults:
+                raise ValueError(
+                    "invalid argument %s, candidates are %s" % (k, BuildConfig.defaults.keys()))
+        self._attr = kwargs
+
+    def __getattr__(self, name):
+        if name not in self._attr:
+            return BuildConfig.defaults[name]
+        return self._attr[name]
+
+    def __enter__(self):
+        # pylint: disable=protected-access
+        self._old_scope = BuildConfig.current
+        attr = BuildConfig.current._attr.copy()
+        attr.update(self._attr)
+        self._attr = attr
+        BuildConfig.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_scope
+        BuildConfig.current = self._old_scope
+
+    def pass_enabled(self, pass_name):
+        """Get whether pass is enabled.
+
+        Parameters
+        ----------
+        pass_name : str
+            The optimization pass name
+
+        Returns
+        -------
+        enabled : bool
+            Whether pass is enabled.
+        """
+        if self.add_pass and pass_name in self.add_pass:
+            return True
+        return self.opt_level >= OPT_PASS_LEVEL[pass_name]
+
+
+BuildConfig.current = BuildConfig()
+
+
+def build_config(**kwargs):
+    """Configure the build behavior by setting config variables.
+
+    Parameters
+    ----------
+    opt_level: int, default=2
+        Optimization level. See OPT_PASS_LEVEL for level of each pass.
+
+    add_pass: set of str
+        Optimization pass to be added regardless of optimization level.
+
+    Returns
+    -------
+    config: BuildConfig
+        The build configuration
+    """
+    return BuildConfig(**kwargs)
+
+
+def optimize(func):
+    """Perform target invariant optimizations.
+
+    Parameters
+    ----------
+    func : tvm.relay.Function
+        The input to optimization.
+
+    Returns
+    -------
+    opt_func : tvm.relay.Function
+        The optimized version of the function.
     """
-    Compile a single function to the components needed by the
-    TVM RTS.
+    cfg = BuildConfig.current
+
+    if cfg.pass_enabled("FoldScaleAxis"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.simplify_inference(func)
+
+    if cfg.pass_enabled("FoldScaleAxis"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.backward_fold_scale_axis(func)
+        func = ir_pass.infer_type(func)
+        func = ir_pass.forward_fold_scale_axis(func)
+    return func
+
+
+def build(func,
+          target=None,
+          target_host=None,
+          params=None):
+    """Build a function to run on TVM graph runtime.
 
     Parameters
     ----------
-    func: relay.Expr
+    func: relay.Function
         The function to build.
 
-    target: optional str
-        The target platform.
+    target : str or :any:`tvm.target.Target`, optional
+        The build target
+
+    target_host : str or :any:`tvm.target.Target` optional
+        Host compilation target, if target is device.
+        When TVM compiles device specific program such as CUDA,
+        we also need host(CPU) side code to interact with the driver
+        setup the dimensions and parameters correctly.
+        target_host is used to specify the host side codegen target.
+        By default, llvm is used if it is enabled,
+        otherwise a stackvm intepreter is used.
+
+    params : dict of str to NDArray
+        Input parameters to the graph that do not change
+        during inference time. Used for pre-compute
+        folding optimization.
 
     Returns
     -------
-    (graph_json, mod, params): tuple of (str, tvm.Module, dict)
-        The outputs of building a Relay function for the TVM runtime.
+    graph_json : str
+        The json string that can be accepted by graph runtime.
 
+    mod : tvm.Module
+        The module containing necessary libraries.
+
+    params : dict
+        The parameters of the final graph.
     """
+    target = target if target else _target.current_target()
     if target is None:
-        target = 'llvm'
-
-    if mod is None:
-        mod = Module({})
-
-    comp = GraphRuntimeCodegen(mod)
-    # NB(@jroesch) This creates lowered functions, and generates names for them
-    #
-    # We need these names to emit the correct graph as these are names of the
-    # functions contained in the module.
-    lowered_ops = ir_pass.lower_ops(mod, func)
-    mod = tvm_build_module([lf.lowered_func for lf in lowered_ops], target)
-
-    # Therefore the call to compile must come after.
-    comp.codegen(func)
-    graph_json = comp.to_json()
+        raise ValueError("Target is not set in env or passed as argument.")
+    target = _target.create(target)
+
+    # If current dispatch context is fallback context (the default root context),
+    # then load pre-tuned parameters from TopHub
+    if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
+        tophub_context = autotvm.tophub.context(target)
+    else:
+        tophub_context = autotvm.util.EmptyContext()
+
+    with tophub_context:
+        func = optimize(func)
+        # Fuse ops before running code gen
+        func = ir_pass.infer_type(func)
+        func = ir_pass.fuse_ops(func)
+        # Graph code generation
+        func = ir_pass.infer_type(func)
+        graph_gen = _graph_gen.GraphRuntimeCodegen(mod=None, target=target)
+        graph_json, lowered_funcs = graph_gen.codegen(func)
+        mod = _tvm_build_module(lowered_funcs, target=target, target_host=target_host)
     return graph_json, mod, params
+
+
+class GraphExecutor(_interpreter.Executor):
+    """Wrapper around Executor interface.
+
+    This executor is used for debug and testing purpoes.
+
+    Parameters
+    ----------
+    mod : tvm.relay.Module
+        The module to support the execution.
+
+    ctx : tvm.TVMContext
+        The runtime context to run the code on.
+
+    target : tvm.Target
+        The target option to build the function.
+    """
+    def __init__(self, mod, ctx, target):
+        self.mod = mod
+        self.ctx = ctx
+        self.target = target
+
+    def _make_executor(self, func):
+        def _graph_wrapper(*args):
+            graph_json, mod, params = build(func, target=self.target)
+            assert params is None
+            gmodule = _graph_rt.create(graph_json, mod, self.ctx)
+            # Create map of inputs.
+            for i, arg in enumerate(args):
+                gmodule.set_input(i, arg)
+            # Run the module, and fetch the output.
+            gmodule.run()
+            return gmodule.get_output(0)
+
+        return _graph_wrapper
+
+
+
+def create_executor(kind="debug",
+                    mod=None,
+                    ctx=None,
+                    target="llvm"):
+    """Factory function to create an executor.
+
+    Parameters
+    ----------
+    kind : str
+        The type of executor
+
+    mod : relay.Mod
+        The mod
+
+    ctx : tvm.TVMContext
+        The context to execute the code.
+
+    target : tvm.Target
+        The corresponding context
+    """
+    if ctx is not None:
+        assert ctx.device_type == _nd.context(str(target), 0).device_type
+    else:
+        ctx = _nd.context(str(target), 0)
+
+    if isinstance(target, str):
+        target = _target.create(target)
+    if kind == "debug":
+        return _interpreter.Interpreter(mod, ctx, target)
+    elif kind == "graph":
+        return GraphExecutor(mod, ctx, target)
+    else:
+        raise RuntimeError("unknown mode {0}".format(mode))
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index d789f281d25a..43cff0bac57a 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -319,12 +319,11 @@ def __init__(self, tuple_value, index):
         self.__init_handle_by_constructor__(
             _make.TupleGetItem, tuple_value, index)
 
+
 class ExprFunctor(object):
     """
     An abstract visitor defined over Expr.
 
-    A Python version of the class defined in `expr_functor.h`.
-
     Defines the default dispatch over expressions, and
     implements memoization.
     """
@@ -352,6 +351,8 @@ def visit(self, expr):
             res = self.visit_if(expr)
         elif isinstance(expr, Tuple):
             res = self.visit_tuple(expr)
+        elif isinstance(expr, TupleGetItem):
+            res = self.visit_tuple_getitem(expr)
         elif isinstance(expr, Constant):
             res = self.visit_constant(expr)
         else:
@@ -361,31 +362,34 @@ def visit(self, expr):
         return res
 
     def visit_function(self, _):
-        raise Exception("Abstract method please implement me.")
+        raise NotImplementedError()
 
     def visit_let(self, _):
-        raise Exception("Abstract method please implement me.")
+        raise NotImplementedError()
 
     def visit_call(self, _):
-        raise Exception("Abstract method please implement me.")
+        raise NotImplementedError()
 
     def visit_var(self, _):
-        raise Exception("Abstract method please implement me.")
+        raise NotImplementedError()
 
     def visit_type(self, typ):
         return typ
 
     def visit_if(self, _):
-        raise Exception("Abstract method please implement me.")
+        raise NotImplementedError()
 
     def visit_tuple(self, _):
-        raise Exception("Abstract method please implement me.")
+        raise NotImplementedError()
+
+    def visit_tuple_getitem(self, _):
+        raise NotImplementedError()
 
     def visit_constant(self, _):
-        raise Exception("Abstract method please implement me.")
+        raise NotImplementedError()
 
     def visit_global_var(self, _):
-        raise Exception("Abstract method please implement me.")
+        raise NotImplementedError()
 
 
 class ExprMutator(ExprFunctor):
@@ -395,7 +399,6 @@ class ExprMutator(ExprFunctor):
     The default behavior recursively traverses the AST
     and reconstructs the AST.
     """
-
     def visit_function(self, fn):
         new_body = self.visit(fn.body)
         return Function(
@@ -429,9 +432,19 @@ def visit_if(self, ite):
     def visit_tuple(self, tup):
         return Tuple([self.visit(field) for field in tup.fields])
 
+    def visit_tuple_getitem(self, op):
+        tuple_value = self.visit(op.tuple_value)
+        if not tuple_value.same_as(op.tuple_value):
+            return TupleGetItem(tuple_value, op.index)
+        return op
+
+    def visit_global_var(self, gvar):
+        return gvar
+
     def visit_constant(self, rconst):
         return rconst
 
+
 class TupleWrapper(object):
     """TupleWrapper.
 
diff --git a/python/tvm/relay/graph_runtime_codegen.py b/python/tvm/relay/graph_runtime_codegen.py
deleted file mode 100644
index 3fd408a58f0d..000000000000
--- a/python/tvm/relay/graph_runtime_codegen.py
+++ /dev/null
@@ -1,368 +0,0 @@
-"""
-A compiler from a Relay expression to TVM's graph runtime.
-
-The compiler is built from a few pieces.
-
-First we define a compiler from a single Relay expression to the
-graph langauge. We require the expression to be a function.
-The function's parameters correpond to the placeholder/inputs
-and model parameters found in the computation graph representation.
-The body of the function represents the computation graph.
-
-The compiler's output is a program in the graph language, which is composed of
-graph langauge is composed of Node, NodeRef, InputNode, OpNode.
-This "little language" represents programs in TVM's graph format.
-
-To connect to the graph runtime, we use a printer that converts our graph format
-into TVM's JSON format. The resulting string can be loaded by
-contrib.graph_runtime or any other TVM runtime comptatible system.
-
-We expose this functionality in compile_to_tvm.
-"""
-
-from __future__ import absolute_import
-import json
-import attr
-from . import ir_pass
-from .op import Op
-from .expr import Function, GlobalVar, ExprMutator
-
-
-@attr.s
-class NodeRef(object):
-    """A reference to a node, used for constructing the graph."""
-    ident = attr.ib()
-    index = attr.ib(default=0)
-    version = attr.ib(default=0)
-
-    def to_json(self):
-        return [self.ident, self.index, self.version]
-
-
-@attr.s
-class Node(object):
-    """The base class for nodes in the TVM runtime system graph input."""
-    name = attr.ib()
-    attrs = attr.ib()
-    is_output = attr.ib()
-
-    def to_json(self):
-        raise Exception("Abstract method, please implement me.")
-
-
-@attr.s
-class InputNode(Node):
-    """An input node in the TVM runtime system graph input."""
-    name = attr.ib()
-    attrs = attr.ib()
-    is_output = attr.ib(default=False)
-
-    def to_json(self):
-        return {
-            "op": "null",
-            "name": self.name,
-            "inputs": []
-        }
-
-
-@attr.s
-class OpNode(Node):
-    """An operator node in the TVM runtime system's graph input."""
-    op_name = attr.ib()
-    inputs = attr.ib()
-    op_attrs = attr.ib()
-    is_output = attr.ib(default=False)
-
-    def to_json(self):
-        attrs = dict.copy(self.op_attrs)
-        # Extend ops with extra info.
-        attrs['func_name'] = self.op_name
-        # When do we flatten?
-        attrs['flatten_data'] = "0"
-        # Fix me!
-        attrs['num_inputs'] = str(len(self.inputs))
-        attrs['num_outputs'] = "1"
-
-        return {
-            "op": "tvm_op",
-            "name": self.name,
-            "attrs": attrs,
-            "inputs": self.inputs
-        }
-
-
-def shape_to_json(shape):
-    return [sh.value for sh in shape]
-
-
-def from_tensor(typ):
-    return (typ.dtype, shape_to_json(typ.shape))
-
-
-class GraphRuntimeCodegen(ExprMutator):
-    """The compiler from Relay to the TVM runtime system."""
-    nodes = attr.ib()
-    id_map = attr.ib()
-
-    def __init__(self, env):
-        ExprMutator.__init__(self)
-        self.nodes = []
-        self.id_map = {}
-        self.env = env
-
-    def add_node(self, node):
-        """
-        Add a node to the graph.
-
-        Parameters
-        ----------
-        node: Node
-            The node to add to the graph.
-
-        Returns
-        -------
-        node_ref: NodeRef
-            A reference to the node.
-
-        """
-        self.nodes.append(node)
-        ident = len(self.nodes) - 1
-        return NodeRef(ident)
-
-    def add_binding(self, ident, ref):
-        """
-        Add a identifier to node mapping.
-
-        Parameters
-        ----------
-        ident: relay.Var
-            The variable to map
-
-        ref: NodeRef
-            The node the identifier points.
-        """
-        self.id_map[ident] = ref
-
-    def let_bind(self, ident, node):
-        """
-        Let bind node to ident.
-
-        Parameters
-        ----------
-        ident: relay.Var
-            The variable to map.
-
-        ref: NodeRef
-            The node the identifier points.
-
-        Returns
-        -------
-        ref: NodeRef
-            Return reference to the node.
-        """
-        ref = self.add_node(node)
-        self.add_binding(ident, ref)
-        return ref
-
-    def get_node(self, ref):
-        """
-        Lookup a node by a node reference.
-
-        Parameters
-        ----------
-        ref: NodeRef
-            The reference to lookup.
-
-        Returns
-        -------
-        node: Node
-            The node.
-        """
-        return self.nodes[ref.ident]
-
-    def lookup(self, ident):
-        """
-        Lookup a node by identifier.
-
-        Parameters
-        ----------
-        ident: relay.Var
-            The reference to lookup.
-
-        Returns
-        -------
-        node: Node
-            The node.
-        """
-        return self.id_map[ident]
-
-    def codegen(self, func):
-        """Compile a single function into a graph.
-
-        Parameters
-        ----------
-        func: tvm.relay.Expr
-            The function to compile.
-        """
-        # First we convert all the parameters into input nodes.
-        params = func.params
-
-        for param in params:
-            dtype, shape = from_tensor(param.type_annotation)
-            node = InputNode("{0}".format(param.name_hint), {
-                "shape": shape,
-                "dtype": dtype,
-            })
-            self.let_bind(param, node)
-
-        # Then we compile the body into a graph which can depend
-        # on input variables.
-        output_ref = self.visit(func.body)
-
-        # Finally we retreive return value of program, which will
-        # become our output node.
-        self.get_node(output_ref).is_output = True
-
-    def visit_let(self, let):
-        """
-        Visit the let binding, by first traversing its value,
-        then setting the metadata on the returned NodeRef.
-
-        Finally visit the body, and return the NodeRef corresponding
-        to it.
-
-        Parameters
-        ----------
-        let: tvm.relay.Expr
-            The let binding to transform.
-
-        Returns
-        -------
-        ref: NodeRef
-            The node reference to the body.
-        """
-        ident = let.var
-        val = let.value
-        body = let.body
-
-        val_ref = self.visit(val)
-        dtype, shape = from_tensor(val.checked_type())
-        val_node = self.get_node(val_ref)
-        val_node.attrs["dtype"] = dtype
-        val_node.attrs["shape"] = shape
-        self.add_binding(ident, val_ref)
-        return self.visit(body)
-
-    def visit_var(self, rvar):
-        return self.lookup(rvar)
-
-    def visit_call(self, call):
-        """Transform a ::tvm.relay.Call into an operator in the TVM graph."""
-        inputs = []
-        for arg in call.args:
-            inputs.append(self.visit(arg).to_json())
-
-        if isinstance(call.op, Op):
-            raise Exception(
-                "Operators should be transformed away; try applying" +
-                "the fuse_ops transformation to the expression.")
-        elif isinstance(call.op, GlobalVar):
-            func = self.env[call.op]
-        elif isinstance(call.op, Function):
-            func = call.op
-        else:
-            raise Exception(
-                "TVM runtime does not support calls to {0}".format(type(call.op)))
-
-        if int(func.attrs.Primitive) != 1:
-            raise Exception(
-                "TVM only support calls to primitive functions " +
-                "(i.e functions composed of fusable operator invocations)")
-
-        op_name = func.attrs.LoweredFunc.name
-
-        attrs = {'shape': shape_to_json(call.checked_type.shape),
-                 'dtype': call.checked_type.dtype}
-        call_hash = str(ir_pass.structural_hash(call))
-        op_node = OpNode("call_" + call_hash, attrs, op_name, inputs, {})
-        return self.add_node(op_node)
-
-    def to_json(self):
-        """
-        Convert the sequence of nodes stored by the compiler into the
-        TVM graph runtime format.
-
-        Returns
-        -------
-        graph_json : str
-            The generated JSON as a string.
-        """
-        nodes = []
-        # First we compute "nodes" field.
-        for node in self.nodes:
-            nodes.append(node.to_json())
-
-        arg_nodes = []
-        heads = []
-        # Compute "arg_nodes" and "heads" fields.
-        for i, node in enumerate(self.nodes):
-            if isinstance(node, InputNode):
-                arg_nodes.append(i)
-
-            if node.is_output:
-                # Need to fix this.
-                heads.append(NodeRef(i).to_json())
-
-        def compute_node_row_ptr(nodes):
-            """Calculate the node_row_ptr field by doing a DFS backwards
-               from the output and reversing the path.
-            """
-            row_ptr = [len(nodes)]
-            discovered = set()
-            stack = []
-            stack.append(len(nodes) - 1)
-            while stack:
-                i = stack.pop()
-                if i not in discovered:
-                    discovered.add(i)
-                    row_ptr.append(i)
-                    node = nodes[i]
-                    if isinstance(node, OpNode):
-                        for inp in node.inputs:
-                            stack.append(inp[0])
-            row_ptr.reverse()
-            return row_ptr
-
-        # Compute "node_row_ptr".
-        node_row_ptr = compute_node_row_ptr(self.nodes)
-
-        # Compute "attrs" field.
-        attrs = {}
-
-        # These fields are mandatory.
-        shapes = []
-        storage_ids = []
-        dtype = []
-        dltype = []
-
-        for i, node in enumerate(self.nodes):
-            storage_ids.append(i)
-            shapes.append(node.attrs['shape'])
-            if node.attrs['dtype'] == 'float32':
-                dtype.append(0)
-                dltype.append('float32')
-
-        attrs["shape"] = ["list_shape", shapes]
-        attrs["storage_id"] = ["list_int", storage_ids]
-        attrs["dtype"] = ["list_int", dtype]
-        attrs["dltype"] = ["list_str", dltype]
-
-        json_dict = {
-            "nodes": nodes,
-            "arg_nodes": arg_nodes,
-            "heads": heads,
-            "attrs": attrs,
-            "node_row_ptr":  node_row_ptr
-        }
-
-        return json.dumps(json_dict)
diff --git a/python/tvm/relay/interpreter.py b/python/tvm/relay/interpreter.py
deleted file mode 100644
index bd8ef0d14415..000000000000
--- a/python/tvm/relay/interpreter.py
+++ /dev/null
@@ -1,223 +0,0 @@
-#pylint: disable=no-else-return
-"""An interface to the Realy interpreter."""
-from __future__ import absolute_import
-import numpy as np
-from .. import register_func, nd
-from .base import NodeBase, register_relay_node
-from . import build_module
-from . import _make
-from . import _interpreter
-from . import ir_pass
-from .module import Module
-from .expr import Call, Constant, GlobalVar, Function, const
-from .scope_builder import ScopeBuilder
-from .._ffi.base import integer_types
-from ..contrib import graph_runtime as tvm_runtime
-from .. import cpu
-
-class Value(NodeBase):
-    """Base class of all values.
-    """
-
-    @staticmethod
-    @register_func("relay.from_scalar")
-    def from_scalar(i, dtype=None):
-        """Convert a Python scalar to a Relay scalar."""
-        if dtype is None:
-            if isinstance(i, integer_types):
-                dtype = 'int32'
-            elif isinstance(i, float):
-                dtype = 'float32'
-            elif isinstance(i, bool):
-                dtype = 'uint8'
-            else:
-                raise Exception("unable to infer dtype {0}".format(type(i)))
-
-        return TensorValue(nd.array(np.array(i, dtype=dtype)))
-
-
-@register_relay_node
-class TupleValue(Value):
-    def __init__(self, *fields):
-        self.__init_handle_by_constructor__(
-            _make.TupleValue, fields)
-
-    def __getitem__(self, field_no):
-        return self.fields[field_no]
-
-
-@register_relay_node
-class Closure(Value):
-    pass
-
-
-@register_relay_node
-class TensorValue(Value):
-    """A Tensor value produced by the evaluator."""
-
-    def __init__(self, data):
-        """Allocate a new TensorValue and copy the data from `array` into
-           the new array.
-        """
-        if isinstance(data, np.ndarray):
-            data = nd.array(data)
-
-        self.__init_handle_by_constructor__(
-            _make.TensorValue, data)
-
-    def as_ndarray(self):
-        """Convert a Relay TensorValue into a tvm.ndarray."""
-        return self.data
-
-    def asnumpy(self):
-        """Convert a Relay TensorValue into a numpy.ndarray."""
-        return self.data.asnumpy()
-
-    def __eq__(self, other):
-        return self.data == other.data
-
-
-def _arg_to_ast(arg):
-    if isinstance(arg, TensorValue):
-        return Constant(arg.data)
-    elif isinstance(arg, np.ndarray):
-        return Constant(nd.array(arg))
-    elif isinstance(arg, Constant):
-        return arg
-    else:
-        return const(arg)
-
-class Executor(object):
-    """An abstract interface for executing Relay programs."""
-
-    def __init__(self, mod=None):
-        """
-        Parameters
-        ----------
-        mod: relay.Module
-            The module.
-        """
-        if mod is None:
-            self.mod = Module({})
-        else:
-            self.mod = mod
-
-
-    def optimize(self, expr):
-        # TODO: We need to move this optimization code into the optimizer/pass manager
-        ck_expr = ir_pass.infer_type(expr, mod=self.mod)
-        fused_expr = ir_pass.fuse_ops(self.mod, ck_expr)
-        ck_fused = ir_pass.infer_type(fused_expr, mod=self.mod)
-        return ck_fused
-
-    def _make_executor(self, _):
-        """
-        Construct a Python function that implements the evaluation
-        of expression.
-
-        Parameters
-        ----------
-        expr: relay.Expr
-            The Relay expression to execute.
-
-        Returns
-        -------
-        executor: function
-            A Python function which implements the behavior of `expr`.
-        """
-        raise Exception("abstract method: please implement me.")
-
-    def evaluate(self, expr, params=None):
-        """
-        Evaluate a Relay expression on the interpreter.
-
-        Parameters
-        ----------
-        expr: tvm.relay.Expr
-            The expression to evaluate.
-        """
-        if params:
-            scope_builder = ScopeBuilder()
-            for key in params:
-                value = params[key]
-                scope_builder.let(key, value)
-            scope_builder.ret(expr)
-            expr = scope_builder.get()
-
-        if isinstance(expr, Function):
-            assert not ir_pass.free_vars(expr)
-
-        executor = self._make_executor(expr)
-
-        # If we are evaluating a function or top-level defintion
-        # the user must call the function themselves.
-        #
-        # If we are evaluating an open term with parameters we will
-        # just return them the result.
-        if isinstance(expr, (Function, GlobalVar)):
-            return executor
-        else:
-            return executor()
-
-
-class Interpreter(Executor):
-    """
-    A wrapper around the Relay interpreter, implements the excecutor interface.
-    """
-    def __init__(self, mod=None):
-        Executor.__init__(self, mod)
-
-    def _make_executor(self, expr):
-        def _interp_wrapper(*args):
-            relay_args = []
-            for arg in args:
-                relay_args.append(_arg_to_ast(arg))
-
-            if isinstance(expr, GlobalVar):
-                func = self.mod[expr]
-                func = self.optimize(func)
-                self.mod._add(expr, func, True)
-                opt_expr = Call(expr, relay_args)
-                return _interpreter.evaluate(self.mod, opt_expr)
-            elif isinstance(expr, Function):
-                call = Call(expr, relay_args)
-                opt_expr = self.optimize(call)
-                return _interpreter.evaluate(self.mod, opt_expr)
-            else:
-                assert not args
-                opt_expr = self.optimize(expr)
-                return _interpreter.evaluate(self.mod, opt_expr)
-
-        return _interp_wrapper
-
-
-class GraphRuntime(Executor):
-    """A wrapper around the TVM graph runtime, implements the Executor interface."""
-    def __init__(self, mod=None):
-        Executor.__init__(self, mod)
-
-    def _make_executor(self, expr):
-        def _graph_wrapper(*args):
-            func = self.optimize(expr)
-            graph_json, mod, params = build_module.build(func, mod=self.mod)
-            assert params is None
-            gmodule = tvm_runtime.create(graph_json, mod, cpu(0))
-            # Create map of inputs.
-            inputs = {}
-            for i, arg in enumerate(args):
-                inputs[func.params[i].name_hint] = arg
-            # Set the inputs here.
-            gmodule.set_input(**inputs)
-            # Run the module, and fetch the output.
-            gmodule.run()
-            return gmodule.get_output(0)
-
-        return _graph_wrapper
-
-def create_executor(mode='debug', mod=None):
-    if mode == 'debug':
-        return Interpreter(mod)
-    elif mode == 'graph':
-        return GraphRuntime(mod)
-    else:
-        raise Exception("unknown mode {0}".format(mode))
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 989e5ad7622f..274761f0a27b 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -160,6 +160,7 @@ def free_type_vars(expr):
     """
     return _ir_pass.free_type_vars(expr)
 
+
 def simplify_inference(expr):
     """ Simplify the data-flow graph for inference phase.
 
@@ -176,6 +177,7 @@ def simplify_inference(expr):
     """
     return _ir_pass.simplify_inference(expr)
 
+
 def dead_code_elimination(expr):
     """ Remove expressions which does not effect the program result (dead code).
 
@@ -256,8 +258,18 @@ def structural_hash(value):
                "relay.Expr or relay.Type").format(type(value))
         raise TypeError(msg)
 
-def fuse_ops(expr, mod):
-    return _ir_pass.FuseOps(mod, expr)
 
-def lower_ops(mod, expr, target='llvm'):
-    return _ir_pass.LowerOps(mod, expr, target)
+def fuse_ops(expr):
+    """Fuse operators in expr together.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    transformed_expr : tvm.relay.Expr
+        Transformed expression, containing fused result.
+    """
+    return _ir_pass.FuseOps(expr)
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 5841d278378a..28d53ec8674a 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -1,272 +1,281 @@
 #pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
 from __future__ import absolute_import
-import tvm
 import topi
 import topi.cuda
-from . import register_schedule, register_compute
+from .op import register_compute, register_schedule, register_pattern, OpPattern
 
 def schedule_injective(outputs, target):
     """Generic schedule for binary broadcast."""
-    with tvm.target.create(target):
+    with target:
         return topi.generic.schedule_injective(outputs)
 
 schedule_broadcast = schedule_injective
 schedule_elemwise = schedule_injective
 
 # log
+@register_compute("log")
 def log_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.log(inputs[0])]
 
-register_compute("log", log_compute)
 register_schedule("log", schedule_broadcast)
 
 # exp
+@register_compute("exp")
 def exp_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.exp(inputs[0])]
 
-register_compute("exp", exp_compute)
 register_schedule("exp", schedule_broadcast)
 
 # sqrt
+@register_compute("sqrt")
 def sqrt_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.sqrt(inputs[0])]
 
-register_compute("sqrt", sqrt_compute)
 register_schedule("sqrt", schedule_broadcast)
 
 # sigmoid
+@register_compute("sigmoid")
 def sigmoid_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.sigmoid(inputs[0])]
 
-register_compute("sigmoid", sigmoid_compute)
 register_schedule("sigmoid", schedule_broadcast)
 
 # floor
+@register_compute("floor")
 def floor_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.floor(inputs[0])]
 
-register_compute("floor", floor_compute)
 register_schedule("floor", schedule_broadcast)
 
 # ceil
+@register_compute("ceil")
 def ceil_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.ceil(inputs[0])]
 
-register_compute("ceil", ceil_compute)
 register_schedule("ceil", schedule_broadcast)
 
 # trunc
+@register_compute("trunc")
 def trunc_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.trunc(inputs[0])]
 
-register_compute("trunc", trunc_compute)
 register_schedule("trunc", schedule_broadcast)
 
 # round
+@register_compute("round")
 def round_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.round(inputs[0])]
 
-register_compute("round", round_compute)
 register_schedule("round", schedule_broadcast)
 
 # abs
+@register_compute("abs")
 def abs_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.abs(inputs[0])]
 
-register_compute("abs", abs_compute)
 register_schedule("abs", schedule_broadcast)
 
 # tanh
+@register_compute("tanh")
 def tanh_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.tanh(inputs[0])]
 
-register_compute("tanh", tanh_compute)
 register_schedule("tanh", schedule_broadcast)
 
 # negative
+@register_compute("negative")
 def negative_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.negative(inputs[0])]
 
-register_compute("negative", negative_compute)
 register_schedule("negative", schedule_broadcast)
 
 # add
+@register_compute("add")
 def add_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.add(inputs[0], inputs[1])]
 
-register_compute("add", add_compute)
 register_schedule("add", schedule_injective)
 
 # subtract
+@register_compute("subtract")
 def subtract_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.subtract(inputs[0], inputs[1])]
 
-register_compute("subtract", subtract_compute)
 register_schedule("subtract", schedule_broadcast)
 
 # multiply
+@register_compute("multiply")
 def multiply_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.multiply(inputs[0], inputs[1])]
 
-register_compute("multiply", multiply_compute)
 register_schedule("multiply", schedule_broadcast)
 
 # divide
+@register_compute("divide")
 def divide_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.divide(inputs[0], inputs[1])]
 
-register_compute("divide", divide_compute)
 register_schedule("divide", schedule_broadcast)
 
-# pow
-def pow_compute(attrs, inputs, output_type, target):
+# power
+@register_compute("power")
+def power_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.power(inputs[0], inputs[1])]
 
-register_compute("pow", pow_compute)
-register_schedule("pow", schedule_injective)
+register_schedule("power", schedule_injective)
 
 # mod
+@register_compute("mod")
 def mod_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.mod(inputs[0], inputs[1])]
 
-register_compute("mod", mod_compute)
 register_schedule("mod", schedule_broadcast)
 
 # equal
+@register_compute("equal")
 def equal_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.equal(inputs[0], inputs[1])]
 
-register_compute("equal", equal_compute)
 register_schedule("equal", schedule_broadcast)
 
 # not_equal
+@register_compute("not_equal")
 def not_equal_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.not_equal(inputs[0], inputs[1])]
 
-register_compute("not_equal", not_equal_compute)
 register_schedule("not_equal", schedule_broadcast)
 
 # less
+@register_compute("less")
 def less_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.less(inputs[0], inputs[1])]
 
-register_compute("less", less_compute)
 register_schedule("less", schedule_broadcast)
 
 # less equal
+@register_compute("less_equal")
 def less_equal_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.less_equal(inputs[0], inputs[1])]
 
-register_compute("less_equal", less_equal_compute)
 register_schedule("less_equal", schedule_broadcast)
 
 # greater
+@register_compute("greater")
 def greater_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.greater(inputs[0], inputs[1])]
 
-register_compute("greater", greater_compute)
 register_schedule("greater", schedule_broadcast)
 
 # greater equal
+@register_compute("greater_equal")
 def greater_equal_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.greater_equal(inputs[0], inputs[1])]
 
-register_compute("greater_equal", greater_equal_compute)
 register_schedule("greater_equal", schedule_broadcast)
 
 # maximum
+@register_compute("maximum")
 def maximum_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.maximum(inputs[0], inputs[1])]
 
-register_compute("maximum_compute", maximum_compute)
 register_schedule("maximum_compute", schedule_injective)
 
 # minimum
+@register_compute("minimum")
 def minimum_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.minimum(inputs[0], inputs[1])]
 
-register_compute("minimum", minimum_compute)
 register_schedule("minimum", schedule_injective)
 
 # right shift
+@register_compute("right_shift")
 def right_shift_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.right_shift(inputs[0], inputs[1])]
 
-register_compute("right_shift", right_shift_compute)
 register_schedule("right_shift", schedule_injective)
 
-# lift shift
+# left shift
+@register_compute("left_shift")
 def left_shift_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 2
     return [topi.left_shift(inputs[0], inputs[1])]
 
-register_compute("left_shift", left_shift_compute)
 register_schedule("left_shift", schedule_injective)
 
 # zeros
+@register_compute("zeros")
 def zeros_compute(attrs, inputs, output_type, target):
     assert not inputs
     return [topi.full(output_type.shape, output_type.dtype, 0.0)]
 
-register_compute("zeros", zeros_compute)
-register_schedule("zeros", schedule_injective)
+register_schedule("zeros", schedule_broadcast)
+register_pattern("zeros", OpPattern.ELEMWISE)
 
 # zeros_like
+@register_compute("zeros_like")
 def zeros_like_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.full_like(inputs[0], 0.0)]
 
-register_compute("zeros_like", zeros_like_compute)
-register_schedule("zeros_like", schedule_injective)
+register_schedule("zeros_like", schedule_broadcast)
 
 # ones
+@register_compute("ones")
 def ones_compute(attrs, inputs, output_type, target):
     assert not inputs
     return [topi.full(output_type.shape, output_type.dtype, 1.0)]
 
-register_compute("ones", ones_compute)
-register_schedule("ones", schedule_injective)
+register_schedule("ones", schedule_broadcast)
+register_pattern("ones", OpPattern.ELEMWISE)
 
 # ones_like
+@register_compute("ones_like")
 def ones_like(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.full_like(inputs[0], 1.0)]
 
-register_compute("ones_like", ones_like)
-register_schedule("ones_like", schedule_injective)
+register_schedule("ones_like", schedule_broadcast)
 
 # clip
+@register_compute("clip")
 def clip_compute(attrs, inputs, output_type, target):
     assert len(inputs) == 1
     return [topi.clip(inputs[0], attrs.a_min, attrs.a_max)]
 
+register_schedule("clip", schedule_elemwise)
+register_pattern("clip", OpPattern.ELEMWISE)
 
-register_compute("clip", clip_compute)
-register_schedule("clip", schedule_injective)
+# concatenate
+@register_compute("concatenate")
+def concatenate_compute(attrs, inputs, output_type, target):
+    return [topi.concatenate(inputs, axis=attrs.axis)]
+
+register_schedule("concatenate", schedule_injective)
+register_pattern("concatenate", OpPattern.INJECTIVE)
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 91523f65f6b7..3bdb5989c292 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -72,13 +72,80 @@ def _register(v):
         """internal register function"""
         _Register(op_name, attr_key, v, level)
         return v
-    return _register(value) if value else _register
+    return _register(value) if value is not None else _register
 
-def register_schedule(op_name, schedule):
-    register(op_name, "FTVMSchedule", schedule)
 
-def register_compute(op_name, compute):
-    register(op_name, "FTVMCompute", compute)
+class OpPattern(object):
+    """Operator generic patterns
+
+    See Also
+    --------
+    top.tag : Contains explanation of the tag type.
+    """
+    # Elementwise operator
+    ELEMWISE = 0
+    # Broadcast operator
+    BROADCAST = 1
+    # Injective mapping
+    INJECTIVE = 2
+    # Comunication
+    COMM_REDUCE = 3
+    # Complex op, can still fuse ewise into it
+    OUT_ELEMWISE_FUSABLE = 4
+    # Not fusable opaque op
+    OPAQUE = 8
+
+
+def register_schedule(op_name, schedule=None, level=10):
+    """Register schedule function for an op
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    schedule : function
+        The schedule function.
+
+    level : int
+        The priority level
+    """
+    return register(op_name, "FTVMSchedule", schedule, level)
+
+
+def register_compute(op_name, compute=None, level=10):
+    """Register compute function for an op.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    compute : function
+        The compute function.
+
+    level : int
+        The priority level
+    """
+    return register(op_name, "FTVMCompute", compute, level)
+
+
+def register_pattern(op_name, pattern, level=10):
+    """Register operator pattern for an op.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    pattern : int
+        The pattern being used.
+
+    level : int
+        The priority level
+    """
+    return register(op_name, "TOpPattern", pattern, level)
+
 
 _init_api("relay.op", __name__)
 
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 2505da8f1dfd..b7845cfaca57 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -266,7 +266,7 @@ def divide(lhs, rhs):
     return _make.divide(lhs, rhs)
 
 
-def pow(lhs, rhs):
+def power(lhs, rhs):
     """Power with numpy-style broadcasting.
 
     Parameters
@@ -281,7 +281,7 @@ def pow(lhs, rhs):
     result : relay.Expr
         The computed result.
     """
-    return _make.pow(lhs, rhs)
+    return _make.power(lhs, rhs)
 
 
 def mod(lhs, rhs):
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 776b61317da7..913f97ecd4a1 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -6,3 +6,4 @@
 from . import dqn
 from . import dcgan
 from . import mobilenet
+from .config import ctx_list
diff --git a/python/tvm/relay/testing/config.py b/python/tvm/relay/testing/config.py
new file mode 100644
index 000000000000..677b72d979a1
--- /dev/null
+++ b/python/tvm/relay/testing/config.py
@@ -0,0 +1,14 @@
+"""Configuration about tests"""
+from __future__ import absolute_import as _abs
+
+import os
+import tvm
+
+def ctx_list():
+    """Get context list for testcases"""
+    device_list = os.environ.get("RELAY_TEST_TARGETS", "")
+    device_list = (device_list.split(",") if device_list
+                   else ["llvm", "cuda"])
+    device_list = set(device_list)
+    res = [(device, tvm.context(device, 0)) for device in device_list]
+    return [x for x in res if x[1].exist]
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 5c0a5e07cd2a..c5c14d711df7 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -154,13 +154,15 @@ std::unordered_set<std::string> TargetNode::libs() const {
   return result;
 }
 
-std::string TargetNode::str() const {
+const std::string& TargetNode::str() const {
+  if (str_repr_.length() != 0) return str_repr_;
   std::ostringstream result;
   result << target_name;
   for (const auto &x : options()) {
     result << " " << x;
   }
-  return result.str();
+  str_repr_ = result.str();
+  return str_repr_;
 }
 
 
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
new file mode 100644
index 000000000000..d9385977dc39
--- /dev/null
+++ b/src/relay/backend/compile_engine.cc
@@ -0,0 +1,351 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/backend/compile_engine.cc
+ * \brief Internal compialtion engine.
+ */
+#include <tvm/schedule.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/operation.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include <utility>
+#include <limits>
+#include <mutex>
+#include <functional>
+#include "compile_engine.h"
+
+namespace tvm {
+namespace relay {
+
+CCacheKey CCacheKeyNode::make(Function source_func, Target target) {
+  auto n = make_node<CCacheKeyNode>();
+  n->source_func = std::move(source_func);
+  n->target = std::move(target);
+  return CCacheKey(n);
+}
+
+// The getter to get schedule from compile engine.
+// Get schedule from functor.
+class ScheduleGetter :
+      public ExprFunctor<Array<Tensor>(const Expr&)> {
+ public:
+  explicit ScheduleGetter(Target target)
+      : target_(target) {}
+
+  Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
+    // for now, we always use int32 shape when possible
+    // even if the result of shape inference becomes int64.
+    Array<IndexExpr> res;
+    for (IndexExpr val : shape) {
+      const int64_t* pval = as_const_int(val);
+      if (pval != nullptr) {
+        CHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
+        CHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
+        res.push_back(ir::IntImm::make(Int(32), *pval));
+      } else {
+        res.push_back(val);
+      }
+    }
+    return res;
+  }
+
+  std::pair<Schedule, CachedFunc> Create(const Function& prim_func) {
+    static auto fschedule =
+        Op::GetAttr<FTVMSchedule>("FTVMSchedule");
+    auto cache_node = make_node<CachedFuncNode>();
+    cache_node->target = target_;
+
+    if (prim_func->params.size() == 1 &&
+        prim_func->params[0]->checked_type().as<TupleTypeNode>()) {
+      // Handle tuple input type by flattening them.
+      // This is the current calling convention of tuple input.
+      Array<tvm::Tensor> inputs;
+      for (Type field : prim_func->params[0]->type_as<TupleTypeNode>()->fields) {
+        const auto* ttype = field.as<TensorTypeNode>();
+        CHECK(ttype != nullptr);
+        tvm::Tensor tensor = tvm::placeholder(
+            GetShape(ttype->shape), ttype->dtype);
+        cache_node->inputs.push_back(tensor);
+        inputs.push_back(tensor);
+      }
+      memo_[prim_func->params[0]] = inputs;
+
+    } else {
+      for (Var param : prim_func->params) {
+        const auto* ttype = param->type_as<TensorTypeNode>();
+        tvm::Tensor tensor = tvm::placeholder(
+            GetShape(ttype->shape), ttype->dtype);
+        cache_node->inputs.push_back(tensor);
+        memo_[param] = Array<Tensor>({tensor});
+      }
+    }
+    readable_name_stream_ << "fused";
+    // enter the target context
+    TargetContext target_ctx(target_);
+    cache_node->outputs = this->VisitExpr(prim_func->body);
+    cache_node->func_name = readable_name_stream_.str();
+    CachedFunc cfunc(cache_node);
+    CHECK(master_op_.defined());
+    Schedule schedule = fschedule[master_op_](
+        cache_node->outputs, target_);
+    return std::make_pair(schedule, cfunc);
+  }
+
+  Array<Tensor> VisitExpr(const Expr& expr) {
+    auto it = memo_.find(expr);
+    if (it != memo_.end()) {
+      return it->second;
+    } else {
+      Array<Tensor> res = ExprFunctor::VisitExpr(expr);
+      memo_[expr] = res;
+      return res;
+    }
+  }
+
+  Array<Tensor> VisitExpr_(const VarNode* op) final {
+    LOG(FATAL) << "Free variable " << op->name_hint;
+    return {};
+  }
+
+  Array<Tensor> VisitExpr_(const CallNode* call_node) final {
+    static auto fcompute =
+        Op::GetAttr<FTVMCompute>("FTVMCompute");
+    static auto fpattern =
+        Op::GetAttr<TOpPattern>("TOpPattern");
+
+    Array<Tensor> inputs;
+    int count_tuple = 0;
+    for (Expr arg : call_node->args) {
+      if (arg->checked_type().as<TupleTypeNode>()) {
+        ++count_tuple;
+      }
+      for (Tensor tensor : VisitExpr(arg)) {
+        inputs.push_back(tensor);
+      }
+    }
+    if (count_tuple) {
+      CHECK_EQ(call_node->args.size(), 1U)
+          << "Only allow function with a single tuple input";
+    }
+    CHECK(call_node->op.as<OpNode>())
+        << "Primitive function only allows call into primitive ops";
+    Op op = Downcast<Op>(call_node->op);
+    Array<Tensor> outputs = fcompute[op](
+        call_node->attrs,
+        inputs,
+        call_node->checked_type(),
+        target_);
+
+    int op_pattern = fpattern[op];
+    if (op_pattern >= kCommReduce) {
+      CHECK(!master_op_.defined())
+          << "Two complicated op in a primitive function";
+    }
+    if (op_pattern >= master_op_patetrn_) {
+      master_op_ = op;
+      master_op_patetrn_ = op_pattern;
+    }
+    if (outputs.size() != 1) {
+      const auto* tuple_type =
+          call_node->checked_type().as<TupleTypeNode>();
+      CHECK(tuple_type) << "Expect output to be a tuple type";
+      CHECK_EQ(tuple_type->fields.size(), outputs.size());
+    }
+    readable_name_stream_ << '_' << op->name;
+    return outputs;
+  }
+
+  Array<Tensor> VisitExpr_(const FunctionNode* op) final {
+    LOG(FATAL) << "Do not support sub function";
+    return Array<Tensor>();
+  }
+
+  Array<Tensor> VisitExpr_(const LetNode* op) final {
+    Array<Tensor> val = VisitExpr(op->value);
+    CHECK(!memo_.count(op->var));
+    memo_[op->var] = val;
+    return VisitExpr(op->body);
+  }
+
+  Array<Tensor> VisitExpr_(const TupleNode* op) final {
+    Array<Tensor> fields;
+    for (Expr field : op->fields) {
+      CHECK(field->checked_type().as<TensorTypeNode>())
+          << "Only allow Tuple of Tensor";
+      Array<Tensor> res = VisitExpr(field);
+      CHECK_EQ(res.size(), 1);
+      fields.push_back(res[0]);
+    }
+    return fields;
+  }
+
+  Array<Tensor> VisitExpr_(const TupleGetItemNode* op) final {
+    const auto* tuple_type = op->tuple->type_as<TupleTypeNode>();
+    Array<Tensor> tuple = VisitExpr(op->tuple);
+    CHECK_EQ(tuple_type->fields.size(), tuple.size());
+    CHECK_GE(op->index, 0);
+    CHECK_LT(static_cast<size_t>(op->index), tuple.size());
+    return {tuple[op->index]};
+  }
+
+ private:
+  tvm::Target target_;
+  Op master_op_;
+  int master_op_patetrn_{0};
+  std::ostringstream readable_name_stream_;
+  std::unordered_map<Expr, Array<Tensor>, NodeHash, NodeEqual> memo_;
+};
+
+
+class CompileEngineImpl : public CompileEngineNode {
+ public:
+  // Lower the fucntion.
+  CachedFunc Lower(const CCacheKey& key)  {
+    return LowerInternal(key)->cached_func;
+  }
+
+  // For now, build one module per function.
+  PackedFunc JIT(const CCacheKey& key) final {
+    CCacheValue value = LowerInternal(key);
+    if (value->packed_func != nullptr) return value->packed_func;
+    // build the function.
+    if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
+      tvm::runtime::Module m = (*f)(value->cached_func->funcs, key->target);
+      value->packed_func = m.GetFunction(value->cached_func->func_name);
+    } else {
+      LOG(FATAL) << "relay.backend.build is not registered";
+    }
+    return value->packed_func;
+  }
+  void Clear() final {
+    cache_.clear();
+  }
+  // List all items in the cache.
+  Array<NodeRef> ListItems() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    Array<NodeRef> items;
+    for (auto& kv : cache_) {
+      items.push_back(kv.first);
+      items.push_back(kv.second);
+    }
+    return items;
+  }
+  /*!
+   * \brief Create schedule for target.
+   * \param source_func The primitive function to be lowered.
+   * \param target The target we want to create schedule for.
+   * \return Pair of schedule and cache.
+   *  The funcs field in cache is not yet populated.
+   */
+  std::pair<Schedule, CachedFunc> CreateSchedule(
+      const Function& source_func, const Target& target) {
+    return ScheduleGetter(target).Create(source_func);
+  }
+
+ private:
+  // implement lowered func
+  CCacheValue LowerInternal(const CCacheKey& key)  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    CCacheValue value;
+    auto it = cache_.find(key);
+    if (it != cache_.end()) {
+      it->second->use_count += 1;
+      if (it->second->cached_func.defined()) return it->second;
+      value = it->second;
+    } else {
+      value = CCacheValue(make_node<CCacheValueNode>());
+      value->use_count = 0;
+      cache_[key] = value;
+    }
+    CHECK(!value->cached_func.defined());
+    auto spair = CreateSchedule(key->source_func, key->target);
+    auto cache_node = make_node<CachedFuncNode>(
+        *(spair.second.operator->()));
+    cache_node->func_name = GetUniqeName(cache_node->func_name);
+    // NOTE: array will copy on write.
+    Array<Tensor> all_args = cache_node->inputs;
+    for (Tensor arg : cache_node->outputs) {
+      all_args.push_back(arg);
+    }
+    // lower the function
+    if (const auto* f = runtime::Registry::Get("relay.backend.lower")) {
+      cache_node->funcs = (*f)(
+          spair.first, all_args, cache_node->func_name, key->source_func);
+    } else {
+      LOG(FATAL) << "relay.backend._lower is not registred";
+    }
+    value->cached_func = CachedFunc(cache_node);
+    return value;
+  }
+  /*!
+   * \brief Get unique name from name.
+   * \param name The orginal name.
+   * \return Updated name which is unique.
+   */
+  std::string GetUniqeName(std::string name) {
+    while (true) {
+      auto it = name_map_.find(name);
+      if (it == name_map_.end()) {
+        name_map_[name] = 1;
+        return name;
+      } else {
+        std::ostringstream os;
+        os << name << "_" << it->second;
+        ++(it->second);
+        name = os.str();
+      }
+    }
+    return name;
+  }
+  /*! \brief compiler cache lock*/
+  std::mutex mutex_;
+  /*! \brief internal name map to get an unique name */
+  std::unordered_map<std::string, int> name_map_;
+  /*! \brief internal compiler cache */
+  std::unordered_map<CCacheKey, CCacheValue> cache_;
+};
+
+/*! \brief The global compile engine */
+const CompileEngine& CompileEngine::Global() {
+  // intentionally allocate raw pointer to avoid
+  // free during destructuion.
+  static CompileEngine* inst = new CompileEngine(
+      make_node<CompileEngineImpl>());
+  return *inst;
+}
+
+
+TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey")
+.set_body_typed<CCacheKey(Function, Target)>(CCacheKeyNode::make);
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGlobal")
+.set_body_typed<CompileEngine()>([]() {
+    return CompileEngine::Global();
+  });
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineClear")
+.set_body_typed<void(const CompileEngine&)>([](CompileEngine self) {
+    self->Clear();
+  });
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineLower")
+.set_body_typed<CachedFunc(CompileEngine, CCacheKey)>(
+    [](CompileEngine self, CCacheKey key) {
+      return self->Lower(key);
+    });
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineJIT")
+.set_body_typed<PackedFunc(CompileEngine, CCacheKey)>(
+    [](CompileEngine self, CCacheKey key) {
+      return self->JIT(key);
+    });
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListItems")
+.set_body_typed<Array<NodeRef>(CompileEngine)>(
+    [](CompileEngine self){
+      return static_cast<CompileEngineImpl*>(self.operator->())->ListItems();
+    });
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
new file mode 100644
index 000000000000..40b53ab31e5e
--- /dev/null
+++ b/src/relay/backend/compile_engine.h
@@ -0,0 +1,206 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/backend/compile_engine.h
+ * \brief Internal compialtion engine handle function cache.
+ *  and interface to low level code generation.
+ */
+#ifndef TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
+#define TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
+
+#include <tvm/lowered_func.h>
+#include <tvm/relay/expr.h>
+#include <string>
+#include <functional>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Node container to represent a cached function. */
+struct CachedFuncNode : public Node {
+  /* \brief compiled target */
+  tvm::Target target;
+  /*! \brief Function name */
+  std::string func_name;
+  /* \brief The inputs to the function */
+  tvm::Array<Tensor> inputs;
+  /* \brief The outputs to the function */
+  tvm::Array<Tensor> outputs;
+  /*! \brief The lowered functions to support the function. */
+  tvm::Array<tvm::LoweredFunc> funcs;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("target", &target);
+    v->Visit("func_name", &func_name);
+    v->Visit("inputs", &inputs);
+    v->Visit("outputs", &outputs);
+    v->Visit("funcs", &funcs);
+  }
+
+  static constexpr const char* _type_key = "relay.CachedFunc";
+  TVM_DECLARE_NODE_TYPE_INFO(CachedFuncNode, Node);
+};
+
+TVM_DEFINE_NODE_REF(CachedFunc, CachedFuncNode);
+
+
+class CCacheKey;
+/*! \brief Compile cache key */
+class CCacheKeyNode : public Node {
+ public:
+  /*! \brief The source function to be lowered. */
+  Function source_func;
+  /*! \brief The hardware target.*/
+  Target target;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("source_func", &source_func);
+    v->Visit("target", &target);
+  }
+  /*! \return The hash value of CCacheKey. */
+  inline size_t Hash() const;
+  /*!
+   * \brief check content equality
+   * \param other The other value.
+   * \return The result of equality check.
+   */
+  inline bool Equal(const CCacheKeyNode* other) const;
+  /*!
+   * \brief create a cache key.
+   * \param source_func The source function.
+   * \param target The target device.
+   * \return the created key.
+   */
+  TVM_DLL static CCacheKey make(Function source_func,
+                                Target target);
+
+  static constexpr const char* _type_key = "relay.CCacheKey";
+  TVM_DECLARE_NODE_TYPE_INFO(CCacheKeyNode, tvm::Node);
+
+ private:
+  /*!
+   * \brief internal cached hash value.
+   */
+  mutable size_t hash_{0};
+};
+
+/*! \brief cache entry used in compile engine */
+class CCacheKey : public NodeRef {
+ public:
+  CCacheKey() {}
+  explicit CCacheKey(NodePtr<Node> n) : NodeRef(n) {}
+  const CCacheKeyNode* operator->() const {
+    return static_cast<CCacheKeyNode*>(node_.get());
+  }
+  // comparator
+  inline bool operator==(const CCacheKey& other) const {
+    CHECK(defined() && other.defined());
+    return (*this)->Equal(other.operator->());
+  }
+  using ContainerType = CCacheKeyNode;
+};
+
+/*! \brief Node container for compile cache. */
+class CCacheValueNode : public Node {
+ public:
+  /*! \brief The corresponding function */
+  CachedFunc cached_func;
+  /*! \brief Result of Packed function generated by JIT */
+  PackedFunc packed_func;
+  /*! \brief usage statistics */
+  int use_count{0};
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("cached_func", &cached_func);
+    v->Visit("use_count", &use_count);
+  }
+  static constexpr const char* _type_key = "relay.CCacheValue";
+  TVM_DECLARE_NODE_TYPE_INFO(CCacheValueNode, tvm::Node);
+};
+
+/*! \brief cache entry used in compile engine */
+class CCacheValue : public NodeRef {
+ public:
+  CCacheValue() {}
+  explicit CCacheValue(NodePtr<Node> n) : NodeRef(n) {}
+  CCacheValueNode* operator->() {
+    return static_cast<CCacheValueNode*>(node_.get());
+  }
+  const CCacheValueNode* operator->() const {
+    return static_cast<const CCacheValueNode*>(node_.get());
+  }
+  using ContainerType = CCacheValueNode;
+};
+
+/*!
+ * \brief Backend compilation engine for
+ *        low level code generation.
+ */
+class CompileEngineNode : public Node {
+ public:
+  /*!
+   * \brief Get lowered result.
+   * \param key The key to the cached function.
+   * \return The result.
+   */
+  virtual CachedFunc Lower(const CCacheKey& key) = 0;
+  /*!
+   * \brief Just in time compile to get a PackedFunc.
+   * \param key The key to the cached function.
+   * \return The result.
+   */
+  virtual PackedFunc JIT(const CCacheKey& key) = 0;
+  /*! \brief clear the cache. */
+  virtual void Clear() = 0;
+
+  // VisitAttrs
+  void VisitAttrs(AttrVisitor*) final {}
+
+  static constexpr const char* _type_key = "relay.CompileEngine";
+  TVM_DECLARE_NODE_TYPE_INFO(CompileEngineNode, Node);
+};
+
+/*! \brier cache entry used in compile engine */
+class CompileEngine : public NodeRef {
+ public:
+  CompileEngine() {}
+  explicit CompileEngine(NodePtr<Node> n) : NodeRef(n) {}
+  CompileEngineNode* operator->() {
+    return static_cast<CompileEngineNode*>(node_.get());
+  }
+  using ContainerType = CompileEngineNode;
+  /*! \brief The global compile engine. */
+  TVM_DLL static const CompileEngine& Global();
+};
+
+// implementations
+inline size_t CCacheKeyNode::Hash() const {
+  if (hash_ != 0) return hash_;
+  // do structral hash, avoid 0.
+  hash_ = StructuralHash()(this->source_func);
+  hash_ = dmlc::HashCombine(
+      hash_, std::hash<std::string>()(target->str()));
+  if (hash_ == 0) hash_ = 1;
+  return hash_;
+}
+
+inline bool CCacheKeyNode::Equal(
+    const CCacheKeyNode* other) const {
+  if (Hash() != other->Hash()) return false;
+  return this->target->str() == other->target->str() &&
+      AlphaEqual(this->source_func, other->source_func);
+}
+
+}  // namespace relay
+}  // namespace tvm
+
+namespace std {
+// overload hash
+template<>
+struct hash<::tvm::relay::CCacheKey> {
+  size_t operator()(const ::tvm::relay::CCacheKey& key) const {
+    CHECK(key.defined());
+    return key->Hash();
+  }
+};
+}  // namespace std
+#endif  // TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
new file mode 100644
index 000000000000..db96a3ad4de1
--- /dev/null
+++ b/src/relay/backend/interpreter.cc
@@ -0,0 +1,426 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/interpreter.cc
+ * \brief An interpreter for the Relay IR.
+ */
+#include <tvm/packed_func_ext.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/interpreter.h>
+#include <tvm/relay/pass.h>
+#include "compile_engine.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace runtime;
+
+inline const PackedFunc& GetPackedFunc(const std::string& name) {
+  const PackedFunc* pf = tvm::runtime::Registry::Get(name);
+  CHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
+  return *pf;
+}
+
+/* Value Implementation */
+Closure ClosureNode::make(tvm::Map<Var, Value> env, Function func) {
+  NodePtr<ClosureNode> n = make_node<ClosureNode>();
+  n->env = std::move(env);
+  n->func = std::move(func);
+  return Closure(n);
+}
+
+TVM_REGISTER_API("relay._make.Closure")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = ClosureNode::make(args[0], args[1]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<ClosureNode>([](const ClosureNode* node, tvm::IRPrinter* p) {
+    p->stream << "ClosureNode(" << node->func << ")";
+  });
+
+TupleValue TupleValueNode::make(tvm::Array<Value> value) {
+  NodePtr<TupleValueNode> n = make_node<TupleValueNode>();
+  n->fields = value;
+  return TupleValue(n);
+}
+
+TVM_REGISTER_API("relay._make.TupleValue")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = TupleValueNode::make(args[0]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TupleValueNode>([](const TupleValueNode* node, tvm::IRPrinter* p) {
+    p->stream << "TupleValueNode(" << node->fields << ")";
+  });
+
+TensorValue TensorValueNode::make(runtime::NDArray data) {
+  NodePtr<TensorValueNode> n = make_node<TensorValueNode>();
+  n->data = std::move(data);
+  return TensorValue(n);
+}
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TensorValueNode>([](const TensorValueNode* node, tvm::IRPrinter* p) {
+    auto to_str = GetPackedFunc("relay._tensor_value_repr");
+    std::string data_str = to_str(GetRef<TensorValue>(node));
+    p->stream << "TensorValueNode(" << data_str << ")";
+  });
+
+TVM_REGISTER_API("relay._make.TensorValue")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    runtime::NDArray data = args[0];
+    *ret = TensorValueNode::make(data);
+  });
+
+/*!
+ * \brief A stack frame in the Relay interpreter.
+ *
+ * Contains a mapping from relay::Var to relay::Value.
+ */
+struct Frame {
+  /*! \brief The set of local variables and arguments for the frame. */
+  tvm::Map<Var, Value> locals;
+
+  explicit Frame(tvm::Map<Var, Value> locals) : locals(locals) {}
+};
+
+/*!
+ * \brief The call stack in the Relay interpreter.
+ *
+ * Contains a stack of frames; each corresponding to
+ * a function call.
+ */
+struct Stack {
+  /*! \brief The stack frames. */
+  std::vector<Frame> frames;
+  Stack() : frames() { frames.push_back(Frame({})); }
+
+  Frame& current_frame() { return frames.back(); }
+
+  Value Lookup(const Var& local) {
+    for (auto frame = frames.rbegin(); frame != frames.rend(); frame++) {
+      auto elem = frame->locals.find(local);
+      if (elem != frame->locals.end()) {
+        return (*elem).second;
+      }
+    }
+
+    LOG(FATAL) << "could not find variable binding for " << local
+               << "address= " << local.operator->();
+    return Value();
+  }
+  /*!
+   * A wrapper around Frame to add RAII semantics to pushing and popping
+   * stack frames.
+   */
+  struct LocalFrame {
+    Stack& st;
+    explicit LocalFrame(Stack& st, const Frame& fr) : st(st) {
+      st.frames.push_back(fr);
+    }
+    ~LocalFrame() { st.frames.pop_back(); }
+  };
+};
+
+// NOTE: the current interpreter assumes A-normal form.
+// which is better for execution.
+//
+// It will run duplicated computations when taking program that
+// contains DAG in dataflow-form.
+// Conversion to ANF is recommended before running the interpretation.
+//
+class Interpreter :
+      public ExprFunctor<Value(const Expr& n)> {
+ public:
+  Interpreter(Module mod,
+              DLContext context,
+              Target target)
+      : mod_(mod), context_(context), target_(target) {
+    engine_ = CompileEngine::Global();
+  }
+
+  template <typename T>
+  T WithFrame(const Frame& fr, const std::function<T()>& f) {
+    Stack::LocalFrame lf(stack_, fr);
+    return f();
+  }
+
+  void extend(const Var& id, Value v) {
+    stack_.current_frame().locals.Set(id, v);
+  }
+
+  inline Value Lookup(const Var& local) {
+    return stack_.Lookup(local);
+  }
+
+  Value Eval(const Expr& expr) {
+    return (*this)(expr);
+  }
+
+  Value VisitExpr(const Expr& expr) final {
+    auto ret = ExprFunctor<Value(const Expr& n)>::VisitExpr(expr);
+    return ret;
+  }
+
+  Value VisitExpr_(const VarNode* var_node) final {
+    return Lookup(GetRef<Var>(var_node));
+  }
+
+  Value VisitExpr_(const GlobalVarNode* op) final {
+    return Eval(mod_->Lookup(GetRef<GlobalVar>(op)));
+  }
+
+  Value VisitExpr_(const OpNode* id) override {
+    // TODO(@jroesch): Eta-expand and return in this case.
+    LOG(FATAL) << "internal error, need to wrap intrinsic into call synthetic call node "
+               << "in "
+               << "this case, eta expand";
+    return Value();
+  }
+
+  Value VisitExpr_(const ConstantNode* op) final {
+    return TensorValueNode::make(op->data.CopyTo(context_));
+  }
+
+  Value VisitExpr_(const TupleNode* op) final {
+    std::vector<Value> values;
+
+    for (const auto& field : op->fields) {
+      Value field_value = Eval(field);
+      values.push_back(field_value);
+    }
+
+    return TupleValueNode::make(values);
+  }
+
+  Value VisitExpr_(const FunctionNode* func_node) final {
+    auto func = GetRef<Function>(func_node);
+    tvm::Map<Var, Value> captured_mod;
+    Array<Var> free_vars = FreeVars(func);
+
+    for (const auto& var : free_vars) {
+      captured_mod.Set(var, Eval(var));
+    }
+
+    return ClosureNode::make(captured_mod, func);
+  }
+
+  Value InvokePrimitiveOp(Function func,
+                          const Array<Value>& args) {
+    // Marshal the arguments.
+    // Handle tuple input/output by flattening them.
+    size_t arg_len = 0;
+    for (size_t i = 0; i < args.size(); i++) {
+      if (args[i].as<TensorValueNode>()) {
+        ++arg_len;
+      } else {
+        const auto* tvalue = args[i].as<TupleValueNode>();
+        arg_len += tvalue->fields.size();
+      }
+    }
+    size_t num_inputs = arg_len;
+    if (const auto* tuple_type = func->body->checked_type().as<TupleTypeNode>()) {
+      arg_len += tuple_type->fields.size();
+    } else {
+      CHECK(func->body->checked_type().as<TensorTypeNode>());
+      arg_len += 1;
+    }
+    std::vector<TVMValue> values(arg_len);
+    std::vector<int> codes(arg_len);
+    TVMArgsSetter setter(values.data(), codes.data());
+
+    auto fset_input = [&](size_t i, Value val) {
+      const TensorValueNode* tv = val.as<TensorValueNode>();
+      CHECK(tv != nullptr) << "expect Tensor argument";
+      setter(i, tv->data);
+      DLContext arg_ctx = tv->data->ctx;
+      CHECK(arg_ctx.device_type ==  context_.device_type &&
+            arg_ctx.device_id == context_.device_id)
+        << "Interpreter expect context to be "
+        << context_ << ", but get " << arg_ctx;
+    };
+
+    if (func->params.size() == 1 &&
+        func->params[0]->checked_type().as<TupleTypeNode>()) {
+      // handle tuple input.
+      const TupleValueNode* tuple = args[0].as<TupleValueNode>();
+      CHECK(tuple);
+      for (size_t i = 0; i < tuple->fields.size(); ++i) {
+        fset_input(i, tuple->fields[i]);
+      }
+    } else {
+      CHECK_EQ(num_inputs, args.size());
+      // Decide the target context.
+      // Primitive functions always sit in the same context.
+      for (size_t i = 0; i < args.size(); i++) {
+        fset_input(i, args[i]);
+      }
+    }
+    // TVM's calling convention is that the final argument is the output
+    // buffer. To preserve the illusion of being a functional language
+    // we need to allocate space for the output buffer based on the
+    // return type.
+    auto fset_output = [&](size_t i, Type val_type) {
+      const TensorTypeNode* rtype = val_type.as<TensorTypeNode>();
+      CHECK(rtype != nullptr);
+      // Allocate output tensor.
+      std::vector<int64_t> shape;
+      for (auto dim : rtype->shape) {
+        const auto* ivalue = as_const_int(dim);
+        CHECK(ivalue) << "expected concrete dimensions";
+        shape.push_back(ivalue[0]);
+      }
+      DLDataType dtype = Type2TVMType(rtype->dtype);
+      auto out_tensor = TensorValueNode::make(
+          NDArray::Empty(shape, dtype, context_));
+      setter(num_inputs + i, out_tensor->data);
+      return out_tensor;
+    };
+
+    PackedFunc packed_func = engine_->JIT(CCacheKeyNode::make(func, target_));
+    TVMRetValue rv;
+    if (const TupleTypeNode* rtype = func->body->checked_type().as<TupleTypeNode>()) {
+      Array<Value> fields;
+      for (size_t i = 0; i < rtype->fields.size(); ++i) {
+        fields.push_back(fset_output(i, rtype->fields[i]));
+      }
+      packed_func.CallPacked(TVMArgs(values.data(), codes.data(), arg_len), &rv);
+      return TupleValueNode::make(fields);
+    } else {
+      Value out_tensor = fset_output(0, func->body->checked_type());
+      packed_func.CallPacked(TVMArgs(values.data(), codes.data(), arg_len), &rv);
+      return out_tensor;
+    }
+  }
+
+  // Check if function is a primitive function.
+  bool IsPrimitive(const Function& func) const {
+    NodeRef res = FunctionGetAttr(func, "Primitive");
+    const ir::IntImm* pval = res.as<ir::IntImm>();
+    return pval && pval->value != 0;
+  }
+
+  // Invoke the closure
+  Value Invoke(const Closure& closure, const tvm::Array<Value>& args) {
+    // Get a reference to the function inside the closure.
+    if (IsPrimitive(closure->func)) {
+      return InvokePrimitiveOp(closure->func, args);
+    }
+    auto func = closure->func;
+    // Allocate a frame with the parameters and free variables.
+    tvm::Map<Var, Value> locals;
+
+    CHECK_EQ(func->params.size(), args.size());
+
+    for (size_t i = 0; i < func->params.size(); i++) {
+      CHECK_EQ(locals.count(func->params[i]), 0);
+      locals.Set(func->params[i], args[i]);
+    }
+
+    // Add the var to value mappings from the Closure's modironment.
+    for (auto it = closure->env.begin(); it != closure->env.end(); ++it) {
+      CHECK_EQ(locals.count((*it).first), 0);
+      locals.Set((*it).first, (*it).second);
+    }
+
+    return WithFrame<Value>(Frame(locals), [&]() { return Eval(func->body); });
+  }
+
+  Value VisitExpr_(const CallNode* call) final {
+    tvm::Array<Value> args;
+    for (auto arg : call->args) {
+      args.push_back(Eval(arg));
+    }
+    // We should not find operators after running fusion,
+    // and operator lowering.
+    //
+    // We have some functions cotaining chunks of operators
+    // which will be loaded into operator map.
+    if (auto op_node = call->op.as<OpNode>()) {
+      LOG(FATAL) << "found " << op_node->name
+                 << "; operators should be removed by future passes; try "
+                    "fusing and lowering";
+    }
+    // Now we just evaluate and expect to find a closure.
+    Value fn_val = Eval(call->op);
+    if (const ClosureNode* closure_node = fn_val.as<ClosureNode>()) {
+      auto closure = GetRef<Closure>(closure_node);
+      return this->Invoke(closure, args);
+    } else {
+      LOG(FATAL) << "internal error: type error, expected function value in the call "
+                 << "position";
+      return Value();
+    }
+  }
+
+  Value VisitExpr_(const LetNode* op) final {
+    auto value = Eval(op->value);
+    this->extend(op->var, value);
+    return Eval(op->body);
+  }
+
+  Value VisitExpr_(const TupleGetItemNode* op) final {
+    Value val = Eval(op->tuple);
+    auto product_node = val.as<TupleValueNode>();
+    CHECK(product_node)
+      << "interal error: when evaluating TupleGetItem expected a tuple value";
+    CHECK_LT(static_cast<size_t>(op->index), product_node->fields.size())
+        << "internal error: index out of bounds";
+    return product_node->fields[op->index];
+  }
+
+  Value VisitExpr_(const IfNode* op) final {
+    Value v = Eval(op->cond);
+    if (const TensorValueNode* bv = v.as<TensorValueNode>()) {
+      DLContext cpu_ctx;
+      cpu_ctx.device_type = kDLCPU;
+      cpu_ctx.device_id = 0;
+      NDArray cpu_array = bv->data.CopyTo(cpu_ctx);
+      CHECK_EQ(TVMType2Type(cpu_array->dtype), Bool());
+      // TODO(@jroesch, @MK): Refactor code into helper from DCE.
+      if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
+        return Eval(op->true_branch);
+      } else {
+        return Eval(op->false_branch);
+      }
+    } else {
+      LOG(FATAL) << "type error, type system should have caught this";
+      return Value();
+    }
+  }
+
+ private:
+  // module
+  Module mod_;
+  // For simplicity we only run the interpreter on a single context.
+  // Context to run the interpreter on.
+  DLContext context_;
+  // Target parameter being used by the interpreter.
+  Target target_;
+  // value stack.
+  Stack stack_;
+  // Backend compile engine.
+  CompileEngine engine_;
+};
+
+
+TypedPackedFunc<Value(Expr)>
+CreateInterpreter(
+    Module mod,
+    DLContext context,
+    Target target) {
+  auto intrp = std::make_shared<Interpreter>(mod, context, target);
+  auto packed = [intrp](Expr expr) {
+    return intrp->Eval(expr);
+  };
+  return TypedPackedFunc<Value(Expr)>(packed);
+}
+
+TVM_REGISTER_API("relay.backend.CreateInterpreter")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = CreateInterpreter(args[0], args[1], args[2]);
+  });
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/interpreter.cc b/src/relay/interpreter.cc
deleted file mode 100644
index 5db7b66ebe83..000000000000
--- a/src/relay/interpreter.cc
+++ /dev/null
@@ -1,432 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file src/tvm/relay/interpreter.cc
- * \brief An interpreter for the Relay IR.
- */
-
-#include <tvm/codegen.h>
-#include <tvm/packed_func_ext.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/interpreter.h>
-#include <tvm/relay/logging.h>
-#include <tvm/relay/pass.h>
-#include <tvm/relay/build_module.h>
-#include "./ir/type_functor.h"
-
-namespace tvm {
-namespace relay {
-
-using namespace runtime;
-
-inline const PackedFunc& GetPackedFunc(const std::string& name) {
-  const PackedFunc* pf = tvm::runtime::Registry::Get(name);
-  CHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
-  return *pf;
-}
-
-/* Value Implementation */
-Closure ClosureNode::make(tvm::Map<Var, Value> env, Function func) {
-  NodePtr<ClosureNode> n = make_node<ClosureNode>();
-  n->env = std::move(env);
-  n->func = std::move(func);
-  return Closure(n);
-}
-
-TVM_REGISTER_API("relay._make.Closure")
-    .set_body([](TVMArgs args, TVMRetValue* ret) {
-      *ret = ClosureNode::make(args[0], args[1]);
-    });
-
-TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<ClosureNode>([](const ClosureNode* node, tvm::IRPrinter* p) {
-      p->stream << "ClosureNode(" << node->func << ")";
-    });
-
-TupleValue TupleValueNode::make(tvm::Array<Value> value) {
-  NodePtr<TupleValueNode> n = make_node<TupleValueNode>();
-  n->fields = value;
-  return TupleValue(n);
-}
-
-TVM_REGISTER_API("relay._make.TupleValue")
-    .set_body([](TVMArgs args, TVMRetValue* ret) {
-      *ret = TupleValueNode::make(args[0]);
-    });
-
-TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<TupleValueNode>([](const TupleValueNode* node,
-                                     tvm::IRPrinter* p) {
-      p->stream << "TupleValueNode(" << node->fields << ")";
-    });
-
-TensorValue TensorValueNode::make(runtime::NDArray data) {
-  NodePtr<TensorValueNode> n = make_node<TensorValueNode>();
-  n->data = std::move(data);
-  return TensorValue(n);
-}
-
-TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-    .set_dispatch<TensorValueNode>([](const TensorValueNode* node,
-                                      tvm::IRPrinter* p) {
-      auto to_str = GetPackedFunc("relay._tensor_value_repr");
-      std::string data_str = to_str(GetRef<TensorValue>(node));
-      p->stream << "TensorValueNode(" << data_str << ")";
-    });
-
-TensorValue TensorValueNode::FromType(const Type& t) {
-  if (auto tt_node = t.as<TensorTypeNode>()) {
-    std::vector<int64_t> dims;
-
-    for (auto dim : tt_node->shape) {
-      auto int_node = dim.as<tvm::ir::IntImm>();
-      CHECK(int_node) << "expected concrete dimensions";
-      dims.push_back(int_node->value);
-    }
-
-    DLDataType dtype;
-    DLContext context;
-
-    switch (tt_node->dtype.code()) {
-      case halideir_type_int:
-        dtype.code = kDLInt;
-        break;
-      case halideir_type_uint:
-        dtype.code = kDLUInt;
-        break;
-      case halideir_type_float:
-        dtype.code = kDLFloat;
-        break;
-      default:
-        throw dmlc::Error("can not convert HalideIR type into DLTensor dtype");
-    }
-
-    dtype.bits = tt_node->dtype.bits();
-    dtype.lanes = tt_node->dtype.lanes();
-
-    // TODO(@jroesch): Is this the right place to place the tensor?
-    context.device_type = DLDeviceType::kDLCPU;
-    context.device_id = 0;
-    runtime::NDArray data = NDArray::Empty(dims, dtype, context);
-    return TensorValueNode::make(data);
-  } else {
-    LOG(FATAL) << "expected a tensor type";
-    return TensorValue();
-  }
-}
-
-TVM_REGISTER_API("relay._make.TensorValue")
-    .set_body([](TVMArgs args, TVMRetValue* ret) {
-      runtime::NDArray data = args[0];
-      *ret = TensorValueNode::make(data);
-    });
-
-/* Evaluator Implementation. */
-struct EvalError : dmlc::Error {
-  explicit EvalError(const std::string& msg) : Error(msg) {}
-};
-
-/*!
- * \brief A stack frame in the Relay interpreter.
- *
- * Contains a mapping from relay::Var to relay::Value.
- */
-struct Frame {
-  /*! \brief The set of local variables and arguments for the frame. */
-  tvm::Map<Var, Value> locals;
-
-  explicit Frame(tvm::Map<Var, Value> locals) : locals(locals) {}
-};
-
-/*!
- * \brief The call stack in the Relay interpreter.
- *
- * Contains a stack of frames; each corresponding to
- * a function call.
- */
-struct Stack {
-  /*! \brief The stack frames. */
-  std::vector<Frame> frames;
-  Stack() : frames() { frames.push_back(Frame({})); }
-
-  Frame& current_frame() { return frames.back(); }
-
-  Value Lookup(const Var& local) {
-    for (auto frame = frames.rbegin(); frame != frames.rend(); frame++) {
-      auto elem = frame->locals.find(local);
-      if (elem != frame->locals.end()) {
-        return (*elem).second;
-      }
-    }
-
-    LOG(FATAL) << "could not find variable binding for " << local
-               << "address= " << local.operator->();
-    return Value();
-  }
-  /*!
-   * A wrapper around Frame to add RAII semantics to pushing and popping
-   * stack frames.
-   */
-  struct LocalFrame {
-    Stack& st;
-    explicit LocalFrame(Stack& st, const Frame& fr) : st(st) {
-      st.frames.push_back(fr);
-    }
-    ~LocalFrame() { st.frames.pop_back(); }
-  };
-};
-
-/*! \brief The equal comparator for expressions. */
-struct ExprEqual {
-  bool operator()(const Expr& a, const Expr& b) const {
-    return AlphaEqual(a, b);
-  }
-};
-
-struct Interpreter : ExprFunctor<Value(const Expr& n)> {
-  Module mod;
-  Stack stack;
-  using JitKey = Function;
-
-  using OpMap = std::unordered_map<JitKey, PackedFunc, StructuralHash, ExprEqual>;
-
-  OpMap operator_map_;
-
-  template <typename T>
-  T with_frame(const Frame& fr, const std::function<T()>& f) {
-    Stack::LocalFrame lf(stack, fr);
-    return f();
-  }
-
-  Interpreter(Module mod) : mod(mod), operator_map_() {}
-  Interpreter(Module mod, OpMap operator_map) : mod(mod), operator_map_(operator_map) {}
-
-  void extend(const Var& id, Value v) {
-    this->stack.current_frame().locals.Set(id, v);
-  }
-
-  inline Value Lookup(const Var& local) {
-    return this->stack.Lookup(local);
-  }
-
-  Value Eval(const Expr& expr) {
-    return (*this)(expr);
-  }
-
-  Value VisitExpr(const Expr& expr) override {
-    RELAY_LOG(INFO) << "VisitExpr: " << expr << std::endl;
-    auto ret = ExprFunctor<Value(const Expr& n)>::VisitExpr(expr);
-    return ret;
-  }
-
-  Value VisitExpr_(const VarNode* var_node) override {
-    return Lookup(GetRef<Var>(var_node));
-  }
-
-  Value VisitExpr_(const GlobalVarNode* op) override {
-    return Eval(this->mod->Lookup(GetRef<GlobalVar>(op)));
-  }
-
-  Value VisitExpr_(const OpNode* id) override {
-    // TODO(@jroesch): Eta-expand and return in this case.
-    throw EvalError(
-        "internal error, need to wrap intrinsic into call synthetic call node "
-        "in "
-        "this case, eta expand");
-  }
-
-  Value VisitExpr_(const ConstantNode* op) override {
-    return TensorValueNode::make(op->data);
-  }
-
-  Value VisitExpr_(const TupleNode* op) override {
-    std::vector<Value> values;
-
-    for (const auto& field : op->fields) {
-      Value field_value = Eval(field);
-      values.push_back(field_value);
-    }
-
-    return TupleValueNode::make(values);
-  }
-
-  Value VisitExpr_(const FunctionNode* func_node) override {
-    auto func = GetRef<Function>(func_node);
-    tvm::Map<Var, Value> captured_mod;
-    Array<Var> free_vars = FreeVars(func);
-
-    for (const auto& var : free_vars) {
-      captured_mod.Set(var, Eval(var));
-    }
-
-    return ClosureNode::make(captured_mod, func);
-  }
-
-  inline Value InvokeCompiledOp(PackedFunc func, const Array<Value>& args,
-                                Type ret_type) {
-    // Marshal the arguments.
-    auto arg_len = args.size() + 1;
-    std::vector<TVMValue> values(arg_len);
-    std::vector<int> codes(arg_len);
-    TVMArgsSetter setter(values.data(), codes.data());
-    TVMRetValue ret;
-
-    // We need real type information to properly allocate the structure.
-    for (size_t i = 0; i < args.size(); i++) {
-      if (const TensorValueNode* tv = args[i].as<TensorValueNode>()) {
-        setter(i, tv->data);
-      }
-    }
-
-    // TVM's calling convention is that the final argument is the output
-    // buffer. To preserve the illusion of being a functional language
-    // we need to allocate space for the output buffer based on the
-    // return type.
-    CHECK(ret_type.as<TensorTypeNode>());
-
-    auto out_tensor = TensorValueNode::FromType(ret_type);
-
-    setter(arg_len - 1, out_tensor->data);
-    func.CallPacked(TVMArgs(values.data(), codes.data(), arg_len), &ret);
-    return out_tensor;
-  }
-
-  Value Invoke(const Closure& closure, const tvm::Array<Value>& args) {
-    // Get a reference to the function inside the closure.
-    auto func = closure->func;
-    auto compiled = operator_map_.find(func);
-    tvm::Array<Function> funcs;
-    for (auto op : operator_map_) {
-      funcs.push_back(op.first);
-    }
-
-    // This case we know we have precompiled the operator.
-    if (compiled != operator_map_.end()) {
-      auto func_ty = func->func_type_annotation();
-      return InvokeCompiledOp(compiled->second, args, func_ty->ret_type);
-    }
-
-    // Allocate a frame with the parameters and free variables.
-    tvm::Map<Var, Value> locals;
-
-    CHECK_EQ(func->params.size(), args.size());
-
-    for (size_t i = 0; i < func->params.size(); i++) {
-      CHECK_EQ(locals.count(func->params[i]), 0);
-      locals.Set(func->params[i], args[i]);
-    }
-
-    // Add the var to value mappings from the Closure's modironment.
-    for (auto it = closure->env.begin(); it != closure->env.end(); ++it) {
-      CHECK_EQ(locals.count((*it).first), 0);
-      locals.Set((*it).first, (*it).second);
-    }
-
-    return with_frame<Value>(Frame(locals), [&]() { return Eval(func->body); });
-  }
-
-  Value VisitExpr_(const CallNode* call) override {
-    tvm::Array<Value> args;
-    for (auto arg : call->args) {
-      args.push_back(Eval(arg));
-    }
-
-    // We should not find operators after running fusion,
-    // and operator lowering.
-    //
-    // We have some functions cotaining chunks of operators
-    // which will be loaded into operator map.
-    if (auto op_node = call->op.as<OpNode>()) {
-      LOG(FATAL) << "found " << op_node->name
-                 << "; operators should be removed by future passes; try "
-                    "fusing and lowering";
-    }
-
-    // Now we just evaluate and expect to find a closure.
-    Value fn_val = Eval(call->op);
-    if (const ClosureNode* closure_node = fn_val.as<ClosureNode>()) {
-      auto closure = GetRef<Closure>(closure_node);
-      return this->Invoke(closure, args);
-    } else {
-      throw EvalError(
-          "internal error: type error, expected function value in the call "
-          "position");
-    }
-  }
-
-  Value VisitExpr_(const LetNode* op) override {
-    auto value = Eval(op->value);
-    this->extend(op->var, value);
-    return Eval(op->body);
-  }
-
-  Value VisitExpr_(const TupleGetItemNode* op) override {
-    Value val = Eval(op->tuple);
-    auto product_node = val.as<TupleValueNode>();
-    CHECK(product_node)
-      << "interal error: when evaluating TupleGetItem expected a tuple value";
-    CHECK_LT(static_cast<size_t>(op->index), product_node->fields.size())
-      << "internal error: index out of bounds";
-    return product_node->fields[op->index];
-  }
-
-  Value VisitExpr_(const IfNode* op) override {
-    Value v = Eval(op->cond);
-    if (const TensorValueNode* bv = v.as<TensorValueNode>()) {
-      // TODO(@jroesch, @MK): Refactor code into helper from DCE.
-      if (reinterpret_cast<uint8_t*>(bv->data->data)[0]) {
-        return Eval(op->true_branch);
-      } else {
-        return Eval(op->false_branch);
-      }
-    } else {
-      throw EvalError("type error, type system should have caught this");
-    }
-  }
-};
-
-Interpreter::OpMap CompileOperators(const Module& mod, const Expr& e) {
-  Interpreter::OpMap op_map;
-  auto lowered_ops = LowerOps(mod, e);
-  RELAY_LOG(INFO) << "LoweredFuncs: " << lowered_ops << std::endl;
-  if (lowered_ops.size()) {
-    const PackedFunc* fbuild_ptr = Registry::Get("relay.op.compiler._build");
-    CHECK(fbuild_ptr) << "Could not find registered function: relay.op.compiler._build";
-    auto fbuild = *fbuild_ptr;
-
-    // Collect the set of lowered functions to build a module.
-    Array<LoweredFunc> lowered_funcs;
-    for (auto lop : lowered_ops) {
-      lowered_funcs.push_back(lop->lowered_func);
-    }
-
-    runtime::Module module = fbuild(lowered_funcs);
-
-    // Loop over the lowered operations to map them into the operator map.
-    for (auto lop : lowered_ops) {
-      Function func = lop->func;
-      LoweredFunc lf = lop->lowered_func;
-
-      RELAY_LOG(INFO) << "LoweredFunc: " << lf->name << std::endl;
-      auto op_impl = module.GetFunction(lf->name);
-      op_map.insert({func, op_impl});
-    }
-  }
-
-  return op_map;
-}
-
-Value Evaluate(Module mod, Expr e) {
-  auto op_map = CompileOperators(mod, e);
-  Interpreter interp(mod, op_map);
-  return interp.Eval(e);
-}
-
-TVM_REGISTER_API("relay._interpreter.evaluate")
-    .set_body([](TVMArgs args, TVMRetValue* ret) {
-      Module mod = args[0];
-      Expr expr = args[1];
-      *ret = Evaluate(mod, expr);
-    });
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 993892a94861..43fdc68a4efe 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -34,10 +34,12 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 
 TensorType ConstantNode::tensor_type() const {
   auto dtype = TVMType2Type(data->dtype);
-
   Array<tvm::Expr> shape;
   for (int i = 0; i < data->ndim; i++) {
-    shape.push_back(tvm::ir::IntImm::make(HalideIR::Int(64), data->shape[i]));
+    CHECK_LE(data->shape[i], std::numeric_limits<int32_t>::max());
+    CHECK_GE(data->shape[i], std::numeric_limits<int32_t>::min());
+    shape.push_back(
+        tvm::ir::IntImm::make(Int(32), data->shape[i]));
   }
 
   return TensorTypeNode::make(shape, dtype);
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
index 96e805b5af2f..25651286ed9e 100644
--- a/src/relay/ir/op.cc
+++ b/src/relay/ir/op.cc
@@ -67,13 +67,15 @@ const GenericOpMap& Op::GetGenericAttr(const std::string& key) {
   return *it->second.get();
 }
 
-void OpRegistry::UpdateAttr(const std::string& key, TVMRetValue value,
+void OpRegistry::UpdateAttr(const std::string& key,
+                            TVMRetValue value,
                             int plevel) {
   OpManager* mgr = OpManager::Global();
   std::lock_guard<std::mutex> lock(mgr->mutex);
   std::unique_ptr<GenericOpMap>& op_map = mgr->attr[key];
   if (op_map == nullptr) {
     op_map.reset(new GenericOpMap());
+    op_map->attr_name_ = key;
   }
   uint32_t index = op_->index_;
   if (op_map->data_.size() <= index) {
@@ -112,31 +114,31 @@ TVM_REGISTER_API("relay.op._OpGetAttr")
     });
 
 TVM_REGISTER_API("relay.op._Register")
-    .set_body([](TVMArgs args, TVMRetValue* rv) {
-      std::string op_name = args[0];
-      std::string attr_key = args[1];
-      runtime::TVMArgValue value = args[2];
-      int plevel = args[3];
-      auto& reg =
-          OpRegistry::Registry()->__REGISTER_OR_GET__(op_name).set_name();
-      // enable resgiteration and override of certain properties
-      if (attr_key == "num_inputs" && plevel > 128) {
-        reg.set_num_inputs(value);
-      } else if (attr_key == "attrs_type_key" && plevel > 128) {
-        reg.set_attrs_type_key(value);
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    std::string op_name = args[0];
+    std::string attr_key = args[1];
+    runtime::TVMArgValue value = args[2];
+    int plevel = args[3];
+    auto& reg =
+        OpRegistry::Registry()->__REGISTER_OR_GET__(op_name).set_name();
+    // enable resgiteration and override of certain properties
+    if (attr_key == "num_inputs" && plevel > 128) {
+      reg.set_num_inputs(value);
+    } else if (attr_key == "attrs_type_key" && plevel > 128) {
+      reg.set_attrs_type_key(value);
+    } else {
+      // normal attr table override.
+      if (args[2].type_code() == kFuncHandle) {
+        // do an eager copy of the PackedFunc
+        PackedFunc f = args[2];
+        // If we get a function from frontend, avoid deleting it.
+        OpManager::Global()->frontend_funcs.push_back(new PackedFunc(f));
+        reg.set_attr(attr_key, f, plevel);
       } else {
-        // normal attr table override.
-        if (args[2].type_code() == kFuncHandle) {
-          // do an eager copy of the PackedFunc
-          PackedFunc f = args[2];
-          // If we get a function from frontend, avoid deleting it.
-          OpManager::Global()->frontend_funcs.push_back(new PackedFunc(f));
-          reg.set_attr(attr_key, f, plevel);
-        } else {
-          reg.set_attr(attr_key, args[2], plevel);
-        }
+        reg.set_attr(attr_key, args[2], plevel);
       }
-    });
+    }
+  });
 
 NodePtr<Node> CreateOp(const std::string& name) {
   auto op = Op::Get(name);
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index 04f51a14ac5f..f28db371706e 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -271,7 +271,7 @@ class TextPrinter :
   TextValue VisitExpr_(const FunctionNode* op) final {
     TextValue id = AllocTempVar();
     std::ostringstream os;
-    os << id << " = function";
+    os << id << " = fn";
     this->PrintFuncInternal(os.str(), GetRef<Function>(op));
     this->PrintEndInst("\n");
     return id;
@@ -516,11 +516,14 @@ class TextPrinter :
         stream_ << ",\n";
       }
     }
-    stream_ << ") ";
+    stream_ << ')';
     if (fn->ret_type.defined()) {
-      stream_ << " -> ";
+      stream_ << '\n';
+      this->PrintIndent(decl_indent);
+      stream_ << "-> ";
       this->PrintType(fn->ret_type, stream_);
     }
+    stream_ << ' ';
     this->PrintScope(fn->body);
   }
   /*!
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
index d07b7f02cd67..6f8dce3875ae 100644
--- a/src/relay/op/op_common.h
+++ b/src/relay/op/op_common.h
@@ -9,6 +9,7 @@
 
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
 #include <vector>
 
 namespace tvm {
@@ -44,7 +45,8 @@ std::vector<T> AsVector(const Array<T> &array) {
       });                                                 \
   RELAY_REGISTER_OP(OpName)                               \
     .set_num_inputs(1)                                    \
-    .add_argument("data", "Tensor", "The input tensor.")
+    .add_argument("data", "Tensor", "The input tensor.")  \
+    .set_attr<TOpPattern>("TOpPattern", kElemWise)
 
 /*! Quick helper macro
  * - Expose a positional make function to construct the node.
@@ -68,7 +70,8 @@ std::vector<T> AsVector(const Array<T> &array) {
     .set_num_inputs(2)                                            \
     .add_argument("lhs", "Tensor", "The left hand side tensor.")  \
     .add_argument("rhs", "Tensor", "The right hand side tensor.") \
-    .add_type_rel("Broadcast", BroadcastRel)
+    .add_type_rel("Broadcast", BroadcastRel)                      \
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast)
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index fe614aa4ea1c..171824fcd3ae 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -46,7 +46,7 @@ RELAY_REGISTER_BINARY_OP("relay.op._make.", "multiply")
 .describe("Elementwise multiply with broadcasting")
 .set_support_level(1);
 
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "pow")
+RELAY_REGISTER_BINARY_OP("relay.op._make.", "power")
 .describe("Elementwise power with broadcasting")
 .set_support_level(4);
 
@@ -65,7 +65,8 @@ RELAY_REGISTER_BINARY_OP("relay.op._make.", "mod")
     .set_num_inputs(2)                                              \
     .add_argument("lhs", "Tensor", "The left hand side tensor.")    \
     .add_argument("rhs", "Tensor", "The right hand side tensor.")   \
-    .add_type_rel("BroadcastComp", BroadcastCompRel)
+    .add_type_rel("BroadcastComp", BroadcastCompRel)                \
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast)
 
 RELAY_REGISTER_CMP_OP("equal")
 .describe("Elementwise equal compare with broadcasting")
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index f5538331a778..2bd16a4f840f 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -3,32 +3,32 @@
  *
  * \file src/tvm/relay/pass/fuse_ops.cc
  *
- * \brief Fuse Relay eligble sequences of Relay operators into a single one.
- *
+ * \brief This is a backend-aware optimization pass.
+ *   Fuse necessary ops into a single one.
  */
+#include <tvm/ir_operator.h>
 #include <tvm/relay/pass.h>
-#include <tvm/runtime/module.h>
-#include <tvm/lowered_func.h>
-#include <tvm/operation.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/logging.h>
-#include "../ir/type_functor.h"
 
 namespace tvm {
 namespace relay {
 
-using namespace runtime;
-
-struct AbstractFusableOps : ExprMutator {
-  Module mod;
-  Array<GlobalVar> fusable_funcs;
-  int counter = 0;
-  size_t expr_hash;
-
-  AbstractFusableOps(Module mod, size_t expr_hash) : mod(mod), expr_hash(expr_hash) {}
+// Simple fuser that only makes each operator function as primitive.
+class SimpleFuser : public ExprMutator {
+ public:
+  // Skip primitive function.
+  Expr VisitExpr_(const FunctionNode* fn_node) {
+    NodeRef res = FunctionGetAttr(GetRef<Function>(fn_node), "Primitive");
+    const ir::IntImm* pval = res.as<ir::IntImm>();
+    if (pval && pval->value != 0) {
+      return GetRef<Expr>(fn_node);
+    } else {
+      return ExprMutator::VisitExpr_(fn_node);
+    }
+  }
 
   Expr VisitExpr_(const CallNode* call) {
-    if (auto op_node = call->op.as<OpNode>()) {
+    if (call->op.as<OpNode>()) {
       // Placeholder fusion algorithm which abstracts
       // single definitions into functions only.
       Array<Var> params;
@@ -37,50 +37,37 @@ struct AbstractFusableOps : ExprMutator {
 
       int param_number = 0;
       for (auto arg : call->args) {
-        auto name = std::string("p") + std::to_string(param_number++);
+        std::ostringstream os;
+        os << "p" << param_number++;
         auto type = arg->checked_type();
-        auto var = VarNode::make(name, type);
+        auto var = VarNode::make(os.str(), type);
         params.push_back(var);
         inner_args.push_back(var);
-        args.push_back(VisitExpr(arg));
+        args.push_back(this->Mutate(arg));
       }
-
       auto body = CallNode::make(call->op, inner_args, call->attrs);
-      auto func = FunctionNode::make(params, body, call->checked_type(), {});
+      auto func = FunctionNode::make(
+          params, body, call->checked_type(), {});
       func = FunctionSetAttr(func, "Primitive", tvm::Integer(1));
-      std::string func_name = "fused_";
-      func_name += op_node->name;
-      func_name += "_";
-      func_name += std::to_string(counter++);
-      func_name += "_";
-      func_name += std::to_string(expr_hash);
-      auto gv = GlobalVarNode::make(func_name);
-      mod->Add(gv, func);
-      fusable_funcs.push_back(gv);
-      return CallNode::make(gv, args, Attrs());
+      return CallNode::make(func, args, Attrs());
     } else {
       return ExprMutator::VisitExpr_(call);
     }
   }
 };
 
-Expr FuseOps(const Module& mod, const Expr& e) {
+
+Expr FuseOps(const Expr& expr) {
   // First we convert all chains of fusable ops into
   // abstracted functions which we mark as primtive
   // then we convert these primtive functions into
   // new operators.
-  auto abstract = AbstractFusableOps(mod, StructuralHash()(e));
-  auto abstracted_e = abstract.VisitExpr(e);
-  RELAY_LOG(INFO) << "FuseOps: before=" << e
-                  << "Fuse: after=" << abstracted_e;
-  return abstracted_e;
+  return SimpleFuser().Mutate(expr);
 }
 
 TVM_REGISTER_API("relay._ir_pass.FuseOps")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = FuseOps(args[1], args[0]);
+    *ret = FuseOps(args[0]);
 });
-
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/lower_ops.cc b/src/relay/pass/lower_ops.cc
deleted file mode 100644
index 55102fe5cf67..000000000000
--- a/src/relay/pass/lower_ops.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/*!
- * Copyright (c) 2018 by Contributors
- *
- * \file src/tvm/relay/pass/lower_ops.cc
- *
- * \brief Lower a Relay program to set of TVM operators.
- *
- */
-#include <tvm/lowered_func.h>
-#include <tvm/operation.h>
-#include <tvm/build_module.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/logging.h>
-#include <tvm/relay/pass.h>
-#include <tvm/runtime/module.h>
-#include <tvm/relay/build_module.h>
-#include "../ir/type_functor.h"
-
-namespace tvm {
-namespace relay {
-
-using namespace runtime;
-
-LoweredOp LoweredOpNode::make(Function func, LoweredFunc lowered_func) {
-  auto node = make_node<LoweredOpNode>();
-  node->func = func;
-  node->lowered_func = lowered_func;
-  return LoweredOp(node);
-}
-
-struct AbstractLocalFunctions : ExprMutator {
-  Module mod;
-  size_t expr_hash;
-  int counter = 0;
-  std::unordered_set<GlobalVar, NodeHash, NodeEqual> visited_funcs;
-  explicit AbstractLocalFunctions(Module mod)
-      : mod(mod), expr_hash(0), counter(0), visited_funcs() {}
-
-  Expr Abstract(const Expr& e) {
-    expr_hash = StructuralHash()(e);
-    return VisitExpr(e);
-  }
-
-  Expr VisitExpr_(const GlobalVarNode* gvar_node) final {
-    auto gvar = GetRef<GlobalVar>(gvar_node);
-    auto it = visited_funcs.find(gvar);
-    if (it == visited_funcs.end()) {
-      auto func = mod->Lookup(gvar);
-      visited_funcs.insert(gvar);
-      auto new_func = FunctionNode::make(
-        func->params,
-        VisitExpr(func->body),
-        func->ret_type,
-        func->type_params,
-        func->attrs);
-      mod->Update(gvar, new_func);
-    }
-    return gvar;
-  }
-
-  Expr VisitExpr_(const FunctionNode* func_node) final {
-    Function func = GetRef<Function>(func_node);
-    auto free_vars = FreeVars(func);
-    Array<Var> params;
-    for (auto free_var : free_vars) {
-      auto var = VarNode::make("free_var", free_var->checked_type());
-      params.push_back(var);
-    }
-    std::string abs_func = "abstracted_func_";
-    abs_func += std::to_string(counter++);
-    abs_func += std::to_string(expr_hash);
-    auto gv = GlobalVarNode::make(abs_func);
-    auto lifted_func = FunctionNode::make(params, func, Type(), {}, {});
-    mod->Add(gv, lifted_func);
-    Array<Expr> args;
-    for (auto free_var : free_vars) {
-      args.push_back(free_var);
-    }
-    return CallNode::make(gv, args, {});
-  }
-};
-
-struct LiveFunctions : ExprVisitor {
-  Module mod;
-  explicit LiveFunctions(Module mod) : mod(mod), global_funcs() {}
-
-  std::unordered_set<GlobalVar, NodeHash, NodeEqual> visited_funcs;
-  std::unordered_set<GlobalVar, NodeHash, NodeEqual> global_funcs;
-
-  void Live(const Expr& e) {
-    CHECK(!e.as<FunctionNode>())
-        << "functions should of been transformed away by previous pass";
-    VisitExpr(e);
-  }
-
-  void VisitExpr_(const FunctionNode* func_node) {
-    LOG(FATAL) << "functions should of been transformed away by previous pass";
-  }
-
-  void VisitExpr_(const GlobalVarNode* var_node) final {
-    GlobalVar var = GetRef<GlobalVar>(var_node);
-    auto it = visited_funcs.find(var);
-    if (it == visited_funcs.end()) {
-      auto func = mod->Lookup(var);
-      visited_funcs.insert(var);
-      // The last pass has trasnformed functions of the form:
-      //
-      // let x = fn (p_1, ..., p_n) { ... };
-      // ...
-      //
-      // into, a top-level declaration:
-      //
-      // def abs_f(fv_1, ..., fv_n) {
-      //    return (fn (p_1...,p_N) { ... };)
-      // }
-      //
-      // and:
-      //
-      // let x = abs_f(fv_1, ... fv_n);
-      //
-      // The only other case we can handle is
-      //
-      // fn foo(...) { body }
-      //
-      // We just search through the body in this case.
-      if (auto inner_func = func->body.as<FunctionNode>()) {
-        return VisitExpr(inner_func->body);
-      } else {
-        return VisitExpr(func->body);
-      }
-    }
-  }
-
-  void VisitExpr_(const CallNode* call) final {
-    RELAY_LOG(INFO) << "LiveOps: CallNode=" << GetRef<Call>(call);
-    if (auto gv_node = call->op.as<GlobalVarNode>()) {
-      GlobalVar gvar = GetRef<GlobalVar>(gv_node);
-      Function func = mod->Lookup(gvar);
-
-      auto attr = FunctionGetAttr(func, "Primitive");
-
-      if (attr.defined() && Downcast<Integer>(attr)->value == 1) {
-        global_funcs.insert(gvar);
-      } else {
-         VisitExpr(gvar);
-      }
-
-      // Finally we need to ensure to visit all the args no matter what.
-      for (auto arg : call->args) {
-        VisitExpr(arg);
-      }
-    } else {
-      return ExprVisitor::VisitExpr_(call);
-    }
-  }
-};
-
-using FCompute = TypedPackedFunc<Array<Tensor>(
-    const Attrs&, const Array<Tensor>&, Type, tvm::Target)>;
-using FSchedule = TypedPackedFunc<Schedule(const Array<Tensor>&, tvm::Target)>;
-
-/*! \brief Return the set of operators in their TVM format. */
-Array<LoweredOp> LowerOps(const Module& mod, const Expr& e,
-                          const std::string& target) {
-  RELAY_LOG(INFO) << "LowerOps: e=" << e;
-  auto flower_ptr = Registry::Get("relay.op.compiler._lower");
-  CHECK(flower_ptr);
-  PackedFunc flower = *flower_ptr;
-
-  auto abstracted_e = AbstractLocalFunctions(mod).Abstract(e);
-  auto live_funcs = LiveFunctions(mod);
-  live_funcs.VisitExpr(abstracted_e);
-
-  auto schedule_reg = Op::GetAttr<FSchedule>("FTVMSchedule");
-  auto compute_reg = Op::GetAttr<FCompute>("FTVMCompute");
-
-  Array<LoweredOp> lowered_funcs;
-
-  for (auto func_name : live_funcs.global_funcs) {
-    auto func = mod->Lookup(func_name);
-    auto call = Downcast<Call>(func->body);
-    auto op_node = call->op.as<OpNode>();
-    CHECK(op_node) << "violated invariant that primtive calls contain a single op call";
-    auto op = GetRef<Op>(op_node);
-    RELAY_LOG(INFO) << "LowerOps: Lowering " << op->name;
-
-    CHECK(IsPrimitiveOp(op)) << "failed to lower "
-      << op->name << "can only lower primitve operations";
-
-    Array<Tensor> inputs;
-    std::string input_name = "in";
-    int i = 0;
-    for (auto type_arg : call->type_args) {
-      auto tt = Downcast<TensorType>(type_arg);
-      inputs.push_back(PlaceholderOpNode::make(input_name + std::to_string(i),
-                                               tt->shape, tt->dtype)
-                           .output(0));
-      i++;
-    }
-
-    auto output_tt = call->checked_type();
-    auto target_node = Target::create(target);
-    Array<Tensor> outputs =
-        compute_reg[op](call->attrs, inputs, output_tt, target_node);
-    auto schedule = schedule_reg[op](outputs, target_node);
-    size_t hash = StructuralHash()(func);
-    LoweredFunc lf =
-        flower(op->name + std::to_string(hash), schedule, inputs, outputs);
-    func = FunctionSetAttr(func, "LoweredFunc", lf);
-    mod->Add(func_name, func, true);
-    lowered_funcs.push_back(LoweredOpNode::make(func, lf));
-  }
-
-  return lowered_funcs;
-}
-
-TVM_REGISTER_API("relay._ir_pass.LowerOps")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = LowerOps(args[0], args[1], args[2]);
-});
-
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index b566b9a3f608..d9435d33903d 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -22,27 +22,6 @@
 namespace tvm {
 namespace runtime {
 
-/*!
- * \brief The name of Device API factory.
- * \param type The device type.
- */
-inline std::string DeviceName(int type) {
-  switch (type) {
-    case kDLCPU: return "cpu";
-    case kDLGPU: return "gpu";
-    case kDLOpenCL: return "opencl";
-    case kDLSDAccel: return "sdaccel";
-    case kDLAOCL: return "aocl";
-    case kDLVulkan: return "vulkan";
-    case kDLMetal: return "metal";
-    case kDLVPI: return "vpi";
-    case kDLROCM: return "rocm";
-    case kOpenGL: return "opengl";
-    case kDLExtDev: return "ext_dev";
-    default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
-  }
-}
-
 class DeviceAPIManager {
  public:
   static const int kMaxDeviceAPI = 32;
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index c4562d1c50e2..52bd07b70f75 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -187,8 +187,8 @@ void GraphRuntime::SetupStorage() {
     CHECK_GE(storage_id, 0) << "Do not support runtime shape op";
     DLDataType t = vtype[i];
     size_t bits = t.bits * t.lanes;
-    CHECK_EQ(bits % 8U, 0U);
-    size_t bytes = (bits / 8U) * size;
+    CHECK(bits % 8U ==  0U || bits ==1U);
+    size_t bytes = ((bits + 7U) / 8U) * size;
 
     uint32_t sid = static_cast<uint32_t>(storage_id);
     if (sid >= pool_entry.size()) {
diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py
new file mode 100644
index 000000000000..568d7849e7ee
--- /dev/null
+++ b/tests/python/relay/test_backend_compile_engine.py
@@ -0,0 +1,38 @@
+import tvm
+import tvm.testing
+import numpy as np
+from tvm import relay
+
+
+def test_compile_engine():
+    engine = relay.backend.compile_engine.get()
+    def get_func(shape):
+        x = relay.var("x", shape=shape)
+        y = relay.add(x, x)
+        z = relay.add(y, x)
+        f = relay.ir_pass.infer_type(relay.Function([x], z))
+        return f
+    z1 = engine.lower(get_func((10,)), "llvm")
+    z2 = engine.lower(get_func((10,)), "llvm")
+    z3 = engine.lower(get_func(()), "llvm")
+    assert z1.same_as(z2)
+    assert not z3.same_as(z1)
+    if tvm.context("cuda").exist:
+        z4 = engine.lower(get_func(()), "cuda")
+        assert not z3.same_as(z4)
+
+    # Test JIT target
+    for target in ["llvm"]:
+        ctx = tvm.context(target)
+        if ctx.exist:
+            f = engine.jit(get_func((10,)), target)
+            x = tvm.nd.array(np.ones(10).astype("float32"), ctx=ctx)
+            y = tvm.nd.empty((10,), ctx=ctx)
+            f(x, y)
+            tvm.testing.assert_allclose(
+                y.asnumpy(), x.asnumpy() * 3)
+    engine.dump()
+
+
+if __name__ == "__main__":
+    test_compile_engine()
diff --git a/tests/python/relay/test_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
similarity index 92%
rename from tests/python/relay/test_graph_runtime.py
rename to tests/python/relay/test_backend_graph_runtime.py
index 7b89831dbfce..7f857b72ad1c 100644
--- a/tests/python/relay/test_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -1,9 +1,7 @@
 import numpy as np
 
 from tvm import relay
-from tvm.relay import create_executor
 from tvm.relay.ir_pass import infer_type
-from tvm.relay.interpreter import Interpreter
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay.op import add
 from tvm.relay.module import Module
@@ -25,8 +23,8 @@ def check_rts(expr, args, expected_result, mod=None):
     expected_result:
         The expected result of running the expression.
     """
-    intrp = create_executor('graph', mod=mod)
-    graph = create_executor('graph', mod=mod)
+    intrp = relay.create_executor('debug', mod=mod)
+    graph = relay.create_executor('graph', mod=mod)
     eval_result = intrp.evaluate(expr)(*args)
     rts_result = graph.evaluate(expr)(*args)
     np.testing.assert_allclose(eval_result.asnumpy(), rts_result.asnumpy())
diff --git a/tests/python/relay/test_interpreter.py b/tests/python/relay/test_backend_interpreter.py
similarity index 73%
rename from tests/python/relay/test_interpreter.py
rename to tests/python/relay/test_backend_interpreter.py
index b7214965db22..c9f689f7baee 100644
--- a/tests/python/relay/test_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -1,16 +1,23 @@
 import numpy as np
 import tvm
+import tvm.testing
 from tvm import relay
-from tvm.relay.interpreter import Value, TupleValue
-from tvm.relay import op
+from tvm.relay.backend.interpreter import Value, TupleValue
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay import testing, create_executor
 
 
 def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
-    intrp = create_executor(mod=mod)
-    result = intrp.evaluate(expr)(*args)
-    np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
+    # TODO(tqchen) add more types once the schedule register is fixed.
+    for target in ["llvm"]:
+        ctx = tvm.context(target, 0)
+        if not ctx.exist:
+            return
+        intrp = create_executor(mod=mod, ctx=ctx, target=target)
+        result = intrp.evaluate(expr)(*args)
+        # use tvm.testing which also set atol
+        tvm.testing.assert_allclose(
+            result.asnumpy(), expected_result, rtol=rtol)
 
 
 def test_from_scalar():
@@ -34,7 +41,7 @@ def test_id():
 
 
 def test_add_const():
-    two = op.add(relay.const(1), relay.const(1))
+    two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
     check_eval(func, [], 2)
 
@@ -42,7 +49,7 @@ def test_add_const():
 def test_mul_param():
     x = relay.var('x', shape=(10, 10))
     y = relay.var('y', shape=(1, 10))
-    func = relay.Function([x, y], op.multiply(x, y))
+    func = relay.Function([x, y], relay.multiply(x, y))
     x_data = np.random.rand(10, 10).astype('float32')
     y_data = np.random.rand(1, 10).astype('float32')
     check_eval(func, [x_data, y_data], x_data * y_data)
@@ -53,7 +60,7 @@ def test_mul_param():
 # def test_dense():
 #     x = relay.var('x', shape=(10, 10))
 #     w = relay.var('w', shape=(10, 10))
-#     y = op.nn.dense(x, w)
+#     y = relay.nn.dense(x, w)
 #     func = relay.Function([x, w], y)
 #     x_data = np.random.rand(10, 10).astype('float32')
 #     w_data = np.random.rand(10, 10).astype('float32')
@@ -63,7 +70,7 @@ def test_mul_param():
 #     x = relay.var('x', shape=(10, 10))
 #     w = relay.var('w', shape=(10, 10))
 #     b = relay.var('b', shape=(10,))
-#     y = op.add(op.nn.dense(x, w), b)
+#     y = relay.add(relay.nn.dense(x, w), b)
 #     func = relay.Function([x, w, b], y)
 #     x_data = np.random.rand(10, 10).astype('float32')
 #     w_data = np.random.rand(10, 10).astype('float32')
@@ -73,46 +80,49 @@ def test_mul_param():
 def test_equal():
     i = relay.var('i', shape=[], dtype='int32')
     j = relay.var('i', shape=[], dtype='int32')
-    z = op.equal(i, j)
+    z = relay.equal(i, j)
     func = relay.Function([i, j], z, ret_type=relay.TensorType([], 'bool'))
     i_data = relay.const(0)
     j_data = relay.const(0)
     check_eval(func, [i_data, j_data], True)
 
+
 def test_subtract():
     i = relay.var('i', shape=[], dtype='int32')
-    sub = op.subtract(i, relay.const(1, dtype='int32'))
+    sub = relay.subtract(i, relay.const(1, dtype='int32'))
     func = relay.Function([i], sub, ret_type=relay.TensorType([], 'int32'))
     i_data = np.array(1, dtype='int32')
     check_eval(func, [i_data], 0)
 
+
 def test_simple_loop():
     mod = relay.module.Module({})
     sum_up = relay.GlobalVar('sum_up')
     i = relay.var('i', shape=[], dtype='int32')
     sb = ScopeBuilder()
-    with sb.if_scope(op.equal(i, relay.const(0, dtype='int32'))):
+    with sb.if_scope(relay.equal(i, relay.const(0, dtype='int32'))):
         sb.ret(i)
     with sb.else_scope():
-        one_less = op.subtract(i, relay.const(1, dtype='int32'))
+        one_less = relay.subtract(i, relay.const(1, dtype='int32'))
         rec_call = relay.Call(sum_up, [one_less])
-        sb.ret(op.add(rec_call, i))
+        sb.ret(relay.add(rec_call, i))
     func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], 'int32'))
     mod[sum_up] = func
     i_data = np.array(10, dtype='int32')
     check_eval(sum_up, [i_data], sum(range(1, 11)), mod=mod)
 
+
 def test_loop():
     mod = relay.module.Module({})
     sum_up = relay.GlobalVar('sum_up')
     i = relay.var('i', shape=[], dtype='int32')
     accum = relay.var('accum', shape=[], dtype='int32')
     sb = ScopeBuilder()
-    with sb.if_scope(op.equal(i, relay.const(0))):
+    with sb.if_scope(relay.equal(i, relay.const(0))):
         sb.ret(accum)
     with sb.else_scope():
-        one_less = op.subtract(i, relay.const(1))
-        new_accum = op.add(accum, i)
+        one_less = relay.subtract(i, relay.const(1))
+        new_accum = relay.add(accum, i)
         sb.ret(relay.Call(sum_up, [one_less, new_accum]))
     func = relay.Function([i, accum], sb.get())
     mod[sum_up] = func
@@ -120,19 +130,21 @@ def test_loop():
     accum_data = np.array(0, dtype='int32')
     check_eval(sum_up, [i_data, accum_data], sum(range(1, 11)), mod=mod)
 
-def test_mlp():
-    pass
-    # net = testing.mlp.get_workload(1)
-    # import pdb; pdb.set_trace()
+
+def test_binds():
+    x = relay.var("x")
+    y = relay.add(x, x)
+    intrp = create_executor("debug")
+    xx = np.ones((10, 20))
+    res = intrp.evaluate(y, binds={x: xx}).asnumpy()
+    tvm.testing.assert_allclose(xx + xx, res)
+
 
 if __name__ == "__main__":
     test_id()
     test_add_const()
-    # test_dense()
-    # test_linear()
     test_equal()
     test_subtract()
     test_simple_loop()
     test_loop()
-    test_mlp()
-
+    test_binds()
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 7ab13409cc43..477207dcef5e 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -2,7 +2,7 @@
 import tvm
 import numpy as np
 from tvm import relay
-from tvm.relay.interpreter import create_executor
+from tvm.relay.testing import ctx_list
 
 def sigmoid(x):
     one = np.ones_like(x)
@@ -27,10 +27,15 @@ def check_single_op(opfunc, ref):
 
         if ref is not None:
             data = np.random.rand(*shape).astype(dtype)
-            intrp = create_executor()
-            op_res = intrp.evaluate(y, { x: relay.const(data) })
             ref_res = ref(data)
-            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+            func = relay.Function([x], y)
+            for target, ctx in ctx_list():
+                # use graph by execuor default for testing, as we need
+                # create function explicitly to avoid constant-folding.
+                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(data)
+                np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
 
     for opfunc, ref in [(tvm.relay.log, np.log),
                    (tvm.relay.exp, np.exp),
@@ -67,14 +72,17 @@ def check_binary_op(opfunc, ref):
             z = opfunc(x, y)
             x_data = np.random.rand(5, 10, 5).astype(t1.dtype)
             y_data = np.random.rand(5, 10, 5).astype(t2.dtype)
-            intrp = create_executor()
-            op_res = intrp.evaluate(z, { x: relay.const(x_data), y: relay.const(y_data) })
             ref_res = ref(x_data, y_data)
-            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+            func = relay.Function([x, y], z)
+            for target, ctx in ctx_list():
+                # use graph by execuor default for testing, as we need
+                # create function explicitly to avoid constant-folding.
+                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, y_data)
+                np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
     for opfunc, ref in [(relay.add, np.add),
                    (relay.subtract, np.subtract),
-                   (relay.mod, np.mod),
                    (relay.multiply, np.multiply),
                    (relay.divide, np.divide)]:
         check_binary_op(opfunc, ref)
@@ -116,7 +124,7 @@ def test_log_softmax():
     assert yy.checked_type == relay.TensorType((n, d))
 
 
-def test_concatenate_infer_type():
+def test_concatenate():
     n, t, d = tvm.var("n"), tvm.var("t"), 100
     x = relay.var("x", shape=(n, t, d))
     y = relay.var("y", shape=(n, t, d))
@@ -134,15 +142,23 @@ def test_concatenate_infer_type():
     zz = relay.ir_pass.infer_type(z)
     assert zz.checked_type == relay.TensorType((n, t + t, 100))
 
-    # x = relay.var("x", shape=(10, 5))
-    # y = relay.var("y", shape=(10, 5))
-    # z = relay.concatenate((x, y), axis=1)
-    # intrp = create_executor()
-    # x_data = np.random.rand(10, 5).astype('float32')
-    # y_data = np.random.rand(10, 5).astype('float32')
-    # op_res = intrp.evaluate(z, { x: relay.const(x_data), y: relay.const(y_data) })
-    # ref_res = np.concatenate(x_data, y_data, axis=1)
-    # np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+    x = relay.var("x", shape=(10, 5))
+    y = relay.var("y", shape=(10, 5))
+    z = relay.concatenate((x, y), axis=1)
+
+    # Check result.
+    func = relay.Function([x, y], z)
+    x_data = np.random.rand(10, 5).astype('float32')
+    y_data = np.random.rand(10, 5).astype('float32')
+    ref_res = np.concatenate((x_data, y_data), axis=1)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data, y_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=0.01)
+        op_res2 = intrp2.evaluate(func)(x_data, y_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=0.01)
 
 def test_dropout():
     n, t, d = tvm.var("n"), tvm.var("t"), tvm.var("d")
@@ -206,7 +222,7 @@ def test_batch_norm():
     test_unary_op()
     test_binary_op()
     test_expand_dims_infer_type()
-    test_concatenate_infer_type()
+    test_concatenate()
     test_softmax()
     test_log_softmax()
     test_dropout()
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index d20997010b4c..6fd70c386567 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -1,7 +1,7 @@
 import tvm
 import numpy as np
 from tvm import relay
-from tvm.relay import create_executor
+from tvm.relay.testing import ctx_list
 
 
 def test_binary_op():
@@ -24,12 +24,15 @@ def check_binary_op(opfunc, ref):
             z = opfunc(x, y)
             x_data = np.random.rand(5, 10, 5).astype(t1.dtype)
             y_data = np.random.rand(5, 10, 5).astype(t2.dtype)
-            intrp = create_executor()
-            op_res = intrp.evaluate(z, { x: relay.const(x_data), y: relay.const(y_data) })
             ref_res = ref(x_data, y_data)
-            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+            func = relay.Function([x, y], z)
 
-    for opfunc, ref in [(relay.pow, np.power)]:
+            for target, ctx in ctx_list():
+                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, y_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+
+    for opfunc, ref in [(relay.power, np.power)]:
         check_binary_op(opfunc, ref)
 
 
@@ -57,15 +60,19 @@ def test_cmp_type():
             z = op(x, y)
             x_data = np.random.rand(*x_shape).astype(t1.dtype)
             y_data = np.random.rand(*y_shape).astype(t2.dtype)
-            intrp = create_executor()
-            op_res = intrp.evaluate(z, { x: relay.const(x_data), y: relay.const(y_data) })
             ref_res = ref(x_data, y_data)
-            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+            func = relay.Function([x, y], z)
+
+            for target, ctx in ctx_list():
+                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, y_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
 
 def test_binary_int_broadcast():
     for op, ref in [(relay.right_shift, np.right_shift),
                (relay.left_shift, np.left_shift),
+                (relay.mod, np.mod),
                (relay.maximum, np.maximum),
                (relay.minimum, np.minimum)]:
         x = relay.var("x", relay.TensorType((10, 4), "int32"))
@@ -81,10 +88,14 @@ def test_binary_int_broadcast():
         t2 = relay.TensorType(y_shape, 'int32')
         x_data = np.random.rand(*x_shape).astype(t1.dtype)
         y_data = np.random.rand(*y_shape).astype(t2.dtype)
-        intrp = create_executor()
-        op_res = intrp.evaluate(z, { x: relay.const(x_data), y: relay.const(y_data) })
+        func = relay.Function([x, y], z)
         ref_res = ref(x_data, y_data)
-        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+        for target, ctx in ctx_list():
+            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x_data, y_data)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+
 
 def test_where():
     cond = relay.var("cond", relay.TensorType((3, 4), "float32"))
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
new file mode 100644
index 000000000000..2bbc1dce9693
--- /dev/null
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -0,0 +1,17 @@
+import tvm
+from tvm import relay
+
+def test_fuse_simple():
+    """Simple testcase."""
+    x = relay.var("x", shape=(10, 20))
+    y = relay.add(x, x)
+    z = relay.exp(y)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z)
+    zz = relay.ir_pass.fuse_ops(zz)
+    zz = relay.ir_pass.infer_type(zz)
+    zz.astext()
+
+
+if __name__ == "__main__":
+    test_fuse_simple()
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 46929c6eb1f8..4cc6cb7e9f34 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -3,10 +3,9 @@
 from __future__ import absolute_import as _abs
 import tvm
 import topi
-from . import tag
 from . import cpp
 
-@tvm.tag_scope(tag=tag.BROADCAST)
+
 def expand_dims(a, axis, num_newaxis=1):
     """Expand the shape of an array.
 
@@ -25,7 +24,6 @@ def expand_dims(a, axis, num_newaxis=1):
     return cpp.expand_dims(a, axis, num_newaxis)
 
 
-@tvm.tag_scope(tag=tag.BROADCAST)
 def expand_like(a, shape_like, axis):
     """Expand an input array with the shape of second array.
     This operation can always be composed of unsqueezing and
@@ -79,7 +77,6 @@ def _compute(*idxs):
     return tvm.compute(shape_like.shape, _compute)
 
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def transpose(a, axes=None):
     """Permute the dimensions of an array.
 
@@ -141,7 +138,6 @@ def strided_slice(a, begin, end, strides=None):
     return cpp.strided_slice(a, begin, end, strides)
 
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def reshape(a, newshape):
     """Reshape the array
 
@@ -159,7 +155,6 @@ def reshape(a, newshape):
     return cpp.reshape(a, newshape)
 
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def squeeze(a, axis=None):
     """Remove single-dimensional entries from the shape of an array.
 
@@ -178,7 +173,6 @@ def squeeze(a, axis=None):
     return cpp.squeeze(a, axis)
 
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def concatenate(a_tuple, axis=0):
     """Join a sequence of arrays along an existing axis.
 
@@ -197,7 +191,6 @@ def concatenate(a_tuple, axis=0):
     return cpp.concatenate(a_tuple, axis)
 
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def split(ary, indices_or_sections, axis=0):
     """Split an array into multiple sub-arrays.
 

From 365b52baca6d94e7723cf823b7e738f268ccd22a Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Mon, 5 Nov 2018 19:55:04 -0800
Subject: [PATCH 330/529] print import_llvm ir in tensorize tutorial (#2064)

---
 tutorials/language/tensorize.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py
index 762068457e4b..4115de1b2eb4 100644
--- a/tutorials/language/tensorize.py
+++ b/tutorials/language/tensorize.py
@@ -154,6 +154,12 @@ def gemv_impl():
 # The importing needs to happen before the tensorized GEMV being executed.
 #
 s[C].pragma(x, "import_llvm", gemv_impl())
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# Finally we compare the tensorize version with that :code:`numpy.dot` produces,
+# ensure our implementation is correct.
+#
 func = tvm.build(s, [A, B, C], target="llvm", name="gemv")
 
 from topi.util import get_const_tuple
@@ -166,12 +172,11 @@ def gemv_impl():
 tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
 
 ######################################################################
-# We compare the tensorize version with that :code:`numpy.dot` produces,
-# ensure our implementation is correct.
-#
 # Reduce-update for Tensorize
-# ------------------------------------
-# Let's then move one step forward.
+# ---------------------------
+# So far you have learned the basic idea of tensorize,
+# now let's move one step forward to a more complicated case.
+#
 # Assume our accelerator could only multiply a vector by a square matrix,
 # in which the vector size needs to be no larger than 16.
 # Given such hardware constrain, now we need to split the reduce axis as following,

From 187ee427c5c2dfc54c89dee305e478250bc58287 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 6 Nov 2018 11:55:21 +0800
Subject: [PATCH 331/529] Add a testcase of dilated conv2d int8 (#2065)

---
 topi/tests/python/test_topi_conv2d_int8.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
index cbffda95d8d6..fd5e91eed72d 100644
--- a/topi/tests/python/test_topi_conv2d_int8.py
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -119,6 +119,9 @@ def test_conv2d_nchw():
         verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_bias=True)
         verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
 
+        # dilation = 2
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, dilation=2)
+
         # batch size
         verify_conv2d_NCHWc_int8(4, 64, 56, 64, 3, 1, 1)
         verify_conv2d_NCHWc_int8(9, 64, 56, 64, 3, 1, 1)

From ba3aeb2f6667438ea281df63f4626e53b9e3d6ef Mon Sep 17 00:00:00 2001
From: "Bob.Liu" <bofangliu@tuputech.com>
Date: Tue, 6 Nov 2018 11:57:45 +0800
Subject: [PATCH 332/529]  [FRONTEND][ONNX] fixed operator converter for Split
 in onnx frontend (#2038)

---
 nnvm/python/nnvm/frontend/onnx.py             | 19 +++++++++-
 .../python/frontend/onnx/test_forward.py      | 36 +++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index f89a94d1b3c8..34ab5cab7c06 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -464,6 +464,23 @@ def _impl_v1(cls, inputs, attr, params):
             inputs[0] = _sym.expand_dims(inputs[0], axis=axes, num_newaxis=1)
         return inputs[0]
 
+
+class Split(OnnxOpConverter):
+    """ Operator converter for Split.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        attr['indices_or_sections'] = []
+        index = 0
+        for i in attr['split'][:-1]:
+            index += i
+            attr['indices_or_sections'].append(index)
+        return AttrCvt(
+            op_name='split',
+            ignores=['split'])(inputs, attr, params)
+
+
 class Slice(OnnxOpConverter):
     """ Operator converter for Slice.
     """
@@ -754,7 +771,7 @@ def _get_convert_map(opset):
         'Cast': Cast.get_converter(opset),
         'Reshape': Reshape.get_converter(opset),
         'Concat': Renamer('concatenate'),
-        'Split': AttrCvt('split', {'split': 'indices_or_sections'}),
+        'Split': Split.get_converter(opset),
         'Slice': Slice.get_converter(opset),
         'Transpose': AttrCvt('transpose', {'perm': 'axes'}),
         'Gather': Gather.get_converter(opset),
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index e0d77277f98b..41b1703db215 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -712,6 +712,41 @@ def test_constantfill():
     verify_constantfill(False, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
     verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5, 4, 5, 6), 10, 'float32', extra_shape=(4, 5, 6))
 
+def verify_split(indata, outdatas, split, axis=0):
+    indata = np.array(indata).astype(np.float32)
+    outdatas = [np.array(o).astype(np.float32) for o in outdatas]
+    node = helper.make_node(
+        'Split',
+        inputs=['input'],
+        outputs=['output_{}'.format(i) for i in range(len(split))],
+        axis=axis,
+        split=split
+    )
+    graph = helper.make_graph([node],
+                              'split_test',
+                              inputs = [helper.make_tensor_value_info("input",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("output_{}".format(i),
+                                            TensorProto.FLOAT, list(outdatas[i].shape))
+                                            for i in range(len(split))
+                                         ])
+    model = helper.make_model(graph, producer_name='split_test')
+
+    for target, ctx in ctx_list():
+        output_shape = [o.shape for o in outdatas]
+        output_type = ['float32', 'float32', 'float32']
+        tvm_out = get_tvm_output(model, indata, target, ctx, output_shape, output_type)
+    for o, t in zip(outdatas, tvm_out):
+        tvm.testing.assert_allclose(o, t)
+
+def test_split():
+    # 1D
+    verify_split([1., 2., 3., 4., 5., 6.], [[1., 2.], [3., 4.], [5., 6.]], [2, 2, 2], 0)
+    verify_split([1., 2., 3., 4., 5., 6.], [[1., 2.], [3.], [4., 5., 6.]], [2, 1, 3], 0)
+    # 2D
+    verify_split([[1., 2., 3., 4.], [7., 8., 9., 10.]],
+                 [[[1., 2.], [7., 8.]], [[3., 4.], [9., 10.]]], [2, 2], 1)
+
 if __name__ == '__main__':
     # verify_super_resolution_example()
     # verify_squeezenet1_1()
@@ -737,3 +772,4 @@ def test_constantfill():
     test_forward_arg_min_max()
     test_softmax()
     test_constantfill()
+    test_split()

From a0e8998d3c0344f300b467cf94ac25fadda60f3a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 6 Nov 2018 08:47:49 -0800
Subject: [PATCH 333/529] [CODEGEN][LLVM] Cache packed func ptr, lift alloca
 (#2070)

---
 src/codegen/llvm/codegen_amdgpu.cc |  6 +++--
 src/codegen/llvm/codegen_cpu.cc    | 36 ++++++++++++++++++------------
 src/codegen/llvm/codegen_llvm.cc   |  6 +++--
 src/codegen/llvm/codegen_llvm.h    | 20 +++++++++++++++++
 src/codegen/llvm/codegen_nvptx.cc  |  6 +++--
 5 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc
index 9cccdf4466fd..d1a0716bc1d9 100644
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -47,8 +47,10 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       if (info.scope.rank == runtime::StorageRank::kLocal) {
         // const int local_address_space = 5;
         // TODO(tqchen): for higher version of LLVM, local address space can be set.
-        llvm::AllocaInst* alloca = builder_->CreateAlloca(
-            LLVMType(op->type), ConstInt32(constant_size));
+        llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
+            return builder_->CreateAlloca(
+                LLVMType(op->type), ConstInt32(constant_size));
+          });
         if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
           alloca->setAlignment(info.alignment);
         }
diff --git a/src/codegen/llvm/codegen_cpu.cc b/src/codegen/llvm/codegen_cpu.cc
index 436c727f86f0..4e005346624b 100644
--- a/src/codegen/llvm/codegen_cpu.cc
+++ b/src/codegen/llvm/codegen_cpu.cc
@@ -503,7 +503,9 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
       handle_not_null, end_block, init_block, md_very_likely_branch_);
   // Initialize the handle if needed.
   builder_->SetInsertPoint(init_block);
-  llvm::Value* out = builder_->CreateAlloca(t_tvm_func_handle_);
+  llvm::Value* out = WithFunctionEntry([&]() {
+      return builder_->CreateAlloca(t_tvm_func_handle_);
+    });
   llvm::LoadInst* ctx = builder_->CreateAlignedLoad(
       gv_mod_ctx_, gv_mod_ctx_->getAlignment());
   ctx->setMetadata(
@@ -513,6 +515,8 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
       RuntimeTVMGetFuncFromEnv(), {ctx, GetConstString(fname), out});
   init_block = CheckCallSuccess(retcode);
   llvm::Value* loaded_handle = builder_->CreateAlignedLoad(out, align);
+  // Store the handle
+  builder_->CreateStore(loaded_handle, hptr);
   builder_->CreateBr(end_block);
   // end block
   builder_->SetInsertPoint(end_block);
@@ -637,19 +641,23 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const Call* op) {
   } else if (op->is_intrinsic(intrinsic::tvm_stack_alloca)) {
     CHECK_EQ(op->args.size(), 2U);
     const std::string& type = op->args[0].as<StringImm>()->value;
-    llvm::Value* num = MakeValue(op->args[1]);
-    if (type == "shape") {
-      return builder_->CreateAlloca(t_tvm_shape_index_, num);
-    } else if (type == "arg_value") {
-      return builder_->CreateAlloca(t_tvm_value_, num);
-    } else if (type == "arg_tcode") {
-      return builder_->CreateAlloca(t_int_, num);
-    } else if (type == "array") {
-      return builder_->CreateAlloca(t_tvm_array_, num);
-    } else {
-      LOG(FATAL) << "Unknown stack alloca type " << type;
-      return nullptr;
-    }
+    return WithFunctionEntry([&]() -> llvm::AllocaInst* {
+        const int64_t* pval = as_const_int(op->args[1]);
+        CHECK(pval) << "require stack alloca to contain constant value";
+        llvm::Value* num = ConstInt32(pval[0]);
+        if (type == "shape") {
+          return builder_->CreateAlloca(t_tvm_shape_index_, num);
+        } else if (type == "arg_value") {
+          return builder_->CreateAlloca(t_tvm_value_, num);
+        } else if (type == "arg_tcode") {
+          return builder_->CreateAlloca(t_int_, num);
+        } else if (type == "array") {
+          return builder_->CreateAlloca(t_tvm_array_, num);
+        } else {
+          LOG(FATAL) << "Unknown stack alloca type " << type;
+          return nullptr;
+        }
+      });
   } else {
     return CodeGenLLVM::CreateIntrinsic(op);
   }
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index c1b1fe24f0a8..22319aa926fb 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -1049,8 +1049,10 @@ void CodeGenLLVM::VisitStmt_(const Allocate* op) {
     if (info.alignment > 16) {
       info.alignment = 16;
     }
-    llvm::AllocaInst* alloca = builder_->CreateAlloca(
-        LLVMType(op->type), ConstInt32(constant_size));
+    llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
+        return builder_->CreateAlloca(
+            LLVMType(op->type), ConstInt32(constant_size));
+      });
     if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
       alloca->setAlignment(info.alignment);
     }
diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h
index d0cee581a0b6..080306310370 100644
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -132,6 +132,26 @@ class CodeGenLLVM :
     /*! \brief The alignment of allocation */
     int alignment{0};
   };
+  /*!
+   * \brief Execute falloca at the beginning of the
+   *  currrent function and obtain its return value.
+   *
+   *  This is a helper function to make sure that
+   *  alloca always happen in the beginning of the function.
+   *
+   * \param falloca The allocation function to be executed.
+   * \tparam F The function to be executed.
+   * \return The result.
+   */
+  template<typename F>
+  inline llvm::AllocaInst* WithFunctionEntry(F falloca) {
+    llvm::BasicBlock* current = builder_->GetInsertBlock();
+    llvm::BasicBlock* entry = &(function_->getEntryBlock());
+    builder_->SetInsertPoint(entry, entry->begin());
+    llvm::AllocaInst* res = falloca();
+    builder_->SetInsertPoint(current);
+    return res;
+  }
   // create intrinstic given call
   virtual llvm::Value* CreateIntrinsic(const Call* op);
   // create extern function call
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index 6bc6ccaff582..2d416d34ea0c 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -49,8 +49,10 @@ class CodeGenNVPTX : public CodeGenLLVM {
       if (info.scope.rank == runtime::StorageRank::kLocal) {
         // const int local_address_space = 5;
         // TODO(tqchen): for higher version of LLVM, local address space can be set.
-        llvm::AllocaInst* alloca = builder_->CreateAlloca(
-            LLVMType(op->type), ConstInt32(constant_size));
+        llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
+            return builder_->CreateAlloca(
+                LLVMType(op->type), ConstInt32(constant_size));
+          });
         if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
           alloca->setAlignment(info.alignment);
         }

From b9bfa76814f3e58dc0fde0844d2b2d773c52ad04 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 7 Nov 2018 02:42:21 +0800
Subject: [PATCH 334/529] Allow to use negative index of array in python
 (#2069)

* Allow to use negative index of array in python

* Support negative index in array slice

* Print index and array size in IndexError

* Fix style
---
 python/tvm/container.py                      | 11 +++++++++--
 tests/python/unittest/test_lang_container.py |  3 +++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/python/tvm/container.py b/python/tvm/container.py
index eb1f17b0fc9d..ba30255f650a 100644
--- a/python/tvm/container.py
+++ b/python/tvm/container.py
@@ -17,10 +17,17 @@ def __getitem__(self, i):
             start = i.start if i.start is not None else 0
             stop = i.stop if i.stop is not None else len(self)
             step = i.step if i.step is not None else 1
+            if start < 0:
+                start += len(self)
+            if stop < 0:
+                stop += len(self)
             return [self[idx] for idx in range(start, stop, step)]
 
-        if i >= len(self):
-            raise IndexError("array index out of range")
+        if i < -len(self) or i >= len(self):
+            raise IndexError("Array index out of range. Array size: {}, got index {}"
+                             .format(len(self), i))
+        if i < 0:
+            i += len(self)
         return _api_internal._ArrayGetItem(self, i)
 
     def __len__(self):
diff --git a/tests/python/unittest/test_lang_container.py b/tests/python/unittest/test_lang_container.py
index 615c5ac0a8d5..8683e56088a0 100644
--- a/tests/python/unittest/test_lang_container.py
+++ b/tests/python/unittest/test_lang_container.py
@@ -3,6 +3,9 @@
 def test_array():
     a = tvm.convert([1,2,3])
     assert len(a) == 3
+    assert a[-1].value == 3
+    a_slice = a[-3:-1]
+    assert (a_slice[0].value, a_slice[1].value) == (1, 2)
 
 def test_array_save_load_json():
     a = tvm.convert([1,2,3])

From 7499b461cb522f1e17f50cfc5bd44a7baae80067 Mon Sep 17 00:00:00 2001
From: xqdan <danxiaoqiang@126.com>
Date: Wed, 7 Nov 2018 04:59:54 +0800
Subject: [PATCH 335/529] fix asan check heap-use-after-free (#2071)

---
 topi/include/topi/reduction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index d68b9b390419..777c103ec950 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -262,7 +262,7 @@ using FIdentity = std::function<Array<Expr>(std::vector<Type> types)>;
 inline FCommReduce MakeCommReducer(FCombine fcombine,
                                    FIdentity fidentity,
                                    std::string name = "reduce") {
-  return [fcombine, fidentity, &name]
+  return [fcombine, fidentity, name]
   (Array<Expr> exprs, const Array<IterVar>& axis, Expr* condition) {
     Array<Var> lhs, rhs;
     std::vector<Type> dtypes;

From f77cf823cbc855880120dff88d702ea41f866aa6 Mon Sep 17 00:00:00 2001
From: Yitao <coloka.yi@gmail.com>
Date: Thu, 8 Nov 2018 01:06:05 +0800
Subject: [PATCH 336/529] Fix a crash in android_deploy demo. (#2073)

---
 .../src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java b/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java
index f3cdefe1c2ff..7d391856f599 100644
--- a/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java
+++ b/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java
@@ -298,7 +298,7 @@ protected Integer doInBackground(Bitmap... bitmaps) {
 
                     // get the function from the module(get output data)
                     Log.i(TAG, "get output data");
-                    NDArray outputNdArray = NDArray.empty(new long[]{1000}, new TVMType("float32"));
+                    NDArray outputNdArray = NDArray.empty(new long[]{1, 1000}, new TVMType("float32"));
                     Function getOutputFunc = graphRuntimeModule.getFunction("get_output");
                     getOutputFunc.pushArg(OUTPUT_INDEX).pushArg(outputNdArray).invoke();
                     float[] output = outputNdArray.asFloatArray();
@@ -630,4 +630,4 @@ public static Matrix getTransformationMatrix(
 
         return matrix;
     }
-}
\ No newline at end of file
+}

From f62740ec81af9916b0213080cc663294c3b239b0 Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Thu, 8 Nov 2018 02:08:13 +0900
Subject: [PATCH 337/529] [Frontend][MXNet] argmax, argmin ops support (#2048)

---
 nnvm/include/nnvm/top/tensor.h                |  3 ++
 nnvm/python/nnvm/frontend/mxnet.py            | 28 +++++++++++++++----
 nnvm/src/top/tensor/reduce.cc                 | 20 ++++++-------
 .../python/frontend/mxnet/test_forward.py     | 13 +++++++++
 topi/python/topi/cuda/reduction.py            |  5 +++-
 5 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h
index 18b937dbb7b0..bed1b05984da 100644
--- a/nnvm/include/nnvm/top/tensor.h
+++ b/nnvm/include/nnvm/top/tensor.h
@@ -205,6 +205,7 @@ struct ReduceParam : public dmlc::Parameter<ReduceParam> {
   TShape axis;
   bool keepdims;
   bool exclude;
+  int dtype;
 
   DMLC_DECLARE_PARAMETER(ReduceParam) {
     DMLC_DECLARE_FIELD(axis).set_default(TShape())
@@ -226,6 +227,8 @@ struct ReduceParam : public dmlc::Parameter<ReduceParam> {
                 "in the result as dimension with size one.");
     DMLC_DECLARE_FIELD(exclude).set_default(false)
       .describe("Whether to perform reduction on axis that are NOT in axis instead.");
+    DMLC_DECLARE_DTYPE_FIELD(dtype).set_default(kInt32)
+      .describe("Target data type.");
   }
 };
 
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 1be76c46fe82..03ba879aa5cf 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -259,12 +259,12 @@ def _crop_like(inputs, attrs):
 
 
 def _expand_dims(inputs, attrs):
-    op_name, new_attrs = "expand_dims", {}
+    op_name, new_attrs = 'expand_dims', {}
     new_attrs['axis'] = _required_attr(attrs, 'axis')
     return _get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _lrn(inputs, attrs):
-    op_name, new_attrs = "lrn", {}
+    op_name, new_attrs = 'lrn', {}
     new_attrs['alpha'] = attrs.get('alpha', 0.0001)
     new_attrs['beta'] = attrs.get('beta', 0.75)
     new_attrs['bias'] = attrs.get('knorm', 2)
@@ -274,13 +274,27 @@ def _lrn(inputs, attrs):
     return _get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _ones(_, attrs):
-    op_name = "ones"
+    op_name = 'ones'
     return _get_nnvm_op(op_name)(**attrs)
 
 def _zeros(_, attrs):
-    op_name = "zeros"
+    op_name = 'zeros'
     return _get_nnvm_op(op_name)(**attrs)
 
+def _argmax(inputs, attrs):
+    op_name, new_attrs = 'argmax', {}
+    new_attrs['dtype'] = 'float32'
+    new_attrs['axis'] = attrs.get('axis', 0)
+    new_attrs['keepdims'] = _parse_bool_str(attrs, 'keepdims', default="False")
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _argmin(inputs, attrs):
+    op_name, new_attrs = 'argmin', {}
+    new_attrs['dtype'] = 'float32'
+    new_attrs['axis'] = attrs.get('axis', 0)
+    new_attrs['keepdims'] = _parse_bool_str(attrs, 'keepdims', default="False")
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
 _identity_list = ['__add_scalar__', '__add_symbol__', '__div_scalar__',
                   '__div_symbol__', '__mul_scalar__', '__mul_symbol__',
                   '__pow_scalar__', '__rdiv_scalar__', '__rpow_scalar__',
@@ -303,8 +317,10 @@ def _zeros(_, attrs):
     '_rminus_scalar': _rename('__rsub_scalar__'),
     '_contrib_MultiBoxPrior' : _rename('multibox_prior'),
     '_contrib_MultiBoxDetection' : _contrib_multibox_detection,
-    '_ones' : _ones,
-    '_zeros' : _zeros,
+    '_ones'         : _ones,
+    '_zeros'        : _zeros,
+    'argmax'        : _argmax,
+    'argmin'        : _argmin,
     'Activation'    : _activations,
     'BatchNorm'     : _batch_norm,
     'BatchNorm_v1'  : _batch_norm,
diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index 7241c4b4b85a..7b768ac64304 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -272,15 +272,13 @@ NNVM_REGISTER_BASE_REDUCE_OP(collapse_sum)
     return Array<Tensor>{ topi::collapse_sum(inputs[0], inputs[1]->shape) };
 });
 
-template<int Type>
 inline bool InferFixedType(const NodeAttrs& attrs,
                           std::vector<int>* in_attrs,
                           std::vector<int>* out_attrs) {
-  // Static type inference for argmax operation. Argmax return indices which
-  // should have Int32 type as shapes do.
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, static_cast<int>(Type));
+  const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, param.dtype);
   return true;
 }
 
@@ -291,7 +289,7 @@ values over a given axis.
 )code" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "The input")
 .set_attr<FInferShape>("FInferShape", ReduceShape)
-.set_attr<FInferType>("FInferType", InferFixedType<kInt32>)
+.set_attr<FInferType>("FInferType", InferFixedType)
 .set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_num_inputs(1)
 .set_attr<FTVMCompute>(
@@ -302,8 +300,9 @@ values over a given axis.
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     auto axis = ShapeToArray(r_axes);
-    return Array<Tensor>{
-      topi::argmax(inputs[0], axis, param.keepdims) };
+    Tensor out = topi::argmax(inputs[0], axis, param.keepdims);
+    if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
+    return Array<Tensor>{out};
 });
 
 NNVM_REGISTER_BASE_REDUCE_OP(argmin)
@@ -313,7 +312,7 @@ values over a given axis.
 )code" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "The input")
 .set_attr<FInferShape>("FInferShape", ReduceShape)
-.set_attr<FInferType>("FInferType", InferFixedType<kInt32>)
+.set_attr<FInferType>("FInferType", InferFixedType)
 .set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_num_inputs(1)
 .set_attr<FTVMCompute>(
@@ -324,8 +323,9 @@ values over a given axis.
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     auto axis = ShapeToArray(r_axes);
-    return Array<Tensor>{
-      topi::argmin(inputs[0], axis, param.keepdims) };
+    Tensor out = topi::argmin(inputs[0], axis, param.keepdims);
+    if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
+    return Array<Tensor>{out};
 });
 
 NNVM_REGISTER_REDUCE_OP(mean)
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index dbd93e710491..66ae9d6e9de4 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -174,6 +174,16 @@ def test_forward_zeros_like():
     data = mx.sym.var('data')
     mx_sym = mx.sym.zeros_like(data, dtype='float32')
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_argmax():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmax(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (5, 3), (5,))
+
+def test_forward_argmin():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmin(data, axis=0)
+    verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
     
 if __name__ == '__main__':
     test_forward_mlp()
@@ -194,3 +204,6 @@ def test_forward_zeros_like():
     test_forward_zeros()
     test_forward_ones_like()
     test_forward_zeros_like()
+    test_forward_argmax()
+    test_forward_argmin()
+    
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 52bacd3d1ae3..79fa02156b19 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -107,7 +107,10 @@ def traverse_before_reduce(operator):
     def traverse_after_reduce(operator):
         """Internal travserse function"""
         if tag.is_broadcast(operator.tag):
-            raise RuntimeError("Not yet support ewise after reduce")
+            if operator not in scheduled_ops:
+                _schedule_injective(operator, sch)
+            for tensor in operator.input_tensors:
+                traverse_after_reduce(tensor.op)
         elif operator.tag == 'comm_reduce':
             _schedule_reduce(operator, sch, is_idx_reduce=False)
             for tensor in operator.input_tensors:

From cf64a12c15c43ee1950fa80dcae3a3f9a25a16fd Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Sat, 10 Nov 2018 03:43:47 +0800
Subject: [PATCH 338/529] Fix conv2d int8 schedule on CUDA (#2074)

---
 topi/python/topi/cuda/conv2d_int8.py | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 200ed1a3887a..ef2cb3706bf2 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -138,10 +138,6 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_
 
 def schedule_conv2d_NCHWc_int8(cfg, s, output):
     """Schedule conv2d int8 NCHWc template"""
-    workload = output.op.attrs["workload"]
-
-    stride = workload[3]
-
     conv = output.op.input_tensors[0]
     packed_data, packed_kernel = conv.op.input_tensors
 
@@ -166,11 +162,6 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output):
     if pad_data != packed_data:
         s[pad_data].compute_inline()
 
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
     # create cache stage
     AA = s.cache_read(pad_data, 'shared', [conv])
     WW = s.cache_read(packed_kernel, 'shared', [conv])
@@ -250,18 +241,11 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output):
 
     # cooperative fetching
     for load in [AA, WW]:
-        if load == AA:
-            n, f, y, x, c = s[load].op.axis
-            if pad_data == packed_data and stride_h == 1 and stride_w == 1:
-                s[load].vectorize(c)
-                fused = s[load].fuse(n, f, y, x)
-            else:
-                c, _ = s[load].split(c, factor=4)
-                fused = s[load].fuse(n, f, y, x, c)
-        else:
-            n, f, y, x, oc_chunk, c = s[load].op.axis
-            fused = s[load].fuse(n, f, y, x, oc_chunk)
-            s[load].vectorize(c)
+        c = s[load].op.axis[-1]
+        c_outer, c = s[load].split(c, factor=4)
+        s[load].vectorize(c)
+        fused = s[load].op.axis[:-1] + [c_outer]
+        fused = s[load].fuse(*fused)
 
         fused, tx = s[load].split(fused, factor=n_tx)
         fused, ty = s[load].split(fused, factor=n_ty)

From 733dac4f94701352acc067441c590678372440fd Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Sat, 10 Nov 2018 03:46:52 +0800
Subject: [PATCH 339/529] [OPENCL] Make use of cpu device when gpu device
 doesn't exist. (#2076)

---
 src/runtime/opencl/opencl_device_api.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index d5177fd9525a..6bb0948bca91 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -245,6 +245,10 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
       continue;
     }
     std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
+    if ((devices_matched.size() == 0) && (device_type == "gpu")) {
+      LOG(WARNING) << "Using CPU OpenCL device";
+      devices_matched = cl::GetDeviceIDs(platform_id, "cpu");
+    }
     if (devices_matched.size() > 0) {
       this->type_key = type_key;
       this->platform_id = platform_id;

From aa96b877a825bd7efba7037030e5c0d21d5ee968 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 9 Nov 2018 14:12:20 -0800
Subject: [PATCH 340/529] [TEAM] vinx13 -> Reviewer (#2083)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f1f37a0f3c39..91ecb2851985 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -24,6 +24,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Zhi Chen](https://github.com/zhiics)
 - [Xiaoqiang Dan](https://github.com/xqdan)
 - [Liangfu Chen](https://github.com/liangfu)
+- [Wuwei Lin](https://github.com/vinx13)
 - [Masahiro Masuda](https://github.com/masahi)
 - [Kazutaka Morita](https://github.com/kazum)
 - [Tatsuya Nishiyama](https://github.com/nishi-t)

From c1af6fc03cc51b84e8deefdb6dc29b1a7bb3c41c Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 9 Nov 2018 14:25:53 -0800
Subject: [PATCH 341/529] [RELAY] CompileEngine update, nn conv2d, fix dense,
 pool. (#2082)

---
 include/tvm/relay/op_attr_types.h             |   3 +-
 python/tvm/relay/op/_tensor.py                |   9 +-
 python/tvm/relay/op/nn/__init__.py            |   1 +
 python/tvm/relay/op/nn/_nn.py                 | 176 +++++++++++++++-
 python/tvm/relay/op/op.py                     |   9 +
 src/relay/backend/compile_engine.cc           |   7 +-
 src/relay/op/nn/nn.cc                         |   5 +-
 src/relay/op/nn/pooling.cc                    |  81 +++++++-
 .../python/relay/test_backend_interpreter.py  |  22 --
 tests/python/relay/test_op_level1.py          |  62 +++++-
 tests/python/relay/test_op_level2.py          | 194 +++++++++++++++---
 topi/python/topi/util.py                      |   9 +-
 12 files changed, 489 insertions(+), 89 deletions(-)

diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 941b32e9d33a..2c9fa2808f85 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -72,7 +72,8 @@ using FTVMCompute = runtime::TypedPackedFunc<
  * \return schedule The computation schedule.
  */
 using FTVMSchedule = runtime::TypedPackedFunc<
-  Schedule(const Array<Tensor>& outs,
+  Schedule(const Attrs& attrs,
+           const Array<Tensor>& outs,
            const Target& target)>;
 }  // namespace relay
 }  // namespace tvm
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 28d53ec8674a..7aef4d4377af 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -2,13 +2,8 @@
 """Backend compiler related feature registration"""
 from __future__ import absolute_import
 import topi
-import topi.cuda
-from .op import register_compute, register_schedule, register_pattern, OpPattern
-
-def schedule_injective(outputs, target):
-    """Generic schedule for binary broadcast."""
-    with target:
-        return topi.generic.schedule_injective(outputs)
+from .op import register_compute, register_schedule, register_pattern
+from .op import schedule_injective, OpPattern
 
 schedule_broadcast = schedule_injective
 schedule_elemwise = schedule_injective
diff --git a/python/tvm/relay/op/nn/__init__.py b/python/tvm/relay/op/nn/__init__.py
index d1818e71882c..0c2a0a4358c9 100644
--- a/python/tvm/relay/op/nn/__init__.py
+++ b/python/tvm/relay/op/nn/__init__.py
@@ -2,3 +2,4 @@
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
 from .nn import *
+from . import _nn
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 4f5dcd4dd08b..7bc26cdec9f9 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -1,16 +1,174 @@
 #pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
-import tvm
 import topi
-from .. import register
+from topi.util import get_const_int, get_const_tuple
+from .. import op as reg
+from ..op import OpPattern, schedule_injective
 
-def dense_compiler(attrs, inputs, output_type):
-    assert len(inputs) == 2
+# dense
+@reg.register_compute("nn.dense")
+def compute_dense(attrs, inputs, out_type, target):
+    """Compute definition of dense"""
     return [topi.nn.dense(inputs[0], inputs[1])]
 
-def dense_schedule(outputs, target):
-    assert len(outputs) == 1
-    return tvm.create_schedule(outputs[0].op)
+@reg.register_schedule("nn.dense")
+def schedule_dense(attrs, outputs, target):
+    """Schedule definition of dense"""
+    with target:
+        return topi.generic.schedule_dense(outputs)
 
-register("nn.dense", "FTVMCompute", dense_compiler)
-register("nn.dense", "FTVMSchedule", dense_schedule)
+reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# conv2d
+@reg.register_compute("nn.conv2d")
+def compute_conv2d(attrs, inputs, out_type, target):
+    """Compute definition of conv2d"""
+    padding = get_const_tuple(attrs.padding)
+    strides = get_const_tuple(attrs.strides)
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    layout = attrs.data_layout
+    weight_layout = attrs.weight_layout
+    out_dtype = attrs.out_dtype
+    out_dtype = (inputs[0].dtype if (out_dtype == "same" or out_dtype == "")
+                 else out_dtype)
+
+    assert layout in ["NCHW", "NHWC", "NCHW4c"]
+    (dilation_h, dilation_w) = dilation
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        out = topi.nn.conv2d(
+            inputs[0], inputs[1], strides, padding,
+            dilation, layout, out_dtype=out_dtype)
+    elif layout == "NCHW" and \
+         weight_layout == "OIHW" and \
+         get_const_int(inputs[1].shape[0]) == groups and \
+         get_const_int(inputs[1].shape[1]) == 1:
+        out = topi.nn.depthwise_conv2d_nchw(
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+    elif layout == "NHWC" and \
+         kernel_layout == "HWOI" and\
+         get_const_int(inputs[1].shape[2]) == groups and \
+         get_const_int(inputs[1].shape[3]) == 1:
+        out = topi.nn.depthwise_conv2d_nhwc(
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+    else:
+        raise ValueError("not support arbitrary group number for now")
+    return [out]
+
+
+@reg.register_schedule("nn.conv2d")
+def schedule_conv2d(attrs, outs, target):
+    """Schedule definition of conv2d"""
+    groups = attrs.groups
+    layout = attrs.data_layout
+    kernel_layout = attrs.weight_layout
+    with target:
+        if groups == 1 and layout == "NCHW":
+            return topi.generic.schedule_conv2d_nchw(outs)
+        elif groups == 1 and layout == "NCHW4c":
+            return topi.generic.schedule_conv2d_nchw(outs)
+        elif groups == 1 and layout == "NHWC":
+            return topi.generic.schedule_conv2d_nhwc(outs)
+        elif groups != 1:
+            if layout == "NCHW":
+                # TODO(leyuan, merrymercy, Huyuwei): fold depthwise topi into conv2d.
+                return topi.generic.schedule_depthwise_conv2d_nchw(outs)
+            elif layout == "NHWC" and kernel_layout == "HWOI":
+                return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
+    raise ValueError("No compatible schedule")
+
+reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# conv2d_transpose
+@reg.register_compute("nn.conv2d_transpose")
+def compute_conv2d_transpose(attrs, inputs, out_dtype, target):
+    """Compute definition of conv2d_transpose"""
+    padding = get_const_tuple(attrs.padding)
+    strides = get_const_tuple(attrs.strides)
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    layout = attrs.data_layout
+    out_dtype = attrs.out_dtype
+    out_dtype = (inputs[0].dtype if (out_dtype == "same" or out_dtype == "")
+                 else out_dtype)
+    assert layout == "NCHW", "only support nchw for now"
+    assert dilation == (1, 1), "not support dilate now"
+    assert groups == 1, "only support groups == 1 for now"
+    out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding, out_dtype)
+    output_padding = get_const_tuple(attrs.output_padding)
+    out = topi.nn.pad(out,
+                      [0, 0, 0, 0], [0, 0, output_padding[0], output_padding[1]])
+    return [out]
+
+@reg.register_schedule("nn.conv2d_transpose")
+def schedule_conv2d_transpose(attrs, outs, target):
+    """Schedule definition of conv2d_transpose"""
+    with target:
+        return topi.generic.schedule_conv2d_transpose_nchw(outs)
+
+reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# bias_add
+@reg.register_compute("nn.bias_add")
+def compute_bias_add(attrs, inputs, out_dtype, target):
+    """Compute definition of conv2d_transpose"""
+    axis = attrs.axis
+    bias = inputs[1]
+    data_ndim = len(inputs[0].shape)
+    if axis < 0:
+        axis = axis + data_ndim
+    num_newaxis = data_ndim - axis - 1
+
+    if num_newaxis:
+        bias = topi.expand_dims(bias, axis=1, num_newaxis=num_newaxis)
+    return [topi.add(inputs[0], bias)]
+
+reg.register_schedule("nn.bias_add", schedule_injective)
+reg.register_pattern("nn.bias_add", OpPattern.BROADCAST)
+
+
+# max_pool2d
+@reg.register_schedule("nn.max_pool2d")
+def schedule_max_pool2d(attrs, outs, target):
+    """Schedule definition of max_pool2d"""
+    layout = attrs.layout
+    with target:
+        return topi.generic.schedule_pool(outs, layout)
+
+reg.register_pattern("nn.max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# avg_pool2d
+@reg.register_schedule("nn.avg_pool2d")
+def schedule_avg_pool2d(attrs, outs, target):
+    """Schedule definition of avg_pool2d"""
+    layout = attrs.layout
+    with target:
+        return topi.generic.schedule_pool(outs, layout)
+
+reg.register_pattern("nn.avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# global_max_pool2d
+@reg.register_schedule("nn.global_max_pool2d")
+def schedule_global_max_pool2d(_, outs, target):
+    """Schedule definition of global_max_pool2d"""
+    with target:
+        return topi.generic.schedule_global_pool(outs)
+
+reg.register_pattern("nn.global_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# global_avg_pool2d
+@reg.register_schedule("nn.global_avg_pool2d")
+def schedule_global_avg_pool2d(_, outs, target):
+    """Schedule definition of global_avg_pool2d"""
+    with target:
+        return topi.generic.schedule_global_pool(outs)
+
+reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 3bdb5989c292..c777a82462c8 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -1,4 +1,7 @@
+#pylint: disable=unused-argument
 """The base node types for the Relay language."""
+import topi
+
 from ..._ffi.function import _init_api
 
 from ..base import register_relay_node
@@ -156,3 +159,9 @@ def _lower(name, schedule, inputs, outputs):
 @register_func("relay.op.compiler._build")
 def _build(lowered_funcs):
     return build(lowered_funcs, target="llvm")
+
+
+def schedule_injective(attrs, outputs, target):
+    """Generic schedule for binary broadcast."""
+    with target:
+        return topi.generic.schedule_injective(outputs)
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index d9385977dc39..38e3f6c2a7b8 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -89,7 +89,7 @@ class ScheduleGetter :
     CachedFunc cfunc(cache_node);
     CHECK(master_op_.defined());
     Schedule schedule = fschedule[master_op_](
-        cache_node->outputs, target_);
+        master_attrs_, cache_node->outputs, target_);
     return std::make_pair(schedule, cfunc);
   }
 
@@ -145,6 +145,7 @@ class ScheduleGetter :
     }
     if (op_pattern >= master_op_patetrn_) {
       master_op_ = op;
+      master_attrs_ = call_node->attrs;
       master_op_patetrn_ = op_pattern;
     }
     if (outputs.size() != 1) {
@@ -193,6 +194,7 @@ class ScheduleGetter :
  private:
   tvm::Target target_;
   Op master_op_;
+  Attrs master_attrs_;
   int master_op_patetrn_{0};
   std::ostringstream readable_name_stream_;
   std::unordered_map<Expr, Array<Tensor>, NodeHash, NodeEqual> memo_;
@@ -285,6 +287,9 @@ class CompileEngineImpl : public CompileEngineNode {
    * \return Updated name which is unique.
    */
   std::string GetUniqeName(std::string name) {
+    for (size_t i = 0; i < name.length(); ++i) {
+      if (name[i] == '.') name[i] = '_';
+    }
     while (true) {
       auto it = name_map_.find(name);
       if (it == name_map_.end()) {
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index d141eec3bdd2..fb4c7304a5eb 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -91,16 +91,15 @@ bool DenseRel(const Array<Type>& types,
   Array<tvm::Expr> oshape = data->shape;
   if (param->units.defined()) {
     Array<tvm::Expr> dshape = data->shape;
-
     // validate the weight shape is proper if defined
     // Assign weight type
-    Array<IndexExpr> wshape({dshape[dshape.size() - 1], param->units});
+    Array<IndexExpr> wshape({param->units, dshape[dshape.size() - 1]});
     reporter->Assign(types[1], TensorTypeNode::make(wshape, data->dtype));
     oshape.Set((oshape.size() - 1), param->units);
   } else {
     if (weight == nullptr) return false;
     Array<tvm::Expr> wshape = weight->shape;
-    oshape.Set((oshape.size() - 1), wshape[wshape.size() - 1]);
+    oshape.Set((oshape.size() - 1), wshape[0]);
   }
 
   // assign output type
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 8c989ac91237..0e54564e0032 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -4,7 +4,9 @@
  * \brief Pooling operators
  */
 #include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/attrs/nn.h>
+#include <topi/nn/pooling.h>
 #include <vector>
 #include "layout.h"
 
@@ -14,7 +16,7 @@ namespace relay {
 TVM_REGISTER_NODE_TYPE(MaxPool2DAttrs);
 TVM_REGISTER_NODE_TYPE(AvgPool2DAttrs);
 
-template <typename AttrTtype>
+template <typename AttrType>
 bool Pool2DRel(const Array<Type>& types,
                int num_inputs,
                const Attrs& attrs,
@@ -27,7 +29,7 @@ bool Pool2DRel(const Array<Type>& types,
   CHECK_NE(dshape.size(), 0);
   CHECK_GE(dshape.size(), 2U)
       << "Pool2D only support input >= 2-D: input must have height and width";
-  const auto param = attrs.as<AttrTtype>();
+  const auto param = attrs.as<AttrType>();
   CHECK(param != nullptr);
 
   Layout layout(param->layout);
@@ -88,6 +90,46 @@ Expr MakeMaxPool2D(Expr data,
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
 
+template<typename AttrType, topi::nn::PoolType mode>
+Array<Tensor> Pool2DCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  const auto* param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+  auto pool_size = param->pool_size;
+  auto strides = param->strides;
+  auto padding = param->padding;
+  auto ceil_mode = param->ceil_mode;
+  Layout layout(param->layout);
+  CHECK(layout.convertible(Layout("NCHW")))
+      << "max_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.indexof('h'), -1) << "max_pool2d does not support input split on height";
+  CHECK_EQ(layout.indexof('w'), -1) << "max_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+      << "Pool2D only support 4-D input (e.g., NCHW)"
+      << " or 5-D input (last dimension is a split of channel)";
+
+  if (param->padding.size() == 1) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+  } else if (param->padding.size() == 2) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[1]);
+  }
+  if (mode == topi::nn::kAvgPool) {
+    bool count_include_pad = reinterpret_cast<const AvgPool2DAttrs*>(param)->count_include_pad;
+    return Array<Tensor>{
+      topi::nn::pool(inputs[0], pool_size, strides, padding,
+                     mode, ceil_mode, layout.name(), count_include_pad)};
+  } else {
+    return Array<Tensor>{
+      topi::nn::pool(inputs[0], pool_size, strides, padding,
+                     mode, ceil_mode, layout.name())};
+  }
+}
 
 TVM_REGISTER_API("relay.op.nn._make.max_pool2d")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
@@ -120,7 +162,8 @@ RELAY_REGISTER_OP("nn.max_pool2d")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
-.add_type_rel("MaxPool2D", Pool2DRel<MaxPool2DAttrs>);
+.add_type_rel("MaxPool2D", Pool2DRel<MaxPool2DAttrs>)
+.set_attr<FTVMCompute>("FTVMCompute", Pool2DCompute<MaxPool2DAttrs, topi::nn::kMaxPool>);
 
 
 // AvgPool2D
@@ -175,7 +218,8 @@ Average pooling operation for one dimensional data.
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
-.add_type_rel("AvgPool2D", Pool2DRel<AvgPool2DAttrs>);
+.add_type_rel("AvgPool2D", Pool2DRel<AvgPool2DAttrs>)
+.set_attr<FTVMCompute>("FTVMCompute", Pool2DCompute<AvgPool2DAttrs, topi::nn::kAvgPool>);
 
 // Global Pool
 TVM_REGISTER_NODE_TYPE(GlobalPool2DAttrs);
@@ -211,6 +255,29 @@ bool GlobalPool2DRel(const Array<Type>& types,
   return true;
 }
 
+
+template<topi::nn::PoolType mode>
+Array<Tensor> GlobalPool2DCompute(const Attrs& attrs,
+                                  const Array<Tensor>& inputs,
+                                  const Type& out_type,
+                                  const Target& target) {
+  const auto* param = attrs.as<GlobalPool2DAttrs>();
+  CHECK(param != nullptr);
+  Layout layout(param->layout);
+  CHECK(layout.convertible(Layout("NCHW")))
+    << "global_avg_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.indexof('h'), -1)
+    << "global_avg_pool2d does not support input split on height";
+  CHECK_EQ(layout.indexof('w'), -1)
+    << "global_avg_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+  return Array<Tensor>{
+    topi::nn::global_pool(inputs[0], mode, layout.name()) };
+}
+
 Expr MakeGlobalAvgPool2D(Expr data,
                          std::string layout) {
   auto attrs = make_node<GlobalPool2DAttrs>();
@@ -239,7 +306,8 @@ RELAY_REGISTER_OP("nn.global_avg_pool2d")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
-.add_type_rel("GlobalAvgPool2D", GlobalPool2DRel);
+.add_type_rel("GlobalAvgPool2D", GlobalPool2DRel)
+.set_attr<FTVMCompute>("FTVMCompute", GlobalPool2DCompute<topi::nn::kAvgPool>);
 
 // GlobalMaxPool
 Expr MakeGlobalMaxPool2D(Expr data,
@@ -269,7 +337,8 @@ RELAY_REGISTER_OP("nn.global_max_pool2d")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
-.add_type_rel("GlobalMaxPool2D", GlobalPool2DRel);
+.add_type_rel("GlobalMaxPool2D", GlobalPool2DRel)
+.set_attr<FTVMCompute>("FTVMCompute", GlobalPool2DCompute<topi::nn::kMaxPool>);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
index c9f689f7baee..f53f27192b9e 100644
--- a/tests/python/relay/test_backend_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -55,28 +55,6 @@ def test_mul_param():
     check_eval(func, [x_data, y_data], x_data * y_data)
 
 
-# failing due to numeric issues
-
-# def test_dense():
-#     x = relay.var('x', shape=(10, 10))
-#     w = relay.var('w', shape=(10, 10))
-#     y = relay.nn.dense(x, w)
-#     func = relay.Function([x, w], y)
-#     x_data = np.random.rand(10, 10).astype('float32')
-#     w_data = np.random.rand(10, 10).astype('float32')
-#     check_eval(func, [x_data, w_data], x_data @ w_data, rtol=0.1)
-
-# def test_linear():
-#     x = relay.var('x', shape=(10, 10))
-#     w = relay.var('w', shape=(10, 10))
-#     b = relay.var('b', shape=(10,))
-#     y = relay.add(relay.nn.dense(x, w), b)
-#     func = relay.Function([x, w, b], y)
-#     x_data = np.random.rand(10, 10).astype('float32')
-#     w_data = np.random.rand(10, 10).astype('float32')
-#     b_data = np.random.rand(10).astype('float32')
-#     check_eval(func, [x_data, w_data, b_data], x_data @ w_data + b_data)
-
 def test_equal():
     i = relay.var('i', shape=[], dtype='int32')
     j = relay.var('i', shape=[], dtype='int32')
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 477207dcef5e..88a7aba59389 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -74,6 +74,7 @@ def check_binary_op(opfunc, ref):
             y_data = np.random.rand(5, 10, 5).astype(t2.dtype)
             ref_res = ref(x_data, y_data)
             func = relay.Function([x, y], z)
+
             for target, ctx in ctx_list():
                 # use graph by execuor default for testing, as we need
                 # create function explicitly to avoid constant-folding.
@@ -89,12 +90,24 @@ def check_binary_op(opfunc, ref):
 
 
 def test_bias_add():
-    x = relay.var("x", shape=(10, 2, 3, 4))
+    xshape=(10, 2, 3, 4)
+    bshape=(2,)
+    dtype="float32"
+    x = relay.var("x", shape=xshape)
     bias = relay.var("bias")
     z = relay.nn.bias_add(x, bias)
     zz = relay.ir_pass.infer_type(z)
     assert "axis=" not in zz.astext()
-    assert zz.args[1].checked_type == relay.TensorType((2,))
+    assert zz.args[1].checked_type == relay.TensorType(bshape)
+
+    func = relay.Function([x, bias], z)
+    x_data = np.random.uniform(size=xshape).astype(dtype)
+    y_data = np.random.uniform(size=bshape).astype(dtype)
+    ref_res = x_data + y_data.reshape((2, 1, 1))
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res = intrp.evaluate(func)(x_data, y_data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
 def test_expand_dims_infer_type():
@@ -217,6 +230,50 @@ def test_batch_norm():
     ]))
 
 
+def test_dense():
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.TensorType((2, w), "float32"))
+    y = relay.nn.dense(x, w, units=2)
+    "units=2" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
+
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    wh, ww = tvm.var("wh"), tvm.var("ww")
+    w = relay.var("w", relay.TensorType((ww, wh), "float32"))
+    y = relay.nn.dense(x, w)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, ww), "float32")
+
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.IncompleteType())
+    y = relay.nn.dense(x, w, units=2)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
+
+    x = relay.var("x", shape=(10, 5))
+    w = relay.var("w", shape=(2, 5))
+    z = relay.nn.dense(x, w)
+
+    # Check result.
+    func = relay.Function([x, w], z)
+    x_data = np.random.rand(10, 5).astype('float32')
+    w_data = np.random.rand(2, 5).astype('float32')
+    ref_res = np.dot(x_data, w_data.T)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data, w_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data, w_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+
+
 if __name__ == "__main__":
     test_bias_add()
     test_unary_op()
@@ -227,3 +284,4 @@ def test_batch_norm():
     test_log_softmax()
     test_dropout()
     test_batch_norm()
+    test_dense()
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 9dd2491289f2..7b3a6d3fe15e 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2,7 +2,9 @@
 """
 import tvm
 from tvm import relay
-
+from tvm.relay.testing import ctx_list
+import numpy as np
+import topi.testing
 
 def test_conv2d_infer_type():
     # symbolic in batch dimension
@@ -62,6 +64,62 @@ def test_conv2d_infer_type():
         (n, h, w, 16), "int32")
 
 
+def test_conv2d_run():
+    def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape,
+                        padding=(1, 1),
+                        fref=None,
+                        groups=1,
+                        dilation=(1, 1),
+                        **attrs):
+        x = relay.var("x", shape=dshape)
+        w = relay.var("w")
+        y = relay.nn.conv2d(x, w,
+                            padding=padding,
+                            dilation=dilation,
+                            groups=groups,
+                            **attrs)
+        func = relay.Function([x, w], y)
+        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
+        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
+        dkernel = topi.testing.dilate_python(kernel, (1, 1) + dilation)
+        if fref is None:
+            ref_res = topi.testing.conv2d_nchw_python(
+                data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding)
+        else:
+            ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
+
+        for target, ctx in ctx_list():
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(data, kernel)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+
+    # depthwise conv2d
+    dshape = (1, 32, 18, 18)
+    kshape = (32, 1, 3, 3)
+    run_test_conv2d("float32", "float32", 1, dshape, kshape,
+                    padding=(1, 1), channels=32, groups=32, kernel_size=(3 ,3),
+                    fref=lambda x, w: topi.testing.depthwise_conv2d_python_nchw(
+                        x, w, (1, 1), "SAME"))
+
+    # normal conv2d
+    dshape = (1, 3, 224, 224)
+    kshape = (10, 3, 3, 3)
+    run_test_conv2d("float32", "float32", 1, dshape, kshape,
+                    padding=(1, 1), channels=10, kernel_size=(3 ,3))
+    # mixed precision
+    run_test_conv2d("int8", "int32", 1, dshape, kshape,
+                    padding=(1, 1), channels=10, kernel_size=(3 ,3))
+    kshape = (10, 3, 1, 3)
+    # mixed precision.
+    run_test_conv2d("int8", "int32", 1, dshape, kshape,
+                    padding=(0, 1), channels=10, kernel_size=(1 ,3))
+    # dilated conv2d
+    dshape = (1, 3, 18, 18)
+    kshape = (10, 3, 3, 3)
+    run_test_conv2d("float32", "float32", 1, dshape, kshape,
+                    padding=(1, 1), channels=10, kernel_size=(3 ,3), dilation=(3, 3))
+
+
 def test_conv2d_transpose_infer_type():
     # symbolic in batch dimension
     n, c, h, w = tvm.var("n"), 10, 10, 12
@@ -90,6 +148,33 @@ def test_conv2d_transpose_infer_type():
     assert yy.checked_type == relay.TensorType(
         (n, 15, 15, 11), "float32")
 
+
+def test_conv2d_transpose_run():
+    dshape = (1, 3, 18, 18)
+    kshape = (3, 10, 3, 3)
+    oshape = (1, 10, 37, 37)
+    x = relay.var("x", shape=dshape)
+    w = relay.var("w")
+    y = relay.nn.conv2d_transpose(x, w,
+                                  channels=10, kernel_size=(3,3), strides=(2,2),
+                                  padding=(1,1), output_padding=(2, 2))
+    func = relay.Function([x, w], y)
+    dtype = "float32"
+    data = np.random.uniform(size=dshape).astype(dtype)
+    kernel = np.random.uniform(size=kshape).astype(dtype)
+    c_np = topi.testing.conv2d_transpose_nchw_python(
+        data, kernel, 2, 1)
+    d_np = np.zeros(shape=oshape)
+    d_np[:,:,0:c_np.shape[2],0:c_np.shape[3]] = c_np
+    ref_res = d_np
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(data, kernel)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+
+
+
 def test_upsampling_infer_type():
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
@@ -103,15 +188,29 @@ def test_upsampling_infer_type():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c, 200, 400), "float32")
 
-def _test_pool2d_infer_type(opfunc):
+
+def _test_pool2d(opfunc, reffunc):
     n, c, h, w = tvm.var("n"), 10, 224, 224
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     y = opfunc(x, pool_size=(1, 1))
     assert "pool_size=" in y.astext()
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n, 10, 224, 224), "float32")
+    # test execution
+    dtype = "float32"
+    dshape = (1, 3, 28, 28)
+    x = relay.var("x", shape=dshape)
+    y = opfunc(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+    func = relay.Function([x], y)
+    data = np.random.uniform(size=dshape).astype(dtype)
+    ref_res = reffunc(data.reshape(1,3,14,2,14,2), axis=(3,5))
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
-def _test_global_pool2d_infer_type(opfunc):
+
+def _test_global_pool2d(opfunc, reffunc):
     n, c, h, w = tvm.var("n"), tvm.var("c"), 224, 224
     x = relay.var("x", relay.TensorType((n, h, w, c), "float32"))
     y = opfunc(x, layout="NHWC")
@@ -123,12 +222,61 @@ def _test_global_pool2d_infer_type(opfunc):
     y = opfunc(x)
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c, 1, 1), "float32")
+    # test execution
+    dtype = "float32"
+    dshape = (1, 1024, 7, 7)
+    x = relay.var("x", shape=dshape)
+    y = opfunc(x)
+    func = relay.Function([x], y)
+    data = np.random.uniform(size=dshape).astype(dtype)
+    ref_res = reffunc(data, axis=(2,3), keepdims=True)
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+
+
+def test_pool2d():
+    _test_pool2d(relay.nn.max_pool2d, np.max)
+    _test_pool2d(relay.nn.avg_pool2d, np.mean)
+    _test_global_pool2d(relay.nn.global_max_pool2d, np.max)
+    _test_global_pool2d(relay.nn.global_avg_pool2d, np.mean)
+
+
+def test_avg_pool2d_no_count_pad():
+    kh, kw = (4, 4)
+    sh, sw = (2, 2)
+    ph, pw = (2, 2)
+    n = 1
+    (ic, ih, iw) = (3, 28, 28)
+    (oc, oh, ow) = (3, 15, 15)
+    dshape = (n, ic, ih, iw)
+    x = relay.var("x", shape=dshape)
+    y = relay.nn.avg_pool2d(x,
+                            pool_size=(kh, kw),
+                            strides=(sw, sw),
+                            padding=(ph, pw),
+                            count_include_pad=False)
+    func = relay.Function([x], y)
+    dtype = "float32"
+    a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype)
+    pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype)
+    no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw)))
+    pad_np[np.ix_(*no_zero)] = a_np
+    b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)
+    for i in range(oh):
+        for j in range(ow):
+            pad_count = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] > 0, axis=(2,3))
+            b_np[:,:,i,j] = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw],
+                                   axis=(2,3)) / np.maximum(pad_count, 1)
+    ref_res = np.maximum(b_np, 0.0)
+    data = a_np
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
-def test_pool2d_infer_type():
-    _test_pool2d_infer_type(relay.nn.max_pool2d)
-    _test_pool2d_infer_type(relay.nn.avg_pool2d)
-    _test_global_pool2d_infer_type(relay.nn.global_avg_pool2d)
-    _test_global_pool2d_infer_type(relay.nn.global_avg_pool2d)
 
 def test_flatten_infer_type():
     d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
@@ -163,30 +311,6 @@ def test_pad_infer_type():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
 
-def test_dense_infer_type():
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    w = relay.var("w", relay.TensorType((w, 2), "float32"))
-    y = relay.nn.dense(x, w, units=2)
-    "units=2" in y.astext()
-    yy = relay.ir_pass.infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
-
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    wh, ww = tvm.var("wh"), tvm.var("ww")
-    w = relay.var("w", relay.TensorType((wh, ww), "float32"))
-    y = relay.nn.dense(x, w)
-    yy = relay.ir_pass.infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, ww), "float32")
-
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    w = relay.var("w", relay.IncompleteType())
-    y = relay.nn.dense(x, w, units=2)
-    yy = relay.ir_pass.infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
-
 
 def test_lrn():
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
@@ -206,12 +330,14 @@ def test_l2_normalize():
 
 
 if __name__ == "__main__":
+    test_pool2d()
+    test_avg_pool2d_no_count_pad()
     test_lrn()
     test_l2_normalize()
     test_conv2d_infer_type()
-    test_pool2d_infer_type()
     test_upsampling_infer_type()
     test_flatten_infer_type()
     test_pad_infer_type()
     test_conv2d_transpose_infer_type()
-    test_dense_infer_type()
+    test_conv2d_transpose_run()
+    test_conv2d_run()
diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index 71e123e83475..de9ff90ae26b 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -1,8 +1,9 @@
 # pylint: disable=invalid-name
 """Common topi utilities"""
 from __future__ import absolute_import as _abs
-import tvm
+from numbers import Integral
 
+import tvm
 from . import tag
 
 def traverse_inline(s, final_op, callback):
@@ -68,13 +69,13 @@ def get_const_int(expr):
     out_value : int
         The output.
     """
-    if isinstance(expr, int):
+    if isinstance(expr, Integral):
         return expr
     if not isinstance(expr, (tvm.expr.IntImm, tvm.expr.UIntImm)):
         expr = tvm.ir_pass.Simplify(expr)
     if not isinstance(expr, (tvm.expr.IntImm, tvm.expr.UIntImm)):
         raise ValueError("Expect value to be constant int")
-    return expr.value
+    return int(expr.value)
 
 
 def equal_const_int(expr, value):
@@ -90,7 +91,7 @@ def equal_const_int(expr, value):
     equal : bool
         Whether they equals.
     """
-    if isinstance(expr, int):
+    if isinstance(expr, Integral):
         return expr == value
     if not isinstance(expr, (tvm.expr.IntImm, tvm.expr.UIntImm)):
         expr = tvm.ir_pass.Simplify(expr)

From 43ee7398d2981ddb7b3192aca1965a96b0bbbf17 Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Fri, 9 Nov 2018 18:00:26 -0800
Subject: [PATCH 342/529] [TVM] [NNPACK] Modernize and improve NNPACK bindings
 (#2084)

---
 cmake/modules/contrib/NNPack.cmake  |   4 +
 python/tvm/contrib/nnpack.py        | 120 ++++++++++++-
 src/contrib/nnpack/convolution.cc   | 253 ++++++++++++++++++++++------
 src/contrib/nnpack/nnpack_utils.cc  |  20 ++-
 src/contrib/nnpack/nnpack_utils.h   |   2 +-
 tests/lint/pylintrc                 |   4 +-
 tests/python/contrib/test_nnpack.py |  93 ++++++++--
 7 files changed, 416 insertions(+), 80 deletions(-)

diff --git a/cmake/modules/contrib/NNPack.cmake b/cmake/modules/contrib/NNPack.cmake
index 82de88a21e63..4bf844d0c468 100644
--- a/cmake/modules/contrib/NNPack.cmake
+++ b/cmake/modules/contrib/NNPack.cmake
@@ -9,6 +9,10 @@ if(USE_NNPACK)
 	include_directories(${PTHREAD_POOL_PATH}/include)
     find_library(NNPACK_CONTRIB_LIB nnpack ${NNPACK_PATH}/lib)
   find_library(NNPACK_PTHREAD_CONTRIB_LIB pthreadpool ${NNPACK_PATH}/lib)
+  find_library(NNPACK_CPUINFO_CONTRIB_LIB cpuinfo ${NNPACK_PATH}/lib)
+  find_library(NNPACK_CLOG_CONTRIB_LIB clog ${NNPACK_PATH}/lib)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CONTRIB_LIB})
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_PTHREAD_CONTRIB_LIB})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CPUINFO_CONTRIB_LIB})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CLOG_CONTRIB_LIB})
 endif(USE_NNPACK)
diff --git a/python/tvm/contrib/nnpack.py b/python/tvm/contrib/nnpack.py
index d6587df26229..36f8a76a87db 100644
--- a/python/tvm/contrib/nnpack.py
+++ b/python/tvm/contrib/nnpack.py
@@ -63,14 +63,32 @@ def fully_connected_output(lhs, rhs, nthreads=1):
             "tvm.contrib.nnpack.fully_connected_output",
             ins[0], ins[1], outs[0], nthreads), name="C")
 
-def convolution_inference(data, kernel, bias, padding, stride, nthreads=1):
-    """Create an extern op to do inference convolution of 3D tensor data and
+
+class ConvolutionAlgorithm:
+    AUTO = 0
+    FFT_8x8 = 1
+    FFT_16x16 = 2
+    WT_8x8 = 3
+    IMPLICIT_GEMM = 4
+    DIRECT = 5
+    WT_8x8_FP16 = 6
+
+
+class ConvolutionTransformStrategy:
+    COMPUTE = 1
+    PRECOMPUTE = 2
+
+
+def convolution_inference(
+        data, kernel, bias, padding, stride, nthreads=1,
+        algorithm=ConvolutionAlgorithm.AUTO):
+    """Create an extern op to do inference convolution of 4D tensor data and
     4D tensor kernel and 1D tensor bias with nnpack.
 
     Parameters
     ----------
     data : Tensor
-        data 3D tensor input[input_channels][input_height][input_width] of
+        data 4D tensor input[batch][input_channels][input_height][input_width] of
         FP32 elements.
     kernel : Tensor
         kernel 4D tensor kernel[output_channels][input_channels][kernel_height]
@@ -88,23 +106,108 @@ def convolution_inference(data, kernel, bias, padding, stride, nthreads=1):
     Returns
     -------
     output : Tensor
-        output 3D tensor output[output_channels][output_height][output_width]
+        output 4D tensor output[batch][output_channels][output_height][output_width]
         of FP32 elements.
     """
 
     assert isinstance(padding, list) and len(padding) == 4
     assert isinstance(stride, list) and len(stride) == 2
-    _, input_height, input_width = data.shape
+    batch, _, input_height, input_width = data.shape
     output_channels, _, kernel_height, kernel_width = kernel.shape
     output_height = (input_height + padding[0] + padding[1] - kernel_height) / stride[0] + 1
     output_width = (input_width + padding[0] + padding[1] - kernel_width) / stride[1] + 1
 
     return _api.extern(
-        (output_channels, output_height, output_width), [data, kernel, bias],
+        (batch, output_channels, output_height, output_width),
+        [data, kernel, bias] if bias is not None else [data, kernel],
         lambda ins, outs: _intrin.call_packed(
-            "tvm.contrib.nnpack.convolution_inference", ins[0], ins[1], ins[2],
+            "tvm.contrib.nnpack.convolution_inference",
+            ins[0],
+            ins[1],
+            ins[2] if bias is not None else 0,
             outs[0], padding[0], padding[1], padding[2], padding[3],
-            stride[0], stride[1], nthreads), name="C")
+            stride[0], stride[1], nthreads, algorithm), name="C")
+
+def convolution_inference_without_weight_transform(
+        data, transformed_kernel, bias, padding, stride, nthreads=1,
+        algorithm=ConvolutionAlgorithm.AUTO):
+    """Create an extern op to do inference convolution of 4D tensor data and
+    4D pre-transformed tensor kernel and 1D tensor bias with nnpack.
+
+    Parameters
+    ----------
+    data : Tensor
+        data 4D tensor input[batch][input_channels][input_height][input_width] of
+        FP32 elements.
+    transformed_kernel : Tensor
+        transformed_kernel 4D tensor kernel[output_channels][input_channels][tile]
+        [tile] of FP32 elements.
+    bias : Tensor
+        bias 1D array bias[output_channels][input_channels][kernel_height]
+        [kernel_width] of FP32 elements.
+    padding : list
+        padding A 4-dim list of [pad_top, pad_bottom, pad_left, pad_right],
+        which indicates the padding around the feature map.
+    stride : list
+        stride A 2-dim list of [stride_height, stride_width], which indicates
+        the stride.
+
+    Returns
+    -------
+    output : Tensor
+        output 4D tensor output[batch][output_channels][output_height][output_width]
+        of FP32 elements.
+    """
+
+    assert algorithm in (ConvolutionAlgorithm.WT_8x8,
+                         ConvolutionAlgorithm.WT_8x8_FP16)
+    assert isinstance(padding, list) and len(padding) == 4
+    assert isinstance(stride, list) and len(stride) == 2
+    batch, _, input_height, input_width = data.shape
+    output_channels, _, _, _ = transformed_kernel.shape
+    kernel_height, kernel_width = (3, 3)
+    output_height = (input_height + padding[0] + padding[1] - kernel_height) / stride[0] + 1
+    output_width = (input_width + padding[0] + padding[1] - kernel_width) / stride[1] + 1
+
+    return _api.extern(
+        (batch, output_channels, output_height, output_width),
+        [data, transformed_kernel, bias] if bias is not None else [data, transformed_kernel],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.nnpack.convolution_inference_without_weight_transform",
+            ins[0],
+            ins[1],
+            ins[2] if bias is not None else 0,
+            outs[0], padding[0], padding[1], padding[2], padding[3],
+            stride[0], stride[1], nthreads, algorithm), name="C")
+
+def convolution_inference_weight_transform(
+        kernel, nthreads=1,
+        algorithm=ConvolutionAlgorithm.AUTO):
+    """Create an extern op to do inference convolution of 3D tensor data and
+    4D tensor kernel and 1D tensor bias with nnpack.
+
+    Parameters
+    ----------
+    kernel : Tensor
+        kernel 4D tensor kernel[output_channels][input_channels][kernel_height]
+        [kernel_width] of FP32 elements.
+
+    Returns
+    -------
+    output : Tensor
+        output 4D tensor output[output_channels][input_channels][tile][tile]
+        of FP32 elements.
+    """
+    assert algorithm in (ConvolutionAlgorithm.WT_8x8, ConvolutionAlgorithm.WT_8x8_FP16)
+    output_channels, input_channels, _, _ = kernel.shape
+
+    transform_tile_size = 8
+    return _api.extern(
+        (output_channels, input_channels, transform_tile_size, transform_tile_size),
+        [kernel],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.nnpack.convolution_inference_weight_transform",
+            ins[0], outs[0], nthreads, algorithm), name="transform_kernel")
 
 def convolution_output(data, kernel, bias, padding, nthreads=1):
     """Create an extern op to compute convolution of 4D tensor data and
@@ -144,4 +247,5 @@ def convolution_output(data, kernel, bias, padding, nthreads=1):
             "tvm.contrib.nnpack.convolution_output", ins[0], ins[1], ins[2],
             outs[0], padding[0], padding[1], padding[2], padding[3], nthreads), name="C")
 
+
 _init_api("tvm.contrib.nnpack")
diff --git a/src/contrib/nnpack/convolution.cc b/src/contrib/nnpack/convolution.cc
index f658a1fe96d4..8bcdd64281cc 100644
--- a/src/contrib/nnpack/convolution.cc
+++ b/src/contrib/nnpack/convolution.cc
@@ -13,62 +13,208 @@ namespace contrib {
 using namespace runtime;
 
 TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
-    nnp_initialize();
-    DLTensor* input  = args[0];
-    DLTensor* kernel = args[1];
-    DLTensor* bias   = args[2];
-    DLTensor* output = args[3];
-    uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6], pad_left = args[7];
-    nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
-    uint64_t stride_width = args[8], stride_height = args[9];
-    nnp_size stride_size{stride_width, stride_height};
-    NNPackConfig(args[10]);
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
+      static std::once_flag flag;
+      std::call_once(flag,
+                     []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      DLTensor *input = args[0];
+      DLTensor *kernel = args[1];
+      DLTensor *bias = nullptr;
+      if (args[2].type_code() == kArrayHandle) {
+        bias = args[2];
+      }
+      DLTensor *output = args[3];
+      uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6],
+               pad_left = args[7];
+      nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
+      uint64_t stride_width = args[8], stride_height = args[9];
+      nnp_size stride_size{stride_width, stride_height};
+      NNPackConfig(args[10]);
 
-    CHECK_EQ(input->ndim, 3);
-    CHECK_EQ(kernel->ndim, 4);
-    CHECK_EQ(bias->ndim, 1);
-    CHECK_EQ(output->ndim, 3);
-
-    CHECK_EQ(input->shape[0], kernel->shape[1]);
-    size_t input_channels = input->shape[0];
-    CHECK_EQ(output->shape[0], kernel->shape[0]);
-    CHECK_EQ(output->shape[0], bias->shape[0]);
-    size_t output_channels = output->shape[0];
-    nnp_size input_size{static_cast<size_t>(input->shape[1]),
-                        static_cast<size_t>(input->shape[2])};
-    nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
-                         static_cast<size_t>(kernel->shape[3])};
+      uint64_t algo_ = args[11];
+      nnp_convolution_algorithm algo =
+          static_cast<nnp_convolution_algorithm>(algo_);
+      CHECK_EQ(input->ndim, 4);
+      CHECK_EQ(kernel->ndim, 4);
+      if (bias) {
+        CHECK_EQ(bias->ndim, 1);
+      }
+      CHECK_EQ(output->ndim, 4);
+      CHECK_EQ(input->shape[1], kernel->shape[1]);
+      CHECK_EQ(input->shape[0], output->shape[0]);
+      size_t input_channels = input->shape[1];
+      CHECK_EQ(output->shape[1], kernel->shape[0]);
+      if (bias) {
+        CHECK_EQ(output->shape[1], bias->shape[0]);
+      }
+      size_t output_channels = output->shape[1];
+      nnp_size input_size{static_cast<size_t>(input->shape[2]),
+                          static_cast<size_t>(input->shape[3])};
+      nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
+                           static_cast<size_t>(kernel->shape[3])};
+      CHECK(input->strides == nullptr);
+      CHECK(kernel->strides == nullptr);
+      if (bias) {
+        CHECK(bias->strides == nullptr);
+      }
 
-    CHECK(input->strides == nullptr);
-    CHECK(kernel->strides == nullptr);
-    CHECK(bias->strides == nullptr);
+      CHECK(TypeMatch(input->dtype, kDLFloat, 32));
+      CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
+      if (bias) {
+        CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
+      }
+      CHECK(TypeMatch(output->dtype, kDLFloat, 32));
 
-    CHECK(TypeMatch(input->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(output->dtype, kDLFloat, 32));
+      // Allocate a zero-bias if we don't pass one in.
+      std::unique_ptr<std::vector<float>> zero_bias;
+      if (!bias) {
+        zero_bias.reset(new std::vector<float>(output->shape[1], 0.0));
+      }
 
-    nnp_convolution_inference(nnp_convolution_algorithm_auto,
-                              nnp_convolution_transform_strategy_block_based,
-                              input_channels,
-                              output_channels,
-                              input_size,
-                              input_padding,
-                              kernel_size,
-                              stride_size,
-                              static_cast<float*>(input->data),
-                              static_cast<float*>(kernel->data),
-                              static_cast<float*>(bias->data),
-                              static_cast<float*>(output->data),
-                              NULL,
-                              NULL,
-                              nnp_activation_identity,
-                              NULL,
-                              entry->threadpool,
-                              NULL);
-  });
+      for (auto n = 0; n < input->shape[0]; ++n) {
+        nnp_status status = nnp_convolution_inference(
+            algo, nnp_convolution_transform_strategy_compute, input_channels,
+            output_channels, input_size, input_padding, kernel_size,
+            stride_size,
+            static_cast<float *>(input->data) + n * input->shape[1] *
+                                                   input->shape[2] *
+                                                   input->shape[3],
+            static_cast<float *>(kernel->data),
+            bias ? static_cast<float *>(bias->data) : zero_bias->data(),
+            static_cast<float *>(output->data) + n * output->shape[1] *
+                                                    output->shape[2] *
+                                                    output->shape[3],
+            NULL, NULL, nnp_activation_identity, NULL, entry->threadpool, NULL);
+
+        CHECK_EQ(status, nnp_status_success);
+      }
+    });
+
+TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_transform")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
+      static std::once_flag flag;
+      std::call_once(flag,
+                     []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      DLTensor *input = args[0];
+      DLTensor *transformed_kernel = args[1];
+      DLTensor *bias = nullptr;
+      if (args[2].type_code() == kArrayHandle) {
+        bias = args[2];
+      }
+      DLTensor *output = args[3];
+      uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6],
+               pad_left = args[7];
+      nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
+      uint64_t stride_width = args[8], stride_height = args[9];
+      nnp_size stride_size{stride_width, stride_height};
+      NNPackConfig(args[10]);
+
+      uint64_t algo_ = args[11];
+      nnp_convolution_algorithm algo =
+          static_cast<nnp_convolution_algorithm>(algo_);
+      CHECK_EQ(input->ndim, 4);
+      if (bias) {
+        CHECK_EQ(bias->ndim, 1);
+      }
+      CHECK_EQ(output->ndim, 4);
+      CHECK_EQ(input->shape[0], output->shape[0]);
+      size_t input_channels = input->shape[1];
+      if (bias) {
+        CHECK_EQ(output->shape[1], bias->shape[0]);
+      }
+      size_t output_channels = output->shape[1];
+      nnp_size input_size{static_cast<size_t>(input->shape[2]),
+                          static_cast<size_t>(input->shape[3])};
+      nnp_size kernel_size{3, 3};
+      CHECK(input->strides == nullptr);
+      CHECK(transformed_kernel->strides == nullptr);
+      if (bias) {
+        CHECK(bias->strides == nullptr);
+      }
+
+      CHECK(TypeMatch(input->dtype, kDLFloat, 32));
+      CHECK(TypeMatch(transformed_kernel->dtype, kDLFloat, 32));
+      if (bias) {
+        CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
+      }
+      CHECK(TypeMatch(output->dtype, kDLFloat, 32));
+
+      // Allocate a zero-bias if we don't pass one in.
+      std::unique_ptr<std::vector<float>> zero_bias;
+      if (!bias) {
+        zero_bias.reset(new std::vector<float>(output->shape[1], 0.0));
+      }
+
+      for (auto n = 0; n < input->shape[0]; ++n) {
+      nnp_status status = nnp_convolution_inference(
+          algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels,
+          input_size, input_padding, kernel_size, stride_size,
+          static_cast<float *>(input->data) + n * input->shape[1] *
+                               input->shape[2] *
+                               input->shape[3],
+          static_cast<float *>(transformed_kernel->data),
+          bias ? static_cast<float *>(bias->data) : zero_bias->data(),
+          static_cast<float *>(output->data) + n * output->shape[1] *
+                               output->shape[2] *
+                               output->shape[3],
+          NULL, NULL,
+          nnp_activation_identity, NULL, entry->threadpool, NULL);
+      CHECK_EQ(status, nnp_status_success);
+      }
+    });
+
+TVM_REGISTER_GLOBAL(
+    "tvm.contrib.nnpack.convolution_inference_weight_transform")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
+      static std::once_flag flag;
+      std::call_once(flag,
+                     []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      DLTensor *kernel = args[0];
+      DLTensor *transformed_kernel = args[1];
+      // Dummy sizes
+      nnp_padding input_padding{1, 1, 1, 1};
+      nnp_size stride_size{1, 1};
+
+      nnp_size input_size{100, 100};
+
+      NNPackConfig(args[2]);
+
+      uint64_t algo_ = args[3];
+      nnp_convolution_algorithm algo =
+          static_cast<nnp_convolution_algorithm>(algo_);
+      CHECK_EQ(kernel->ndim, 4);
+      size_t input_channels = kernel->shape[1];
+      size_t output_channels = kernel->shape[0];
+      CHECK_EQ(kernel->shape[2], 3);
+      CHECK_EQ(kernel->shape[3], 3);
+      nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
+                           static_cast<size_t>(kernel->shape[3])};
+      CHECK(kernel->strides == nullptr);
+      CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
+
+      size_t transformed_kernel_size = 0;
+      nnp_status status;
+      status = nnp_convolution_inference(
+          algo, nnp_convolution_transform_strategy_precompute, input_channels,
+          output_channels, input_size, input_padding, kernel_size, stride_size,
+          nullptr, nullptr, nullptr, nullptr, nullptr, &transformed_kernel_size,
+          nnp_activation_identity, nullptr, entry->threadpool, nullptr);
+      CHECK_EQ(status, nnp_status_success);
+
+      CHECK_LE(transformed_kernel_size, GetDataSize(*transformed_kernel));
+
+      status = nnp_convolution_inference(
+          algo, nnp_convolution_transform_strategy_precompute, input_channels,
+          output_channels, input_size, input_padding, kernel_size, stride_size,
+          nullptr, static_cast<float *>(kernel->data), nullptr, nullptr,
+          static_cast<float *>(transformed_kernel->data),
+          &transformed_kernel_size, nnp_activation_identity, nullptr,
+          entry->threadpool, nullptr);
+      CHECK_EQ(status, nnp_status_success);
+    });
 
 
 TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_output")
@@ -109,7 +255,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_output")
     CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
     CHECK(TypeMatch(output->dtype, kDLFloat, 32));
 
-    nnp_convolution_output(nnp_convolution_algorithm_auto,
+    nnp_status status = nnp_convolution_output(nnp_convolution_algorithm_auto,
                            batch_size,
                            input_channels,
                            output_channels,
@@ -126,6 +272,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_output")
                            NULL,
                            entry->threadpool,
                            NULL);
+    CHECK_EQ(status, nnp_status_success);
   });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/nnpack/nnpack_utils.cc b/src/contrib/nnpack/nnpack_utils.cc
index 3220d7af339f..d8ef1d0b8327 100644
--- a/src/contrib/nnpack/nnpack_utils.cc
+++ b/src/contrib/nnpack/nnpack_utils.cc
@@ -10,20 +10,30 @@ using namespace runtime;
 
 typedef dmlc::ThreadLocalStore<NNPackThreadLocalEntry> NNPackThreadLocalStore;
 
+
 NNPackThreadLocalEntry* NNPackThreadLocalEntry::ThreadLocal() {
   return NNPackThreadLocalStore::Get();
 }
 
 bool NNPackConfig(uint64_t nthreads) {
   NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
-  if (entry->threadpool != NULL &&
-      pthreadpool_get_threads_count(entry->threadpool) != nthreads) {
+  if (entry->threadpool && pthreadpool_get_threads_count(entry->threadpool) == nthreads) {
+    CHECK_NE(nthreads, 1);
+    return true;
+  }
+  if (entry->threadpool) {
     pthreadpool_destroy(entry->threadpool);
-    entry->threadpool = NULL;
+    entry->threadpool = nullptr;
   }
-  if (entry->threadpool == NULL) {
-    entry->threadpool = pthreadpool_create(nthreads);
+
+  if (nthreads == 1) {
+    // a null threadpool means the function is invoked on the calling thread,
+    // which is the desired logic for nthreads == 1
+    CHECK(!entry->threadpool);
+    return true;
   }
+
+  entry->threadpool = pthreadpool_create(nthreads);
   return true;
 }
 
diff --git a/src/contrib/nnpack/nnpack_utils.h b/src/contrib/nnpack/nnpack_utils.h
index fe7420786bde..1d44adff16ef 100644
--- a/src/contrib/nnpack/nnpack_utils.h
+++ b/src/contrib/nnpack/nnpack_utils.h
@@ -15,7 +15,7 @@ namespace contrib {
 using namespace runtime;
 
 struct NNPackThreadLocalEntry {
-  pthreadpool_t threadpool{NULL};
+  pthreadpool_t threadpool{nullptr};
   static NNPackThreadLocalEntry* ThreadLocal();
 };
 
diff --git a/tests/lint/pylintrc b/tests/lint/pylintrc
index f5c4452cfa16..18f526702ad8 100644
--- a/tests/lint/pylintrc
+++ b/tests/lint/pylintrc
@@ -290,10 +290,10 @@ variable-rgx=[a-z_][a-z0-9_]{2,30}$
 variable-name-hint=[a-z_][a-z0-9_]{2,30}$
 
 # Regular expression matching correct function names
-function-rgx=[a-z_][a-z0-9_]{2,30}$
+function-rgx=[a-z_][a-z0-9_]{2,48}$
 
 # Naming hint for function names
-function-name-hint=[a-z_][a-z0-9_]{2,30}$
+function-name-hint=[a-z_][a-z0-9_]{2,48}$
 
 # Regular expression matching correct class names
 class-rgx=[A-Z_][a-zA-Z0-9]+$
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index a6c6b8158ff3..0b275fb812bf 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -100,7 +100,7 @@ def np_conv(na, nw, padding, stride=1):
     return nb
 
 def test_convolution_inference():
-    BATCH = 32
+    BATCH = 8
     IH = 48
     IW = 48
     IC = 16
@@ -111,19 +111,17 @@ def test_convolution_inference():
 
     OH = (IH + 2*PAD - K) + 1
     OW = (IW + 2*PAD - K) + 1
-    dshape = (IC, IH, IW)
+    dshape = (BATCH, IC, IH, IW)
     kshape = (OC, IC, K, K)
     bshape = (OC, )
-    oshape = (OC, OH, OW)
+    oshape = (BATCH, OC, OH, OW)
 
     data = tvm.placeholder(dshape, name='data')
     kernel = tvm.placeholder(kshape, name='kernel')
     bias = tvm.placeholder(bshape, name='bias')
-    output = nnpack.convolution_inference(data, kernel, bias,
-        [PAD, PAD, PAD, PAD], [STRIDE, STRIDE])
-    s = tvm.create_schedule(output.op)
-
-    def verify(target="llvm"):
+    def verify(target="llvm",
+               algorithm=nnpack.ConvolutionAlgorithm.AUTO,
+               with_bias=True):
         if not tvm.module.enabled(target):
             print("skip because %s is not enabled..." % target)
             return
@@ -131,6 +129,12 @@ def verify(target="llvm"):
             print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
+        output = nnpack.convolution_inference(
+            data, kernel, bias if with_bias else None,
+            [PAD, PAD, PAD, PAD], [STRIDE, STRIDE],
+            algorithm=algorithm)
+        s = tvm.create_schedule(output.op)
+
         f = tvm.build(s, [data, kernel, bias, output], target)
 
         na = np.random.uniform(size=dshape).astype(data.dtype)
@@ -141,10 +145,77 @@ def verify(target="llvm"):
         tc = tvm.nd.array(nc, ctx)
         td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
         f(ta, tb, tc, td)
-        nd = np_conv(np.reshape(na, (1, IC, IH, IW)), nb, PAD, STRIDE)
+        nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(1, bshape[0], 1, 1)
         tvm.testing.assert_allclose(
-            td.asnumpy(), nd.reshape(IC, IH, IW), rtol=1e-5)
-    verify()
+            td.asnumpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5)
+    for algorithm in [
+            nnpack.ConvolutionAlgorithm.AUTO,
+            nnpack.ConvolutionAlgorithm.FFT_8x8,
+            nnpack.ConvolutionAlgorithm.FFT_16x16,
+            nnpack.ConvolutionAlgorithm.WT_8x8,
+            nnpack.ConvolutionAlgorithm.IMPLICIT_GEMM,
+            nnpack.ConvolutionAlgorithm.WT_8x8_FP16,
+    ]:
+        for with_bias in [True, False]:
+            verify(algorithm=algorithm, with_bias=with_bias)
+
+
+def test_convolution_inference_without_weight_transform():
+    BATCH = 6
+    IH = 48
+    IW = 48
+    IC = 16
+    OC = 16
+    K = 3
+    PAD = 1
+    STRIDE = 1
+
+    OH = (IH + 2*PAD - K) + 1
+    OW = (IW + 2*PAD - K) + 1
+    dshape = (BATCH, IC, IH, IW)
+    kshape = (OC, IC, K, K)
+    bshape = (OC, )
+    oshape = (BATCH, OC, OH, OW)
+
+    data = tvm.placeholder(dshape, name='data')
+    kernel = tvm.placeholder(kshape, name='kernel')
+    bias = tvm.placeholder(bshape, name='bias')
+    def verify(target="llvm",
+               algorithm=nnpack.ConvolutionAlgorithm.AUTO,
+               with_bias=True):
+        if not tvm.module.enabled(target):
+            print("skip because %s is not enabled..." % target)
+            return
+        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
+            print("skip because extern function is not available")
+            return
+
+        ctx = tvm.cpu(0)
+        transformed_kernel = nnpack.convolution_inference_weight_transform(
+            kernel, algorithm=algorithm)
+        output = nnpack.convolution_inference_without_weight_transform(
+            data, transformed_kernel, bias if with_bias else None,
+            [PAD, PAD, PAD, PAD], [STRIDE, STRIDE],
+            algorithm=algorithm)
+
+        s = tvm.create_schedule(output.op)
+
+        f = tvm.build(s, [data, kernel, bias, output], target)
+
+        na = np.random.uniform(size=dshape).astype(data.dtype)
+        nb = np.random.uniform(size=kshape).astype(kernel.dtype)
+        nc = np.random.uniform(size=bshape).astype(bias.dtype) if with_bias else np.zeros(bshape, dtype=bias.dtype)
+        ta = tvm.nd.array(na, ctx)
+        tb = tvm.nd.array(nb, ctx)
+        tc = tvm.nd.array(nc, ctx)
+        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
+        f(ta, tb, tc, td)
+        nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(1, bshape[0], 1, 1)
+        tvm.testing.assert_allclose(
+            td.asnumpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5)
+    for algorithm in [nnpack.ConvolutionAlgorithm.WT_8x8]:
+        for with_bias in [True, False]:
+            verify(algorithm=algorithm, with_bias=with_bias)
 
 def test_convolution_output():
     BATCH = 32

From f4a24b20d63aabaa9d6f46742994d7249bd4fed3 Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Fri, 9 Nov 2018 21:08:04 -0800
Subject: [PATCH 343/529] Add NNPACK to CI (#2085)

---
 docker/Dockerfile.ci_cpu                |  4 ++++
 docker/install/ubuntu_install_nnpack.sh | 13 +++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 docker/install/ubuntu_install_nnpack.sh

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index b2bebea0b892..e6e2dd7a37b0 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -35,4 +35,8 @@ RUN bash /install/ubuntu_install_redis.sh
 COPY install/ubuntu_install_golang.sh /install/ubuntu_install_golang.sh
 RUN bash /install/ubuntu_install_golang.sh
 
+# NNPACK deps
+COPY install/ubuntu_install_nnpack.sh /install/ubuntu_install_nnpack.sh
+RUN bash /install/ubuntu_install_nnpack.sh
+
 ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
diff --git a/docker/install/ubuntu_install_nnpack.sh b/docker/install/ubuntu_install_nnpack.sh
new file mode 100644
index 000000000000..83225d4aa820
--- /dev/null
+++ b/docker/install/ubuntu_install_nnpack.sh
@@ -0,0 +1,13 @@
+apt-get update && apt-get install -y --no-install-recommends --force-yes git cmake
+
+
+git clone https://github.com/Maratyszcza/NNPACK NNPACK
+cd NNPACK
+# TODO: specific tag?
+git checkout 1e005b0c2
+cd -
+
+mkdir -p NNPACK/build
+cd NNPACK/build
+cmake -DCMAKE_INSTALL_PREFIX:PATH=. -DNNPACK_INFERENCE_ONLY=OFF -DNNPACK_CONVOLUTION_ONLY=OFF -DNNPACK_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && make -j4 && make install
+cd -

From 073d43d030f49ae5e51c2adcd85f866b49c608e9 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Sun, 11 Nov 2018 00:52:22 +0800
Subject: [PATCH 344/529] [TOPI][CUDA] int8 group conv2d  (#2075)

---
 nnvm/python/nnvm/top/nn.py                    |   5 +
 python/tvm/autotvm/task/nnvm_integration.py   |  14 +-
 topi/python/topi/cuda/__init__.py             |   3 +-
 topi/python/topi/cuda/group_conv2d_nchw.py    | 308 ++++++++++++++++++
 topi/python/topi/generic/nn.py                |  19 ++
 topi/python/topi/nn/conv2d.py                 |  77 +++++
 .../python/topi/testing/conv2d_nchw_python.py |  37 ++-
 topi/tests/python/common.py                   |  15 +
 topi/tests/python/test_topi_conv2d_int8.py    |  13 +-
 topi/tests/python/test_topi_group_conv2d.py   | 215 ++++++++++++
 10 files changed, 690 insertions(+), 16 deletions(-)
 create mode 100644 topi/python/topi/cuda/group_conv2d_nchw.py
 create mode 100644 topi/tests/python/test_topi_group_conv2d.py

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 03ffb46a5c5c..34dd2303f1d7 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -108,6 +108,9 @@ def compute_conv2d(attrs, inputs, _):
          groups == channels:
         out = topi.nn.depthwise_conv2d_nchw(
             inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+    elif layout == "NCHW":
+        out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
+                                        out_dtype=out_dtype)
     elif layout == "NHWC" and \
          kernel_layout == "HWOI" and \
          groups == get_const_int(inputs[0].shape[3]) and \
@@ -143,6 +146,8 @@ def schedule_conv2d(attrs, outs, target):
             return topi.generic.schedule_depthwise_conv2d_nchw(outs)
         elif groups == channels and layout == "NHWC" and kernel_layout == "HWOI":
             return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
+        elif layout == "NCHW":
+            return topi.generic.schedule_group_conv2d_nchw(outs)
         else:
             raise ValueError("No compatible schedule")
 
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index 80b62229a34e..6a07194a594d 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -58,7 +58,8 @@ def __init__(self):
         # NOTE: To add more symbols, you only need to change the following lists
         # nnvm symbol -> topi compute
         self.symbol2topi = {
-            nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
+            nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                              topi.nn.group_conv2d_nchw],
             nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
             nnvm.sym.dense: [topi.nn.dense],
         }
@@ -67,6 +68,7 @@ def __init__(self):
         self.topi_to_task = {
             topi.nn.conv2d: "topi_nn_conv2d",
             topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
+            topi.nn.group_conv2d_nchw: "topi_nn_group_conv2d_nchw",
             topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
             topi.nn.dense: "topi_nn_dense",
         }
@@ -76,6 +78,7 @@ def __init__(self):
                              topi.generic.schedule_conv2d_nhwc],
             topi.nn.depthwise_conv2d_nchw: [topi.generic.schedule_depthwise_conv2d_nchw,
                                             topi.generic.schedule_depthwise_conv2d_nhwc],
+            topi.nn.group_conv2d_nchw: [topi.generic.schedule_group_conv2d_nchw],
             topi.nn.conv2d_transpose_nchw: [topi.generic.schedule_conv2d_transpose_nchw],
             topi.nn.dense: [topi.generic.schedule_dense],
         }
@@ -143,6 +146,15 @@ def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs):
             s = topi.generic.schedule_depthwise_conv2d_nchw([C])
             return s, [A, W, C]
 
+        @register("topi_nn_group_conv2d_nchw")
+        def _topi_nn_group_conv2d_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.group_conv2d_nchw(*args, **kwargs)
+            s = topi.generic.schedule_group_conv2d_nchw([C])
+            return s, [A, W, C]
+
         @register("topi_nn_conv2d_transpose_nchw")
         def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
             assert not kwargs, "Do not support kwargs in template function call"
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index e1db2c6fdf63..28d2eb258bea 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -2,10 +2,11 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw
+from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, group_conv2d_nchw
 from .conv2d_hwcn import schedule_conv2d_hwcn
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
+from .group_conv2d_nchw import schedule_conv2d_nchw_cuda
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py
new file mode 100644
index 000000000000..739691131284
--- /dev/null
+++ b/topi/python/topi/cuda/group_conv2d_nchw.py
@@ -0,0 +1,308 @@
+# pylint: disable=invalid-name
+"""The template for cuda group_conv2d_nchw"""
+import tvm
+from tvm import autotvm
+
+from .injective import _schedule_injective
+from .tensor_intrin import dp4a
+from ..nn.pad import pad
+from ..nn.util import get_pad_tuple
+from ..util import traverse_inline, get_const_tuple, get_const_int
+from .. import nn, generic
+
+
+@autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], ['direct', 'int8'])
+def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
+                           out_dtype='float32'):
+    """Group convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width] or
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel // groups, filter_height, filter_width] or
+        6-D with shape [num_filter_chunk, in_channel_chunk // groups, filter_height,
+        filter_width, num_filter_block, in_channel_block]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    dilation : int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    groups : int
+        number of groups
+
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        5-D with shape [batch, out_channel, out_height, out_width, out_channel_block]
+    """
+    ic_block_factor = 4
+    oc_block_factor = 4
+
+    pre_computed = len(kernel.shape) == 6
+    if not pre_computed:
+        batch, channels, height, width = get_const_tuple(data.shape)
+        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(
+            kernel.shape)
+
+        assert channels % groups == 0, "input channels must divide group size"
+        assert out_channels % groups == 0, "output channels must divide group size"
+        assert channels % ic_block_factor == 0, \
+            "Number of input channels per group must divide {}".format(ic_block_factor)
+        assert out_channels % 4 == 0, \
+            "Number of output channels per group must divide {}".format(oc_block_factor)
+
+        packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
+                                   ic_block_factor),
+                                  lambda n, c, h, w, vc: data[n, c*ic_block_factor + vc, h, w],
+                                  name="packed_data")
+        packed_kernel = tvm.compute(
+            (out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_h, kernel_w,
+             oc_block_factor, ic_block_factor),
+            lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block:
+            kernel[oc_chunk * oc_block_factor + oc_block,
+                   ic_chunk * ic_block_factor + ic_block, kh, kw],
+            name="packed_kernel")
+    else:
+        packed_data = data
+        packed_kernel = kernel
+
+    batch, ic_chunk, in_height, in_width, _ = get_const_tuple(
+        packed_data.shape)
+    oc_chunk, _, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
+        packed_kernel.shape)
+
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (kernel_h, kernel_w))
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left, 0]
+    pad_after = [0, 0, pad_down, pad_right, 0]
+    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
+
+    # compute the output shape
+    out_height = (in_height - (kernel_h - 1) * dilation_h -
+                  1 + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - (kernel_w - 1) * dilation_w -
+                 1 + pad_left + pad_right) // stride_w + 1
+
+    oshape = (batch, oc_chunk, out_height, out_width, oc_block)
+
+    icc = tvm.reduce_axis((0, ic_chunk // groups), name='ic_chunk')
+    icb = tvm.reduce_axis((0, ic_block_factor), name='ic_block')
+    kh = tvm.reduce_axis((0, kernel_h), name='kh')
+    kw = tvm.reduce_axis((0, kernel_w), name='kw')
+
+    conv = tvm.compute(oshape, lambda n, occ, oh, ow, ocb:
+                       tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc,
+                                        oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb]
+                               .astype('int32') *
+                               packed_kernel[occ, icc,
+                                             kh, kw, ocb, icb]
+                               .astype('int32'),
+                               axis=[icc, kh, kw, icb]))
+
+    output = tvm.compute(oshape, lambda *index: conv(*index).astype(out_dtype),
+                         tag='group_conv2d_NCHWc_int8')
+    num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
+        ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups
+    cfg.add_flop(num_flop)
+
+    return output
+
+
+_dp4a = dp4a('shared', 'shared', 'local')
+
+
+def schedule_group_conv2d_NCHWc_int8(cfg, s, output):
+    """Schedule group conv2d int8 NCHWc template"""
+    workload = output.op.attrs["workload"]
+    groups = get_const_int(workload[6])
+
+    conv = output.op.input_tensors[0]
+    packed_data, packed_kernel = conv.op.input_tensors
+
+    if isinstance(packed_data.op, tvm.tensor.ComputeOp) and "pad" in packed_data.op.tag:
+        pad_data = packed_data
+        packed_data = pad_data.op.input_tensors[0]
+    else:
+        pad_data = packed_data
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # skip this part during tuning to make records accurate
+        # this part will be pre-computed during NNVM's pre-compute optimization pass
+        s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
+        s[packed_kernel].pragma(
+            s[packed_kernel].op.axis[0], "debug_skip_region")
+    else:
+        if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and\
+                packed_kernel.name == 'packed_kernel':
+            # data and kernel are not pre-computed, schedule layout transform here
+            _schedule_injective(packed_data.op, s)
+            _schedule_injective(packed_kernel.op, s)
+
+    if pad_data != packed_data:
+        s[pad_data].compute_inline()
+
+    # create cache stage
+    AA = s.cache_read(pad_data, 'shared', [conv])
+    WW = s.cache_read(packed_kernel, 'shared', [conv])
+
+    s[conv].set_scope('local')
+
+    # handle bias
+    if output.op not in s.outputs:
+        s[output].compute_inline()
+        output = s.outputs[0].output(0)
+
+    oc_chunk = get_const_int(output.shape[1])
+    # tile and bind spatial axes
+    n, f, y, x, c = s[output].op.axis
+    cfg.define_split("tile_n", n, num_outputs=4)
+    cfg.define_split("tile_g", cfg.axis(groups), num_outputs=2)
+    cfg.define_split("tile_f", cfg.axis(oc_chunk // groups), num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+
+    # this is the scope to attach global config inside this kernel
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    g, f = s[output].split(f, nparts=groups)
+    s[output].bind(n, tvm.thread_axis('blockIdx.z'))
+    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
+    bg, vg = cfg["tile_g"].apply(s, output, g)
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    s[output].reorder(bn, bg, bf, by, bx, vn, vg, vf, vy,
+                      vx, tn, tf, ty, tx, ni, fi, yi, xi)
+    s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(s[output].fuse(bg, bf), tvm.thread_axis("blockIdx.y"))
+    s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vn, tvm.thread_axis("vthread"))
+    s[output].bind(vg, tvm.thread_axis("vthread"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
+    if cfg["fuse_yx"].val:
+        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
+        s[output].bind(tf, tvm.thread_axis("threadIdx.y"))
+        tyx = s[output].fuse(ty, tx)
+        s[output].bind(tyx, tvm.thread_axis("threadIdx.x"))
+        s[conv].compute_at(s[output], tyx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2]
+        n_ty = cfg["tile_f"].size[2]
+        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
+    else:
+        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
+        s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z"))
+        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[conv].compute_at(s[output], tx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
+        n_ty = cfg["tile_y"].size[2]
+        n_tx = cfg["tile_x"].size[2]
+
+    # tile and bind reduction axes
+    n, f, y, x, c = s[conv].op.axis
+    rc, ry, rx, rc_block = s[conv].op.reduce_axis
+    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
+    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
+    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
+    rco, rci = cfg['tile_rc'].apply(s, conv, rc)
+    ryo, ryi = cfg['tile_ry'].apply(s, conv, ry)
+    rxo, rxi = cfg['tile_rx'].apply(s, conv, rx)
+
+    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
+    _, rc_block = s[conv].split(rc_block, factor=4)
+    s[conv].tensorize(rc_block, _dp4a)
+
+    s[AA].compute_at(s[conv], rxo)
+    s[WW].compute_at(s[conv], rxo)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        c = s[load].op.axis[-1]
+        c_outer, c = s[load].split(c, factor=4)
+        s[load].vectorize(c)
+        fused = s[load].op.axis[:-1] + [c_outer]
+        fused = s[load].fuse(*fused)
+
+        fused, tx = s[load].split(fused, factor=n_tx)
+        fused, ty = s[load].split(fused, factor=n_ty)
+        fused, tz = s[load].split(fused, factor=n_tz)
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # double buffer
+    cfg.define_knob('AA_double_buffer', [0, 1])
+    cfg.define_knob('WW_double_buffer', [0, 1])
+    if cfg['AA_double_buffer'].val:
+        s[AA].double_buffer()
+    if cfg['WW_double_buffer'].val:
+        s[WW].double_buffer()
+
+    # unroll
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step',
+                     cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', False)
+
+    return s
+
+
+@autotvm.register_topi_schedule(generic.schedule_group_conv2d_nchw,
+                                ["cuda", "gpu"], ["direct", "int8"])
+def schedule_conv2d_nchw_cuda(cfg, outs):
+    """TOPI schedule callback of group conv2d for cuda gpu
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for group conv2d.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == "group_conv2d_NCHWc_int8":
+            schedule_group_conv2d_NCHWc_int8(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index a48b85638fb1..0f4b51b81433 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -173,6 +173,25 @@ def schedule_depthwise_conv2d_nhwc(outs):
     """
     return _default_schedule(outs, False)
 
+
+@tvm.target.generic_func
+def schedule_group_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of group_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_bitserial_conv2d_nchw(outs):
     """Schedule for bitserial_conv2d_nchw
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 2b88886524bd..d4b9393c19dd 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -403,3 +403,80 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding, di
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
+
+
+@tvm.target.generic_func
+def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None):
+    """Group convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    Input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    Filter : tvm.Tensor
+        4-D with shape [num_filter, in_channel // groups, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    dilation : int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    groups : int
+        number of groups
+
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_channel, in_height, in_width = get_const_tuple(Input.shape)
+    num_filter, _, kernel_h, kernel_w = get_const_tuple(Filter.shape)
+
+    assert in_channel % groups == 0, "input channels must divide group size"
+    assert num_filter % groups == 0, "output channels must divide group size"
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (kernel_h, kernel_w))
+    # compute the output shape
+    out_channel = num_filter
+    out_height = simplify(
+        (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify(
+        (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1)
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down, pad_right]
+    temp = pad(Input, pad_before, pad_after, name="pad_temp")
+    rc = tvm.reduce_axis((0, in_channel // groups), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    return tvm.compute(
+        (batch, out_channel, out_height, out_width),
+        lambda nn, ff, yy, xx: tvm.sum(
+            temp[nn, ff // (num_filter//groups) * (in_channel//groups) + rc,
+                 yy * stride_h + ry * dilation_h,
+                 xx * stride_w + rx * dilation_w].astype(out_dtype) *
+            Filter[ff, rc, ry, rx].astype(out_dtype),
+            axis=[rc, ry, rx]), tag="conv2d_nchw")
diff --git a/topi/python/topi/testing/conv2d_nchw_python.py b/topi/python/topi/testing/conv2d_nchw_python.py
index 4a40d02d215c..7d2aa0d0fedf 100644
--- a/topi/python/topi/testing/conv2d_nchw_python.py
+++ b/topi/python/topi/testing/conv2d_nchw_python.py
@@ -4,8 +4,8 @@
 import scipy.signal
 
 
-def conv2d_nchw_python(a_np, w_np, stride, padding):
-    """Convolution operator in HWCN layout.
+def _conv2d_nchw_python(a_np, w_np, stride, padding):
+    """Convolution operator in NCHW layout.
 
     Parameters
     ----------
@@ -66,3 +66,36 @@ def conv2d_nchw_python(a_np, w_np, stride, padding):
                     apad, np.rot90(np.rot90(w_np[f, c])), mode='valid')
                 b_np[n, f] += out[::stride_h, ::stride_w]
     return b_np
+
+
+def conv2d_nchw_python(a_np, w_np, stride, padding, groups=1):
+    """Convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    w_np : numpy.ndarray
+        4-D with shape [num_filter, in_channel // groups, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str or a list/tuple of two ints
+        Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width]
+
+    groups : int
+        Number of groups
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    a_slices = np.array_split(a_np, groups, axis=1)
+    w_slices = np.array_split(w_np, groups, axis=0)
+    b_slices = [_conv2d_nchw_python(a_slice, w_slice, stride, padding)
+                for a_slice, w_slice in zip(a_slices, w_slices)]
+    b_np = np.concatenate(b_slices, axis=1)
+    return b_np
diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py
index 763db5f86be2..f34f3b331fd1 100644
--- a/topi/tests/python/common.py
+++ b/topi/tests/python/common.py
@@ -1,5 +1,9 @@
 """Common utility for topi test"""
 
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+
+
 def get_all_backend():
     """return all supported target
 
@@ -10,3 +14,14 @@ def get_all_backend():
     """
     return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx',
             'llvm -device=arm_cpu', 'opencl -device=mali', 'aocl_sw_emu']
+
+
+class NCHWcInt8Fallback(autotvm.FallbackContext):
+    def _query_inside(self, target, workload):
+        key = (target, workload)
+        if key in self.memory:
+            return self.memory[key]
+        cfg = FallbackConfigEntity()
+        cfg.template_key = 'int8'
+        self.memory[key] = cfg
+        return cfg
diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
index fd5e91eed72d..272a72f82619 100644
--- a/topi/tests/python/test_topi_conv2d_int8.py
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -9,7 +9,7 @@
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
-from common import get_all_backend
+from common import get_all_backend, NCHWcInt8Fallback
 
 oc_block_factor = 4
 
@@ -88,17 +88,6 @@ def check_device(device):
         check_device(device)
 
 
-class NCHWcInt8Fallback(autotvm.FallbackContext):
-    def _query_inside(self, target, workload):
-        key = (target, workload)
-        if key in self.memory:
-            return self.memory[key]
-        cfg = FallbackConfigEntity()
-        cfg.template_key = 'int8'
-        self.memory[key] = cfg
-        return cfg
-
-
 def test_conv2d_nchw():
     with NCHWcInt8Fallback():
         # ResNet18 workloads where channels in / out are multiple of oc_block_factor
diff --git a/topi/tests/python/test_topi_group_conv2d.py b/topi/tests/python/test_topi_group_conv2d.py
new file mode 100644
index 000000000000..c1ff656fcd93
--- /dev/null
+++ b/topi/tests/python/test_topi_group_conv2d.py
@@ -0,0 +1,215 @@
+"""Example code to do group convolution."""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+from common import get_all_backend, NCHWcInt8Fallback
+
+
+def verify_group_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, add_bias=False, add_relu=False):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" %
+        (batch, in_channel, in_size, num_filter,
+         kernel, stride, padding, dilation, groups))
+
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = tvm.placeholder((num_filter, in_channel // groups, kernel, kernel), name='W')
+    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_nchw")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(dtype)
+
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+            print("Skip because int8 intrinsics are not available")
+            return
+
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_group_conv2d_nchw([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\
+                (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \
+            (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+
+oc_block_factor = 4
+
+
+def verify_group_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, add_bias=False, add_relu=False):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" %
+        (batch, in_channel, in_size, num_filter,
+         kernel, stride, padding, dilation, groups))
+
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
+    W = tvm.placeholder((num_filter, in_channel // groups, kernel, kernel), name='W', dtype='int8')
+    bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias',
+                            dtype='int8')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_NCHWc_int8")
+    def get_ref_data():
+        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
+        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(dtype)
+
+        # convert to NCHWc
+        _, _, out_height, out_width = c_np.shape
+        c_np = c_np.reshape((batch, num_filter // oc_block_factor, oc_block_factor, \
+                out_height, out_width)).transpose(0, 1, 3, 4, 2)
+
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+            print("Skip because int8 intrinsics are not available")
+            return
+
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_group_conv2d_nchw([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\
+                (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \
+            (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ["cuda"]:
+        check_device(device)
+
+
+def test_group_conv2d_nchw():
+    # ResNeXt-50 workload
+    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 256, 56, 256, 3, 2, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 256, 28, 256, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 512, 28, 512, 3, 2, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 512, 14, 512, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 1024, 14, 1024, 3, 2, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 1024, 7, 1024, 3, 1, 1, 1, 32)
+
+    # bias, relu
+    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
+    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
+    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True,
+                             add_bias=True)
+
+    # dilation
+    verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
+
+    # batch size
+    verify_group_conv2d_nchw(2, 128, 56, 128, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nchw(9, 128, 56, 128, 3, 1, 1, 1, 32)
+
+
+
+def test_group_conv2d_NCHWc_int8():
+    with NCHWcInt8Fallback():
+        # ResNeXt-50 workload
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 256, 56, 256, 3, 2, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 256, 28, 256, 3, 1, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 512, 28, 512, 3, 2, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 512, 14, 512, 3, 1, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 1024, 14, 1024, 3, 2, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 1024, 7, 1024, 3, 1, 1, 1, 32)
+
+        # bias, relu
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True,
+                                       add_bias=True)
+        # dilation
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
+
+        # batch size
+        verify_group_conv2d_NCHWc_int8(2, 128, 56, 128, 3, 1, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(9, 128, 56, 128, 3, 1, 1, 1, 32)
+
+
+if __name__ == "__main__":
+    test_group_conv2d_nchw()
+    test_group_conv2d_NCHWc_int8()

From 30aa36674cbaf87d1540a5ceec5197aa97104f57 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfu.chen@icloud.com>
Date: Mon, 12 Nov 2018 11:24:48 +0800
Subject: [PATCH 345/529] [VTA] Improved RPC for VTA (#2043)

* assign default port to 9091 as the documented

* bug fix in printing RuntimeError and add an additional search path

* pretty print rebuild runtime args

* PRC => RPC

* replace vta_config.json file path

`build/vta_config.json` => `vta/config/vta_config.json`

* undo the change in adding lib_search path

* search vta_config.py file in vta/config

* avoid exposing driver function calls, and use predefined `VTAMemGetPhyAddr` instead.

* rename `tests/hardware/pynq` => `metal_test`

* set config path back to `build` dir
---
 vta/python/vta/exec/rpc_server.py                  | 10 +++++-----
 vta/python/vta/libinfo.py                          |  3 ++-
 vta/tests/hardware/common/test_lib.cc              | 14 +++++++-------
 vta/tests/hardware/{pynq => metal_test}/Makefile   |  2 +-
 .../hardware/{pynq => metal_test}/metal_test.cc    |  4 +++-
 5 files changed, 18 insertions(+), 15 deletions(-)
 rename vta/tests/hardware/{pynq => metal_test}/Makefile (94%)
 rename vta/tests/hardware/{pynq => metal_test}/metal_test.cc (94%)

diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py
index 768f6a00d451..c318e2dd5178 100644
--- a/vta/python/vta/exec/rpc_server.py
+++ b/vta/python/vta/exec/rpc_server.py
@@ -87,8 +87,8 @@ def reconfig_runtime(cfg_json):
         ldflags = pkg.ldflags
         lib_name = dll_path
         source = pkg.lib_source
-        logging.info("Rebuild runtime: output=%s, cflags=%s, source=%s, ldflags=%s",
-                     dll_path, str(cflags), str(source), str(ldflags))
+        logging.info("Rebuild runtime:\n output=%s,\n cflags=%s,\n source=%s,\n ldflags=%s",
+                     dll_path, '\n\t'.join(cflags), '\n\t'.join(source), '\n\t'.join(ldflags))
         cc.create_shared(lib_name, source, cflags + ldflags)
         with open(cfg_path, "w") as outputfile:
             outputfile.write(pkg.cfg_json)
@@ -99,10 +99,10 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--host', type=str, default="0.0.0.0",
                         help='the hostname of the server')
-    parser.add_argument('--port', type=int, default=9090,
-                        help='The port of the PRC')
+    parser.add_argument('--port', type=int, default=9091,
+                        help='The port of the RPC')
     parser.add_argument('--port-end', type=int, default=9199,
-                        help='The end search port of the PRC')
+                        help='The end search port of the RPC')
     parser.add_argument('--key', type=str, default="",
                         help="RPC key used to identify the connection type.")
     parser.add_argument('--tracker', type=str, default="",
diff --git a/vta/python/vta/libinfo.py b/vta/python/vta/libinfo.py
index 6cda7dfdeb7d..f7de9c55b1a0 100644
--- a/vta/python/vta/libinfo.py
+++ b/vta/python/vta/libinfo.py
@@ -21,5 +21,6 @@ def find_libvta(optional=False):
     lib_path = [os.path.join(x, lib_name) for x in lib_search]
     lib_found = [x for x in lib_path if os.path.exists(x)]
     if not lib_found and not optional:
-        raise RuntimeError("Cannot find libvta: candidates are: " % str(lib_path))
+        raise RuntimeError('Cannot find the files.\n' +
+                           'List of candidates:\n' + str('\n'.join(lib_path)))
     return lib_found
diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc
index 95b793ea3ba1..98b5f9a030b9 100644
--- a/vta/tests/hardware/common/test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -46,12 +46,12 @@ uint64_t vta(
   void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
 
   // Physical address pointers
-  uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
-  uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
-  uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
-  uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
-  uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
-  uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
+  uint32_t insn_phy = insns ? VTAMemGetPhyAddr(insns) : 0;
+  uint32_t uop_phy = uops ? VTAMemGetPhyAddr(uops) : 0;
+  uint32_t input_phy = inputs ? VTAMemGetPhyAddr(inputs) : 0;
+  uint32_t weight_phy = weights ? VTAMemGetPhyAddr(weights) : 0;
+  uint32_t bias_phy = biases ? VTAMemGetPhyAddr(biases) : 0;
+  uint32_t output_phy = outputs ? VTAMemGetPhyAddr(outputs) : 0;
 
 #if VTA_DEBUG == 1
   printf("INFO - Starting FPGA!\n");
@@ -1453,4 +1453,4 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression
     printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
     return -1;
   }
-}
\ No newline at end of file
+}
diff --git a/vta/tests/hardware/pynq/Makefile b/vta/tests/hardware/metal_test/Makefile
similarity index 94%
rename from vta/tests/hardware/pynq/Makefile
rename to vta/tests/hardware/metal_test/Makefile
index 7a862e22eff9..4174b4e4726d 100644
--- a/vta/tests/hardware/pynq/Makefile
+++ b/vta/tests/hardware/metal_test/Makefile
@@ -11,7 +11,7 @@ OBJECTS = pynq_driver.o test_lib.o metal_test.o
 EXECUTABLE = vta
 
 # Include VTA config
-VTA_CONFIG = python ../../../make/vta_config.py
+VTA_CONFIG = python ../../../config/vta_config.py
 CFLAGS += `${VTA_CONFIG} --cflags`
 LDFLAGS += `${VTA_CONFIG} --ldflags`
 VTA_TARGET := $(shell ${VTA_CONFIG} --target)
diff --git a/vta/tests/hardware/pynq/metal_test.cc b/vta/tests/hardware/metal_test/metal_test.cc
similarity index 94%
rename from vta/tests/hardware/pynq/metal_test.cc
rename to vta/tests/hardware/metal_test/metal_test.cc
index 56be244baa79..48d719ff4b32 100644
--- a/vta/tests/hardware/pynq/metal_test.cc
+++ b/vta/tests/hardware/metal_test/metal_test.cc
@@ -10,7 +10,9 @@
 #include <string.h>
 #include <time.h>
 #include <vta/driver.h>
-#include "../../../src/pynq/pynq_driver.h"
+#ifdef VTA_TARGET_PYNQ
+#  include "../../../src/pynq/pynq_driver.h"
+#endif  // VTA_TARGET_PYNQ
 #include "../common/test_lib.h"
 
 int main(void) {

From 075f5956dd66a835a5c19edcaf8244dca3d3b175 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Mon, 12 Nov 2018 17:17:15 -0800
Subject: [PATCH 346/529] [TOPI] depthwise-conv2d in NCHW[x]c layout for x86
 (#2045)

---
 nnvm/python/nnvm/top/nn.py                    |  11 +-
 nnvm/src/top/nn/convolution.cc                |   4 +-
 topi/python/topi/generic/nn.py                |  17 ++
 topi/python/topi/nn/depthwise_conv2d.py       |  65 +++++-
 topi/python/topi/x86/__init__.py              |   1 +
 topi/python/topi/x86/conv2d.py                | 114 +++++-----
 topi/python/topi/x86/conv2d_avx_1x1.py        |   4 +-
 topi/python/topi/x86/conv2d_avx_common.py     |   4 +-
 topi/python/topi/x86/depthwise_conv2d.py      | 203 ++++++++++++++++++
 topi/python/topi/x86/util.py                  |  12 ++
 topi/tests/python/test_topi_conv2d_NCHWc.py   |  23 +-
 .../python/test_topi_depthwise_conv2d.py      | 116 ++++++++++
 tutorials/autotvm/tune_nnvm_x86.py            |  10 +-
 13 files changed, 517 insertions(+), 67 deletions(-)
 create mode 100644 topi/python/topi/x86/depthwise_conv2d.py
 create mode 100644 topi/python/topi/x86/util.py

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 34dd2303f1d7..2069a0a5ad50 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -4,7 +4,7 @@
 
 import tvm
 import topi
-from topi.util import get_const_int
+from topi.util import get_const_int, get_const_tuple
 from .tensor import _fschedule_broadcast, _fschedule_injective
 from . import registry as reg
 from .registry import OpPattern
@@ -164,16 +164,22 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")
+    out_channel = attrs.get_int("channels")
     groups = attrs.get_int("groups")
     layout = attrs.get_string("layout")
     out_layout = attrs.get_string("out_layout")
     out_dtype = attrs.get_string("out_dtype")
     out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
+    _, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape)
+    in_channel = in_channel_chunk * in_channel_block
     assert dilation == (1, 1), "not support dilate now"
     if groups == 1:
         # pylint: disable=assignment-from-no-return
         out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation,
                                    layout, out_layout, out_dtype)
+    elif groups == in_channel and groups == out_channel:
+        out = topi.nn.depthwise_conv2d_NCHWc(inputs[0], inputs[1], strides, padding,
+                                             dilation, layout, out_layout, out_dtype)
         # pylint: enable=assignment-from-no-return
     else:
         raise ValueError("not support arbitrary group number > 1 for now")
@@ -187,9 +193,12 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
 def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
     """Schedule definition of conv2d NCHWc"""
     groups = attrs.get_int("groups")
+    out_channel = attrs.get_int("channels")
     with tvm.target.create(target):
         if groups == 1:
             return topi.generic.schedule_conv2d_NCHWc(outs)
+        elif groups == out_channel:
+            return topi.generic.schedule_depthwise_conv2d_NCHWc(outs)
         else:
             raise ValueError("not support group number > 1 for now")
 
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index 22bda048a0a2..813947492117 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -82,7 +82,9 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
 
   wshape[kernel_layout.indexof('O')] *= param.groups;
 
-  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kWeight, wshape);
+  if (in_shape->at(Conv2DParam::kWeight).ndim() == 0) {
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kWeight, wshape);
+  }
   if (param.use_bias) {
     static const Layout default_bias_layout("C");
     TShape bias_shape({param.channels});
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 0f4b51b81433..8c303e5be182 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -174,6 +174,23 @@ def schedule_depthwise_conv2d_nhwc(outs):
     return _default_schedule(outs, False)
 
 
+@tvm.target.generic_func
+def schedule_depthwise_conv2d_NCHWc(outs):
+    """Schedule for depthwise_conv2d_NCHWc
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of depthwise_conv2d_nhwc
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_group_conv2d_nchw(outs):
     """Schedule for conv2d_nchw
diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py
index 78107d2bd1ce..b5f46b840c9c 100644
--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -1,6 +1,7 @@
-# pylint: disable=invalid-name, unused-variable, too-many-locals
+# pylint: disable=invalid-name, unused-variable, too-many-locals, unused-argument
 """Depthwise convolution operators"""
 from __future__ import absolute_import as _abs
+from collections import namedtuple
 import tvm
 
 from .dilate import dilate
@@ -8,6 +9,27 @@
 from .util import get_pad_tuple
 from ..util import simplify
 
+# workload description of depthwise-conv2d
+Workload = namedtuple('Workload',
+                      ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
+                       'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+
+def _get_workload(data, kernel, stride, padding, out_dtype):
+    """ Get the workload structure. """
+    _, in_channel, height, width = [x.value for x in data.shape]
+    channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape]
+    out_channel = channel * channel_multiplier
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
+        "Do not support inputs with different data types now. ' \
+        '{} vs. {}".format(data.dtype, kernel.dtype)
+    return Workload(data.dtype, out_dtype, height, width, in_channel,
+                    out_channel, kh, kw, HPAD, WPAD, HSTR, WSTR)
+
 
 @tvm.target.generic_func
 def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
@@ -258,3 +280,44 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid
         tag='depthwise_conv2d_backward_weight_nhwc')
 
     return Weight_grad
+
+
+@tvm.target.generic_func
+def depthwise_conv2d_NCHWc(Input, Filter, stride, padding, dilation,
+                           layout, out_layout, out_dtype=None):
+    """Depthwise convolution NCHW[x]c forward operator.
+
+    Parameters
+    ----------
+    Input : tvm.Tensor
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    Filter : tvm.Tensor
+        4-D with shape [out_channel_chunk, filter_height, filter_width, out_channel_block]
+        In NCHWc depthwise convolution,
+        we group kernel's in_channel and channel_multiplier together then do the tiling.
+
+    stride : tuple of two ints
+        The spatial stride along height and width
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    dilation: int or a list/tuple of two ints
+         dilation size, or [dilation_height, dilation_width]
+
+    layout : str
+        Input data layout
+
+    out_layout : str
+        Output data layout
+
+    out_dtype: str, optional
+        Output data type
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    raise ValueError("missing register for topi.nn.depthwise_conv2d_NCHWc")
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
index c146419fcec9..9e0e94e6cd2d 100644
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -9,3 +9,4 @@
 from .injective import *
 from .pooling import schedule_pool, schedule_global_pool
 from .bitserial_conv2d import schedule_bitserial_conv2d
+from .depthwise_conv2d import schedule_depthwise_conv2d_NCHWc
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index afeb99e7051d..7e0b90f1db9b 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -7,36 +7,30 @@
 from .. import generic, tag
 from .. import nn
 from ..util import get_const_tuple
-from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, _get_workload
+from ..nn.conv2d import conv2d, conv2d_NCHWc, \
+    conv2d_alter_layout, _get_workload as _get_conv2d_workload
 from ..nn.dilate import dilate
+from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
+from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, depthwise_conv2d_nchw
 from ..nn.pad import pad
 
 from . import conv2d_avx_1x1, conv2d_avx_common
 
-def _get_fp32_len():
-    fp32_vec_len = 8
-    target = tvm.target.current_target()
-    if target is not None:
-        for opt in target.options:
-            if opt == '-mcpu=skylake-avx512':
-                fp32_vec_len = 16
-    return fp32_vec_len
-
-
-def _get_default_config(cfg, workload):
+def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False):
     """
     Get default schedule config for the workload
-    Parameters
-    ----------
-    workload : topi.nn.conv2d.Workload
-        Convolution workload
     """
-    fp32_vec_len = _get_fp32_len()
-    is_kernel_1x1 = workload.hkernel == 1 and workload.wkernel == 1
-    if is_kernel_1x1:
-        conv2d_avx_1x1._fallback_schedule(cfg, workload, fp32_vec_len)
+    if is_depthwise:
+        wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, out_dtype)
+        from depthwise_conv2d import _fallback_schedule
+        _fallback_schedule(cfg, wkl)
     else:
-        conv2d_avx_common._fallback_schedule(cfg, workload, fp32_vec_len)
+        wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype)
+        is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
+        if is_kernel_1x1:
+            conv2d_avx_1x1._fallback_schedule(cfg, wkl)
+        else:
+            conv2d_avx_common._fallback_schedule(cfg, wkl)
 
 
 def _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout):
@@ -74,10 +68,9 @@ def _declaration_conv(cfg, data, kernel, strides, padding, dilation, layout, out
     if layout == 'NCHW':
         _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout)
         if cfg.is_fallback:
-            wkl = _get_workload(data, kernel, strides, padding, out_dtype)
-            _get_default_config(cfg, wkl)
-        return _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout,
-                                      out_dtype)
+            _get_default_config(cfg, data, kernel, strides, padding, out_dtype)
+        return _declaration_conv_impl(cfg, data, kernel, strides,
+                                      padding, dilation, layout, out_dtype)
     elif layout == 'HWCN':
         return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
     elif layout == 'NHWC':
@@ -295,44 +288,69 @@ def _alter_conv2d_layout(attrs, inputs, tinfo):
     copy_inputs = [s for s in inputs]
     new_attrs = {k : attrs[k] for k in attrs.keys()}
     data, kernel = tinfo[0], tinfo[1]
-    # only optimize for NCHW, groups=1 conv
-    if attrs['layout'] != 'NCHW' or attrs.get_int("groups") != 1:
-        return None
     batch_size, in_channel, height, width = get_const_tuple(data.shape)
-    out_channel, _, kh, kw = get_const_tuple(kernel.shape)
 
+    groups = attrs.get_int("groups")
+    out_channel = attrs.get_int("channels")
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
     layout = attrs['layout']
+    kh, kw = attrs.get_int_tuple("kernel_size")
 
     dtype = data.dtype
     out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"]
+    is_depthwise = groups == in_channel and groups == out_channel
+
+    # only optimize for NCHW
+    if layout != 'NCHW':
+        return None
+    if groups != 1 and not is_depthwise:
+        return None
 
-    workload = autotvm.task.args_to_workload(
-        [data, kernel, strides, padding, layout, out_dtype], conv2d)
     dispatch_ctx = autotvm.task.DispatchContext.current
     target = tvm.target.current_target()
+    # query schedule and fallback if necessary
+    workload = autotvm.task.args_to_workload(
+        [data, kernel, strides, padding, out_dtype], depthwise_conv2d_nchw) \
+        if is_depthwise else \
+        autotvm.task.args_to_workload(
+            [data, kernel, strides, padding, layout, out_dtype], conv2d)
     cfg = dispatch_ctx.query(target, workload)
     if cfg.is_fallback:
-        wkl = _get_workload(data, kernel, strides, padding, out_dtype)
-        _get_default_config(cfg, wkl)
+        _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise)
 
     ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
     new_attrs['layout'] = 'NCHW%dc' % ic_bn
     new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
-    # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
-    new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
 
-    # Store the same config for the altered operator (workload)
     new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
                                dtype=data.dtype)
-    new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn),
-                                 dtype=kernel.dtype)
-    new_workload = autotvm.task.args_to_workload(
-        [new_data, new_kernel, strides, padding, new_attrs['layout'],
-         new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
-    dispatch_ctx.update(target, new_workload, cfg)
+    if is_depthwise:
+        # channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block
+        # in which out_channel = merge(channel, channel_multiplier)
+        kernel_sym = copy_inputs[1]
+        kernel_sym = sym.reshape(kernel_sym, shape=(out_channel//oc_bn, oc_bn, kh, kw))
+        kernel_sym = sym.transpose(kernel_sym, axes=(0, 2, 3, 1))
+        copy_inputs[1] = kernel_sym
+
+        # Store altered operator's config
+        new_kernel = tvm.placeholder((out_channel//oc_bn, kh, kw, oc_bn), dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, new_attrs['layout'],
+             new_attrs['out_layout'], out_dtype], depthwise_conv2d_NCHWc)
+    else:
+        out_channel, _, kh, kw = get_const_tuple(kernel.shape)
+        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
+        new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
+
+        # Store altered operator's config
+        new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn),
+                                     dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, new_attrs['layout'],
+             new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
 
+    dispatch_ctx.update(target, new_workload, cfg)
     return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
 
 
@@ -354,13 +372,11 @@ def _declaration_conv_NCHWc(cfg, data, kernel, strides,
         oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape)
     num_filter = oc_chunk * oc_bn
 
-    # get workload and related schedule config
-    wkl = _get_workload(tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
-                        tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width),
-                                        dtype=kernel.dtype),
-                        strides, padding, out_dtype)
     if cfg.is_fallback:
-        _get_default_config(cfg, wkl)
+        _get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
+                            tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width),
+                                            dtype=kernel.dtype),
+                            strides, padding, out_dtype)
 
     # output shape
     out_height = (ih + 2 * HPAD - kernel_height) // HSTR + 1
@@ -386,7 +402,7 @@ def _declaration_conv_NCHWc(cfg, data, kernel, strides,
         n_elems = 4
         assert ic_bn % n_elems == 0
 
-        ic_outer = tvm.reduce_axis((0, wkl.in_filter//ic_bn), name='ic_outer')
+        ic_outer = tvm.reduce_axis((0, in_channel//ic_bn), name='ic_outer')
         ic_f_inner = tvm.reduce_axis((0, ic_bn//n_elems), name='ic_f_inner')
         ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
         return tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index ce70ec83828b..d44e3899293d 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -8,8 +8,10 @@
 from ..util import get_const_tuple
 from .tensor_intrin import dot_16x1x16_int8_int8_int32
 from .check_targets import check_skylake
+from .util import get_fp32_len
 
-def _fallback_schedule(cfg, wkl, simd_width):
+def _fallback_schedule(cfg, wkl):
+    simd_width = get_fp32_len()
     HPAD, WPAD = wkl.hpad, wkl.wpad
     HSTR, WSTR = wkl.hstride, wkl.wstride
     out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index e52722ed54a7..1b8ee5fe9be4 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -8,8 +8,10 @@
 from ..util import get_const_tuple
 from .tensor_intrin import dot_16x1x16_int8_int8_int32
 from .check_targets import check_skylake
+from .util import get_fp32_len
 
-def _fallback_schedule(cfg, wkl, simd_width):
+def _fallback_schedule(cfg, wkl):
+    simd_width = get_fp32_len()
     HPAD, WPAD = wkl.hpad, wkl.wpad
     HSTR, WSTR = wkl.hstride, wkl.wstride
     out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py
new file mode 100644
index 000000000000..8f37a0316229
--- /dev/null
+++ b/topi/python/topi/x86/depthwise_conv2d.py
@@ -0,0 +1,203 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Depthwise Conv2D schedule on x86"""
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task import get_config
+from tvm.autotvm.task.space import SplitEntity
+from tvm.autotvm.task.nnvm_integration import deserialize_args
+from .. import generic, tag
+from ..nn.pad import pad
+from ..util import get_const_tuple
+from ..nn.util import get_pad_tuple
+from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, _get_workload
+
+from .util import get_fp32_len
+
+def _fallback_schedule(cfg, wkl):
+    """
+    Get default schedule for the workload
+    Parameters
+    ----------
+    cfg : tvm.autotvm.task.space.FallbackConfigEntity
+        Fallback config to be updated
+    wkl : topi.nn.depthwise_conv2d.Workload
+        Convolution workload
+    """
+    simd_width = get_fp32_len()
+
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+
+    oc_bn = 1
+    for bn in range(simd_width, 0, -1):
+        if wkl.out_filter % bn == 0:
+            oc_bn = bn
+            break
+
+    ic_bn = 1
+    for bn in range(oc_bn, 0, -1):
+        if wkl.in_filter % bn == 0:
+            ic_bn = bn
+            break
+
+    reg_n = 1
+    for n in range(31, 0, -1):
+        if out_width % n == 0:
+            reg_n = n
+            break
+
+    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
+    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
+    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
+
+
+@autotvm.register_topi_compute(depthwise_conv2d_NCHWc, 'cpu', 'direct')
+def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation,
+                                layout, out_layout, out_dtype=None):
+    out_dtype = data.dtype if out_dtype is None else out_dtype
+    batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape)
+    out_channel_chunk, filter_height, filter_width, out_channel_block \
+        = get_const_tuple(kernel.shape)
+
+    strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    HSTR, WSTR = strides
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (filter_height, filter_width))
+
+    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    assert (dh, dw) == (1, 1), "Does not support dilation"
+
+    in_channel = in_channel_chunk * in_channel_block
+    out_channel = out_channel_chunk * out_channel_block
+    channel_multiplier = out_channel // in_channel
+
+    out_height = (in_height - filter_height + pad_top + pad_down) // HSTR + 1
+    out_width = (in_width - filter_width + pad_left + pad_right) // WSTR + 1
+
+    # get workload and related schedule config
+    wkl = _get_workload(tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype),
+                        tvm.placeholder((out_channel, in_channel, filter_height, filter_width),
+                                        dtype=kernel.dtype),
+                        strides, padding, out_dtype)
+    if cfg.is_fallback:
+        _fallback_schedule(cfg, wkl)
+
+    # padding stage
+    DOPAD = (pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0)
+    if DOPAD:
+        pad_before = [0, 0, pad_top, pad_left, 0]
+        pad_after = [0, 0, pad_down, pad_right, 0]
+        data_pad = pad(data, pad_before, pad_after, name="PaddedInput")
+    else:
+        data_pad = data
+
+    # depthconv stage
+    kh = tvm.reduce_axis((0, filter_height), name='kh')
+    kw = tvm.reduce_axis((0, filter_width), name='kw')
+    Output = tvm.compute(
+        (batch, out_channel_chunk, out_height, out_width, out_channel_block),
+        lambda b, oco, oh, ow, oci: tvm.sum(
+            (data_pad[b, (oco * out_channel_block + oci) // channel_multiplier // in_channel_block,
+                      oh*HSTR+kh, ow*WSTR+kw,
+                      ((oco * out_channel_block + oci) // channel_multiplier) % in_channel_block]
+             .astype(out_dtype) *
+             kernel[oco, kh, kw, oci].astype(out_dtype)),
+            axis=[kh, kw]),
+        name='DepthwiseConv2d', tag="depthwise_conv2d_NCHWc")
+    return Output
+
+
+@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_NCHWc, 'cpu', ['direct'])
+def schedule_depthwise_conv2d_NCHWc(cfg, outs):
+    """CPU schedule for depthwise conv2d in NCHW[x]c layout"""
+    s = tvm.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                    traverse(tensor.op)
+        if 'depthwise_conv2d_NCHWc' in op.tag:
+            conv_out = op.output(0)
+            data = conv_out.op.input_tensors[0]
+            kernel = conv_out.op.input_tensors[1]
+            _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, outs[0])
+        scheduled_ops.append(op)
+    traverse(outs[0].op)
+    return s
+
+def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, output):
+    tile_ow = cfg["tile_ow"].size[-1]
+    # schedule data
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
+        p = s[A].fuse(ic_chunk, ih)
+        s[A].parallel(p)
+
+    C, O = conv_out, output
+    CC = s.cache_write(C, 'global')
+
+    _, ic_chunk, oh, ow, ic_block = s[C].op.axis
+    ow_chunk, ow_block = s[C].split(ow, factor=tile_ow)
+    s[C].reorder(ic_chunk, oh, ow_chunk, ow_block, ic_block)
+    parallel_axis = s[C].fuse(ic_chunk, oh)
+    s[C].parallel(parallel_axis)
+    s[CC].compute_at(s[C], ow_chunk)
+
+    _, ic_chunk, oh, ow, ic_block = s[CC].op.axis
+    kh, kw = s[CC].op.reduce_axis
+    ow_chunk, ow_block = s[CC].split(ow, factor=tile_ow)
+    s[CC].reorder(ic_chunk, oh, kh, kw, ow_block, ic_block)
+    s[CC].vectorize(ic_block)
+    s[CC].unroll(ow_block)
+
+    if C != O:
+        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+        ow_chunk, ow_block = s[O].split(ow, factor=tile_ow)
+        s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+        parallel_axis = s[O].fuse(oc_chunk, oh)
+        s[C].compute_at(s[O], parallel_axis)
+        s[O].vectorize(oc_block)
+        s[O].parallel(parallel_axis)
+    return s
+
+
+@autotvm.task.register("topi_x86_depthwise_conv2d_NCHWc_from_nchw")
+def _topi_nn_depthwise_conv2d_NCHWc(*args, **kwargs):
+    assert not kwargs, "Do not support kwargs in template function call"
+    data, kernel, strides, padding, dilation, dtype = deserialize_args(args)
+
+    batch, in_channel, height, width = get_const_tuple(data.shape)
+    filter_channel, channel_multiplier, kh, kw = get_const_tuple(kernel.shape)
+    ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding)
+    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    out_height = (height - kh + 2 * ph) // sh + 1
+    out_width = (width - kw + 2 * pw) // sw + 1
+    out_channel = filter_channel * channel_multiplier
+
+    # get config here
+    cfg = get_config()
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+
+    # change shape with the value in config
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    new_data_shape = (batch, in_channel // ic_bn, height, width, ic_bn)
+    new_kernel_shape = (out_channel // oc_bn, kh, kw, oc_bn)
+    new_data = tvm.placeholder(new_data_shape, data.dtype)
+    new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
+
+    data_layout = "NCHW%dc" % ic_bn
+    out_layout = "NCHW%dc" % oc_bn
+
+    C = _depthwise_conv2d_NCHWc_cpu(cfg, new_data, new_kernel, strides, padding, dilation,
+                                    data_layout, out_layout, dtype)
+    s = schedule_depthwise_conv2d_NCHWc(cfg, [C])
+    return s, [new_data, new_kernel, C]
diff --git a/topi/python/topi/x86/util.py b/topi/python/topi/x86/util.py
new file mode 100644
index 000000000000..678ff8e24cff
--- /dev/null
+++ b/topi/python/topi/x86/util.py
@@ -0,0 +1,12 @@
+"""Common x86 related utilities"""
+from __future__ import absolute_import as _abs
+import tvm
+
+def get_fp32_len():
+    fp32_vec_len = 8
+    target = tvm.target.current_target()
+    if target is not None:
+        for opt in target.options:
+            if opt == '-mcpu=skylake-avx512':
+                fp32_vec_len = 16
+    return fp32_vec_len
diff --git a/topi/tests/python/test_topi_conv2d_NCHWc.py b/topi/tests/python/test_topi_conv2d_NCHWc.py
index 38e6ad6d9e7c..a3af43c8d810 100644
--- a/topi/tests/python/test_topi_conv2d_NCHWc.py
+++ b/topi/tests/python/test_topi_conv2d_NCHWc.py
@@ -13,27 +13,22 @@
 def _transform_data(data, bn):
     # NCHW -> NCHW[x]c
     batch_size, channel, height, width = data.shape
-    data = np.transpose(data, (0, 2, 3, 1))
-    data = np.reshape(data, (batch_size, height, width, channel//bn, bn))
-    data = np.transpose(data, (0, 3, 1, 2, 4))
+    data = np.reshape(data, (batch_size, channel//bn, bn, height, width))
+    data = np.transpose(data, (0, 1, 3, 4, 2))
     return data
 
 def _transform_kernel(kernel, ic_bn, oc_bn):
     # OIHW -> OIHW[x]i[x]o
     out_channel, in_channel, kh, kw = kernel.shape
-    kernel = np.transpose(kernel, (1, 2, 3, 0))
-    kernel = np.reshape(kernel, (in_channel, kh, kw, out_channel//oc_bn, oc_bn))
-    kernel = np.transpose(kernel, (1, 2, 3, 4, 0))
-    kernel = np.reshape(kernel, (kh, kw, out_channel//oc_bn, oc_bn, in_channel//ic_bn, ic_bn))
-    kernel = np.transpose(kernel, (2, 4, 0, 1, 5, 3))
+    kernel = np.reshape(kernel, (out_channel//oc_bn, oc_bn, in_channel//ic_bn, ic_bn, kh, kw))
+    kernel = np.transpose(kernel, (0, 2, 4, 5, 3, 1))
     return kernel
 
 def _transform_bias(bias, bn):
     # [num_filter, 1, 1] -> [num_filter//bn, 1, 1, bn]
     num_filter, h, w = bias.shape
-    bias = np.transpose(bias, (1, 2, 0))
-    bias = np.reshape(bias, (h, w, num_filter//bn, bn))
-    bias = np.transpose(bias, (2, 0, 1, 3))
+    bias = np.reshape(bias, (num_filter//bn, bn, h, w))
+    bias = np.transpose(bias, (0, 2, 3, 1))
     return bias
 
 def verify_conv2d_NCHWc(batch, in_channel, in_size, num_filter, kernel, stride,
@@ -86,6 +81,7 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding),
+                                     (dilation, dilation),
                                      layout='NCHW%dc'%ic_block,
                                      out_layout="NCHW%dc"%oc_block,
                                      out_dtype=dtype)
@@ -117,7 +113,7 @@ def check_device(device):
             check_device(device)
 
 
-if __name__ == "__main__":
+def test_conv2d_NCHWc():
     # ResNet18 workloads
     verify_conv2d_NCHWc(1,   3, 224,  64, 7, 2, 3)
     verify_conv2d_NCHWc(1,  64,  56,  64, 3, 1, 1)
@@ -204,3 +200,6 @@ def check_device(device):
     verify_conv2d_NCHWc(1, 2048,  10, 126, 3, 1, 1)
     verify_conv2d_NCHWc(1,  512,   5, 126, 3, 1, 1)
     verify_conv2d_NCHWc(1,  256,   3, 126, 3, 1, 1)
+
+if __name__ == "__main__":
+    test_conv2d_NCHWc()
\ No newline at end of file
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index a5dd6d328f07..98c93dff9993 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -203,6 +203,115 @@ def get_ref_data():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
             check_device(device)
 
+def _transform_data(data, bn):
+    # NCHW -> NCHW[x]c
+    batch_size, channel, height, width = data.shape
+    data = np.reshape(data, (batch_size, channel//bn, bn, height, width))
+    data = np.transpose(data, (0, 1, 3, 4, 2))
+    return data
+
+def _transform_kernel(kernel, bn):
+    # channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block
+    channel, channel_multiplier, kh, kw = kernel.shape
+    out_channel = channel * channel_multiplier
+    kernel = np.reshape(kernel, (out_channel//bn, bn, kh, kw))
+    kernel = np.transpose(kernel, (0, 2, 3, 1))
+    return kernel
+
+def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
+    in_width = in_height
+    filter_channel = in_channel
+    filter_width = filter_height
+    stride_h = stride_w = stride
+
+    assert dilation == 1, "depthwise_conv2d_NCHWc currently does not support dilation."
+    pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
+    padding_args = (pad_h, pad_w)
+
+    out_channel = filter_channel * channel_multiplier
+    # for testing functionality,
+    # we choose arbitrary block size that can divide the channel,
+    # regardless of the performance.
+    oc_block = 1
+    for bn in range(16, 0, -1):
+        if out_channel % bn == 0:
+            oc_block = bn
+            break
+
+    ic_block = 1
+    for bn in range(oc_block, 0, -1):
+        if in_channel % bn == 0:
+            ic_block = bn
+            break
+
+    # placeholder
+    Input = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='Input')
+    Filter = tvm.placeholder((out_channel//oc_block, filter_height, filter_width, oc_block), name='Filter')
+    in_layout = "NCHW%dc" % ic_block
+    out_layout = "NCHW%dc" % oc_block
+    dtype = 'float32'
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            # declare
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_NCHWc(Input, Filter,
+                                                             (stride_h, stride_w),
+                                                             padding_args,
+                                                             (dilation, dilation),
+                                                             in_layout,
+                                                             out_layout, dtype)
+            # TODO: add scale_shift implement for NCHWc and add test here
+            Relu = topi.nn.relu(DepthwiseConv2d)
+            # schedule
+            s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
+            s2 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
+        # build the kernels
+        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
+        f2 = tvm.build(s2, [Input, Filter, Relu], device)
+
+        # Prepare pod type for test data closure
+        input_shape = (batch, in_channel, in_height, in_width)
+        filter_shape = (filter_channel, channel_multiplier, filter_height, filter_width)
+
+        # Use memoize, pickle the test data for next time use.
+        @memoize("topi.tests.test_topi_depthwise_conv2d.NCHWc")
+        def get_ref_data():
+            input_np = np.random.uniform(size=input_shape).astype(dtype)
+            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
+            # correctness with scipy
+            depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
+                input_np, filter_np, stride, padding)
+            relu_scipy = np.maximum(depthwise_conv2d_scipy, 0)
+            return (_transform_data(input_np, ic_block),
+                    _transform_kernel(filter_np, oc_block),
+                    _transform_data(depthwise_conv2d_scipy, oc_block),
+                    _transform_data(relu_scipy, oc_block))
+
+        # Get the test data
+        (input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data()
+
+        input_tvm = tvm.nd.array(input_np, ctx)
+        filter_tvm = tvm.nd.array(filter_np, ctx)
+        depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
+                                                     dtype=DepthwiseConv2d.dtype), ctx)
+        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
+        # launch kernel 1 (depthwise_conv2d)
+        f1(input_tvm, filter_tvm, depthwise_conv2d_tvm)
+        # launch kernel 2 (depthwise_conv2d + relu)
+        f2(input_tvm, filter_tvm, relu_tvm)
+        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+
+    # test llvm only for now since depthwise_conv2d_NCHWc implement is missing in other backend.
+    for device in ["llvm"]:
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
+
 
 def test_depthwise_conv2d():
     # mobilenet workloads
@@ -233,5 +342,12 @@ def test_depthwise_conv2d():
     # disabled because it uses too large shared memory on cuda
     # depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
 
+    # NCHW[x]c
+    depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME")
+    depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "SAME")
+    depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID")
+    depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "VALID")
+
+
 if __name__ == "__main__":
     test_depthwise_conv2d()
diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py
index 18f1117dc68a..9f8692c3981e 100644
--- a/tutorials/autotvm/tune_nnvm_x86.py
+++ b/tutorials/autotvm/tune_nnvm_x86.py
@@ -117,7 +117,15 @@ def tune_kernels(tasks,
         prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
 
         # converting conv2d tasks to conv2d_NCHWc tasks
-        task = autotvm.task.create("topi_x86_conv2d_NCHWc", args=tsk.args,
+        op_name = tsk.workload[0]
+        if op_name == 'conv2d':
+            func_create = 'topi_x86_conv2d_NCHWc'
+        elif op_name == 'depthwise_conv2d_nchw':
+            func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
+        else:
+            raise ValueError("Tuning {} is not supported on x86".format(op_name))
+
+        task = autotvm.task.create(func_create, args=tsk.args,
                                    target=target, template_key='direct')
         task.workload = tsk.workload
 

From fe6510c3e073b8058aa95417aad27efab918256e Mon Sep 17 00:00:00 2001
From: "Bob.Liu" <bofangliu@tuputech.com>
Date: Tue, 13 Nov 2018 12:53:39 +0800
Subject: [PATCH 347/529] [FRONTEND][ONNX]add Pad, ReduceMax, ReduceMin,
 ReduceMean and ReduceSum OP (#2061)

* add Pad,ReduceMax,ReduceMin,ReduceMean,ReduceSum for onnx frontend

* fixed pylint error and warning for frontend.onnx file

* add implement v2 for Pad in onnx frontend

* compatible with python 3.x

* disable too-many-lines pylint check in frontend onnx

* use random values instead in onnx frontend testing
---
 nnvm/python/nnvm/frontend/onnx.py             |  71 +++++++----
 .../python/frontend/onnx/test_forward.py      | 116 ++++++++++++++++++
 2 files changed, 161 insertions(+), 26 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index 34ab5cab7c06..097909de1a8d 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -1,4 +1,4 @@
-# pylint: disable=import-self, invalid-name, unused-argument
+# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines
 """ONNX: Open Neural Network Exchange frontend."""
 from __future__ import absolute_import as _abs
 import numpy as np
@@ -31,10 +31,9 @@ def get_converter(cls, opset):
             max([i for i, v in enumerate(versions) if v == opset]) - 1]
         if hasattr(cls, '_impl_v{}'.format(version)):
             return getattr(cls, '_impl_v{}'.format(version))
-        else:
-            raise NotImplementedError(
-                'opset version {} of {} not implemented'.format(
-                    version, cls.__name__))
+        raise NotImplementedError(
+            'opset version {} of {} not implemented'.format(
+                version, cls.__name__))
 
 
 class Elemwise(OnnxOpConverter):
@@ -200,22 +199,44 @@ class Mul(Elemwise):
 
 
 class Pad(OnnxOpConverter):
+    """ Operator converter for Pad.
+    """
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        # get number of channels
-        channels = _infer_channels(inputs[1], params, True)
-        attr['channels'] = channels
-        groups = attr.pop('group')
-        attr['groups'] = groups
+        pad_width = []
+        pads = attr.pop('paddings')
+        dims = int(len(pads) / 2)
+        for i in range(dims):
+            pad_width.append((pads[i], pads[i+dims]))
+        attr['pad_width'] = pad_width
+
+        return AttrCvt(
+            op_name='pad',
+            transforms={
+                'value': 'pad_value',
+            },
+            ignores=['mode'],
+            custom_check=(lambda attrs: attrs.get('mode', 'constant').decode("utf-8") == 'constant',
+                          'split mode != constant'))(inputs, attr, params)
+
+    @classmethod
+    def _impl_v2(cls, inputs, attr, params):
+        pad_width = []
+        pads = attr.pop('pads')
+        dims = int(len(pads) / 2)
+        for i in range(dims):
+            pad_width.append((pads[i], pads[i+dims]))
+        attr['pad_width'] = pad_width
+
         return AttrCvt(
             op_name='pad',
             transforms={
                 'value': 'pad_value',
-                'pads': 'pad_width'
             },
-            custom_check=lambda attrs: attrs.get('mode') == 'constant')(
-                inputs, attr, params)
+            ignores=['mode'],
+            custom_check=(lambda attrs: attrs.get('mode', 'constant').decode("utf-8") == 'constant',
+                          'split mode != constant'))(inputs, attr, params)
 
 
 class ParametricSoftPlus(OnnxOpConverter):
@@ -368,8 +389,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
 
     return _impl
 
@@ -659,14 +679,13 @@ def _impl_v1(cls, inputs, attr, params):
                 transforms={'value': 'fill_value'},
                 ignores=['dtype'])(inputs, attr)
             return _sym.cast(out, dtype=attr['dtype'].decode("utf-8"))
-        else:
-            if 'extra_shape' in attr:
-                shape = shape + attr.pop('extra_shape')
+        if 'extra_shape' in attr:
+            shape = shape + attr.pop('extra_shape')
 
-            return AttrCvt(
-                op_name='full',
-                transforms={'value': 'fill_value'},
-                extras={'shape':shape})(inputs, attr)
+        return AttrCvt(
+            op_name='full',
+            transforms={'value': 'fill_value'},
+            extras={'shape':shape})(inputs, attr)
 
 # compatible operators that do NOT require any conversion.
 _identity_list = []
@@ -758,10 +777,10 @@ def _get_convert_map(opset):
         'LRN': LRN.get_converter(opset),
 
         # defs/reduction
-        'ReduceMax': AttrCvt('max', {'axes', 'axis'}),
-        'ReduceMin': AttrCvt('min', {'axes', 'axis'}),
-        'ReduceSum': AttrCvt('sum', {'axes', 'axis'}),
-        # 'ReduceMean'
+        'ReduceMax': AttrCvt('max', {'axes': 'axis'}),
+        'ReduceMin': AttrCvt('min', {'axes': 'axis'}),
+        'ReduceSum': AttrCvt('sum', {'axes': 'axis'}),
+        'ReduceMean': AttrCvt('mean', {'axes': 'axis'}),
         # 'ReduceProd'
         # 'ReduceLogSumExp'
         'ArgMax': ArgMax.get_converter(opset),
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 41b1703db215..022dc4a0fd7b 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -712,6 +712,117 @@ def test_constantfill():
     verify_constantfill(False, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
     verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5, 4, 5, 6), 10, 'float32', extra_shape=(4, 5, 6))
 
+
+def verify_pad(indata, pads, value=0.0):
+    indata = np.array(indata).astype(np.float32)
+    #  numpy expect result
+    len_dim = len(pads) // 2
+    np_pads = [(pads[i], pads[i+len_dim]) for i in range(len_dim)]
+    outdata = np.pad(indata, pad_width=np_pads, mode='constant', constant_values=value)
+    #  onnx graph
+    node = helper.make_node(
+        'Pad',
+        inputs=['input'],
+        outputs=['output'],
+        mode='constant',
+        pads=pads,
+        value=value
+    )
+    graph = helper.make_graph([node],
+                              'pad_test',
+                              inputs = [helper.make_tensor_value_info("input",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("output",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+    model = helper.make_model(graph, producer_name='pad_test')
+    #  tvm result
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+    tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_pad():
+    verify_pad(np.random.randn(2, 2).astype(np.float32), [0, 1, 0, 0], 0.0)
+    verify_pad(np.random.randn(2, 3).astype(np.float32), [1, 0, 0, 1], 0.0)
+    verify_pad(np.random.randn(3, 2).astype(np.float32), [0, 0, 1, 0], 5.0)
+
+def verify_reduce_x(name, indata, axis, keepdims):
+    indata = np.array(indata).astype(np.float32)
+    #  numpy expect result
+    if name == 'ReduceMax':
+        outdata = np.maximum.reduce(indata, axis=axis, keepdims=keepdims == 1)
+    elif name == 'ReduceMin':
+        outdata = np.minimum.reduce(indata, axis=axis, keepdims=keepdims == 1)
+    elif name == 'ReduceSum':
+        outdata = np.sum(indata, axis=axis, keepdims=keepdims == 1)
+    elif name == 'ReduceMean':
+        outdata = np.mean(indata, axis=axis, keepdims=keepdims == 1)
+    else:
+        raise Exception('unsupport op: {}'.format(name))
+    if len(np.asarray(outdata).shape) == 0:
+        outdata = np.asarray([outdata])
+    #  onnx graph
+    if axis is None:
+        node = helper.make_node(name, inputs=['input'], outputs=['output'],
+                                keepdims=keepdims)
+    else:
+        node = helper.make_node(name, inputs=['input'], outputs=['output'],
+                                axis=axis, keepdims=keepdims)
+    graph = helper.make_graph([node],
+                              '{}_test'.format(name),
+                              inputs = [helper.make_tensor_value_info("input",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("output",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+    model = helper.make_model(graph, producer_name='{}_test'.format(name))
+    #  tvm result
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+    tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_reduce_max():
+    verify_reduce_x("ReduceMax",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceMax",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceMax",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def test_reduce_min():
+    verify_reduce_x("ReduceMin",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceMin",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceMin",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def test_reduce_sum():
+    verify_reduce_x("ReduceSum",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceSum",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceSum",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def test_reduce_mean():
+    verify_reduce_x("ReduceMean",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceMean",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceMean",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
 def verify_split(indata, outdatas, split, axis=0):
     indata = np.array(indata).astype(np.float32)
     outdatas = [np.array(o).astype(np.float32) for o in outdatas]
@@ -772,4 +883,9 @@ def test_split():
     test_forward_arg_min_max()
     test_softmax()
     test_constantfill()
+    test_pad()
+    test_reduce_max()
+    test_reduce_min()
+    test_reduce_sum()
+    test_reduce_mean()
     test_split()

From 528f684f5afd1090877df1974b0b1f265f745ed7 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 14 Nov 2018 02:36:36 +0800
Subject: [PATCH 348/529] [RELAY] Fix type info after mutation in simplify
 inference (#2093)

---
 src/relay/pass/simplify_inference.cc          | 27 ++++++++++++++-----
 .../relay/test_pass_simplify_inference.py     |  6 ++---
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/relay/pass/simplify_inference.cc b/src/relay/pass/simplify_inference.cc
index 785b486ddc06..6acf4e65b1ac 100644
--- a/src/relay/pass/simplify_inference.cc
+++ b/src/relay/pass/simplify_inference.cc
@@ -15,7 +15,8 @@ Expr BatchNormToInferUnpack(const Attrs attrs,
                             Expr gamma,
                             Expr beta,
                             Expr moving_mean,
-                            Expr moving_var) {
+                            Expr moving_var,
+                            Type tdata) {
   const auto param = attrs.as<BatchNormAttrs>();
   Expr epsilon = MakeConstantScalar(Float(32), static_cast<float>(param->epsilon));
   Expr var_add_eps = Add(moving_var, epsilon);
@@ -32,9 +33,11 @@ Expr BatchNormToInferUnpack(const Attrs attrs,
   }
 
   int axis = param->axis;
-  const auto* tdata = data->type_as<TensorTypeNode>();
-  scale = ExpandBiasToMatchAxis(scale, tdata->shape.size(), {axis});
-  shift = ExpandBiasToMatchAxis(shift, tdata->shape.size(), {axis});
+  auto ttype = tdata.as<TensorTypeNode>();
+  CHECK(ttype);
+  auto ndim = ttype->shape.size();
+  scale = ExpandBiasToMatchAxis(scale, ndim, {axis});
+  shift = ExpandBiasToMatchAxis(shift, ndim, {axis});
 
   Expr out = Multiply(data, scale);
   out = Add(out, shift);
@@ -54,14 +57,26 @@ class InferenceSimplifier : public ExprMutator {
     }
     if (const auto* call = new_n->tuple.as<CallNode>()) {
       if (call->op.same_as(batch_norm)) {
-        return BatchNormToInferUnpack(call->attrs,
-          call->args[0], call->args[1], call->args[2], call->args[3], call->args[4]);
+        return BatchNormToInferUnpack(call->attrs, call->args[0], call->args[1], call->args[2],
+                                      call->args[3], call->args[4], ty_map_.at(call->args[0]));
       } else if (call->op.same_as(dropout)) {
         return call->args[0];
       }
     }
     return new_e;
   }
+
+  Expr VisitExpr_(const CallNode* n) {
+    static const Op& batch_norm = Op::Get("nn.batch_norm");
+    auto new_n = ExprMutator::VisitExpr_(n);
+    if (n->op.same_as(batch_norm)) {
+      ty_map_[new_n.as<CallNode>()->args[0]] = n->args[0]->checked_type();
+    }
+    return new_n;
+  }
+
+ private:
+  std::unordered_map<Expr, Type, NodeHash, NodeEqual> ty_map_;
 };
 
 Expr SimplifyInference(const Expr& e) {
diff --git a/tests/python/relay/test_pass_simplify_inference.py b/tests/python/relay/test_pass_simplify_inference.py
index 9830b83dc6e5..7585a88063ab 100644
--- a/tests/python/relay/test_pass_simplify_inference.py
+++ b/tests/python/relay/test_pass_simplify_inference.py
@@ -30,12 +30,12 @@ def check(dim, axis, nstep):
             y1, _, _ = rly.nn.batch_norm(y1 + rly.const(1, 'float32'),
                 gamma, beta, moving_mean, moving_var, epsilon=eps, axis=axis)
             y1 = rly.nn.dropout(y1)
-            y1 = rly.ir_pass.infer_type(y1)
-            y1 = simplify_inference(y1)
-
             y2 = simple_bn(y2 + rly.const(1, 'float32'),
                            gamma, beta, moving_mean, moving_var,
                            epsilon=eps, axis=axis, shape=ttype1.shape)
+        y1 = rly.ir_pass.infer_type(y1)
+        y1 = simplify_inference(y1)
+
         assert rly.ir_pass.graph_equal(y1, y2)
 
     check(2, 1, 1)

From b4cd00bf01f1175fdb3351af24ffcbb2a59f09c0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 13 Nov 2018 13:31:45 -0800
Subject: [PATCH 349/529] [RELAY][PASS] General OpFusion. (#2090)

---
 include/tvm/relay/expr.h                      |  10 +
 include/tvm/runtime/packed_func.h             |   2 +
 python/tvm/relay/base.py                      |   9 +-
 python/tvm/relay/build_module.py              |   4 +-
 python/tvm/relay/expr.py                      |   3 +-
 python/tvm/relay/ir_pass.py                   |   7 +-
 src/common/arena.h                            |  58 +-
 src/relay/backend/compile_engine.cc           |  23 +
 src/relay/ir/text_printer.cc                  |  26 +-
 src/relay/pass/fold_scale_axis.cc             |  18 +-
 src/relay/pass/fuse_ops.cc                    | 747 +++++++++++++++++-
 src/relay/pass/pass_util.h                    |  27 +
 src/relay/pass/type_infer.cc                  |  23 +-
 src/relay/pass/type_solver.cc                 |   6 +-
 src/relay/pass/type_solver.h                  |  51 +-
 src/relay/pass/util.cc                        |  18 +
 tests/python/relay/test_ir_text_printer.py    |   1 +
 .../python/relay/test_pass_fold_scale_axis.py |   8 +
 tests/python/relay/test_pass_fuse_ops.py      |  98 ++-
 19 files changed, 1026 insertions(+), 113 deletions(-)
 create mode 100644 src/relay/pass/pass_util.h

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 1a5470489ce2..2319f8baec00 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -429,6 +429,16 @@ inline const TTypeNode* ExprNode::type_as() const {
   return node;
 }
 
+/*!
+ * \brief Print node as text format.
+ * \param node The node to be printed.
+ * \param annotate An optional callback function for attaching
+ *        additional comment block to an expr.
+ * \return The text representation.
+ */
+std::string RelayPrint(
+    const NodeRef& node,
+    runtime::TypedPackedFunc<std::string(Expr)> annotate = nullptr);
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_EXPR_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 59ad52ccf3fd..f25785d39eeb 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -161,6 +161,8 @@ class TypedPackedFunc<R(Args...)> {
   using TSelf = TypedPackedFunc<R(Args...)>;
   /*! \brief default constructor */
   TypedPackedFunc() {}
+  /*! \brief constructor from null */
+  TypedPackedFunc(std::nullptr_t null) {}  // NOLINT(*)
   /*!
    * \brief construct by wrap a PackedFunc
    *
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
index 5a92eb57d209..012315b40f51 100644
--- a/python/tvm/relay/base.py
+++ b/python/tvm/relay/base.py
@@ -22,15 +22,20 @@ def register_relay_node(type_key=None):
 
 
 class RelayNode(NodeBase):
-    def astext(self):
+    """Base class of all relay node."""
+    def astext(self, annotate=None):
         """Get the text format of the expression.
 
         Returns
         -------
         text : str
             The text format of the expression.
+
+        annotate: Optional[relay.Expr->str]
+            Optional annotate function to provide additional
+            information in the comment block.
         """
-        return _expr._text_print(self)
+        return _expr.RelayPrint(self, annotate)
 
 
 @register_relay_node
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index c48ec90e9e12..0f33e86ab5cd 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -173,11 +173,13 @@ def build(func,
     else:
         tophub_context = autotvm.util.EmptyContext()
 
+    cfg = BuildConfig.current
+
     with tophub_context:
         func = optimize(func)
         # Fuse ops before running code gen
         func = ir_pass.infer_type(func)
-        func = ir_pass.fuse_ops(func)
+        func = ir_pass.fuse_ops(func, cfg.opt_level)
         # Graph code generation
         func = ir_pass.infer_type(func)
         graph_gen = _graph_gen.GraphRuntimeCodegen(mod=None, target=target)
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 43cff0bac57a..f82ea09a102a 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -6,7 +6,6 @@
 import numpy as _np
 from .base import RelayNode, register_relay_node
 from . import _make
-from . import _expr
 from . import ty as _ty
 from .._ffi import base as _base
 from .. import nd as _nd
@@ -477,7 +476,7 @@ def astext(self):
         text : str
             The text format of the tuple expression.
         """
-        return _expr._text_print(self.tuple_value)
+        return self.tuple_value.astext()
 
     def __getitem__(self, index):
         if index >= len(self):
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 274761f0a27b..b1a76d6fae6f 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -259,7 +259,7 @@ def structural_hash(value):
         raise TypeError(msg)
 
 
-def fuse_ops(expr):
+def fuse_ops(expr, opt_level=1):
     """Fuse operators in expr together.
 
     Parameters
@@ -267,9 +267,12 @@ def fuse_ops(expr):
     expr : tvm.relay.Expr
         The input expression.
 
+    opt_level : int
+        The level of fuse optimization.
+
     Returns
     -------
     transformed_expr : tvm.relay.Expr
         Transformed expression, containing fused result.
     """
-    return _ir_pass.FuseOps(expr)
+    return _ir_pass.FuseOps(expr, opt_level)
diff --git a/src/common/arena.h b/src/common/arena.h
index e8d4b2e23e37..c5da093a70b8 100644
--- a/src/common/arena.h
+++ b/src/common/arena.h
@@ -38,11 +38,29 @@ class Arena {
   /*!
    * \brief Allocate a space from Arena for type T
    * \param T the data type to be allocated
+   * \note The space of T is not initialized.
    */
   template<typename T>
-  T* Alloc() {
+  T* allocate_() {
     return static_cast<T*>(Alloc(sizeof(T), alignof(T)));
   }
+  /*!
+   * \brief Create a new instance of type T.
+   * \param args The constructor argument.
+   * \tparam T the type to be created.
+   * \tparam Args Arguments to the constructor.
+   *
+   * \return The allocated object.
+   * \note The type T must be simple type, or only contain
+   *  memory allocated from the same arena.
+   *  Otherwise the destructor needs to be called explicitly.
+   */
+  template<typename T, typename... Args>
+  T* make(Args&&... args) {
+    T* ptr = allocate_<T>();
+    new (ptr) T(std::forward<Args>(args)...);
+    return ptr;
+  }
 
  private:
   // page size 16 KB
@@ -87,6 +105,44 @@ class Arena {
   }
 };
 
+/*!
+ * \brief Link list node
+ * \tparam T the content data type
+ */
+template<typename T>
+struct LinkNode {
+  /*! \brief The content value */
+  T value;
+  /*! \brief pointer to the next location */
+  LinkNode<T>* next{nullptr};
+};
+/*!
+ * \brief LinkedList structure
+ * \tparam T the content data type
+ * \note This is a simple data structure that can be used together with the arena.
+ * \sa LinkNode
+ */
+template<typename T>
+struct LinkedList {
+  /*! \brief Head pointer */
+  LinkNode<T>* head{nullptr};
+  /*! \brief Tail pointer */
+  LinkNode<T>* tail{nullptr};
+  /*!
+   * \brief Push a new node to the end of the linked list.
+   * \param node The node to be pushed.
+   */
+  void Push(LinkNode<T>* node) {
+    node->next = nullptr;
+    if (this->tail != nullptr) {
+      this->tail->next = node;
+      this->tail = node;
+    } else {
+      head = tail = node;
+    }
+  }
+};
+
 }  // namespace common
 }  // namespace tvm
 #endif  // TVM_COMMON_ARENA_H_
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 38e3f6c2a7b8..dc094e00e05b 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -109,6 +109,29 @@ class ScheduleGetter :
     return {};
   }
 
+  Array<Tensor> VisitExpr_(const ConstantNode* op) final {
+    CHECK(op->is_scalar());
+    void* data = op->data->data;
+    DataType dtype = TVMType2Type(op->data->dtype);
+    Tensor value = tvm::compute({}, [&](const Array<tvm::Var>&) {
+        if (dtype == Int(32)) {
+          return make_const(dtype, static_cast<const int32_t*>(data)[0]);
+        } else if (dtype == Int(64)) {
+          return make_const(dtype, static_cast<const int64_t*>(data)[0]);
+        } else if (dtype == Float(32)) {
+          return make_const(dtype, static_cast<const float*>(data)[0]);
+        } else if (dtype == Float(64)) {
+          return make_const(dtype, static_cast<const double*>(data)[0]);
+        } else if (dtype == Bool()) {
+          return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
+        } else {
+          LOG(FATAL) << "not handled";
+          return tvm::Expr();
+        }
+      });
+    return {value};
+  }
+
   Array<Tensor> VisitExpr_(const CallNode* call_node) final {
     static auto fcompute =
         Op::GetAttr<FTVMCompute>("FTVMCompute");
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index f28db371706e..93ed76bed3c2 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -125,6 +125,8 @@ class TextPrinter :
     public TypeFunctor<void (const Type&, std::ostream& os)>,  // NOLINT(*)
     public AttrFunctor<void (const NodeRef&, std::ostream& os)> { // NOLINT(*)
  public:
+  explicit TextPrinter(runtime::TypedPackedFunc<std::string(Expr)> annotate)
+      : annotate_(annotate) {}
   /*!
    * \brief Print a node to string.
    * \param node.
@@ -279,11 +281,11 @@ class TextPrinter :
 
   TextValue VisitExpr_(const CallNode* op) final {
     // possibly through meta-data
-    TextValue call_op = GetValue(op->op);
     std::vector<TextValue> args;
     for (Expr arg : op->args) {
       args.emplace_back(GetValue(arg));
     }
+    TextValue call_op = GetValue(op->op);
     TextValue id = this->AllocTempVar();
     this->PrintIndent();
 
@@ -532,7 +534,9 @@ class TextPrinter :
    */
   void PrintOptionalInfo(const Expr& expr) {
     // additional information in comment.
-    if (expr->checked_type_.defined()) {
+    if (annotate_ != nullptr) {
+      stream_ << " # " << annotate_(expr);
+    } else if (expr->checked_type_.defined()) {
       stream_ << " # ty=";
       this->PrintType(expr->checked_type(), stream_);
     }
@@ -678,7 +682,10 @@ class TextPrinter :
       name = "%" + name;
     }
     TextValue val(GetUniqueName(name));
-    CHECK(!memo_.count(var)) << "Duplicated variable " << var;
+    // still print if ir is malformed, but show the error.
+    if (memo_.count(var)) {
+      memo_[var] = TextValue(val.name + "-malformed-ir");
+    }
     memo_[var] = val;
     return val;
   }
@@ -686,6 +693,8 @@ class TextPrinter :
  private:
   class AttrPrinter;
   friend class AttrPrinter;
+  /*! \brief additional comment function */
+  runtime::TypedPackedFunc<std::string(Expr)> annotate_;
   /*! \brief meta data context */
   TextMetaDataContext meta_;
   /*! \brief Check whether scope is still valid */
@@ -776,12 +785,15 @@ void TextPrinter::PrintCallAttrs(const Expr& op,
   os << ", " << meta_.GetMetaNode(attrs);
 }
 
-std::string RelayPrint(const NodeRef& node) {
-  return TextPrinter().Print(node);
+std::string RelayPrint(const NodeRef& node,
+                       runtime::TypedPackedFunc<std::string(Expr)> annotate) {
+  return TextPrinter(annotate).Print(node);
 }
 
-TVM_REGISTER_API("relay._expr._text_print")
-.set_body_typed<std::string(const NodeRef&)>(RelayPrint);
+TVM_REGISTER_API("relay._expr.RelayPrint")
+.set_body_typed<std::string(
+    const NodeRef&,
+    runtime::TypedPackedFunc<std::string(Expr)>)>(RelayPrint);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index e757118f33f2..038f34df5760 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -10,6 +10,7 @@
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr_functor.h>
 #include "pattern_util.h"
+#include "pass_util.h"
 #include "../op/nn/layout.h"
 
 namespace tvm {
@@ -580,23 +581,6 @@ using FBackwardTransform = TypedPackedFunc<
 //----------------------------------------------
 // Generic Visitors for FScaleAxisBackward
 //----------------------------------------------
-/*!
- * \brief Get reference counter of each internal ExprNode in body.
- * \param body The body expression.
- * \return The reference count mapping.
- */
-std::unordered_map<const Node*, size_t>
-GetExprRefCount(const Expr& body) {
-  class ExprRefCounter : private ExprVisitor {
-   public:
-    std::unordered_map<const Node*, size_t>
-    Get(const Expr& body) {
-      this->VisitExpr(body);
-      return std::move(this->visit_counter_);
-    }
-  };
-  return ExprRefCounter().Get(body);
-}
 
 class BackwardPrep : private ExprVisitor {
  public:
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 2bd16a4f840f..2503bd5f53fa 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -9,13 +9,686 @@
 #include <tvm/ir_operator.h>
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include "../../common/arena.h"
+
 
 namespace tvm {
 namespace relay {
 
-// Simple fuser that only makes each operator function as primitive.
-class SimpleFuser : public ExprMutator {
+/*
+  Note on Fusing algorithm:
+
+  The main challenge of genenral fusor is to handle possible diamond shape branches,
+  in the following graph, conv2d can be fused to elemwise add.
+
+            conv2d
+            /  |  \
+           /   |   \
+         op    op   op
+          \    |    /
+           \   |   /
+          elemwise add
+               |
+
+  However, at the point of conv2d we do not necessarily know that all its future path
+  will merge at the elemwise add. The new fusor algorithm applies post-dominator analysis.
+  The immediate post-dominator of a node defined by the closest node where all the future path goes into.
+  In the above case, the elemwise add is the post-dominator of conv2d. The general algorithm is as follows:
+
+  - Construct a DAG of dataflow graph for dominator analysis
+  - Construct a post-dominator tree which gives immediate post dominator of each node.
+  - Run fusion algorithm with the given post-dominator information.
+
+  Note that, because we run analysis on a DAG, we use a single pass post-dominator
+  tree construction algorithm via LCA, which is simpler than the full version that handles cycles.
+
+  The fusion algorithm traverses from each node and checks if it can be fused to its
+  immediate post dominator. It has to check the following things:
+
+  - CheckPath: check all the path between a node and its immediate post-dominator
+               satiesfies the fuse condition.
+  - Note that these intermediate node can already be fused with another nodes, the algorithm
+      will still run correctly.
+  - CommitFuse: mark all the nodes between source and post-dominator as the same group.
+  - We use an Union-Find data structure to manage the groups.
+*/
+using common::LinkNode;
+using common::LinkedList;
+
+/*!
+ * \brief Indexed data flow graph in forward direction.
+ *  This is a temporary data structure used for operator fusion analysis.
+ *
+ *  This data structure only captures the dataflow fragement and
+ *  could ignore blocks like let by simply ordering each dataflow block
+ *  and mark the output node as extern_ref;
+ */
+class IndexedForwardGraph {
+ public:
+  struct Node;
+  /*!
+   * The forward edge in the dataflow graph.
+   */
+  struct Edge {
+    /*! \brief The corresponding node */
+    Node* node{nullptr};
+    /*! \brief The respective pattern of this op */
+    OpPatternKind pattern{kOpaque};
+  };
+  /*! \brief A node in the graph. */
+  struct Node {
+    /*! \brief weak reference to the corresponding edge. */
+    const tvm::Node* ref{nullptr};
+    /*! \brief The index of the node in topological order. */
+    size_t index{0};
+    /*! \brief Whether this node is referenced by external source */
+    bool extern_ref{false};
+    /*! \brief The general pattern in the node */
+    OpPatternKind pattern{kOpaque};
+    /*! \brief The outputs of the node. */
+    LinkedList<Edge> outputs;
+  };
+  /*! \brief The node map that maps node to graph */
+  std::unordered_map<const tvm::Node*, Node*> node_map;
+  /*! \brief All the nodes in post DFS order */
+  std::vector<Node*> post_dfs_order;
+
+  /*! \brief Dump the graph into string. */
+  void DebugDump() {
+    std::ostringstream os;
+    for (size_t i = 0; i < post_dfs_order.size(); ++i) {
+      Node* node = post_dfs_order[i];
+      os << "node[" << i << "], "
+         << GetRef<NodeRef>(node->ref)
+         << " outputs=[";
+      for (auto* link = node->outputs.head; link != nullptr; link = link->next) {
+        os << link->value.node->index << ", ";
+      }
+      os << "]\n";
+    }
+    LOG(INFO) << os.str();
+  }
+  /*!
+   * \brief create a indexed forward graph.
+   * \param arena The arena used for data allocation.
+   * \param body The body of the expression to create a graph.
+   */
+  static IndexedForwardGraph Create(common::Arena* arena, const Expr& body);
+
+ private:
+  class Creator;
+};
+
+// Creator of post dominator tree of the dataflow
+class IndexedForwardGraph::Creator : private ExprVisitor {
+ public:
+  explicit Creator(common::Arena* arena)
+      : arena_(arena) {}
+
+  IndexedForwardGraph Prepare(const Expr& body) {
+    this->Update(body, nullptr, kOpaque);
+    this->VisitExpr(body);
+    return std::move(graph_);
+  }
+
+ private:
+  /*! \brief allocator of all the internal node object */
+  common::Arena* arena_;
+  // The output.
+  IndexedForwardGraph graph_;
+  // attribute equal comparator
+  AttrsEqual attr_equal_;
+  // Update the message stored at the node.
+  void Update(const Expr& node,
+              IndexedForwardGraph::Node* parent,
+              OpPatternKind pattern) {
+    const tvm::Node* key = node.get();
+    IndexedForwardGraph::Node* current;
+    auto it = graph_.node_map.find(key);
+    if (it != graph_.node_map.end()) {
+      current = it->second;
+    } else {
+      current = arena_->make<IndexedForwardGraph::Node>();
+      graph_.node_map[key] = current;
+    }
+    if (parent != nullptr) {
+      auto* link = arena_->make<LinkNode<IndexedForwardGraph::Edge> >();
+      link->value.node = parent;
+      link->value.pattern = pattern;
+      current->outputs.Push(link);
+    } else {
+      current->extern_ref = true;
+    }
+  }
+  void AddNode(const tvm::Node* key) {
+    auto it = graph_.node_map.find(key);
+    CHECK(it != graph_.node_map.end())
+        << "Cannot find node " << GetRef<NodeRef>(key);
+    IndexedForwardGraph::Node* node = it->second;
+    CHECK(node->ref == nullptr);
+    node->ref = key;
+    node->index = graph_.post_dfs_order.size();
+    graph_.post_dfs_order.push_back(node);
+  }
+
+  // Post order tree
+  void VisitExpr_(const FunctionNode* op) {
+    for (auto param : op->params) {
+      this->Update(param, nullptr, kOpaque);
+    }
+    this->Update(op->body, nullptr, kOpaque);
+    ExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const ConstantNode* op) {
+    this->AddNode(op);
+    Node* node = graph_.node_map.at(op);
+    DataType dtype = TVMType2Type(op->data->dtype);
+    // This rule must be consistent with code generator.
+    bool is_simple_const = (
+        dtype == Int(32) ||
+        dtype == Int(64) ||
+        dtype == Float(32) ||
+        dtype == Float(64) ||
+        dtype == Bool());
+    if (op->is_scalar() && is_simple_const) {
+      node->pattern = kElemWise;
+    } else {
+      // for now, mark non-scalar constant
+      // as opaque, we will not choose to fuse it.
+      node->pattern = kOpaque;
+    }
+  }
+
+  void VisitExpr_(const CallNode* call) {
+    CHECK(graph_.node_map.count(call));
+    Node* node = graph_.node_map.at(call);
+    static auto fpattern =
+        Op::GetAttr<TOpPattern>("TOpPattern");
+    // setup pattern.
+    OpPatternKind op_pattern = kOpaque;
+    if (const OpNode* opnode = call->op.as<OpNode>()) {
+      op_pattern = static_cast<OpPatternKind>(fpattern[GetRef<Op>(opnode)]);
+    }
+    node->pattern = op_pattern;
+    const auto* rtype = call->checked_type().as<TensorTypeNode>();
+    // pass the message back to all the children it references.
+    for (size_t i = 0; i < call->args.size(); ++i) {
+      const auto* arg_type =
+          call->args[i]->checked_type().as<TensorTypeNode>();
+      // specifically check if result type
+      OpPatternKind edge_pattern = op_pattern;
+      if (edge_pattern == kBroadcast &&
+          arg_type != nullptr &&
+          rtype != nullptr &&
+          attr_equal_(rtype->shape, arg_type->shape)) {
+        edge_pattern = kElemWise;
+      }
+      this->Update(call->args[i], node, edge_pattern);
+    }
+    ExprVisitor::VisitExpr_(call);
+    this->AddNode(call);
+  }
+
+  void VisitExpr_(const TupleNode* op) {
+    for (const Expr& field : op->fields) {
+      this->Update(field, nullptr, kOpaque);
+    }
+    ExprVisitor::VisitExpr_(op);
+    this->AddNode(op);
+  }
+
+  void VisitExpr_(const TupleGetItemNode* op) {
+    CHECK(graph_.node_map.count(op));
+    Node* node = graph_.node_map.at(op);
+    this->Update(op->tuple, node, kOpaque);
+    ExprVisitor::VisitExpr_(op);
+    this->AddNode(op);
+  }
+
+  void VisitExpr_(const VarNode* op) {
+    this->AddNode(op);
+  }
+
+  void VisitExpr_(const LetNode* op) {
+    // do not fuse through let.
+    this->Update(op->var, nullptr, kOpaque);
+    this->Update(op->value, nullptr, kOpaque);
+    this->Update(op->body, nullptr, kOpaque);
+    ExprVisitor::VisitExpr_(op);
+    this->AddNode(op);
+  }
+
+  void VisitExpr_(const IfNode* op) {
+    // do not fuse through if.
+    this->Update(op->cond, nullptr, kOpaque);
+    this->Update(op->true_branch, nullptr, kOpaque);
+    this->Update(op->false_branch, nullptr, kOpaque);
+    ExprVisitor::VisitExpr_(op);
+    this->AddNode(op);
+  }
+};
+
+IndexedForwardGraph IndexedForwardGraph::Create(
+    common::Arena* arena, const Expr& body) {
+  return Creator(arena).Prepare(body);
+}
+
+/*!
+ * \brief Dominator tree that represent domination or
+ *  post domination relation of the node.
+ */
+class DominatorTree {
  public:
+  /*!
+   * \brief A node in the dominator tree.
+   */
+  struct Node {
+    /*! \brief The node in the tree */
+    IndexedForwardGraph::Node* gnode{nullptr};
+    /*! \brief parent of the tree */
+    Node* parent{nullptr};
+    /*! \brief current depth*/
+    int depth{0};
+    /*! \brief aggregated pattern to parent */
+    OpPatternKind pattern{kOpaque};
+  };
+  // index -> node.
+  std::vector<Node*> nodes;
+  /*!
+   * \brief compute a post dominator relation for a given dataflow graph.
+   * \param arena The arena used for node allocation.
+   * \param graph The graph to be analyze.
+   * \return The dominator tree of the graph.
+   * \note This algorithm makes use of the fact that graph is DAG,
+   *       and runs a single pass algorithm via LCA.
+   */
+  static DominatorTree PostDom(common::Arena* arena,
+                               const IndexedForwardGraph& graph);
+
+ private:
+  // Combine pattern together.
+  static OpPatternKind CombinePattern(
+      OpPatternKind lhs, OpPatternKind rhs) {
+    if (lhs > rhs) return lhs;
+    return rhs;
+  }
+  /*!
+   * \brief Find the least common acenstor of the two nodes.
+   * \param lhs The left node.
+   * \param rhs The right node.
+   * \param edge_pattern
+   *        The combined edge pattern across all the parents.
+   * \return The least common acenstor of thw two.
+   */
+  static Node* LeastCommonAcenstor(
+      Node* lhs,
+      Node* rhs,
+      OpPatternKind* edge_pattern) {
+    while (lhs != rhs) {
+      if (lhs == nullptr) return nullptr;
+      if (rhs == nullptr) return nullptr;
+      if (lhs->depth < rhs->depth) {
+        edge_pattern[0] = CombinePattern(
+            edge_pattern[0], rhs->pattern);
+        rhs = rhs->parent;
+      } else if (rhs->depth < lhs->depth) {
+        edge_pattern[0] = CombinePattern(
+            edge_pattern[0], lhs->pattern);
+        lhs = lhs->parent;
+      } else {
+        lhs = lhs->parent;
+        rhs = rhs->parent;
+        edge_pattern[0] = CombinePattern(
+            edge_pattern[0], lhs->pattern);
+        edge_pattern[0] = CombinePattern(
+            edge_pattern[0], rhs->pattern);
+      }
+    }
+    return lhs;
+  }
+};
+
+DominatorTree DominatorTree::PostDom(common::Arena* arena,
+                                     const IndexedForwardGraph& graph) {
+  DominatorTree tree;
+  tree.nodes.resize(graph.post_dfs_order.size(), nullptr);
+  // reverse topo order
+  for (size_t i = graph.post_dfs_order.size(); i != 0; --i) {
+    size_t index = i - 1;
+    Node* tnode = arena->make<Node>();
+    auto* gnode = graph.post_dfs_order[index];
+    tnode->gnode = gnode;
+    if (gnode->extern_ref) {
+      tnode->depth = 1;
+      tnode->parent = nullptr;
+      tnode->pattern = kOpaque;
+    } else {
+      // find the LCAs of all outputs.
+      OpPatternKind pattern = kElemWise;
+      Node* parent = nullptr;
+      for (auto link = gnode->outputs.head; link != nullptr; link= link->next) {
+        size_t oindex = link->value.node->index;
+        CHECK_LT(oindex, tree.nodes.size());
+        Node* onode = tree.nodes[oindex];
+        CHECK(onode != nullptr);
+        if (parent != nullptr) {
+          parent = LeastCommonAcenstor(parent, onode, &pattern);
+        } else {
+          parent = onode;
+        }
+        pattern = CombinePattern(pattern, link->value.pattern);
+      }
+      CHECK(parent != nullptr);
+      tnode->depth = parent->depth + 1;
+      tnode->parent = parent;
+      tnode->pattern = pattern;
+    }
+    tree.nodes[index] = tnode;
+  }
+  return tree;
+}
+
+/*!
+ * \brief A partition of the graph marked by union find data structure.
+ */
+class GraphPartitioner {
+ public:
+  explicit GraphPartitioner(common::Arena* arena, int opt_level)
+      : arena_(arena), opt_level_(opt_level) {}
+  /*!
+   * \brief Group as a union find data structure.
+   */
+  struct Group {
+    /*! \brief The parent in the union find data structure. */
+    Group* parent{nullptr};
+    /*! \brief The pattern of the group */
+    OpPatternKind pattern;
+    /*! \brief reference to the root node. */
+    const tvm::Node* root_ref{nullptr};
+    /*!
+     * \brief Reference to the master node,
+     * this field is not nullptr only if pattern is kOutEWiseFusable.
+     */
+    const tvm::Node* master_ref{nullptr};
+    /*!
+     * \brief Find the group root, perform path compression
+     * \return The root type node.
+     */
+    Group* FindRoot() {
+      // fast path
+      if (this->parent == nullptr) return this;
+      // slow path with path compression.
+      Group* root = this;
+      while (root->parent != nullptr) {
+        root = root->parent;
+      }
+      for (Group* p = this; p != root;) {
+        Group* parent = p->parent;
+        p->parent = root;
+        p = parent;
+      }
+      return root;
+    }
+  };
+  /*!
+   * \brief Partition a graph.
+   * \return group assignments of each node.
+   */
+  std::vector<Group*> Partition(const IndexedForwardGraph& graph);
+
+ private:
+  /*! \brief The internal arena for temporary space. */
+  common::Arena* arena_;
+  /*! \brief optimization level for fuse operation. */
+  int opt_level_;
+  /*! \brief The internal groups. */
+  std::vector<Group*> groups_;
+  /*! \brief internal field used for deduplication */
+  std::unordered_set<IndexedForwardGraph::Node*> visited_;
+  // Internal implelementation of CheckPath
+  template<typename F>
+  bool CheckPath_(IndexedForwardGraph::Node* src,
+                  IndexedForwardGraph::Node* sink,
+                  F fcond) {
+    if (visited_.count(src)) return true;
+    visited_.insert(src);
+    Group* gnode =  groups_[src->index];
+    CHECK(gnode != nullptr);
+    gnode = gnode->FindRoot();
+    if (!fcond(gnode->pattern, src == sink)) return false;
+    if (src == sink) return true;
+    for (auto link = src->outputs.head; link != nullptr; link = link->next) {
+      if (!CheckPath_(link->value.node, sink, fcond)) return false;
+    }
+    return true;
+  }
+  /*!
+   * \brief Check all the node between src and sink satisfies fcond.
+   *
+   * src and sink are not checked.
+   *
+   * \param src The source node.
+   * \param sink The termination node.
+   * \param fcond The condition to be checked.
+   * \tparam F the condition function.
+   * \note sink must be a post-dominator of src.
+   */
+  template<typename F>
+  bool CheckPath(IndexedForwardGraph::Node* src,
+                 IndexedForwardGraph::Node* sink,
+                 F fcond) {
+    CHECK(!src->extern_ref);
+    visited_.clear();
+    CHECK(src != sink);
+    for (auto link = src->outputs.head; link != nullptr; link = link->next) {
+      if (!CheckPath_(link->value.node, sink, fcond)) return false;
+    }
+    return true;
+  }
+  // Combine two patterns together.
+  static OpPatternKind CombinePattern(
+      OpPatternKind lhs, OpPatternKind rhs) {
+    if (lhs > kBroadcast && rhs > kBroadcast) {
+      LOG(FATAL) << "Cannot merge two complex group together";
+    }
+    if (lhs > rhs) return lhs;
+    return rhs;
+  }
+  /*!
+   * \brief Merge the child group to the parent.
+   * \param child The child group.
+   * \param parent The parent group.
+   */
+  void MergeFromTo(Group* child, Group* parent) {
+    child = child->FindRoot();
+    parent = parent->FindRoot();
+    if (child == parent) return;
+    child->parent = parent;
+    // update master ref and pattern
+    if (child->master_ref != nullptr) {
+      CHECK(parent->master_ref == nullptr);
+      parent->master_ref = child->master_ref;
+      parent->pattern = CombinePattern(
+          child->pattern, parent->pattern);
+    }
+  }
+  // Internal implelementation of CommitFuse
+  void CommitFuse_(IndexedForwardGraph::Node* src,
+                   IndexedForwardGraph::Node* sink,
+                   Group* target) {
+    if (src == sink) return;
+    if (visited_.count(src)) return;
+    visited_.insert(src);
+    Group* gnode = groups_[src->index];
+    CHECK(gnode != nullptr);
+    // merge the current group to the parent if possible.
+    MergeFromTo(gnode, target);
+    for (auto link = src->outputs.head; link != nullptr; link = link->next) {
+      CommitFuse_(link->value.node, sink, target);;
+    }
+  }
+  /*!
+   * \brief Commit fusion operation.
+   * \param src The source node.
+   * \param sink The termination node.
+   * \tparam group the group to be committed.
+   * \note sink must be a post-dominator of src.
+   */
+  void CommitFuse(IndexedForwardGraph::Node* src,
+                  IndexedForwardGraph::Node* sink) {
+    Group* target = groups_[sink->index];
+    visited_.clear();
+    CHECK(src != sink);
+    CommitFuse_(src, sink, target);
+  }
+
+  // Initialize the groups.
+  void InitGroups(const IndexedForwardGraph& graph) {
+    groups_.resize(graph.post_dfs_order.size());
+    for (size_t nid = 0; nid < groups_.size(); ++nid) {
+      const auto* graph_node = graph.post_dfs_order[nid];
+      auto* group_node = arena_->make<Group>();
+      group_node->pattern = graph_node->pattern;
+      group_node->root_ref = graph_node->ref;
+      // set master ref if necessary.
+      if (group_node->pattern == kOutEWiseFusable) {
+        group_node->master_ref = graph_node->ref;
+      }
+      groups_[nid] = group_node;
+    }
+  }
+
+  // execute the fusion algorithm.
+  void RunFuse(const IndexedForwardGraph& graph,
+               const DominatorTree& post_dom_tree,
+               int phase) {
+    for (size_t nid = 0; nid < groups_.size(); ++nid) {
+      // the group of current node has been specified already.
+      auto* graph_node = graph.post_dfs_order[nid];
+      auto* dom_node = post_dom_tree.nodes[nid];
+      Group* group_node = groups_[nid];
+      CHECK(group_node != nullptr);
+      // no actions for opaque nodes
+      if (group_node->pattern == kOpaque) continue;
+      // no actions needed if the current node have no dominator
+      if (dom_node->parent == nullptr) continue;
+      CHECK(!graph_node->extern_ref);
+      // Skip if current node is already fused to the parent.
+      size_t dom_parent_gindex = dom_node->parent->gnode->index;
+      if (groups_[dom_parent_gindex] != nullptr &&
+          group_node->FindRoot() == groups_[dom_parent_gindex]->FindRoot()) {
+        continue;
+      }
+      // Try to fuse current node to its post-dominator.
+      if (group_node->pattern == kOutEWiseFusable) {
+        if (phase != 0) continue;
+        // Path for OutEWiseFusable: conv2d
+        // Check if the dominator relation is elemwise.
+        if (dom_node->parent != nullptr && dom_node->pattern == kElemWise) {
+          CHECK(dom_node->parent->gnode != nullptr);
+          // The fuse can be executed if all the intermediate ops are still broadcast.
+          auto fcond = [](OpPatternKind kind, bool is_sink) {
+            return kind <= kBroadcast;
+          };
+          if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
+            CommitFuse(graph_node, dom_node->parent->gnode);
+          }
+        }
+      } else if (group_node->pattern <= kBroadcast) {
+        // The fuse can be executed if all the intermediate ops are still broadcast.
+        auto fcond = [](OpPatternKind kind, bool is_sink) {
+          if (!is_sink) {
+            return kind <= kBroadcast;
+          } else {
+            return (kind <= kBroadcast ||
+                    kind == kCommReduce ||
+                    kind == kOutEWiseFusable);
+          }
+        };
+        if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
+          CommitFuse(graph_node, dom_node->parent->gnode);
+        }
+      } else if (group_node->pattern == kInjective) {
+        // defer injective fusion to second phase.
+        // so conv2d always finishes fusing.
+        if (phase != 1) continue;
+        // Check if all path are injective.
+        auto fcond = [](OpPatternKind kind, bool is_sink) {
+          return kind <= kInjective;
+        };
+        if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
+          CommitFuse(graph_node, dom_node->parent->gnode);
+        }
+      } else {
+        // do nothing.
+        CHECK(group_node->pattern == kCommReduce);
+      }
+    }
+  }
+};
+
+std::vector<GraphPartitioner::Group*>
+GraphPartitioner::Partition(const IndexedForwardGraph& graph) {
+  this->InitGroups(graph);
+  if (opt_level_ == 0) return std::move(groups_);
+  // get post dominator tree
+  auto post_dom_tree = DominatorTree::PostDom(arena_, graph);
+  // run fusion algorithm.
+  for (int phase = 0; phase < 2; ++phase) {
+    this->RunFuse(graph, post_dom_tree, phase);
+  }
+  return std::move(groups_);
+}
+
+class FuseMutator : private ExprMutator {
+ public:
+  // Run the transform
+  Expr Transform(const Expr& body, int fuse_opt_level) {
+    // setup the group map.
+    auto graph = IndexedForwardGraph::Create(&arena_, body);
+    auto groups = GraphPartitioner(&arena_, fuse_opt_level).Partition(
+        graph);
+    for (size_t nid = 0; nid < graph.post_dfs_order.size(); ++nid) {
+      CHECK(graph.post_dfs_order[nid]->ref != nullptr);
+      gmap_[graph.post_dfs_order[nid]->ref] = groups[nid];
+    }
+    // The following line can be used for debug.
+    // this->DebugDumpGroup(body);
+    return this->Mutate(body);
+  }
+
+
+ private:
+  /*! \brief Temporary information from each group. */
+  struct GroupInfo {
+   public:
+    // The parameters of the function.
+    Array<Var> params;
+    // The arguments to call the functions.
+    Array<Expr> arguments;
+    // Get a new parameter or allocate an old one
+    Var GetOrAllocParam(const Expr& expr, const Type& type) {
+      // run linear scan as most fused groups contain only a few inputs.
+      for (size_t i = 0; i < arguments.size(); ++i) {
+        if (expr.same_as(arguments[i])) return params[i];
+      }
+      // create a new parameter.
+      std::ostringstream os;
+      os << "p" << params.size();
+      auto var = VarNode::make(os.str(), type);
+      params.push_back(var);
+      arguments.push_back(expr);
+      return var;
+    }
+  };
+  /*! \brief Internal arena. */
+  common::Arena arena_;
+  /*! \brief The group assignment map. */
+  std::unordered_map<const Node*, GraphPartitioner::Group*> gmap_;
+  /* \brief Internal group information map. */
+  std::unordered_map<GraphPartitioner::Group*, GroupInfo> ginfo_;
   // Skip primitive function.
   Expr VisitExpr_(const FunctionNode* fn_node) {
     NodeRef res = FunctionGetAttr(GetRef<Function>(fn_node), "Primitive");
@@ -26,48 +699,74 @@ class SimpleFuser : public ExprMutator {
       return ExprMutator::VisitExpr_(fn_node);
     }
   }
-
+  // Transform calls.
   Expr VisitExpr_(const CallNode* call) {
     if (call->op.as<OpNode>()) {
-      // Placeholder fusion algorithm which abstracts
-      // single definitions into functions only.
-      Array<Var> params;
-      Array<Expr> inner_args;
-      Array<Expr> args;
-
-      int param_number = 0;
+      // If it is a primitive op call
+      // then we must have a group assignment for it already.
+      CHECK(gmap_.count(call));
+      auto* ret_group = gmap_.at(call)->FindRoot();
+      Array<Expr> new_args;
       for (auto arg : call->args) {
-        std::ostringstream os;
-        os << "p" << param_number++;
         auto type = arg->checked_type();
-        auto var = VarNode::make(os.str(), type);
-        params.push_back(var);
-        inner_args.push_back(var);
-        args.push_back(this->Mutate(arg));
+        CHECK(gmap_.count(arg.get()))
+            << "cannot find group of " << arg;
+        auto* arg_group = gmap_.at(arg.get())->FindRoot();
+        Expr new_arg = this->Mutate(arg);
+
+        if (ret_group != arg_group) {
+          Var param = ginfo_[ret_group].GetOrAllocParam(new_arg, type);
+          new_args.push_back(param);
+        } else {
+          new_args.push_back(new_arg);
+        }
+      }
+      auto new_call = CallNode::make(
+          call->op, new_args, call->attrs, call->type_args);
+
+      if (ret_group->root_ref == call) {
+        // This is the root of the group
+        // create the new call node.
+        const GroupInfo& ginfo = ginfo_[ret_group];
+        auto func = FunctionNode::make(
+            ginfo.params, new_call, call->checked_type(), {});
+        func = FunctionSetAttr(func, "Primitive", tvm::Integer(1));
+        return CallNode::make(func, ginfo.arguments, Attrs());
+      } else {
+        // This is an intermediate node of a fused function
+        // simply return the new call.
+        return new_call;
       }
-      auto body = CallNode::make(call->op, inner_args, call->attrs);
-      auto func = FunctionNode::make(
-          params, body, call->checked_type(), {});
-      func = FunctionSetAttr(func, "Primitive", tvm::Integer(1));
-      return CallNode::make(func, args, Attrs());
     } else {
       return ExprMutator::VisitExpr_(call);
     }
   }
+  // Debug function, dump the group assignment in text.
+  void DebugDumpGroup(const Expr& body) {
+    std::string text = RelayPrint(body, [this](const Expr& expr) -> std::string {
+        auto it = gmap_.find(expr.get());
+        if (it == gmap_.end()) return "";
+        std::ostringstream os;
+        auto *group = it->second->FindRoot();
+        os << "group=" << group;
+        return os.str();
+      });
+    LOG(INFO) << "Dump of group info:\n" << text;
+  }
 };
 
 
-Expr FuseOps(const Expr& expr) {
+Expr FuseOps(const Expr& expr, int fuse_opt_level) {
   // First we convert all chains of fusable ops into
   // abstracted functions which we mark as primtive
   // then we convert these primtive functions into
   // new operators.
-  return SimpleFuser().Mutate(expr);
+  return FuseMutator().Transform(expr, fuse_opt_level);
 }
 
 TVM_REGISTER_API("relay._ir_pass.FuseOps")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = FuseOps(args[0]);
+    *ret = FuseOps(args[0], args[1]);
 });
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/pass_util.h b/src/relay/pass/pass_util.h
new file mode 100644
index 000000000000..bf52297e8930
--- /dev/null
+++ b/src/relay/pass/pass_util.h
@@ -0,0 +1,27 @@
+/*!
+ *  Copyright (c) 2018 by Contributors.
+ *
+ * \file tvm/relay/pass/pass_util.h
+ * \brief Utilities for writing
+ */
+#ifndef TVM_RELAY_PASS_PASS_UTIL_H_
+#define TVM_RELAY_PASS_PASS_UTIL_H_
+
+#include <tvm/relay/op.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/attrs/transform.h>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Get reference counter of each internal ExprNode in body.
+ * \param body The body expression.
+ * \return The reference count mapping.
+ */
+std::unordered_map<const Node*, size_t>
+GetExprRefCount(const Expr& body);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_PASS_UTIL_H_
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index b224a099aee1..5cabfbdabc49 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -442,6 +442,9 @@ class TypeInferencer::Resolver : public ExprMutator {
     VarNode* new_var =(
         std::is_base_of<VarNode, T>::value ?
         static_cast<VarNode*>(new_e.node_.get()) : nullptr);
+    FunctionNode* new_fn =(
+        std::is_base_of<FunctionNode, T>::value ?
+        static_cast<FunctionNode*>(new_e.node_.get()) : nullptr);
 
     // check if we need update the new_e
     bool need_update_type = !checked_type.same_as(new_e->checked_type_);
@@ -454,7 +457,17 @@ class TypeInferencer::Resolver : public ExprMutator {
         update_missing_type_annotation_ &&
         !new_var->type_annotation.defined());
 
-    if (!need_update_type && !need_update_var && !need_update_call) return new_e;
+    bool need_update_fn = (
+        std::is_base_of<FunctionNode, T>::value &&
+        update_missing_type_annotation_ &&
+        !new_fn->ret_type.defined());
+
+    if (!need_update_type &&
+        !need_update_var &&
+        !need_update_call &&
+        !need_update_fn) {
+      return new_e;
+    }
 
     if (!new_e.node_.unique()) {
       // Copy on write optimization
@@ -467,6 +480,9 @@ class TypeInferencer::Resolver : public ExprMutator {
       new_var = (
           std::is_base_of<VarNode, T>::value ?
           static_cast<VarNode*>(new_e.node_.get()) : nullptr);
+      new_fn = (
+          std::is_base_of<FunctionNode, T>::value ?
+          static_cast<FunctionNode*>(new_e.node_.get()) : nullptr);
     }
 
     // attach the information.
@@ -483,6 +499,11 @@ class TypeInferencer::Resolver : public ExprMutator {
     if (need_update_var) {
       new_var->type_annotation = checked_type;
     }
+    if (need_update_fn) {
+      auto* fn_type = checked_type.as<FuncTypeNode>();
+      CHECK(fn_type != nullptr);
+      new_fn->ret_type = fn_type->ret_type;
+    }
     return new_e;
   }
 
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index 3ca161d23f72..e1efcbbdd0b9 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -85,18 +85,18 @@ Type TypeSolver::Unify(const Type& dst, const Type& src) {
 void TypeSolver::AddConstraint(const TypeConstraint& constraint) {
   if (auto *op = constraint.as<TypeRelationNode>()) {
     // create a new relation node.
-    RelationNode* rnode = make<RelationNode>();
+    RelationNode* rnode = arena_.make<RelationNode>();
     rnode->rel = GetRef<TypeRelation>(op);
     rel_nodes_.push_back(rnode);
     // populate the type information.
     for (size_t i = 0; i < op->args.size(); ++i) {
       // insert link to the type list
-      LinkNode<TypeNode*>* tlink = make<LinkNode<TypeNode*> >();
+      LinkNode<TypeNode*>* tlink = arena_.make<LinkNode<TypeNode*> >();
       TypeNode* tnode = GetTypeNode(op->args[i]);
       tlink->value = tnode;
       rnode->type_list.Push(tlink);
       // insert type->relation node
-      LinkNode<RelationNode*>* rlink = make<LinkNode<RelationNode*> >();
+      LinkNode<RelationNode*>* rlink = arena_.make<LinkNode<RelationNode*> >();
       rlink->value = rnode;
       tnode->rel_list.Push(rlink);
     }
diff --git a/src/relay/pass/type_solver.h b/src/relay/pass/type_solver.h
index 30f82f980a75..2f311c9b9810 100644
--- a/src/relay/pass/type_solver.h
+++ b/src/relay/pass/type_solver.h
@@ -16,6 +16,8 @@
 namespace tvm {
 namespace relay {
 
+using common::LinkNode;
+using common::LinkedList;
 /*!
  * \brief Interface of type solver used in type inference.
  *
@@ -69,41 +71,6 @@ class TypeSolver {
   // Internally the solver maintains a bipartite graph of Relation and Types.
   // All the object in the structure is managed by a arena allocator
   // which releases the memory upon distruction of the type solver.
-  /*!
-   * \brief Link list node
-   * \tparam T the content data type
-   */
-  template<typename T>
-  struct LinkNode {
-    /*! \brief The content value */
-    T value;
-    /*! \brief pointer to the next location */
-    LinkNode<T>* next{nullptr};
-  };
-  /*!
-   * \brief LinkedList structure
-   * \tparam T the content data type
-   */
-  template<typename T>
-  struct LinkedList {
-    /*! \brief Head pointer */
-    LinkNode<T>* head{nullptr};
-    /*! \brief Tail pointer */
-    LinkNode<T>* tail{nullptr};
-    /*!
-     * \brief Push a new node to the end of the linked list.
-     * \param node The node to be pushed.
-     */
-    void Push(LinkNode<T>* node) {
-      node->next = nullptr;
-      if (this->tail != nullptr) {
-        this->tail->next = node;
-        this->tail = node;
-      } else {
-        head = tail = node;
-      }
-    }
-  };
   /*!
    * \brief type node struct
    *  TypeNode implements a union-find data structure(via parent)
@@ -164,18 +131,6 @@ class TypeSolver {
   common::Arena arena_;
   /*! \brief Reporter that reports back to self */
   TypeReporter reporter_;
-  /*!
-   * \brief Create function to create a new node ptr via arena
-   * \tparam The type parameter
-   * \return The node pointer.
-   */
-  template<typename T>
-  T* make() {
-    T* ptr = arena_.Alloc<T>();
-    // call constructor
-    new (ptr) T();
-    return ptr;
-  }
   /*!
    * \brief GetTypeNode that is corresponds to t.
    * if it do not exist, create a new one.
@@ -186,7 +141,7 @@ class TypeSolver {
     if (it != tmap_.end()) {
       return it->second->FindRoot();
     } else {
-      TypeNode* n = make<TypeNode>();
+      TypeNode* n = arena_.make<TypeNode>();
       type_nodes_.push_back(n);
       n->resolved_type = t;
       tmap_[t] = n;
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index 51ef0377868f..ebc4e6fc16e6 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -129,5 +129,23 @@ TVM_REGISTER_API("relay._ir_pass.free_type_vars")
     }
   });
 
+/*!
+ * \brief Get reference counter of each internal ExprNode in body.
+ * \param body The body expression.
+ * \return The reference count mapping.
+ */
+std::unordered_map<const Node*, size_t>
+GetExprRefCount(const Expr& body) {
+  class ExprRefCounter : private ExprVisitor {
+   public:
+    std::unordered_map<const Node*, size_t>
+    Get(const Expr& body) {
+      this->VisitExpr(body);
+      return std::move(this->visit_counter_);
+    }
+  };
+  return ExprRefCounter().Get(body);
+}
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index dd790a6d7d87..d12804d512f0 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -33,6 +33,7 @@ def test_env():
     text = env.astext()
     assert "def @myf" in text
     assert "%1 = add(%0, %0) # ty=float32" in text
+    show(env.astext(annotate=lambda x: str(x.checked_type.dtype)))
     show(text)
 
 
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index 1b57bdce0e0c..a5a7a05a974c 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -46,6 +46,8 @@ def check(shape, channels):
         weight = relay.var("weight", type_dict["weight"])
         y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
         y1_expected = expected(x, weight, in_bias, in_scale, channels)
+        y1_folded = relay.ir_pass.infer_type(y1_folded)
+        y1_expected = relay.ir_pass.infer_type(y1_expected)
         assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
 
     check((2, 4, 10, 10), 2)
@@ -113,6 +115,8 @@ def check(shape, channels):
         type_dict = {x.name_hint:x.checked_type for x in y1.params}
         weight = relay.var("weight", type_dict["weight"])
         y1_expected = expected(x, weight, in_bias, in_scale, channels)
+        y1_folded = relay.ir_pass.infer_type(y1_folded)
+        y1_expected = relay.ir_pass.infer_type(y1_expected)
         assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
 
     check((2, 4, 10, 3), 3)
@@ -194,6 +198,8 @@ def check(shape, channels):
         weight = relay.var("weight", type_dict["weight"])
         y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
         y1_expected = expected(x, weight, out_bias, out_scale, channels)
+        y1_folded = relay.ir_pass.infer_type(y1_folded)
+        y1_expected = relay.ir_pass.infer_type(y1_expected)
         assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
 
     check((2, 4, 10, 10), 8)
@@ -255,6 +261,8 @@ def check(shape, channels):
         weight = relay.var("weight", type_dict["weight"])
         y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
         y1_expected = expected(x, weight, out_bias, out_scale, channels)
+        y1_folded = relay.ir_pass.infer_type(y1_folded)
+        y1_expected = relay.ir_pass.infer_type(y1_expected)
         assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
 
     check((2, 4, 10, 10), 8)
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 2bbc1dce9693..19bec20ac4af 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -3,15 +3,103 @@
 
 def test_fuse_simple():
     """Simple testcase."""
-    x = relay.var("x", shape=(10, 20))
-    y = relay.add(x, x)
-    z = relay.exp(y)
+    def before():
+        x = relay.var("x", shape=(10, 20))
+        y = relay.add(x, relay.const(1, "float32"))
+        z = relay.exp(y)
+        return relay.Function([x], z)
+
+    def expected():
+        x = relay.var("p", shape=(10, 20))
+        y = relay.add(x, relay.const(1, "float32"))
+        z = relay.exp(y)
+        f1 = relay.Function([x], z)
+        x = relay.var("x", shape=(10, 20))
+        y = relay.Call(f1, [x])
+        return relay.Function([x], y)
+
+    z = before()
     z = relay.ir_pass.infer_type(z)
-    zz = relay.ir_pass.fuse_ops(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
     zz = relay.ir_pass.fuse_ops(zz)
     zz = relay.ir_pass.infer_type(zz)
-    zz.astext()
+    after = relay.ir_pass.infer_type(expected())
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
+
+def test_conv2d_fuse():
+    """Test fusion case of conv2d"""
+    def before(dshape):
+        x = relay.var("x", shape=dshape)
+        y = relay.nn.conv2d(x, relay.var("w1"),
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            channels=16)
+        # this is the next dominator.
+        y1 = relay.add(relay.const(1, "float32"), y)
+        y = relay.add(y, y1)
+        # second path
+        z2 = relay.nn.conv2d(y, relay.var("w2"),
+                             kernel_size=(1, 1),
+                             padding=(0,0),
+                             channels=16)
+        z3 = relay.nn.conv2d(y, relay.var("w3"),
+                             kernel_size=(3, 3),
+                             padding=(1,1),
+                             channels=16)
+        # add can only be fused to z1
+        z = relay.add(z2, z3)
+        return relay.Function(relay.ir_pass.free_vars(z), z)
+
+    def expected(dshape):
+        # segment 1
+        x = relay.var("p0", shape=dshape)
+        w = relay.var("p1")
+        y = relay.nn.conv2d(x, w,
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            channels=16)
+        y1 = relay.add(relay.const(1, "float32"), y)
+        y = relay.add(y, y1)
+        f1 = relay.Function([x, w], y)
+        # segment 2
+        x = relay.var("p0", shape=dshape)
+        w = relay.var("p1")
+        z2 = relay.nn.conv2d(x, w,
+                             kernel_size=(3, 3),
+                             padding=(1,1),
+                             channels=16)
+        f2 = relay.Function([x, w], z2)
+        # segment 3
+        x = relay.var("p0", shape=dshape)
+        w = relay.var("p1")
+        offset = relay.var("p2", shape=dshape)
+        z3 = relay.nn.conv2d(x, w,
+                             kernel_size=(1, 1),
+                             padding=(0, 0),
+                             channels=16)
+        z3 = relay.add(z3, offset)
+        f3 = relay.Function([x, w, offset], z3)
+        # compose
+        x = relay.var("x", shape=dshape)
+        y = relay.Call(f1, [x, relay.var("w1")])
+        z2 = relay.Call(f2, [y, relay.var("w3")])
+        z3 = relay.Call(f3, [y, relay.var("w2"), z2])
+        z = z3
+        return relay.Function(relay.ir_pass.free_vars(z), z)
+
+    dshape = (1, 16, 64, 64)
+    z = before(dshape)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
+    after = relay.ir_pass.infer_type(expected(dshape))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
 
 
 if __name__ == "__main__":
     test_fuse_simple()
+    test_conv2d_fuse()

From 0d1ba8c6f6f02511d9f352191f418c0baad09854 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 13 Nov 2018 13:32:38 -0800
Subject: [PATCH 350/529] [RELAY][OP] strided_slice (#2094)

---
 docs/langref/relay_op.rst                     |   2 +
 include/tvm/relay/attrs/transform.h           |  15 ++
 nnvm/src/top/tensor/transform.cc              |  30 +++-
 python/tvm/_ffi/node_generic.py               |   2 +
 python/tvm/relay/op/__init__.py               |   1 +
 python/tvm/relay/op/_transform.py             |   8 +
 python/tvm/relay/op/transform.py              |  27 +++
 src/api/api_lang.cc                           |   6 +-
 src/relay/ir/text_printer.cc                  |   6 +-
 src/relay/op/tensor/transform.cc              | 168 ++++++++++++++++++
 tests/python/relay/test_op_level4.py          |  38 +++-
 topi/include/topi/transform.h                 |  55 ++++--
 topi/python/topi/testing/__init__.py          |   1 +
 .../topi/testing/strided_slice_python.py      |  32 ++++
 topi/tests/python/test_topi_transform.py      |  17 +-
 15 files changed, 371 insertions(+), 37 deletions(-)
 create mode 100644 python/tvm/relay/op/_transform.py
 create mode 100644 topi/python/topi/testing/strided_slice_python.py

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 405f071e3283..e99ac3c97f73 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -123,6 +123,7 @@ This level enables additional math and transform operators.
    tvm.relay.min
    tvm.relay.mean
    tvm.relay.prod
+   tvm.relay.strided_slice
 
 
 **Level 5: Vision/Image Operators**
@@ -227,6 +228,7 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.min
 .. autofunction:: tvm.relay.mean
 .. autofunction:: tvm.relay.prod
+.. autofunction:: tvm.relay.strided_slice
 
 
 
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index cb87d358e966..4d2008628d3a 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -123,6 +123,21 @@ struct SplitAttrs : public tvm::AttrsNode<SplitAttrs> {
   }
 };
 
+/*! \brief Attributes for StridedSlice operator */
+struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
+  Array<Integer> begin;
+  Array<Integer> end;
+  Array<Integer> strides;
+
+  TVM_DECLARE_ATTRS(StridedSliceAttrs, "relay.attrs.StridedSliceAttrs") {
+    TVM_ATTR_FIELD(begin)
+        .describe("Indices for begin of slice, begin index is also inclusive");
+    TVM_ATTR_FIELD(end)
+        .describe("Indices for end of slice, end index is also inclusive");
+    TVM_ATTR_FIELD(strides).set_default(Array<Integer>({}))
+        .describe("Stride values of the slice");
+  }
+};
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 4d08bf761326..2f42727d6083 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -980,23 +980,25 @@ Examples::
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const StridedSliceParam& param = nnvm::get<StridedSliceParam>(attrs.parsed);
-    Array<Expr> begin;
-    Array<Expr> end;
-    Array<Expr> stride;
+    Array<Integer> begin;
+    Array<Integer> end;
+    Array<Integer> stride;
 
     for (int64_t i : param.begin) {
-        begin.push_back(tvm::make_const(tvm::Int(32), i));
+      begin.push_back(static_cast<int>(i));
     }
 
     for (int64_t i : param.end) {
-        end.push_back(tvm::make_const(tvm::Int(32), i));
+      end.push_back(static_cast<int>(i));
     }
 
     for (int64_t i : param.stride) {
-        stride.push_back(tvm::make_const(tvm::Int(32), i));
+      stride.push_back(static_cast<int>(i));
     }
 
-    return Array<Tensor>{ topi::strided_slice(inputs[0], begin, end, stride) };
+    return Array<Tensor>{
+      topi::strided_slice(inputs[0], begin, end, stride)
+    };
 })
 .set_support_level(1);
 
@@ -1210,6 +1212,15 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+// Adapter function to make int array.
+Array<Integer> GetIntArray(Array<Expr> arr) {
+  for (size_t i = 0; i < arr.size(); ++i) {
+    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
+        << "Expect an int array";
+  }
+  return Array<Integer>(arr.node_);
+}
+
 NNVM_REGISTER_OP(slice_like)
 .describe(R"code(Slice the first input respect to the second input.
 )code" NNVM_ADD_FILELINE)
@@ -1261,7 +1272,10 @@ NNVM_REGISTER_OP(slice_like)
       }
     }
     return Array<Tensor>{
-      topi::strided_slice(inputs[0], begin_idx, end_idx, strides)
+      topi::strided_slice(inputs[0],
+                          GetIntArray(begin_idx),
+                          GetIntArray(end_idx),
+                          GetIntArray(strides))
     };
 })
 .set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
diff --git a/python/tvm/_ffi/node_generic.py b/python/tvm/_ffi/node_generic.py
index b7230f29da59..e86453499faa 100644
--- a/python/tvm/_ffi/node_generic.py
+++ b/python/tvm/_ffi/node_generic.py
@@ -56,6 +56,8 @@ def convert_to_node(value):
         return _api_internal._Map(*vlist)
     elif isinstance(value, NodeGeneric):
         return value.asnode()
+    elif value is None:
+        return None
     else:
         raise ValueError("don't know how to convert type %s to node" % type(value))
 
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 9b581486608b..30aef433d7c6 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -13,6 +13,7 @@
 
 # operator registry
 from . import _tensor
+from . import _transform
 from ..expr import Expr
 from ..base import register_relay_node
 
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
new file mode 100644
index 000000000000..7867336d033f
--- /dev/null
+++ b/python/tvm/relay/op/_transform.py
@@ -0,0 +1,8 @@
+#pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+from . import op as _reg
+from .op import schedule_injective
+
+# strided_slice
+_reg.register_schedule("strided_slice", schedule_injective)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 909b175f08ca..e43a4a573e54 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -334,3 +334,30 @@ def split(data, indices_or_sections, axis=0):
     else:
         ret_size = len(indices_or_sections) + 1
     return TupleWrapper(_make.split(data, indices_or_sections, axis), ret_size)
+
+
+def strided_slice(data, begin, end, strides=None):
+    """Strided slice of an array..
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The source array to be sliced.
+
+    begin: list of int
+        The indices to begin with in the slicing.
+
+    end: list of int
+        Indicies indicating end of the slice.
+
+    strides: list of int, optional
+        Specifies the stride values, it can be negative in that case,
+        the input tensor will be reversed in that particular axis.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    strides = strides or []
+    return _make.strided_slice(data, list(begin), list(end), list(strides))
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index 75365da5bf50..3525e23b8b20 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -47,7 +47,11 @@ TVM_REGISTER_API("_Array")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     std::vector<NodePtr<Node> > data;
     for (int i = 0; i < args.size(); ++i) {
-      data.push_back(args[i].node_sptr());
+      if (args[i].type_code() != kNull) {
+        data.push_back(args[i].node_sptr());
+      } else {
+        data.push_back(NodePtr<Node>(nullptr));
+      }
     }
     auto node = make_node<ArrayNode>();
     node->data = std::move(data);
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index 93ed76bed3c2..bfc5f0db52b7 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -403,7 +403,11 @@ class TextPrinter :
    * \param os The output type.
    */
   void PrintAttr(const NodeRef& value, std::ostream& os) {  // NOLINT(*)
-    this->VisitAttr(value, os);
+    if (value.defined()) {
+      this->VisitAttr(value, os);
+    } else {
+      os << "None";
+    }
   }
   //------------------------------------
   // Overload of Attr printing functions
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 20e0e3adbfd3..98ac1c30b66c 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -7,6 +7,7 @@
 #include <tvm/relay/attrs/transform.h>
 #include <tvm/ir_operator.h>
 #include <tvm/ir.h>
+#include <topi/transform.h>
 #include <vector>
 #include "../op_common.h"
 
@@ -890,6 +891,173 @@ RELAY_REGISTER_OP("broadcast_to_like")
 .set_support_level(10)
 .add_type_rel("BroadCastToLike", BroadCastToLikeRel);
 
+
+// strided_slice
+TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
+bool StridedSliceRel(const Array<Type>& types,
+                     int num_inputs,
+                     const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const StridedSliceAttrs *param = attrs.as<StridedSliceAttrs>();
+  CHECK(param != nullptr);
+
+  auto dshape = data->shape;
+  auto num_axis = dshape.size();
+
+  std::vector<int64_t> stride_vec;
+  for (Integer i : param->strides) {
+    CHECK(i.defined());
+    stride_vec.push_back(i->value);
+  }
+  for (size_t i = stride_vec.size(); i < num_axis; ++i) {
+    stride_vec.push_back(1);
+  }
+  const int64_t max_range = std::numeric_limits<int64_t>::max();
+
+  std::vector<int64_t> begin_vec;
+  for (size_t i = 0; i < param->begin.size(); ++i) {
+    if (!param->begin[i].defined()) {
+      // value=None
+      begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
+    } else {
+      begin_vec.push_back(param->begin[i]->value);
+    }
+  }
+  for (size_t i = begin_vec.size(); i < num_axis; ++i) {
+    begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
+  }
+
+  std::vector<int64_t> end_vec;
+  for (size_t i = 0; i < param->end.size(); ++i) {
+    // allow end to be None
+    if (!param->end[i].defined()) {
+      end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+    } else {
+      end_vec.push_back(param->end[i]->value);
+    }
+  }
+  for (size_t i = end_vec.size(); i < num_axis; ++i) {
+    end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+  }
+
+  std::vector<IndexExpr> oshape(dshape.size());
+  for (size_t i = 0; i < num_axis; ++i) {
+    int64_t stride_v = stride_vec[i];
+    int64_t begin_v = begin_vec[i];
+    int64_t end_v = end_vec[i];
+
+    if ((stride_v == 1 &&
+         begin_v == 0 &&
+         end_v == max_range) ||
+        (stride_v == -1 &&
+         begin_v == max_range &&
+         end_v == 0)) {
+      // Quick path, do not slice this dimension.
+      oshape[i] = dshape[i];
+      continue;
+    }
+    // Normal path, require the shape to be concrete integer.
+    // Require concrete integer as symbolic inference of min/max
+    // can get complicated and not very helpful.
+    const int64_t* p_dim_size = as_const_int(dshape[i]);
+    CHECK(p_dim_size)
+        << "strided_slice requires sliced dimension to be concrete int";
+    int64_t dim_size = p_dim_size[0];
+    begin_v = (begin_v < 0) ? dim_size + begin_v : begin_v;
+    end_v = (end_v < 0) ? dim_size + end_v : end_v;
+
+    int64_t slice_range, step;
+    if (stride_v < 0) {
+      if (end_v < -1) end_v = -1;
+      CHECK_LT(end_v, begin_v)
+          << "strided_slice get empty slice at axis " << i;
+      begin_v = std::min(dim_size - 1, begin_v);
+      slice_range = begin_v - end_v;
+      step = -stride_v;
+    } else {
+      if (begin_v < 0) begin_v = 0;
+      CHECK_GE(stride_v, 0);
+      CHECK_LT(begin_v, end_v)
+          << "strided_slice get empty slice at axis " << i;
+      end_v = std::min(dim_size, end_v);
+      slice_range = end_v - begin_v;
+      step = stride_v;
+    }
+    oshape[i] = make_const(dshape[i].type(), (slice_range + step - 1) / step);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+
+// Positional relay function to create StridedSlice operator used by frontend FFI.
+Expr MakeStridedSlice(Expr data,
+                      Array<Integer> begin,
+                      Array<Integer> end,
+                      Array<Integer> strides) {
+  auto attrs = make_node<StridedSliceAttrs>();
+  attrs->begin = std::move(begin);
+  attrs->end = std::move(end);
+  attrs->strides = std::move(strides);
+  static const Op& op = Op::Get("strided_slice");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+Array<Tensor> StridedSliceCompute(const Attrs& attrs,
+                                  const Array<Tensor>& inputs,
+                                  const Type& out_type,
+                                  const Target& target) {
+  const StridedSliceAttrs *param = attrs.as<StridedSliceAttrs>();
+  CHECK(param != nullptr);
+  return Array<Tensor>{
+    topi::strided_slice(inputs[0], param->begin, param->end, param->strides)
+  };
+}
+
+
+TVM_REGISTER_API("relay.op._make.strided_slice")
+  .set_body([](const TVMArgs& args, TVMRetValue* rv) {
+      runtime::detail::unpack_call<Expr, 4>(MakeStridedSlice, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("strided_slice")
+    .describe(R"code(Strided slice of an array.
+
+Examples::
+
+  x = [[  1.,   4.,   7.,  10.],
+       [  2.,   5.,   8.,  11.],
+       [  3.,   6.,   9.,  12.]]
+
+  strided_slice(x, begin=[0, 1], end=[2, 4], stride=[1, 1]) = [[ 4.,  7.,  10.],
+                                                               [ 5.,  8.,  11.]]
+
+  x = [[[ 1.,  2.],
+        [ 3.,  4.]],
+
+       [[ 5.,  6.],
+        [ 7.,  8.]]]
+
+  strided_slice(x, begin=[0, 0], end=[2, 2]) = [[[ 1.,  2.],
+                                                 [ 3.,  4.]],
+
+                                                [[ 5.,  6.],
+                                                 [ 7.,  8.]]]
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(4)
+.set_attrs_type_key("relay.attrs.StridedSliceAttrs")
+.add_type_rel("StridedSlice", StridedSliceRel)
+.set_attr<FTVMCompute>("FTVMCompute", StridedSliceCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+
 // Split
 TVM_REGISTER_NODE_TYPE(SplitAttrs);
 
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 6fd70c386567..dd12dc7cff3a 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -2,7 +2,7 @@
 import numpy as np
 from tvm import relay
 from tvm.relay.testing import ctx_list
-
+import topi.testing
 
 def test_binary_op():
     def check_binary_op(opfunc, ref):
@@ -142,7 +142,43 @@ def test_reduce_functions():
         verify_reduce(func, (128, 24, 128), (0, 1), True, False, (1, 1, 128))
         verify_reduce(func, (128, 24, 128), (0, 2), True, False, (1, 24, 1))
 
+
+def test_strided_slice():
+    def verify(dshape, begin, end, strides, output, test_ref=True):
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.strided_slice(x, begin=begin, end=end, strides=strides)
+        func = relay.Function([x], z)
+        func = relay.ir_pass.infer_type(func)
+        text = func.astext()
+        assert "begin=" in text
+        assert "end=" in text
+        if output:
+            assert func.body.checked_type == relay.ty.TensorType(output, "float32")
+        if not test_ref:
+            return
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        ref_res = topi.testing.strided_slice_python(
+            x_data, begin, end, strides)
+        for target, ctx in ctx_list():
+            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x_data)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    verify((d1, d2, 3), [None, None, 1], [None, None, 2], None, (d1, d2, 1), False)
+    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
+    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3))
+    verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
+    verify((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], (1, 2, 2))
+    verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
+    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
+    verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
+    verify((3, 4, 3), [1, 1, 0], [4, 4], None, (2, 3, 3))
+    verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
+
+
 if __name__ == "__main__":
+    test_strided_slice()
     test_binary_op()
     test_cmp_type()
     test_binary_int_broadcast()
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 7fc408c2c79c..cb09f1cb419e 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -10,6 +10,7 @@
 #include <vector>
 #include <iterator>
 #include <algorithm>
+#include <limits>
 
 #include "topi/tags.h"
 #include "topi/detail/ravel_unravel.h"
@@ -403,31 +404,51 @@ inline Array<Tensor> split(const Tensor& x,
 * \return A Tensor whose op member is the split operation
 */
 inline Tensor strided_slice(const Tensor& x,
-                            const Array<Expr>& begin,
-                            const Array<Expr>& end,
-                            const Array<Expr>& strides,
+                            const Array<Integer>& begin,
+                            const Array<Integer>& end,
+                            const Array<Integer>& strides,
                             std::string name = "tensor",
                             std::string tag = kInjective) {
   size_t src_tensor_dim = static_cast<size_t>(x->shape.size());
-  std::vector<int64_t> begin_vec = GetConstInt64Values(begin, "begin");
-  std::vector<int64_t> end_vec = GetConstInt64Values(end, "end");
-  std::vector<int64_t> stride_vec = GetConstInt64Values(strides, "strides");
-  // in case user has not provided begin indices for all the axes,
-  // then inflate it with default value = 0
-  for (size_t i = begin_vec.size(); i < src_tensor_dim; ++i) {
-    begin_vec.push_back(0);
-  }
-  // in case user has not provided end indices for all the axes,
-  // then inflate it with default value = input_tensor.shape[axis]
-  for (size_t i = end_vec.size(); i < src_tensor_dim; ++i) {
-    end_vec.push_back(GetConstInt(x->shape[i]));
+  // Setup the ranges.
+  // NOTE: this code duplicates the shape inference logic relay.op
+  // Consider to refactor in the future.
+  std::vector<int64_t> stride_vec;
+  for (Integer i : strides) {
+    CHECK(i.defined());
+    stride_vec.push_back(i->value);
   }
-  // in case user has not provided stride values,
-  // then inflate it with default value = 1
   for (size_t i = stride_vec.size(); i < src_tensor_dim; ++i) {
     stride_vec.push_back(1);
   }
+  const int64_t max_range = std::numeric_limits<int64_t>::max();
+
+  std::vector<int64_t> begin_vec;
+  for (size_t i = 0; i < begin.size(); ++i) {
+    if (!begin[i].defined()) {
+      // value=None
+      begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
+    } else {
+      begin_vec.push_back(begin[i]->value);
+    }
+  }
+  for (size_t i = begin_vec.size(); i < src_tensor_dim; ++i) {
+    begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
+  }
 
+  std::vector<int64_t> end_vec;
+  for (size_t i = 0; i < end.size(); ++i) {
+    // allow end to be None
+    if (!end[i].defined()) {
+      end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+    } else {
+      end_vec.push_back(end[i]->value);
+    }
+  }
+  for (size_t i = end_vec.size(); i < src_tensor_dim; ++i) {
+    end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+  }
+  // Compute
   Array<Expr> out_shape;
   Array<Expr> begin_expr;
   Array<Expr> strides_expr;
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 8a3269ba83ae..c496e08c1835 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -19,3 +19,4 @@
 from .lrn_python import lrn_python
 from .l2_normalize_python import l2_normalize_python
 from .gather_nd_python import gather_nd_python
+from .strided_slice_python import strided_slice_python
diff --git a/topi/python/topi/testing/strided_slice_python.py b/topi/python/topi/testing/strided_slice_python.py
new file mode 100644
index 000000000000..4407b3bec1c7
--- /dev/null
+++ b/topi/python/topi/testing/strided_slice_python.py
@@ -0,0 +1,32 @@
+"""gather_nd in python"""
+
+def strided_slice_python(data, begin, end, strides):
+    """Python version of strided slice operator.
+
+    Parameters
+    ----------
+    data : numpy.ndarray
+        Input data
+
+    begin : list
+        Begining of the slices.
+
+    end : list
+        End of the slices.
+
+    strides : list
+        The stride of each slice.
+
+    Returns
+    -------
+    result : numpy.ndarray
+        The sliced result.
+    """
+    strides = [] if strides is None else strides
+    slices = []
+    for i in range(len(data.shape)):
+        slices.append(slice(
+            begin[i] if i < len(begin) else None,
+            end[i] if i < len(end) else None,
+            strides[i] if i < len(strides) else None))
+    return data[tuple(slices)]
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 75e4d3b675b0..dc3c3fb70b24 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -249,13 +249,11 @@ def check_device(device):
     for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
 
-def verify_strided_slice(in_shape, begin, end, stride=None):
-    stride = stride if stride else [1, 1, 1]
+def verify_strided_slice(in_shape, begin, end, strides=None):
     A = tvm.placeholder(shape=in_shape, name="A")
-    B = topi.strided_slice(A, begin, end, stride) + 1
-    def test_forward(x, begin, end, stride):
-        return x[begin[0]:end[0]:stride[0],
-                    begin[1]:end[1]:stride[1], begin[2]:end[2]:stride[2]] + 1
+    strides = [1,1,1] if strides is None else strides
+    B = topi.strided_slice(A, begin, end, strides) + 1
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -267,7 +265,8 @@ def check_device(device):
 
         foo = tvm.build(s, [A, B], device, name="stride_slice")
         x_np = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = test_forward(x_np, begin, end, stride)
+        out_npy = topi.testing.strided_slice_python(
+            x_np, begin, end, strides) + 1
         data_nd = tvm.nd.array(x_np, ctx)
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
         foo(data_nd, out_nd)
@@ -298,7 +297,7 @@ def check_device(device):
             shape_size = shape_size * src_shape[i]
         data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
         out_npys = topi.testing.gather_nd_python(data_npy, indices_src)
-        
+
         data_nd = tvm.nd.array(data_npy, ctx)
         indices_nd = tvm.nd.array(indices_src, ctx)
         out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
@@ -412,6 +411,7 @@ def test_gather_nd():
                          indices_dtype)
 
 if __name__ == "__main__":
+    test_strided_slice()
     test_concatenate()
     test_tranpose()
     test_expand_dims()
@@ -421,5 +421,4 @@ def test_gather_nd():
     test_flip()
     test_expand_like()
     test_take()
-    test_strided_slice()
     test_gather_nd()

From c5482665e8274af5a2f6fedb55ded8aa79031184 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Tue, 13 Nov 2018 14:57:16 -0800
Subject: [PATCH 351/529] [Relay][OP]NMS (#1929)

---
 include/tvm/relay/attrs/vision.h       | 16 +++++++
 python/tvm/relay/op/vision/__init__.py |  1 +
 python/tvm/relay/op/vision/nms.py      | 36 +++++++++++++++
 src/relay/op/vision/multibox_op.cc     |  3 +-
 src/relay/op/vision/nms.cc             | 62 ++++++++++++++++++++++++++
 tests/python/relay/test_op_level5.py   | 31 ++++++++++++-
 6 files changed, 146 insertions(+), 3 deletions(-)
 create mode 100644 python/tvm/relay/op/vision/nms.py
 create mode 100644 src/relay/op/vision/nms.cc

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 60ee4cb88e43..5408582c8356 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -40,6 +40,22 @@ struct MultiBoxPriorAttrs : public tvm::AttrsNode<MultiBoxPriorAttrs> {
   }
 };
 
+/*! \brief Attributes used in non_maximum_suppression operators */
+struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
+  double overlap_threshold;
+  bool force_suppress;
+  int topk;
+
+  TVM_DECLARE_ATTRS(NMSAttrs, "relay.attrs.NMSAttrs") {
+      TVM_ATTR_FIELD(overlap_threshold).set_default(0.5)
+        .describe("Non-maximum suppression threshold.");
+      TVM_ATTR_FIELD(force_suppress).set_default(false)
+        .describe("Suppress all detections regardless of class_id.");
+      TVM_ATTR_FIELD(topk).set_default(-1)
+        .describe("Keep maximum top k detections before nms, -1 for no limit.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_VISION_H_
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
index b3010d2d5310..9ecd8a84770a 100644
--- a/python/tvm/relay/op/vision/__init__.py
+++ b/python/tvm/relay/op/vision/__init__.py
@@ -3,3 +3,4 @@
 from __future__ import absolute_import as _abs
 
 from .multibox import *
+from .nms import *
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
new file mode 100644
index 000000000000..8035e3030b17
--- /dev/null
+++ b/python/tvm/relay/op/vision/nms.py
@@ -0,0 +1,36 @@
+"""Non-maximum suppression operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+def nms(data,
+        valid_count,
+        overlap_threshold=0.5,
+        force_suppress=False,
+        topk=-1):
+    """Non-maximum suppression operator for object detection.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        3-D tensor with shape [batch_size, num_anchors, 6].
+        The last dimension should be in format of
+        [class_id, score, box_left, box_top, box_right, box_bottom].
+
+    valid_count : relay.Expr
+        1-D tensor for valid number of boxes.
+
+    overlap_threshold : float, optional
+        Non-maximum suppression threshold.
+
+    force_suppress : bool, optional
+        Suppress all detections regardless of class_id.
+
+    topk : int, optional
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    out : relay.Expr
+        3-D tensor with shape [batch_size, num_anchors, 6].
+    """
+    return _make.nms(data, valid_count, overlap_threshold, force_suppress, topk)
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index ce069a78186b..e347e544e4f9 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -5,7 +5,6 @@
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/vision.h>
-#include <vector>
 
 namespace tvm {
 namespace relay {
@@ -66,7 +65,7 @@ RELAY_REGISTER_OP("vision.multibox_prior")
 .set_attrs_type_key("relay.attrs.MultiBoxPriorAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
-.set_support_level(4)
+.set_support_level(5)
 .add_type_rel("MultiBoxPrior", MultiboxPriorRel);
 
 }  // namespace relay
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
new file mode 100644
index 000000000000..3e3f73bc6cb4
--- /dev/null
+++ b/src/relay/op/vision/nms.cc
@@ -0,0 +1,62 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file nms.cc
+ * \brief Non-maximum suppression operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/vision.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(NMSAttrs);
+
+bool NMSRel(const Array<Type>& types,
+            int num_inputs,
+            const Attrs& attrs,
+            const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* valid_count = types[1].as<TensorTypeNode>();
+  const auto& dshape = data->shape;
+  const auto& vshape = valid_count->shape;
+  CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
+  CHECK_EQ(vshape.size(), 1) << "Input valid count should be 1-D.";
+
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(dshape, data->dtype));
+  return true;
+}
+
+
+Expr MakeNMS(Expr data,
+             Expr valid_count,
+             double overlap_threshold,
+             bool force_suppress,
+             int topk) {
+  auto attrs = make_node<NMSAttrs>();
+  attrs->overlap_threshold = overlap_threshold;
+  attrs->force_suppress = force_suppress;
+  attrs->topk = topk;
+  static const Op& op = Op::Get("vision.nms");
+  return CallNode::make(op, {data, valid_count}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.vision._make.nms")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 5>(MakeNMS, args, rv);
+});
+
+
+RELAY_REGISTER_OP("vision.nms")
+.describe(R"doc("Non-maximum suppression."
+)doc" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
+.set_support_level(5)
+.add_type_rel("NMS", NMSRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 4e554cd0cf81..0bd7a4816a1b 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -18,7 +18,6 @@ def test_resize_infer_type():
     assert zz.checked_type == relay.TensorType((n, c, 100, 200), "int8")
 
 
-
 def test_multibox_prior():
     sizes = (0.3, 1.5, 0.7)
     ratios = (1.3, 2.4)
@@ -44,6 +43,36 @@ def test_multibox_prior():
         (1, h * w, 4), "float32")
 
 
+def test_nms():
+    num_anchors = 60
+
+    overlap_threshold = 0.5
+    force_suppress = True
+    nms_topk = 10
+
+    n = tvm.var("n")
+    x0 = relay.var("x0", relay.ty.TensorType((n, num_anchors, 6), "float32"))
+    x1 = relay.var("x1", relay.ty.TensorType((n,), "int"))
+
+    z = relay.vision.nms(x0, x1, overlap_threshold, force_suppress, nms_topk)
+
+    assert "overlap_threshold" in z.astext()
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType(
+        (n, num_anchors, 6), "float32")
+
+    n = tvm.var("n")
+    x0 = relay.var("x0", relay.ty.TensorType((n, num_anchors, 6), "float32"))
+    x1 = relay.var("x1", relay.ty.TensorType((n,), "int"))
+
+    z = relay.vision.nms(x0, x1)
+
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType(
+        (n, num_anchors, 6), "float32")
+
+
 if __name__ == "__main__":
     test_resize_infer_type()
     test_multibox_prior()
+    test_nms()

From 9731dff0657eb4e1d2d624d8389f8dfccdfee1a1 Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Tue, 13 Nov 2018 18:49:03 -0800
Subject: [PATCH 352/529] [Jenkinsfile] Build NNPACK and run tests in `ci-cpu`
 (#2095)

---
 Jenkinsfile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index f63e7d0f396e..adc9e12ca74b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -131,6 +131,8 @@ stage('Build') {
            echo set\\(USE_SORT ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
+           echo set\\(USE_NNPACK ON\\) >> config.cmake
+           echo set\\(NNPACK_PATH /NNPACK/build/\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
@@ -140,6 +142,8 @@ stage('Build') {
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_cpp_unittest.sh"
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_vta.sh"
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_rust.sh"
+          sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_unittest.sh"
+          sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_integration.sh"
         }
       }
     }

From e3bfedc7770919447beac118e81963ae0bf20a4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Tue, 13 Nov 2018 18:49:43 -0800
Subject: [PATCH 353/529] Fix error in fuse_ops.cc (#2098)

---
 src/relay/pass/fuse_ops.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 2503bd5f53fa..9d19d2e6ca49 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -322,7 +322,7 @@ class DominatorTree {
    *        The combined edge pattern across all the parents.
    * \return The least common acenstor of thw two.
    */
-  static Node* LeastCommonAcenstor(
+  static Node* LeastCommonAncestor(
       Node* lhs,
       Node* rhs,
       OpPatternKind* edge_pattern) {
@@ -338,12 +338,12 @@ class DominatorTree {
             edge_pattern[0], lhs->pattern);
         lhs = lhs->parent;
       } else {
-        lhs = lhs->parent;
-        rhs = rhs->parent;
         edge_pattern[0] = CombinePattern(
             edge_pattern[0], lhs->pattern);
         edge_pattern[0] = CombinePattern(
             edge_pattern[0], rhs->pattern);
+        lhs = lhs->parent;
+        rhs = rhs->parent;
       }
     }
     return lhs;
@@ -374,7 +374,7 @@ DominatorTree DominatorTree::PostDom(common::Arena* arena,
         Node* onode = tree.nodes[oindex];
         CHECK(onode != nullptr);
         if (parent != nullptr) {
-          parent = LeastCommonAcenstor(parent, onode, &pattern);
+          parent = LeastCommonAncestor(parent, onode, &pattern);
         } else {
           parent = onode;
         }

From 80ddfc179d00a9c8c4af0df6d05a559233d6c1ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Tue, 13 Nov 2018 19:58:28 -0800
Subject: [PATCH 354/529] Update fuse_ops.cc (#2102)

---
 src/relay/pass/fuse_ops.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 9d19d2e6ca49..54dcdf1e8fc6 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -320,7 +320,7 @@ class DominatorTree {
    * \param rhs The right node.
    * \param edge_pattern
    *        The combined edge pattern across all the parents.
-   * \return The least common acenstor of thw two.
+   * \return The least common ancestor of thw two.
    */
   static Node* LeastCommonAncestor(
       Node* lhs,
@@ -380,8 +380,7 @@ DominatorTree DominatorTree::PostDom(common::Arena* arena,
         }
         pattern = CombinePattern(pattern, link->value.pattern);
       }
-      CHECK(parent != nullptr);
-      tnode->depth = parent->depth + 1;
+      tnode->depth = parent ? parent->depth + 1 : 1;
       tnode->parent = parent;
       tnode->pattern = pattern;
     }

From 28d4c1c344502c30b26474519a181355417b833b Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 14 Nov 2018 08:56:40 -0800
Subject: [PATCH 355/529] [RELAY][PASS] Bind, FoldConstant (#2100)

---
 include/tvm/relay/expr_functor.h              |  11 ++
 include/tvm/relay/op_attr_types.h             |  10 ++
 include/tvm/relay/pass.h                      |  16 ++
 python/tvm/relay/__init__.py                  |   2 +-
 .../relay/backend/graph_runtime_codegen.py    |  14 +-
 python/tvm/relay/build_module.py              |  56 ++++--
 python/tvm/relay/expr.py                      |  22 +++
 python/tvm/relay/ir_pass.py                   |  16 ++
 src/relay/ir/expr_functor.cc                  |  71 +++++++-
 src/relay/ir/op.cc                            |   2 -
 src/relay/ir/type_functor.cc                  | 159 ++++++++++++++++++
 src/relay/ir/type_functor.h                   | 126 +++-----------
 src/relay/op/op_common.h                      |   3 +-
 src/relay/pass/fold_constant.cc               | 120 +++++++++++++
 src/relay/pass/pass_util.h                    |  17 ++
 src/relay/pass/type_infer.cc                  |   4 +-
 src/relay/pass/type_subst.cc                  |  39 -----
 src/relay/pass/type_subst.h                   |  19 ---
 src/relay/pass/util.cc                        |   1 -
 .../relay/test_backend_graph_runtime.py       |  23 ++-
 tests/python/relay/test_ir_bind.py            |  23 +++
 tests/python/relay/test_pass_fold_constant.py |  75 +++++++++
 22 files changed, 648 insertions(+), 181 deletions(-)
 create mode 100644 src/relay/ir/type_functor.cc
 create mode 100644 src/relay/pass/fold_constant.cc
 delete mode 100644 src/relay/pass/type_subst.cc
 delete mode 100644 src/relay/pass/type_subst.h
 create mode 100644 tests/python/relay/test_ir_bind.py
 create mode 100644 tests/python/relay/test_pass_fold_constant.py

diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index 85a6b502d845..1681f9b87d2f 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -182,6 +182,17 @@ class ExprMutator
   std::unordered_map<Expr, Expr, NodeHash, NodeEqual> memo_;
 };
 
+/*
+ * \brief Bind function parameters or free variables.
+ *
+ * Parameter binding can only happen if expr is a Function.
+ * binds cannot change internal arguments of internal functions.
+ *
+ * \param expr The function to be binded.
+ * \param binds The map of arguments to
+ */
+Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& binds);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_EXPR_FUNCTOR_H_
diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 2c9fa2808f85..f80d51772ae2 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -38,6 +38,16 @@ enum OpPatternKind {
 /*! \brief the operator pattern */
 using TOpPattern = int;
 
+/*!
+ * \brief Whether operator is stateful or contain internal state.
+ *
+ * All the primitive ops we registered so far are pure.
+ * This attribute is left for potential future compatible reasons.
+ * We can always work around the stateful ops by adding an additional
+ * handle argument and return it.
+ */
+using TOpIsStateful = bool;
+
 /*!
  * \brief Computation description interface.
  *
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 5ff60c7035d3..3ca81ebd027d 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -143,6 +143,22 @@ tvm::Array<TypeVar> FreeTypeVars(const Expr& expr);
  */
 Expr DeadCodeElimination(const Expr& e);
 
+/*!
+ * \brief Fold constant expressions.
+ * \param expr the expression to be optimized.
+ * \return The optimized expression.
+ */
+Expr FoldConstant(const Expr& expr);
+
+/*!
+ * \brief Fuse operations into expr into seperate functions.
+ * \param expr The expression.
+ * \param fuse_opt_level Optimization level.
+ * \return The optimized expression.
+ */
+Expr FuseOps(const Expr& expr, int fuse_opt_level);
+
+
 /*! \brief A hashing structure in the style of std::hash. */
 struct StructuralHash {
   /*! \brief Hash a Relay type.
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 19f3a55d491a..92e1e72fdac2 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -54,7 +54,7 @@
 # helper functions
 var = expr.var
 const = expr.const
-
+bind = expr.bind
 
 # pylint: disable=unused-argument
 @register_func("relay.debug")
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
index 9bd03945c847..4bbab957ab1d 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -102,6 +102,7 @@ def __init__(self, mod, target):
         self.target = target
         self.nodes = []
         self.var_map = {}
+        self.params = {}
         self.compile_engine = compile_engine.get()
         self.lowered_funcs = set()
         self._name_map = {}
@@ -162,8 +163,12 @@ def visit_tuple_getitem(self, op):
         assert isinstance(vtuple, tuple)
         return vtuple[op.index]
 
-    def visit_constant(self, _):
-        raise RuntimeError("constant not supported")
+    def visit_constant(self, op):
+        index = len(self.params)
+        name = "p%d" % index
+        self.params[name] = op.data
+        node = InputNode(name, {})
+        return self.add_node(node, op.checked_type)
 
     def visit_function(self, _):
         raise RuntimeError("function not supported")
@@ -312,6 +317,9 @@ def codegen(self, func):
 
         lowered_funcs : List[tvm.LoweredFunc]
             The lowered functions.
+
+        params : Dict[str, tvm.nd.NDArray]
+            Additional constant parameters.
         """
         # First we convert all the parameters into input nodes.
         for param in func.params:
@@ -324,7 +332,7 @@ def codegen(self, func):
         self.heads = self.visit(func.body)
         graph_json = self._get_json()
         lowered_funcs = list(self.lowered_funcs)
-        return graph_json, lowered_funcs
+        return graph_json, lowered_funcs, self.params
 
     def _get_unique_name(self, name):
         if name not in self._name_map:
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 0f33e86ab5cd..557e4edac681 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -6,6 +6,7 @@
 from .. import nd as _nd, target as _target, autotvm
 from ..contrib import graph_runtime as _graph_rt
 from . import ir_pass
+from . import expr
 from .backend import interpreter as _interpreter
 from .backend import graph_runtime_codegen as _graph_gen
 
@@ -13,6 +14,7 @@
 OPT_PASS_LEVEL = {
     "SimplifyInference": 0,
     "OpFusion": 1,
+    "FoldConstant": 2,
     "FoldScaleAxis": 3,
 }
 
@@ -95,7 +97,27 @@ def build_config(**kwargs):
     return BuildConfig(**kwargs)
 
 
-def optimize(func):
+def _bind_params_by_name(func, params):
+    """Bind parameters of function by its name."""
+    name_dict = {}
+    for arg in func.params:
+        name = arg.name_hint
+        if name in name_dict:
+            name_dict[name] = None
+        else:
+            name_dict[name] = arg
+    bind_dict = {}
+    for k, v in params.items():
+        if k not in name_dict:
+            continue
+        arg = name_dict[k]
+        if arg is None:
+            raise ValueError("Multiple args in the function have name %s" % k)
+        bind_dict[arg] = expr.const(v)
+    return expr.bind(func, bind_dict)
+
+
+def optimize(func, params=None):
     """Perform target invariant optimizations.
 
     Parameters
@@ -103,6 +125,10 @@ def optimize(func):
     func : tvm.relay.Function
         The input to optimization.
 
+    params : Optional[Dict[str, tvm.nd.NDArray]]
+        Input parameters to the graph that do not change
+        during inference time. used for constant folding.
+
     Returns
     -------
     opt_func : tvm.relay.Function
@@ -110,7 +136,11 @@ def optimize(func):
     """
     cfg = BuildConfig.current
 
-    if cfg.pass_enabled("FoldScaleAxis"):
+    # bind expressions
+    if params:
+        func = _bind_params_by_name(func, params)
+
+    if cfg.pass_enabled("SimplifyInference"):
         func = ir_pass.infer_type(func)
         func = ir_pass.simplify_inference(func)
 
@@ -119,6 +149,10 @@ def optimize(func):
         func = ir_pass.backward_fold_scale_axis(func)
         func = ir_pass.infer_type(func)
         func = ir_pass.forward_fold_scale_axis(func)
+
+    if cfg.pass_enabled("FoldConstant"):
+        func = ir_pass.fold_constant(func)
+
     return func
 
 
@@ -147,8 +181,7 @@ def build(func,
 
     params : dict of str to NDArray
         Input parameters to the graph that do not change
-        during inference time. Used for pre-compute
-        folding optimization.
+        during inference time. Used for constant folding.
 
     Returns
     -------
@@ -176,14 +209,14 @@ def build(func,
     cfg = BuildConfig.current
 
     with tophub_context:
-        func = optimize(func)
+        func = optimize(func, params)
         # Fuse ops before running code gen
         func = ir_pass.infer_type(func)
         func = ir_pass.fuse_ops(func, cfg.opt_level)
         # Graph code generation
         func = ir_pass.infer_type(func)
         graph_gen = _graph_gen.GraphRuntimeCodegen(mod=None, target=target)
-        graph_json, lowered_funcs = graph_gen.codegen(func)
+        graph_json, lowered_funcs, params = graph_gen.codegen(func)
         mod = _tvm_build_module(lowered_funcs, target=target, target_host=target_host)
     return graph_json, mod, params
 
@@ -210,21 +243,22 @@ def __init__(self, mod, ctx, target):
         self.target = target
 
     def _make_executor(self, func):
+        graph_json, mod, params = build(func, target=self.target)
+        gmodule = _graph_rt.create(graph_json, mod, self.ctx)
+        if params:
+            gmodule.set_input(*params)
         def _graph_wrapper(*args):
-            graph_json, mod, params = build(func, target=self.target)
-            assert params is None
-            gmodule = _graph_rt.create(graph_json, mod, self.ctx)
             # Create map of inputs.
             for i, arg in enumerate(args):
                 gmodule.set_input(i, arg)
             # Run the module, and fetch the output.
             gmodule.run()
-            return gmodule.get_output(0)
+            # make a copy so multiple invocation won't hurt perf.
+            return gmodule.get_output(0).copyto(_nd.cpu(0))
 
         return _graph_wrapper
 
 
-
 def create_executor(kind="debug",
                     mod=None,
                     ctx=None,
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index f82ea09a102a..d71db0036f20 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -6,6 +6,7 @@
 import numpy as _np
 from .base import RelayNode, register_relay_node
 from . import _make
+from . import _expr
 from . import ty as _ty
 from .._ffi import base as _base
 from .. import nd as _nd
@@ -577,3 +578,24 @@ def const(value, dtype=None):
     if not isinstance(value, _nd.NDArray):
         raise ValueError("value has to be scalar or NDArray")
     return Constant(value)
+
+
+def bind(expr, binds):
+    """Bind an free variables in expr or function arguments.
+
+    We can bind parameters expr if it is a function.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    binds : Union[Map[tvm.relay.Var, tvm.relay.Expr], Map[str, tvm.relay.Expr]]
+        The specific bindings.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The expression or function after binding.
+    """
+    return _expr.Bind(expr, binds)
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index b1a76d6fae6f..9d59980f6127 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -259,6 +259,22 @@ def structural_hash(value):
         raise TypeError(msg)
 
 
+def fold_constant(expr):
+    """Fold the constant expression in expr.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    transformed_expr : tvm.relay.Expr
+        The transformed expression.
+    """
+    return _ir_pass.FoldConstant(expr)
+
+
 def fuse_ops(expr, opt_level=1):
     """Fuse operators in expr together.
 
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 08f903a26d3e..5e3ee1761c38 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -6,8 +6,8 @@
  * ExprMutator uses memoization and self return in order to amortize
  * the cost of using functional updates.
  */
-
 #include <tvm/relay/expr_functor.h>
+#include "type_functor.h"
 
 namespace tvm {
 namespace relay {
@@ -228,5 +228,74 @@ void ExprVisitor::VisitExpr_(const TupleGetItemNode* op) {
 
 void ExprVisitor::VisitType(const Type& t) { return; }
 
+// Implement bind.
+class ExprBinder : public ExprMutator {
+ public:
+  explicit ExprBinder(const tvm::Map<Var, Expr>& args_map)
+    : args_map_(args_map) {
+  }
+
+  Expr VisitExpr_(const LetNode* op) final {
+    CHECK(!args_map_.count(op->var))
+        << "Cannot bind an internel variable in let";
+    return ExprMutator::VisitExpr_(op);
+  }
+
+  Expr VisitExpr_(const FunctionNode* op) final {
+    for (Var param : op->params) {
+      CHECK(!args_map_.count(param))
+          << "Cannnot bind an internal function parameter";
+    }
+    return ExprMutator::VisitExpr_(op);
+  }
+
+  Expr VisitExpr_(const VarNode* op) final {
+    auto id = GetRef<Var>(op);
+    auto it = args_map_.find(id);
+    if (it != args_map_.end()) {
+      return (*it).second;
+    } else {
+      return id;
+    }
+  }
+
+ private:
+  const tvm::Map<Var, Expr>& args_map_;
+};
+
+Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& args_map) {
+  if (const FunctionNode* func = expr.as<FunctionNode>()) {
+    Expr new_body = ExprBinder(args_map).Mutate(func->body);
+    Array<Var> new_params;
+    for (Var param : func->params) {
+      if (!args_map.count(param)) {
+        new_params.push_back(param);
+      }
+    }
+    if (new_body.same_as(func->body) &&
+        new_params.size() == func->params.size()) {
+      return expr;
+    }
+    return FunctionNode::make(new_params,
+                              new_body,
+                              func->ret_type,
+                              func->type_params,
+                              func->attrs);
+  } else {
+    return ExprBinder(args_map).Mutate(expr);
+  }
+}
+
+
+TVM_REGISTER_API("relay._expr.Bind")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    NodeRef input = args[0];
+    if (input->derived_from<ExprNode>()) {
+      *ret = Bind(Downcast<Expr>(input), args[1]);
+    } else {
+      CHECK(input->derived_from<TypeNode>());
+      *ret = Bind(Downcast<Type>(input), args[1]);
+    }
+  });
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
index 25651286ed9e..d0ae57bb01e1 100644
--- a/src/relay/ir/op.cc
+++ b/src/relay/ir/op.cc
@@ -11,8 +11,6 @@
 #include <memory>
 #include <mutex>
 
-#include "./../pass/type_subst.h"
-
 namespace dmlc {
 // enable registry
 DMLC_REGISTRY_ENABLE(::tvm::relay::OpRegistry);
diff --git a/src/relay/ir/type_functor.cc b/src/relay/ir/type_functor.cc
new file mode 100644
index 000000000000..fc0daa3cb9c6
--- /dev/null
+++ b/src/relay/ir/type_functor.cc
@@ -0,0 +1,159 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_functor.cc
+ * \brief Implementations of type functors.
+ */
+#include "type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+void TypeVisitor::VisitType_(const TypeVarNode* op) {
+}
+
+void TypeVisitor::VisitType_(const TensorTypeNode* op) {
+}
+
+void TypeVisitor::VisitType_(const IncompleteTypeNode* op) {
+}
+
+void TypeVisitor::VisitType_(const FuncTypeNode* op) {
+  for (auto type_param : op->type_params) {
+    this->VisitType(type_param);
+  }
+
+  for (auto type_cs : op->type_constraints) {
+    this->VisitType(type_cs);
+  }
+
+  for (auto arg_type : op->arg_types) {
+    this->VisitType(arg_type);
+  }
+  this->VisitType(op->ret_type);
+}
+
+void TypeVisitor::VisitType_(const TupleTypeNode* op) {
+  for (const Type& t : op->fields) {
+    this->VisitType(t);
+  }
+}
+
+void TypeVisitor::VisitType_(const TypeRelationNode* op) {
+  for (const Type& t : op->args) {
+    this->VisitType(t);
+  }
+}
+
+
+// Type Mutator.
+Array<Type> TypeMutator::MutateArray(Array<Type> arr) {
+  // The array will do copy on write
+  // If no changes are made, the original array will be returned.
+  for (size_t i = 0; i < arr.size(); ++i) {
+    Type ty = arr[i];
+    Type new_ty = VisitType(ty);
+    if (!ty.same_as(new_ty)) {
+      arr.Set(i, new_ty);
+    }
+  }
+  return arr;
+}
+
+Type TypeMutator::VisitType_(const TypeVarNode* op) {
+  return GetRef<TypeVar>(op);
+}
+
+Type TypeMutator::VisitType_(const TensorTypeNode* op) {
+  // TODO(tvm-team) recursively visit to replace Var
+  return GetRef<Type>(op);
+}
+
+Type TypeMutator::VisitType_(const IncompleteTypeNode* op) {
+  return GetRef<Type>(op);
+}
+
+Type TypeMutator::VisitType_(const FuncTypeNode* op) {
+  bool changed = false;
+  Array<TypeVar> type_params;
+  for (auto type_param : op->type_params) {
+    auto new_type_param = VisitType(type_param);
+    changed = changed || !new_type_param.same_as(type_param);
+    if (const TypeVarNode* tin = new_type_param.as<TypeVarNode>()) {
+      type_params.push_back(GetRef<TypeVar>(tin));
+    } else {
+      LOG(FATAL) << new_type_param << std::endl;
+    }
+  }
+
+  Array<TypeConstraint> type_constraints;
+  for (auto type_cs : op->type_constraints) {
+    auto new_type_cs = VisitType(type_cs);
+    changed = changed || !new_type_cs.same_as(type_cs);
+    if (const TypeConstraintNode* tin =
+        new_type_cs.as_derived<TypeConstraintNode>()) {
+      type_constraints.push_back(GetRef<TypeConstraint>(tin));
+    } else {
+      LOG(FATAL) << new_type_cs << std::endl;
+    }
+  }
+
+  Array<Type> new_args = MutateArray(op->arg_types);
+  changed = changed || new_args.same_as(op->arg_types);
+
+  Type new_ret_type = VisitType(op->ret_type);
+  changed = changed || new_ret_type.same_as(op->ret_type);
+
+  if (!changed) return GetRef<Type>(op);
+  return FuncTypeNode::make(new_args,
+                            new_ret_type,
+                            type_params,
+                            type_constraints);
+}
+
+Type TypeMutator::VisitType_(const TupleTypeNode* op) {
+  Array<Type> new_fields = MutateArray(op->fields);
+  if (new_fields.same_as(op->fields)) {
+    return GetRef<Type>(op);
+  } else {
+    return TupleTypeNode::make(new_fields);
+  }
+}
+
+Type TypeMutator::VisitType_(const TypeRelationNode* type_rel) {
+  Array<Type> new_args = MutateArray(type_rel->args);
+  if (new_args.same_as(type_rel->args)) {
+    return GetRef<Type>(type_rel);
+  } else {
+    return TypeRelationNode::make(type_rel->func,
+                                  new_args,
+                                  type_rel->num_inputs,
+                                  type_rel->attrs);
+  }
+}
+
+// Implements bind.
+class TypeBinder : public TypeMutator {
+ public:
+  explicit TypeBinder(const tvm::Map<TypeVar, Type>& args_map)
+    : args_map_(args_map) {}
+
+  Type VisitType_(const TypeVarNode* op) override {
+    auto id = GetRef<TypeVar>(op);
+    auto it = args_map_.find(id);
+    if (it != args_map_.end()) {
+      return (*it).second;
+    } else {
+      return id;
+    }
+  }
+
+ private:
+  const tvm::Map<TypeVar, Type>& args_map_;
+};
+
+Type Bind(const Type& type, const tvm::Map<TypeVar, Type>& args_map) {
+  return TypeBinder(args_map).VisitType(type);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/type_functor.h b/src/relay/ir/type_functor.h
index f51c8c746eb9..e8dfd2b7cd7c 100644
--- a/src/relay/ir/type_functor.h
+++ b/src/relay/ir/type_functor.h
@@ -91,113 +91,39 @@ class TypeFunctor<R(const Type& n, Args...)> {
 };
 
 /*!
- * \brief A type visitor for vistiors which make use of internal
- * mutable state.
- *
- * We recursively visit each type contained inside the visitor.
+ * \brief A type visitor that recursively visit types.
  */
-class TypeVisitor :
-    public ::tvm::relay::TypeFunctor<void(const Type& n)> {
+class TypeVisitor : public TypeFunctor<void(const Type& n)> {
  public:
-  void VisitType_(const TypeVarNode* op) override {}
-
-  void VisitType_(const FuncTypeNode* op) override {
-    for (auto type_param : op->type_params) {
-      this->VisitType(type_param);
-    }
-
-    for (auto type_cs : op->type_constraints) {
-      this->VisitType(type_cs);
-    }
-
-    for (auto arg_type : op->arg_types) {
-      this->VisitType(arg_type);
-    }
-    this->VisitType(op->ret_type);
-  }
-
-  void VisitType_(const TensorTypeNode* op) override {}
-
-  void VisitType_(const TupleTypeNode* op) override {
-    for (const Type& t : op->fields) {
-      this->VisitType(t);
-    }
-  }
-
-  void VisitType_(const TypeRelationNode* op) override {
-    for (const Type& t : op->args) {
-      this->VisitType(t);
-    }
-  }
-
-  void VisitType_(const IncompleteTypeNode* op) override {}
+  void VisitType_(const TypeVarNode* op) override;
+  void VisitType_(const IncompleteTypeNode* op) override;
+  void VisitType_(const TensorTypeNode* op) override;
+  void VisitType_(const FuncTypeNode* op) override;
+  void VisitType_(const TupleTypeNode* op) override;
+  void VisitType_(const TypeRelationNode* op) override;
 };
 
-// A functional visitor for rebuilding an AST in place.
-struct TypeMutator : TypeFunctor<Type(const Type& n)> {
-  Type VisitType_(const TensorTypeNode* op) override {
-    // TODO(@jroesch): maybe we should recursively visit
-    return TensorTypeNode::make(op->shape, op->dtype);
-  }
-
-  Type VisitType_(const TypeVarNode* op) override {
-    return GetRef<TypeVar>(op);
-  }
-
-  Type VisitType_(const FuncTypeNode* op) override {
-    Array<TypeVar> type_params;
-    for (auto type_param : op->type_params) {
-      auto new_type_param = VisitType(type_param);
-      if (const TypeVarNode* tin = new_type_param.as<TypeVarNode>()) {
-        type_params.push_back(GetRef<TypeVar>(tin));
-      } else {
-        CHECK(false) << new_type_param << std::endl;
-      }
-    }
-
-    Array<TypeConstraint> type_constraints;
-    for (auto type_cs : op->type_constraints) {
-      auto new_type_cs = VisitType(type_cs);
-      if (const TypeConstraintNode* tin =
-          new_type_cs.as_derived<TypeConstraintNode>()) {
-        type_constraints.push_back(GetRef<TypeConstraint>(tin));
-      } else {
-        CHECK(false) << new_type_cs << std::endl;
-      }
-    }
-
-    std::vector<Type> args;
-    for (auto arg_type : op->arg_types) {
-      args.push_back(VisitType(arg_type));
-    }
-
-    return FuncTypeNode::make(tvm::Array<Type>(args), VisitType(op->ret_type),
-                              type_params, type_constraints);
-  }
+// Mutator that transform a type to another one.
+class TypeMutator : public TypeFunctor<Type(const Type& n)> {
+ public:
+  Type VisitType_(const TypeVarNode* op) override;
+  Type VisitType_(const TensorTypeNode* op) override;
+  Type VisitType_(const IncompleteTypeNode* op) override;
+  Type VisitType_(const FuncTypeNode* op) override;
+  Type VisitType_(const TupleTypeNode* op) override;
+  Type VisitType_(const TypeRelationNode* type_rel) override;
 
-  Type VisitType_(const TupleTypeNode* op) override {
-    std::vector<Type> new_fields;
-    for (const Type& t : op->fields) {
-      new_fields.push_back(this->VisitType(t));
-    }
-    return TupleTypeNode::make(new_fields);
-  }
+ private:
+  Array<Type> MutateArray(Array<Type> arr);
+};
 
-  Type VisitType_(const TypeRelationNode* type_rel) override {
-    std::vector<Type> new_args;
-    for (const Type& t : type_rel->args) {
-      new_args.push_back(this->VisitType(t));
-    }
-    return TypeRelationNode::make(type_rel->func,
-                                  new_args,
-                                  type_rel->num_inputs,
-                                  type_rel->attrs);
-  }
+/*!
+ * \brief Bind free type variables in the type.
+ * \param type The type to be updated.
+ * \param args_map The binding map.
+ */
+Type Bind(const Type& type, const Map<TypeVar, Type>& args_map);
 
-  Type VisitType_(const IncompleteTypeNode* op) override {
-    return GetRef<Type>(op);
-  }
-};
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_IR_TYPE_FUNCTOR_H_
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
index 6f8dce3875ae..4c814bc1614f 100644
--- a/src/relay/op/op_common.h
+++ b/src/relay/op/op_common.h
@@ -71,7 +71,8 @@ std::vector<T> AsVector(const Array<T> &array) {
     .add_argument("lhs", "Tensor", "The left hand side tensor.")  \
     .add_argument("rhs", "Tensor", "The right hand side tensor.") \
     .add_type_rel("Broadcast", BroadcastRel)                      \
-    .set_attr<TOpPattern>("TOpPattern", kBroadcast)
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast)               \
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
new file mode 100644
index 000000000000..a5d514b76556
--- /dev/null
+++ b/src/relay/pass/fold_constant.cc
@@ -0,0 +1,120 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file constant_folding.cc
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/interpreter.h>
+
+namespace tvm {
+namespace relay {
+
+using FInterpreter = runtime::TypedPackedFunc<Value(Expr)>;
+
+
+// TODO(tvm-team) consider combine dead-code with constant folder.
+// or make a more powerful partial evaluator.
+class ConstantFolder : public ExprMutator {
+ public:
+  explicit ConstantFolder(FInterpreter executor)
+      : executor_(executor) {
+  }
+
+  Expr VisitExpr_(const LetNode* op) final {
+    Expr value = this->Mutate(op->value);
+    if (value.as<ConstantNode>()) {
+      memo_[op->var] = value;
+      return this->Mutate(op->body);
+    } else {
+      Var var = Downcast<Var>(this->Mutate(op->var));
+      Expr body = this->Mutate(op->body);
+      if (var.same_as(op->var) &&
+          value.same_as(op->value) &&
+          body.same_as(op->body)) {
+        return GetRef<Expr>(op);
+      } else {
+        return LetNode::make(var, value, body);
+      }
+    }
+  }
+
+  Expr VisitExpr_(const CallNode* call) final {
+    static auto op_stateful = Op::GetAttr<TOpIsStateful>("TOpIsStateful");
+    Expr res = ExprMutator::VisitExpr_(call);
+    call = res.as<CallNode>();
+    // We don't constant fold function with zero arguments.
+    // This is a heuristic that is useful.
+    // For example it is harmful to fold ones(shape=(4, 5)).
+    if (call->args.size() == 0) return res;
+    const OpNode* op = call->op.as<OpNode>();
+    if (op == nullptr) return res;
+    // skip stateful ops.
+    if (op_stateful.get(GetRef<Op>(op), false)) return res;
+    bool all_const_args = true;
+    for (Expr arg : call->args) {
+      if (arg.as<ConstantNode>() == nullptr) {
+        all_const_args = false;
+      }
+    }
+    if (all_const_args) {
+      return ConstEvaluate(res);
+    } else {
+      return res;
+    }
+  }
+
+  Expr VisitExpr_(const TupleGetItemNode* op) final {
+    Expr res = ExprMutator::VisitExpr_(op);
+    op = res.as<TupleGetItemNode>();
+    if (const auto* tuple = op->tuple.as<TupleNode>()) {
+      return tuple->fields[op->index];
+    } else {
+      return res;
+    }
+  }
+
+ private:
+  // Internal interepreter.
+  FInterpreter executor_;
+  // Convert value to expression.
+  Expr ValueToExpr(Value value) {
+    if (const auto* val = value.as<TensorValueNode>()) {
+      return ConstantNode::make(val->data);
+    } else if (const auto* val = value.as<TupleValueNode>()) {
+      Array<Expr> fields;
+      for (Value field : val->fields) {
+        fields.push_back(ValueToExpr(field));
+      }
+      return TupleNode::make(fields);
+    } else {
+      LOG(FATAL) << "Cannot handle " << value->type_key();
+      return Expr();
+    }
+  }
+  // Constant evaluate a expression.
+  Expr ConstEvaluate(Expr expr) {
+    expr = InferType(expr, Module(nullptr));
+    expr = FuseOps(expr, 0);
+    expr = InferType(expr, Module(nullptr));
+    return ValueToExpr(executor_(expr));
+  }
+};
+
+
+Expr FoldConstant(const Expr& expr) {
+  DLContext ctx;
+  ctx.device_type = kDLCPU;
+  ctx.device_id = 0;
+  Target target = Target::create("llvm");
+  return ConstantFolder(CreateInterpreter(
+      Module(nullptr), ctx, target)).Mutate(expr);
+}
+
+TVM_REGISTER_API("relay._ir_pass.FoldConstant")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = FoldConstant(args[0]);
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/pass_util.h b/src/relay/pass/pass_util.h
index bf52297e8930..d42494409b53 100644
--- a/src/relay/pass/pass_util.h
+++ b/src/relay/pass/pass_util.h
@@ -22,6 +22,23 @@ namespace relay {
 std::unordered_map<const Node*, size_t>
 GetExprRefCount(const Expr& body);
 
+/*!
+ * \brief Substitute var with subst.
+ * \param type The type to be substituted.
+ * \param tvar The type variable to be substituted.
+ * \param subst The target of substitution.
+ * \return The substituted result.
+ */
+Type TypeSubst(const Type& type, const TypeVar& tvar, const Type& subst);
+
+/*!
+ * \brief Substitute type vars in type.
+ * \param type The type to be substituted.
+ * \param subst_map The map of substitution.
+ * \return The substituted result.
+ */
+Type TypeSubst(const Type& type, const tvm::Map<TypeVar, Type>& subst_map);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_PASS_UTIL_H_
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 5cabfbdabc49..13da159e99a8 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -24,7 +24,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pass.h>
 #include "type_solver.h"
-#include "type_subst.h"
+#include "../ir/type_functor.h"
 
 namespace tvm {
 namespace relay {
@@ -278,7 +278,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
     Type inst_ty = FuncTypeNode::make(fn_ty->arg_types,
                                       ret_type, {},
                                       fn_ty->type_constraints);
-    inst_ty = TypeSubst(inst_ty, subst_map);
+    inst_ty = Bind(inst_ty, subst_map);
     return Downcast<FuncType>(inst_ty);
   }
 
diff --git a/src/relay/pass/type_subst.cc b/src/relay/pass/type_subst.cc
deleted file mode 100644
index 76507058f059..000000000000
--- a/src/relay/pass/type_subst.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file type_subst.cc
- * \brief Function for substituting a concrete type in place of a type ID
- */
-#include "./type_subst.h"
-#include "../ir/type_functor.h"
-
-namespace tvm {
-namespace relay {
-
-struct TypeSubstV : TypeMutator {
-  tvm::Map<TypeVar, Type> subst_map;
-
-  explicit TypeSubstV(tvm::Map<TypeVar, Type> subst_map)
-    : subst_map(subst_map) {}
-
-  Type VisitType_(const TypeVarNode* op) override {
-    auto id = GetRef<TypeVar>(op);
-    if (subst_map.find(id) != subst_map.end()) {
-      return this->subst_map[id];
-    } else {
-      return id;
-    }
-  }
-};
-
-Type TypeSubst(const Type& type, const TypeVar& target, const Type& subst) {
-  TypeSubstV ty_sub({ {target, subst} });
-  return ty_sub.VisitType(type);
-}
-
-Type TypeSubst(const Type& type, tvm::Map<TypeVar, Type> subst_map) {
-  TypeSubstV ty_sub(subst_map);
-  return ty_sub.VisitType(type);
-}
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/pass/type_subst.h b/src/relay/pass/type_subst.h
deleted file mode 100644
index 808e3536ae30..000000000000
--- a/src/relay/pass/type_subst.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file src/tvm/relay/pass/type_subst.h
- * \brief Utility functions for substituting types.
- */
-#ifndef TVM_RELAY_PASS_TYPE_SUBST_H_
-#define TVM_RELAY_PASS_TYPE_SUBST_H_
-
-#include <tvm/relay/expr.h>
-
-namespace tvm {
-namespace relay {
-
-Type TypeSubst(const Type& type, const TypeVar& target, const Type& subst);
-Type TypeSubst(const Type& type, tvm::Map<TypeVar, Type> subst_map);
-
-}  // namespace relay
-}  // namespace tvm
-#endif  // TVM_RELAY_PASS_TYPE_SUBST_H_
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index ebc4e6fc16e6..8f7179deea53 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -13,7 +13,6 @@ namespace tvm {
 namespace relay {
 
 // FreeTypeVar
-
 class FreeTypeVarTVisitor : public TypeVisitor {
  public:
   FreeTypeVarTVisitor(
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
index 7f857b72ad1c..7b610f82f6a5 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -1,6 +1,8 @@
 import numpy as np
 
+import tvm
 from tvm import relay
+from tvm.contrib import graph_runtime
 from tvm.relay.ir_pass import infer_type
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay.op import add
@@ -27,7 +29,7 @@ def check_rts(expr, args, expected_result, mod=None):
     graph = relay.create_executor('graph', mod=mod)
     eval_result = intrp.evaluate(expr)(*args)
     rts_result = graph.evaluate(expr)(*args)
-    np.testing.assert_allclose(eval_result.asnumpy(), rts_result.asnumpy())
+    tvm.testing.assert_allclose(eval_result.asnumpy(), rts_result.asnumpy())
 
 def test_add_op_scalar():
     """
@@ -71,7 +73,26 @@ def test_add_op_broadcast():
     y_data = np.random.rand(1, 5).astype('float32')
     check_rts(func, [x_data, y_data], x_data + y_data)
 
+
+def test_with_params():
+    x = relay.var('x', shape=(10, 5))
+    y = relay.var('y', shape=(1, 5))
+    func = relay.Function([x, y], add(x, y))
+    x_data = np.random.rand(10, 5).astype('float32')
+    y_data = np.random.rand(1, 5).astype('float32')
+    params = {"y": y_data}
+    graph, lib, params = relay.build(func, "llvm", params=params)
+    mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+    mod.set_input(**params)
+    mod.set_input(x=x_data)
+    mod.run()
+    res = mod.get_output(0).asnumpy()
+    ref_res = y_data + x_data
+    tvm.testing.assert_allclose(res, ref_res)
+
+
 if __name__ == "__main__":
+    test_with_params()
     test_add_op_scalar()
     test_add_op_tensor()
     test_add_op_broadcast()
diff --git a/tests/python/relay/test_ir_bind.py b/tests/python/relay/test_ir_bind.py
new file mode 100644
index 000000000000..8377bb9fb953
--- /dev/null
+++ b/tests/python/relay/test_ir_bind.py
@@ -0,0 +1,23 @@
+""" test bind function."""
+import tvm
+from tvm import relay
+
+
+def test_bind_params():
+    x = relay.var("x")
+    y = relay.var("y")
+    z = relay.add(x, y)
+    f = relay.Function([x, y], z)
+    fbinded = relay.bind(f, {x : relay.const(1, "float32")})
+    fexpected =relay.Function(
+        [y],
+        relay.add(relay.const(1, "float32"),  y))
+    assert relay.ir_pass.alpha_equal(fbinded, fexpected)
+
+    zbinded = relay.bind(z, {y: x})
+    zexpected = relay.add(x, x)
+    assert relay.ir_pass.alpha_equal(zbinded, zexpected)
+
+
+if __name__ == "__main__":
+    test_bind_params()
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
new file mode 100644
index 000000000000..4d9e397be975
--- /dev/null
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -0,0 +1,75 @@
+import numpy as np
+from tvm import relay
+
+
+def test_fold_const():
+    c_data = np.array([1, 2, 3]).astype("float32")
+    def before():
+        c = relay.const(c_data)
+        x = relay.var("x")
+        y = relay.add(c, c)
+        y = relay.multiply(y, relay.const(2, "float32"))
+        y = relay.add(x, y)
+        z = relay.add(y, c)
+        return relay.Function([x], z)
+
+    def expected():
+        x = relay.var("x")
+        c_folded = (c_data + c_data) * 2
+        y = relay.add(x, relay.const(c_folded))
+        z = relay.add(y, relay.const(c_data))
+        return relay.Function([x], z)
+    zz = relay.ir_pass.fold_constant(before())
+    zexpected = expected()
+    assert relay.ir_pass.alpha_equal(zz, zexpected)
+
+
+def test_fold_let():
+    c_data = np.array(1).astype("float32")
+    def before():
+        sb = relay.ScopeBuilder()
+        x = relay.var("x")
+        t1 = sb.let("t1", relay.const(c_data))
+        t2 = sb.let("t2", relay.add(t1, t1))
+        t3 = sb.let("t3", relay.add(t2, x))
+        sb.ret(t3)
+        return relay.Function([x], sb.get())
+
+    def expected():
+        sb = relay.ScopeBuilder()
+        x = relay.var("x")
+        c_folded = (c_data + c_data)
+        t3 = sb.let("t3", relay.add(relay.const(c_folded), x))
+        sb.ret(t3)
+        return relay.Function([x], sb.get())
+
+    zz = relay.ir_pass.fold_constant(before())
+    zexpected = expected()
+    assert relay.ir_pass.graph_equal(zz, zexpected)
+
+
+def test_fold_tuple():
+    c_data = np.array(1).astype("float32")
+    def before():
+        c = relay.const(c_data)
+        x = relay.var("x")
+        y = relay.Tuple([x, c])
+        z = relay.add(y[1], c)
+        z = relay.add(z, y[0])
+        return relay.Function([x], z)
+
+    def expected():
+        c = relay.const(c_data + c_data)
+        x = relay.var("x")
+        z = relay.add(c, x)
+        return relay.Function([x], z)
+
+    zz = relay.ir_pass.fold_constant(before())
+    zexpected = expected()
+    assert relay.ir_pass.graph_equal(zz, zexpected)
+
+
+if __name__ == "__main__":
+    test_fold_const()
+    test_fold_let()
+    test_fold_tuple()

From 654e8c5b6b6a5b6e29d493f44729488b78217923 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 14 Nov 2018 16:42:23 -0800
Subject: [PATCH 356/529] [RELAY][PASS] FuseOps, fix input fusion rule for
 conv2d (#2110)

---
 src/relay/pass/fuse_ops.cc               | 35 ++++++++++++++----------
 tests/python/relay/test_pass_fuse_ops.py |  9 +++++-
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 54dcdf1e8fc6..cb5f86f4b525 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -464,14 +464,15 @@ class GraphPartitioner {
     return true;
   }
   /*!
-   * \brief Check all the node between src and sink satisfies fcond.
+   * \brief Check all the node and edge pattern
+   *  between src and sink satisfies fcond.
    *
-   * src and sink are not checked.
+   * src is not checked.
    *
    * \param src The source node.
    * \param sink The termination node.
    * \param fcond The condition to be checked.
-   * \tparam F the condition function.
+   * \tparam F the condition function, with signature
    * \note sink must be a post-dominator of src.
    */
   template<typename F>
@@ -596,18 +597,24 @@ class GraphPartitioner {
           }
         }
       } else if (group_node->pattern <= kBroadcast) {
-        // The fuse can be executed if all the intermediate ops are still broadcast.
-        auto fcond = [](OpPatternKind kind, bool is_sink) {
-          if (!is_sink) {
-            return kind <= kBroadcast;
-          } else {
-            return (kind <= kBroadcast ||
-                    kind == kCommReduce ||
-                    kind == kOutEWiseFusable);
+        // Pre-condition: can only be fused to parent which is injective or reduction.
+        if (dom_node->parent != nullptr &&
+            (dom_node->pattern <= kInjective ||
+             dom_node->pattern == kCommReduce)) {
+          // Check if all the intermediate ops are still broadcast.
+          // The final terminal node can already be fused to a OutEWiseFusable group.
+          auto fcond = [](OpPatternKind kind, bool is_sink) {
+            if (!is_sink) {
+              return kind <= kBroadcast;
+            } else {
+              return (kind <= kBroadcast ||
+                      kind == kCommReduce ||
+                      kind == kOutEWiseFusable);
+            }
+          };
+          if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
+            CommitFuse(graph_node, dom_node->parent->gnode);
           }
-        };
-        if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
-          CommitFuse(graph_node, dom_node->parent->gnode);
         }
       } else if (group_node->pattern == kInjective) {
         // defer injective fusion to second phase.
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 19bec20ac4af..27806791c399 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -29,10 +29,12 @@ def expected():
 
 
 
+
 def test_conv2d_fuse():
     """Test fusion case of conv2d"""
     def before(dshape):
         x = relay.var("x", shape=dshape)
+        x = relay.add(x, relay.const(1, "float32"))
         y = relay.nn.conv2d(x, relay.var("w1"),
                             kernel_size=(3, 3),
                             padding=(1, 1),
@@ -54,6 +56,10 @@ def before(dshape):
         return relay.Function(relay.ir_pass.free_vars(z), z)
 
     def expected(dshape):
+        # segment 0
+        x = relay.var("p0", shape=dshape)
+        y = relay.add(x, relay.const(1, "float32"))
+        f0 = relay.Function([x], y)
         # segment 1
         x = relay.var("p0", shape=dshape)
         w = relay.var("p1")
@@ -84,7 +90,8 @@ def expected(dshape):
         f3 = relay.Function([x, w, offset], z3)
         # compose
         x = relay.var("x", shape=dshape)
-        y = relay.Call(f1, [x, relay.var("w1")])
+        y = relay.Call(f0, [x])
+        y = relay.Call(f1, [y, relay.var("w1")])
         z2 = relay.Call(f2, [y, relay.var("w3")])
         z3 = relay.Call(f3, [y, relay.var("w2"), z2])
         z = z3

From 0f40aa2bcbae611f536efc8a7c4379ff7087bf19 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Wed, 14 Nov 2018 16:45:36 -0800
Subject: [PATCH 357/529] [Bugfix] Recover original layout when alter_layout
 function return None (#2101)

---
 nnvm/src/compiler/alter_op_layout.cc          | 25 +++++----
 .../python/compiler/test_alter_op_layout.py   | 56 ++++++++++++++++++-
 2 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/nnvm/src/compiler/alter_op_layout.cc b/nnvm/src/compiler/alter_op_layout.cc
index b02655fc8925..f62e39efd9eb 100644
--- a/nnvm/src/compiler/alter_op_layout.cc
+++ b/nnvm/src/compiler/alter_op_layout.cc
@@ -46,7 +46,7 @@ Graph AlterOpLayout(const Graph& src) {
 
   std::vector<std::vector<Layout> > in_layouts_of_node(idx_graph.num_nodes());
   std::vector<std::vector<Layout> > out_layouts_of_node(idx_graph.num_nodes());
-  std::unordered_map<const Node*, uint32_t> new_nodes;
+  std::unordered_map<const Node*, uint32_t> unchanged_nodes;
 
   if (src.HasAttr("layout")) {
     // record layouts so that LayoutTransform pass can fix layouts correctly,
@@ -56,10 +56,8 @@ Graph AlterOpLayout(const Graph& src) {
     const auto& layouts = src.GetAttr<std::vector<Layout> >("layout");
     for (uint32_t nid = 0; nid < idx_graph.num_nodes(); ++nid) {
       const auto &inode = idx_graph[nid];
-      if (falter_op_layout.count(inode.source->op())) {
-        // do not record input layouts of nodes that will be replaced.
-        continue;
-      }
+      // record input layouts for all nodes,
+      // while replaced nodes will ignore the records here and have undefined input layouts.
       std::vector<Layout> in_layout;
       for (const auto& e : inode.inputs) {
         in_layout.emplace_back(layouts[idx_graph.entry_id(e)]);
@@ -80,7 +78,8 @@ Graph AlterOpLayout(const Graph& src) {
     nnvm::compiler::FTVMAlterOpLayout fn_alter_op_layout =
       falter_op_layout.get(n->op(), nullptr);
     if (fn_alter_op_layout == nullptr) {
-      new_nodes[n.get()] = nid;
+      // will restore the original input layouts later.
+      unchanged_nodes[n.get()] = nid;
       return false;
     }
 
@@ -106,7 +105,13 @@ Graph AlterOpLayout(const Graph& src) {
     Symbol op;
     bool do_alter =
       fn_alter_op_layout(n->attrs, Symbol::CreateGroup(op_inputs), tensor_infos, &op);
-    if (do_alter) *ret = op.outputs;
+
+    if (do_alter) {
+      *ret = op.outputs;
+    } else {
+      // will restore the original input layouts later.
+      unchanged_nodes[n.get()] = nid;
+    }
     return do_alter;
   };
 
@@ -118,15 +123,15 @@ Graph AlterOpLayout(const Graph& src) {
     std::vector<Layout> ret_layouts(ret_idx.num_node_entries(), Layout::Undef());
     for (uint32_t nid = 0; nid < ret_idx.num_nodes(); ++nid) {
       const auto& inode = ret_idx[nid];
-      if (new_nodes.count(inode.source)) {
+      if (unchanged_nodes.count(inode.source)) {
         const std::vector<Layout>& in_layouts =
-          in_layouts_of_node[new_nodes[inode.source]];
+          in_layouts_of_node[unchanged_nodes[inode.source]];
         for (uint32_t i = 0; i < inode.inputs.size(); ++i) {
           const auto& e = inode.inputs[i];
           ret_layouts[ret_idx.entry_id(e)] = in_layouts[i];
         }
         const std::vector<Layout>& out_layouts =
-          out_layouts_of_node[new_nodes[inode.source]];
+          out_layouts_of_node[unchanged_nodes[inode.source]];
         for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
           ret_layouts[ret_idx.entry_id(nid, i)] = out_layouts[i];
         }
diff --git a/nnvm/tests/python/compiler/test_alter_op_layout.py b/nnvm/tests/python/compiler/test_alter_op_layout.py
index 0fbf5ad3b479..cc3df61a28c7 100644
--- a/nnvm/tests/python/compiler/test_alter_op_layout.py
+++ b/nnvm/tests/python/compiler/test_alter_op_layout.py
@@ -45,9 +45,61 @@ def alter_conv2d_layout(attrs, inputs, tinfos):
 
     # check copy layouts
     for node in ["data", "relu", "flatten", "softmax", "conv_weight"]:
-        assert(layouts[node] == layouts_origin[node])
-    assert(layouts["conv_alter"] == layouts_origin["conv"])
+        assert layouts[node] == layouts_origin[node]
+    assert layouts["conv_alter"] == layouts_origin["conv"]
+
+
+def test_consecutive_alter_layout():
+    data = sym.Variable("data", shape=(1, 32, 512, 512))
+    pool1 = sym.global_avg_pool2d(data, name="global_avg_pool2d_1", layout="NCHW")
+    pool2 = sym.global_avg_pool2d(pool1, name="global_avg_pool2d_2", layout="NCHW")
+    relu = sym.relu(pool2, name="relu")
+
+    g = graph.create(relu)
+    g = g.apply("CorrectLayout")
+    g = graph_attr.set_dtype_inputs(g, "float32")
+    g = g.apply(["InferShape", "InferType"])
+    assert g.json_attr("layout") == ['NCHW', 'NCHW', 'NCHW', 'NCHW']
+
+    @reg.register_alter_op_layout("global_avg_pool2d", level=100)
+    def alter_global_avg_pool2d_layout(attrs, inputs, tinfos):
+        new_attrs = {k : attrs[k] for k in attrs.keys()}
+        new_attrs["layout"] = "NCHW16c"
+        return sym.global_avg_pool2d(inputs[0], **new_attrs)
+
+    g = g.apply("AlterOpLayout")
+
+    # pool1 get replaced - output layout of pool1 is not recorded
+    # pool2 get replaced - input layout of pool2 is not recorded
+    # thus the second entry must be undefined - it can neither recover from pool1's output,
+    # nor from pool2's input.
+    assert g.json_attr("layout") == ['NCHW', '__undef__', 'NCHW', 'NCHW']
+
+
+def test_alter_func_return_none():
+    data = sym.Variable("data", shape=(1, 32, 512, 512))
+    pool1 = sym.global_max_pool2d(data, name="pool1", layout="NCHW")
+    pool2 = sym.global_max_pool2d(pool1, name="pool2", layout="NCHW")
+    relu = sym.relu(pool2, name="relu")
+
+    g = graph.create(relu)
+    g = g.apply("CorrectLayout")
+    g = graph_attr.set_dtype_inputs(g, "float32")
+    g = g.apply(["InferShape", "InferType"])
+    assert g.json_attr("layout") == ['NCHW', 'NCHW', 'NCHW', 'NCHW']
+
+    @reg.register_alter_op_layout("global_max_pool2d", level=100)
+    def alter_global_max_pool2d_layout(attrs, inputs, tinfos):
+        return None
+
+    g = g.apply("AlterOpLayout")
+
+    # alter func return none, nothing get replaced,
+    # the layouts should remain the same
+    assert g.json_attr("layout") == ['NCHW', 'NCHW', 'NCHW', 'NCHW']
 
 
 if __name__ == "__main__":
     test_alter_conv2d_layout()
+    test_consecutive_alter_layout()
+    test_alter_func_return_none()

From 1387e78fddfbebcbaaacdfd20bb3724b2aac460e Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 14 Nov 2018 18:59:39 -0800
Subject: [PATCH 358/529] [RELAY] bugfix type functor caching (#2113)

---
 src/relay/ir/type_functor.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/relay/ir/type_functor.cc b/src/relay/ir/type_functor.cc
index fc0daa3cb9c6..0ef1743cbbc4 100644
--- a/src/relay/ir/type_functor.cc
+++ b/src/relay/ir/type_functor.cc
@@ -98,10 +98,10 @@ Type TypeMutator::VisitType_(const FuncTypeNode* op) {
   }
 
   Array<Type> new_args = MutateArray(op->arg_types);
-  changed = changed || new_args.same_as(op->arg_types);
+  changed = changed || !new_args.same_as(op->arg_types);
 
   Type new_ret_type = VisitType(op->ret_type);
-  changed = changed || new_ret_type.same_as(op->ret_type);
+  changed = changed || !new_ret_type.same_as(op->ret_type);
 
   if (!changed) return GetRef<Type>(op);
   return FuncTypeNode::make(new_args,

From 6e4e3b29dab7b91559f15f0be71278b8e114cd12 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 14 Nov 2018 22:01:15 -0800
Subject: [PATCH 359/529] [RELAY][PASS] Make FoldConst context and target
 invariant (#2114)

---
 src/relay/backend/compile_engine.cc           | 5 +++--
 src/relay/pass/fold_constant.cc               | 4 ++++
 tests/python/relay/test_pass_fold_constant.py | 9 ++++++++-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index dc094e00e05b..b10e9f2e2ea3 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -82,8 +82,6 @@ class ScheduleGetter :
       }
     }
     readable_name_stream_ << "fused";
-    // enter the target context
-    TargetContext target_ctx(target_);
     cache_node->outputs = this->VisitExpr(prim_func->body);
     cache_node->func_name = readable_name_stream_.str();
     CachedFunc cfunc(cache_node);
@@ -284,6 +282,9 @@ class CompileEngineImpl : public CompileEngineNode {
       value->use_count = 0;
       cache_[key] = value;
     }
+    // Enforce use the target.
+    TargetContext target_ctx(key->target);
+
     CHECK(!value->cached_func.defined());
     auto spair = CreateSchedule(key->source_func, key->target);
     auto cache_node = make_node<CachedFuncNode>(
diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
index a5d514b76556..6237bcdce7a8 100644
--- a/src/relay/pass/fold_constant.cc
+++ b/src/relay/pass/fold_constant.cc
@@ -107,6 +107,10 @@ Expr FoldConstant(const Expr& expr) {
   ctx.device_type = kDLCPU;
   ctx.device_id = 0;
   Target target = Target::create("llvm");
+  // use a fresh build context
+  // in case we are already in a build context.
+  BuildConfigContext fresh_build_ctx(build_config());
+
   return ConstantFolder(CreateInterpreter(
       Module(nullptr), ctx, target)).Mutate(expr);
 }
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
index 4d9e397be975..250cfc70cc28 100644
--- a/tests/python/relay/test_pass_fold_constant.py
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -1,4 +1,5 @@
 import numpy as np
+import tvm
 from tvm import relay
 
 
@@ -19,7 +20,13 @@ def expected():
         y = relay.add(x, relay.const(c_folded))
         z = relay.add(y, relay.const(c_data))
         return relay.Function([x], z)
-    zz = relay.ir_pass.fold_constant(before())
+
+    def fail(x):
+        raise RuntimeError()
+    # the fold constant should work on any context.
+    with tvm.build_config(add_lower_pass=[(0, fail)]):
+        with tvm.target.create("cuda"):
+            zz = relay.ir_pass.fold_constant(before())
     zexpected = expected()
     assert relay.ir_pass.alpha_equal(zz, zexpected)
 

From d5dade451e6682f05cd10b852e5940045df938a9 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 14 Nov 2018 22:21:03 -0800
Subject: [PATCH 360/529] [NNPACK] temporary disable nnpack test (#2115)

---
 tests/python/contrib/test_nnpack.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index 0b275fb812bf..151869729d42 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -21,6 +21,7 @@ def verify(target="llvm"):
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_output", True):
             print("skip because extern function is not available")
             return
+        return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
         a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
@@ -51,6 +52,7 @@ def verify(target="llvm"):
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
             print("skip because extern function is not available")
             return
+        return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
         a = tvm.nd.array(np.random.uniform(size=(l)).astype(A.dtype), ctx)
@@ -128,6 +130,7 @@ def verify(target="llvm",
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
             print("skip because extern function is not available")
             return
+        return
         ctx = tvm.cpu(0)
         output = nnpack.convolution_inference(
             data, kernel, bias if with_bias else None,
@@ -189,7 +192,7 @@ def verify(target="llvm",
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
             print("skip because extern function is not available")
             return
-
+        return
         ctx = tvm.cpu(0)
         transformed_kernel = nnpack.convolution_inference_weight_transform(
             kernel, algorithm=algorithm)
@@ -246,6 +249,7 @@ def verify(target="llvm"):
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
             print("skip because extern function is not available")
             return
+        return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [data, kernel, bias, output], target)
 

From f666b42f3c060663d46ea34be07d9ae4b075b86b Mon Sep 17 00:00:00 2001
From: Ruslan Baratov <ruslan_baratov@yahoo.com>
Date: Thu, 15 Nov 2018 21:01:14 +0000
Subject: [PATCH 361/529] Docs: Fix links (#2118)

---
 docs/install/from_source.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 84bfa3c63bf0..5f73e4b1d562 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -67,13 +67,13 @@ The configuration of tvm can be modified by `config.cmake`.
 
   - LLVM 4.0 or higher is needed for build with LLVM. Note that verison of LLVM from default apt may lower than 4.0.
   - Since LLVM takes long time to build from source, you can download pre-built version of LLVM from
-    [LLVM Download Page](http://releases.llvm.org/download.html).
+    `LLVM Download Page <http://releases.llvm.org/download.html>`_.
 
 
     - Unzip to a certain location, modify ``build/config.cmake`` to add ``set(USE_LLVM /path/to/your/llvm/bin/llvm-config)``
     - You can also directly set ``set(USE_LLVM ON)`` and let cmake search for a usable version of LLVM.
 
-  - You can also use [LLVM Nightly Ubuntu Build](https://apt.llvm.org/)
+  - You can also use `LLVM Nightly Ubuntu Build <https://apt.llvm.org/>`_
 
     - Note that apt-package append ``llvm-config`` with version number.
       For example, set ``set(LLVM_CONFIG llvm-config-4.0)`` if you installed 4.0 package

From e5443cdefff768fd30ec22b0b02c8ab9920ffe34 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 16 Nov 2018 05:12:14 +0800
Subject: [PATCH 362/529] Fix doc of strided_slice (#2103)

---
 include/tvm/relay/attrs/transform.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 4d2008628d3a..fc539f3ce742 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -133,7 +133,7 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
     TVM_ATTR_FIELD(begin)
         .describe("Indices for begin of slice, begin index is also inclusive");
     TVM_ATTR_FIELD(end)
-        .describe("Indices for end of slice, end index is also inclusive");
+        .describe("Indices for end of slice, end index is exclusive");
     TVM_ATTR_FIELD(strides).set_default(Array<Integer>({}))
         .describe("Stride values of the slice");
   }

From 67920406362a746b3d5046dc361281f614297ea4 Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Thu, 15 Nov 2018 14:01:36 -0800
Subject: [PATCH 363/529] [NNPACK] Add check for NNPACK being available
 (`nnp_initialize()` succeeding) (#2119)

This fixes issues with failing tests on PowerPC.
---
 python/tvm/contrib/nnpack.py        | 13 ++++---------
 src/contrib/nnpack/nnpack_utils.cc  |  5 +++--
 tests/python/contrib/test_nnpack.py | 20 +++++++++++++++-----
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/python/tvm/contrib/nnpack.py b/python/tvm/contrib/nnpack.py
index 36f8a76a87db..3fb00a3f85e5 100644
--- a/python/tvm/contrib/nnpack.py
+++ b/python/tvm/contrib/nnpack.py
@@ -5,16 +5,11 @@
 from .. import intrin as _intrin
 from .._ffi.function import _init_api
 
-def config(nthreads):
-    """Configure the nnpack library.
-
-    Parameters
-    ----------
-    nthreads : int
-        The threads number of nnpack thread pool, must be a nonnegative.
-
+def is_available():
+    """Check whether NNPACK is available, that is, `nnp_initialize()`
+    returns `nnp_status_success`.
     """
-    _Config(nthreads)
+    return _initialize() == 0
 
 def fully_connected_inference(lhs, rhs, nthreads=1):
     """Create an extern op that compute fully connected of 1D tensor lhs and
diff --git a/src/contrib/nnpack/nnpack_utils.cc b/src/contrib/nnpack/nnpack_utils.cc
index d8ef1d0b8327..12eb828cc7e6 100644
--- a/src/contrib/nnpack/nnpack_utils.cc
+++ b/src/contrib/nnpack/nnpack_utils.cc
@@ -38,9 +38,10 @@ bool NNPackConfig(uint64_t nthreads) {
 }
 
 
-TVM_REGISTER_GLOBAL("contrib.nnpack._Config")
+TVM_REGISTER_GLOBAL("contrib.nnpack._initialize")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    CHECK(NNPackConfig(args[0]));
+    *ret = nnp_initialize();
   });
+
 }  // namespace contrib
 }  // namespace tvm
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index 151869729d42..a4b77a39af63 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -21,7 +21,9 @@ def verify(target="llvm"):
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_output", True):
             print("skip because extern function is not available")
             return
-        return
+        if not nnpack.is_available():
+            return
+
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
         a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
@@ -52,7 +54,9 @@ def verify(target="llvm"):
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
             print("skip because extern function is not available")
             return
-        return
+        if not nnpack.is_available():
+            return
+
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
         a = tvm.nd.array(np.random.uniform(size=(l)).astype(A.dtype), ctx)
@@ -130,7 +134,9 @@ def verify(target="llvm",
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
             print("skip because extern function is not available")
             return
-        return
+        if not nnpack.is_available():
+            return
+
         ctx = tvm.cpu(0)
         output = nnpack.convolution_inference(
             data, kernel, bias if with_bias else None,
@@ -192,7 +198,9 @@ def verify(target="llvm",
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
             print("skip because extern function is not available")
             return
-        return
+        if not nnpack.is_available():
+            return
+
         ctx = tvm.cpu(0)
         transformed_kernel = nnpack.convolution_inference_weight_transform(
             kernel, algorithm=algorithm)
@@ -249,7 +257,9 @@ def verify(target="llvm"):
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
             print("skip because extern function is not available")
             return
-        return
+        if not nnpack.is_available():
+            return
+
         ctx = tvm.cpu(0)
         f = tvm.build(s, [data, kernel, bias, output], target)
 

From e46ac1a48a908904438308711b2dad00f9765b23 Mon Sep 17 00:00:00 2001
From: David Hirvonen <dhirvonen@elucideye.com>
Date: Thu, 15 Nov 2018 23:58:12 -0500
Subject: [PATCH 364/529] =?UTF-8?q?clarify=20NNVM=E2=80=99s=20LLVM=20requi?=
 =?UTF-8?q?rement=20(#2117)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/install/from_source.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 5f73e4b1d562..10d86ebe6243 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -42,7 +42,8 @@ The minimal building requirements are
 - A recent c++ compiler supporting C++ 11 (g++-4.8 or higher)
 - CMake 3.5 or higher
 - We highly recommend to build with LLVM to enable all the features.
-- It is possible to build without llvm dependency if we only want to use CUDA/OpenCL
+- It is possible to build TVM without the LLVM dependency if we only want to use CUDA/OpenCL
+- If we want to use the NNVM compiler, then LLVM is required
 
 We use cmake to build the library.
 The configuration of tvm can be modified by `config.cmake`.

From 194a3739b151351722f6b92bf41da166494311c3 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 15 Nov 2018 22:11:41 -0800
Subject: [PATCH 365/529] [RELAY][[PASS] Consolidate ForwardRewrite pass.
 (#2124)

---
 include/tvm/relay/expr.h          |  26 +++
 include/tvm/relay/op.h            |  40 +++++
 include/tvm/relay/op_attr_types.h |  19 +++
 include/tvm/relay/pass.h          |  11 ++
 include/tvm/runtime/packed_func.h |   2 +
 src/relay/pass/fold_scale_axis.cc | 267 ++++++++++--------------------
 src/relay/pass/forward_rewrite.cc | 132 +++++++++++++++
 7 files changed, 319 insertions(+), 178 deletions(-)
 create mode 100644 src/relay/pass/forward_rewrite.cc

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 2319f8baec00..c72612791b52 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -415,6 +415,32 @@ class TupleGetItemNode : public ExprNode {
 
 RELAY_DEFINE_NODE_REF(TupleGetItem, TupleGetItemNode, Expr);
 
+/*!
+ * \brief Base class of the temporary expression.
+ *
+ * TempExprs are pass specific expression that can be
+ * useful to define intermediate result in the
+ * rewriting pass such as layout or type transformation.
+ *
+ * Subclass TempExprNode allows us to pattern match on
+ * specific kind TempExpr and use them for expression rewriting.
+ *
+ * TempExpr should only be used within a pass,
+ */
+class TempExprNode : public ExprNode {
+ public:
+  /*!
+   * \brief Convert the expression to a normal(non-temp) Expr.
+   * \return The corresponding normal(non-temp) expression.
+   */
+  virtual Expr Realize() const = 0;
+
+  static constexpr const char* _type_key = "relay.TempExpr";
+  TVM_DECLARE_BASE_NODE_INFO(TempExprNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(TempExpr, TempExprNode, Expr);
+
 // implementataions
 template<typename TTypeNode>
 inline const TTypeNode* ExprNode::type_as() const {
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index ad447ad13cee..d3c5edd31461 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -276,6 +276,16 @@ class GenericOpMap {
    */
   template <typename ValueType>
   inline ValueType get(const Op& op, ValueType def_value) const;
+  /*!
+   * \brief get the corresponding value element at op with default value.
+   * \param expr The key to the map
+   * \param def_value The default value when the key does not exist
+   *         or if expr is not an Op.
+   * \return the const reference to the content value.
+   * \tparam ValueType The content value type.
+   */
+  template <typename ValueType>
+  inline ValueType get(const Expr& expr, ValueType def_value) const;
 
  private:
   friend class OpRegistry;
@@ -313,6 +323,14 @@ class OpMap {
    * \return the const reference to the content value.
    */
   inline ValueType get(const Op& op, ValueType def_value) const;
+  /*!
+   * \brief get the corresponding value element at op with default value.
+   * \param expr The key to the map
+   * \param def_value The default value when the key does not exist
+   *         or if expr is not an Op.
+   * \return the const reference to the content value.
+   */
+  inline ValueType get(const Expr& expr, ValueType def_value) const;
 
  private:
   friend class Op;
@@ -496,6 +514,21 @@ inline ValueType GenericOpMap::get(const Op& op, ValueType value) const {
   }
 }
 
+template <typename ValueType>
+inline ValueType GenericOpMap::get(const Expr& expr, ValueType value) const {
+  CHECK(expr.defined());
+  if (const OpNode* op = expr.as<OpNode>()) {
+    const uint32_t idx = op->index_;
+    if (idx < data_.size() && data_[idx].second != 0) {
+      return data_[idx].first;
+    } else {
+      return value;
+    }
+  } else {
+    return value;
+  }
+}
+
 template <typename ValueType>
 inline int OpMap<ValueType>::count(const Op& op) const {
   return map_.count(op);
@@ -505,12 +538,19 @@ template <typename ValueType>
 inline ValueType OpMap<ValueType>::operator[](const Op& op) const {
   return map_[op];
 }
+
 template <typename ValueType>
 inline ValueType OpMap<ValueType>::get(const Op& op,
                                        ValueType def_value) const {
   return map_.get<ValueType>(op, def_value);
 }
 
+template <typename ValueType>
+inline ValueType OpMap<ValueType>::get(const Expr& expr,
+                                       ValueType def_value) const {
+  return map_.get<ValueType>(expr, def_value);
+}
+
 /*!
  * \brief Check that an expression is a "primtive operator".
  *
diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index f80d51772ae2..3d9fa56855c3 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -85,6 +85,25 @@ using FTVMSchedule = runtime::TypedPackedFunc<
   Schedule(const Attrs& attrs,
            const Array<Tensor>& outs,
            const Target& target)>;
+
+/*!
+ * \brief Forward rewriting rule for a specific op.
+ *
+ * \param ref_call The reference old call type to be rewritten.
+ *                 We can make use of the op and type information.
+ * \param new_args The new arguments (some of them could be TempExpr).
+ * \param ctx  Optional context information about ref_call.
+ * \return The rewriten result call, can also return nullptr,
+ *         which indicate the rewriter should use the default fallback
+ *         rule that realizes all its input and compose the call.
+ *
+ * \note When we register the function, we can register
+ *       a different signature with ctx to be a specific node type.
+ */
+using FForwardRewrite = runtime::TypedPackedFunc<
+  Expr(const Call& ref_call,
+       const Array<Expr>& new_args,
+       const NodeRef& ctx)>;
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_OP_ATTR_TYPES_H_
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 3ca81ebd027d..4410ed0d0de1 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -158,6 +158,17 @@ Expr FoldConstant(const Expr& expr);
  */
 Expr FuseOps(const Expr& expr, int fuse_opt_level);
 
+/*!
+ * \brief Apply rewrite rules to rewrite the expr in post DFS order.
+ * \param expr The expression.
+ * \param rewrite_map_attr_name The Op's attr name which corresponds to the rewrite
+ *                              rule function.
+ * \param fcontext Additional callback to provide context argument for each call node.
+ * \return The rewritten expression.
+ */
+Expr ForwardRewrite(const Expr& expr,
+                    const std::string& rewrite_map_attr_name,
+                    std::function<NodeRef(const Call&)> fcontext = nullptr);
 
 /*! \brief A hashing structure in the style of std::hash. */
 struct StructuralHash {
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index f25785d39eeb..0aeb7f2b1513 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -73,6 +73,8 @@ class PackedFunc {
   using FType = std::function<void (TVMArgs args, TVMRetValue* rv)>;
   /*! \brief default constructor */
   PackedFunc() {}
+  /*! \brief constructor from null */
+  PackedFunc(std::nullptr_t null) {}  // NOLINT(*)
   /*!
    * \brief constructing a packed function from a std::function.
    * \param body the internal container of packed function.
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index 038f34df5760..d3f7043088eb 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -87,23 +87,6 @@ AxesSet Intersect(const AxesSet& lhs, const AxesSet& rhs) {
   return ret;
 }
 
-/*!
- * \param Get function from op_map.
- * \param op_map The OpMap.
- * \param op The operator being called.
- * \tparam ValueType the content value type.
- * \return The result value map.
- */
-template<typename ValueType>
-ValueType GetFunc(const OpMap<ValueType>& op_map,
-                  const Expr& op) {
-  if (const OpNode* opnode = op.as<OpNode>()) {
-    return op_map.get(GetRef<Op>(opnode), ValueType());
-  } else {
-    return ValueType();
-  }
-}
-
 /*!
  * \brief Preparation function for pass scale forward.
  * \param call The call node.
@@ -114,7 +97,7 @@ using FForwardPrep = runtime::TypedPackedFunc<
   Array<AxesSet> (const Call& call, const AxesSet& out_scale_axes)>;
 
 /*! \brief Axis scale tuple.  */
-class STupleNode : public Node {
+class ScaledExprNode : public TempExprNode {
  public:
   /*! \brief The value */
   Expr value;
@@ -123,29 +106,26 @@ class STupleNode : public Node {
   /*! \brief The scaling factor */
   Expr scale = NullValue<Expr>();
 
+  Expr Realize() const final {
+    CHECK(!axes.defined())
+        << "outstanding scale";
+    return value;
+  }
+
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("value", &value);
     v->Visit("axes", &axes);
     v->Visit("scale", &scale);
   }
 
-  static constexpr const char* _type_key = "relay.fold_scale_axis.STupleNode";
-  TVM_DECLARE_NODE_TYPE_INFO(STupleNode, Node);
+  static constexpr const char* _type_key = "relay.fold_scale_axis.ScaledExpr";
+  TVM_DECLARE_NODE_TYPE_INFO(ScaledExprNode, TempExprNode);
 };
 
-RELAY_DEFINE_NODE_REF(STuple, STupleNode, NodeRef);
-
-/*!
- * \brief The transform function, transform an old call to
- *  a new one given the new args.
- * \param ref_call Reference call node that represent the op and the types.
- * \param expected_out_axes The scale axes allowed in the output.
- * \param sargs The input arguments.
- */
-using FForwardTransform = TypedPackedFunc<
-  STuple(const Call& ref_call,
-         const AxesSet& expected_out_axes,
-         const Array<STuple>& sargs)>;
+using FForwardRewrite = TypedPackedFunc<
+  Expr(const Call& ref_call,
+       const Array<Expr>& new_args,
+       const AxesSet& expeced_out_axes)>;
 
 //----------------------------------------------
 // Generic Visitors for FScaleAxisForward
@@ -219,7 +199,7 @@ class ForwardPrep : private ExprVisitor {
         out_axes = NullValue<AxesSet>();
       }
       // pass the message back to all the children it references.
-      auto f = GetFunc(fprep, call->op);
+      auto f = fprep.get(call->op, nullptr);
       if (f != nullptr) {
         Array<AxesSet> in_axes = f(GetRef<Call>(call), out_axes);
         CHECK_EQ(in_axes.size(), call->args.size());
@@ -261,87 +241,6 @@ class ForwardPrep : private ExprVisitor {
   }
 };
 
-class ForwardTransformer : private ExprMutator {
- public:
-  // Transform expression.
-  Expr Fold(Expr expr) {
-    expected_scale_axes_ =
-        ForwardPrep().Prepare(expr);
-    return this->Mutate(expr);
-  }
-
- private:
-  // Valid axes on each node.
-  std::unordered_map<const Node*, AxesSet> expected_scale_axes_;
-  std::unordered_map<const Node*, STuple> scale_memo_;
-  // If user simply call mutate,
-  // then only Expr is returned and we cannot
-  // accept outstanding scales.
-  Expr VisitExpr(const Expr& expr) final {
-    Expr res = ExprMutator::VisitExpr(expr);
-    CHECK(!scale_memo_.count(expr.get()))
-        << "Outstanding scale";
-    return res;
-  }
-
-  STuple GetSTuple(const Expr& expr) {
-    Expr res = ExprMutator::VisitExpr(expr);
-    auto it = scale_memo_.find(expr.get());
-    if (it != scale_memo_.end()) {
-      CHECK(it->second->value.same_as(res));
-      return it->second;
-    } else {
-      auto node = make_node<STupleNode>();
-      node->value = res;
-      return STuple(node);
-    }
-  }
-
-  Expr VisitExpr_(const CallNode* call_node) final {
-    static const auto& ftransform =
-        Op::GetAttr<FForwardTransform>("FScaleAxisForwardTransform");
-    auto new_op = this->Mutate(call_node->op);
-    bool has_scale = false;
-    bool unchanged = call_node->op.same_as(new_op);
-
-    Array<STuple> call_sargs;
-    Array<Expr> call_args;
-    for (auto arg : call_node->args) {
-      STuple new_sarg = this->GetSTuple(arg);
-      unchanged &= new_sarg->value.same_as(arg);
-      if (new_sarg->axes.defined()) has_scale = true;
-      call_sargs.push_back(new_sarg);
-      call_args.push_back(new_sarg->value);
-    }
-
-    // get expected scale axes.
-    AxesSet expected_out_axes;
-    auto axis_it = expected_scale_axes_.find(call_node);
-    if (axis_it != expected_scale_axes_.end()) {
-      expected_out_axes = axis_it->second;
-    }
-    // propagation function
-    auto f = GetFunc(ftransform, call_node->op);
-    if (f != nullptr) {
-      STuple sret = f(GetRef<Call>(call_node), expected_out_axes, call_sargs);
-      if (sret.defined()) {
-        if (sret->axes.defined()) {
-          scale_memo_[call_node] = sret;
-        }
-        return sret->value;
-      }
-    }
-    // normal path
-    CHECK(!has_scale) << "Outstanding scale, on op=" << call_node->op;
-    if (unchanged) {
-      return GetRef<Expr>(call_node);
-    } else {
-      return CallNode::make(
-          new_op, call_args, call_node->attrs, call_node->type_args);
-    }
-  }
-};
-
 //----------------------------------------------
 // Per operator defs for FScaleAxisForward
 //----------------------------------------------
@@ -351,30 +250,31 @@ Array<AxesSet> ReluForwardPrep(const Call& call, AxesSet out) {
   return {out};
 }
 
-STuple ReluForwardTransform(const Call& ref_call,
-                            const AxesSet& expected_axes,
-                            const Array<STuple>& sargs) {
-  if (!sargs[0]->axes.defined()) return STuple();
+Expr ReluForwardRewrite(const Call& ref_call,
+                        const Array<Expr>& new_args,
+                        const AxesSet& expected_axes) {
+  const auto* input = new_args[0].as<ScaledExprNode>();
+  if (input == nullptr) return Expr(nullptr);
   // return transformed conv2d
-  auto rnode = make_node<STupleNode>();
+  auto rnode = make_node<ScaledExprNode>();
   rnode->value = CallNode::make(
-      ref_call->op, {sargs[0]->value}, ref_call->attrs, ref_call->type_args);
-  rnode->scale = sargs[0]->scale;
-  rnode->axes = sargs[0]->axes;
-  return STuple(rnode);
+      ref_call->op, {input->value}, ref_call->attrs, ref_call->type_args);
+  rnode->scale = input->scale;
+  rnode->axes = input->axes;
+  return Expr(rnode);
 }
 
 RELAY_REGISTER_OP("nn.relu")
 .set_attr<FForwardPrep>("FScaleAxisForwardPrep", ReluForwardPrep);
 
 RELAY_REGISTER_OP("nn.relu")
-.set_attr<FForwardTransform>("FScaleAxisForwardTransform", ReluForwardTransform);
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", ReluForwardRewrite);
 
 RELAY_REGISTER_OP("nn.leaky_relu")
 .set_attr<FForwardPrep>("FScaleAxisForwardPrep", ReluForwardPrep);
 
 RELAY_REGISTER_OP("nn.leaky_relu")
-.set_attr<FForwardTransform>("FScaleAxisForwardTransform", ReluForwardTransform);
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", ReluForwardRewrite);
 
 // AddSub
 Array<AxesSet> AddSubForwardPrep(const Call& call, AxesSet out_axes) {
@@ -391,69 +291,69 @@ Array<AxesSet> AddSubForwardPrep(const Call& call, AxesSet out_axes) {
   }
 }
 
-STuple AddSubForwardTransform(const Call& ref_call,
-                              const AxesSet& expected_out_axes,
-                              const Array<STuple>& sargs) {
-  if (!sargs[0]->axes.defined() && !sargs[1]->axes.defined()) {
-    return STuple();
-  }
+Expr AddSubForwardRewrite(const Call& ref_call,
+                          const Array<Expr>& new_args,
+                          const AxesSet& expected_out_axes) {
+  const auto* slhs = new_args[0].as<ScaledExprNode>();
+  const auto* srhs = new_args[1].as<ScaledExprNode>();
+  if (!slhs && !srhs) return Expr();
   const auto* tlhs = ref_call->args[0]->type_as<TensorTypeNode>();
   const auto* trhs = ref_call->args[1]->type_as<TensorTypeNode>();
+  auto rnode = make_node<ScaledExprNode>();
 
-  auto rnode = make_node<STupleNode>();
-  if (sargs[0]->axes.defined()) {
-    CHECK(!sargs[1]->axes.defined());
-    CHECK(MatchBroadcastToLeftAxes(tlhs, trhs, sargs[0]->axes));
+  if (slhs != nullptr) {
+    CHECK(srhs == nullptr);
+    CHECK(MatchBroadcastToLeftAxes(tlhs, trhs, slhs->axes));
     Expr scale = ExpandBiasToMatchAxis(
-        sargs[0]->scale, tlhs->shape.size(), sargs[0]->axes);
-    Expr rhs = Divide(sargs[1]->value, scale);
-    rnode->value = CallNode::make(ref_call->op, {sargs[0]->value, rhs},
+        slhs->scale, tlhs->shape.size(), slhs->axes);
+    Expr rhs = Divide(new_args[1], scale);
+    rnode->value = CallNode::make(ref_call->op, {slhs->value, rhs},
                                   ref_call->attrs, ref_call->type_args);
-    rnode->scale = sargs[0]->scale;
-    rnode->axes = sargs[0]->axes;
+    rnode->scale = slhs->scale;
+    rnode->axes = slhs->axes;
   } else {
-    CHECK(sargs[1]->axes.defined());
-    CHECK(sargs[0]->axes.defined());
-    CHECK(MatchBroadcastToLeftAxes(trhs, tlhs, sargs[1]->axes));
+    CHECK(slhs != nullptr);
+    CHECK(MatchBroadcastToLeftAxes(trhs, tlhs, srhs->axes));
     Expr scale = ExpandBiasToMatchAxis(
-        sargs[1]->scale, trhs->shape.size(), sargs[1]->axes);
-    Expr lhs = Divide(sargs[0]->value, scale);
-    rnode->value = CallNode::make(ref_call->op, {lhs, sargs[1]->value},
+        srhs->scale, trhs->shape.size(), srhs->axes);
+    Expr lhs = Divide(new_args[0], scale);
+    rnode->value = CallNode::make(ref_call->op, {lhs, srhs->value},
                                   ref_call->attrs, ref_call->type_args);
-    rnode->scale = sargs[1]->scale;
-    rnode->axes = sargs[1]->axes;
+    rnode->scale = srhs->scale;
+    rnode->axes = srhs->axes;
   }
-  return STuple(rnode);
+  return Expr(rnode);
 }
 
 RELAY_REGISTER_OP("add")
 .set_attr<FForwardPrep>("FScaleAxisForwardPrep", AddSubForwardPrep);
 
 RELAY_REGISTER_OP("add")
-.set_attr<FForwardTransform>("FScaleAxisForwardTransform", AddSubForwardTransform);
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", AddSubForwardRewrite);
 
 RELAY_REGISTER_OP("subtract")
 .set_attr<FForwardPrep>("FScaleAxisForwardPrep", AddSubForwardPrep);
 
 RELAY_REGISTER_OP("subtract")
-.set_attr<FForwardTransform>("FScaleAxisForwardTransform", AddSubForwardTransform);
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", AddSubForwardRewrite);
 
 // Producer operators
 // Multiply produces the scale-axis pair.
-STuple MultiplyForwardTransform(const Call& ref_call,
-                                const AxesSet& expected_out_axes,
-                                const Array<STuple>& sargs) {
-  if (!expected_out_axes.defined()) return STuple();
+Expr MultiplyForwardRewrite(const Call& ref_call,
+                            const Array<Expr>& new_args,
+                            const AxesSet& expected_out_axes) {
+  if (!expected_out_axes.defined()) return Expr();
   // TODO(tvm-team) allow same axes accumulation
   // not as important because it is less common in nn.
-  CHECK(!sargs[0]->axes.defined());
-  CHECK(!sargs[1]->axes.defined());
+  const auto* slhs = new_args[0].as<ScaledExprNode>();
+  const auto* srhs = new_args[1].as<ScaledExprNode>();
+  CHECK(!slhs && !srhs);
+
   const auto* tlhs = ref_call->args[0]->type_as<TensorTypeNode>();
   const auto* trhs = ref_call->args[1]->type_as<TensorTypeNode>();
-
-  Expr lhs = sargs[0]->value;
-  Expr rhs = sargs[1]->value;
-  auto rnode = make_node<STupleNode>();
+  Expr lhs = new_args[0];
+  Expr rhs = new_args[1];
+  auto rnode = make_node<ScaledExprNode>();
   if (MatchBroadcastToLeftAxes(tlhs, trhs, expected_out_axes, &rhs)) {
     rnode->value = lhs;
     rnode->scale = rhs;
@@ -463,11 +363,11 @@ STuple MultiplyForwardTransform(const Call& ref_call,
     rnode->scale = lhs;
     rnode->axes = expected_out_axes;
   }
-  return STuple(rnode);
+  return Expr(rnode);
 }
 
 RELAY_REGISTER_OP("multiply")
-.set_attr<FForwardTransform>("FScaleAxisForwardTransform", MultiplyForwardTransform);
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", MultiplyForwardRewrite);
 
 // Consumer operators
 // Conv2D send out requirement of axis folding.
@@ -500,13 +400,14 @@ Array<AxesSet> Conv2DForwardPrep(const Call& call, AxesSet out) {
 }
 
 // Conv2D consumes the scale axis during transformation.
-STuple Conv2DForwardTransform(const Call& ref_call,
-                              const AxesSet& expected_axes,
-                              const Array<STuple>& sargs) {
+Expr Conv2DForwardRewrite(const Call& ref_call,
+                          const Array<Expr>& new_args,
+                          const AxesSet& expected_axes) {
   // if data do not have scale, normal transform path.
-  STuple sdata = sargs[0];
-  if (!sdata->scale.defined()) return STuple();
-  CHECK(sdata->axes.defined());
+  const auto* sdata = new_args[0].as<ScaledExprNode>();
+  const auto* sweight = new_args[1].as<ScaledExprNode>();
+  if (sdata == nullptr) return Expr();
+  if (sweight != nullptr) return Expr();
   const auto* param = ref_call->attrs.as<Conv2DAttrs>();
   CHECK(param != nullptr);
   Layout data_layout(param->data_layout);
@@ -524,7 +425,8 @@ STuple Conv2DForwardTransform(const Call& ref_call,
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(ref_call, param, weight_layout);
   CHECK(param->groups == 1 || is_depthwise_conv2d);
-  Expr weight = sargs[1]->value;
+
+  Expr weight = new_args[1];
 
   // match the ic_axis
   if (is_depthwise_conv2d) {
@@ -537,21 +439,30 @@ STuple Conv2DForwardTransform(const Call& ref_call,
     weight = Multiply(weight, scale);
   }
   // return transformed conv2d
-  auto rnode = make_node<STupleNode>();
-  rnode->value = CallNode::make(
+  return CallNode::make(
       ref_call->op, {sdata->value, weight}, ref_call->attrs, ref_call->type_args);
-  return STuple(rnode);
 }
 
 RELAY_REGISTER_OP("nn.conv2d")
 .set_attr<FForwardPrep>("FScaleAxisForwardPrep", Conv2DForwardPrep);
 
 RELAY_REGISTER_OP("nn.conv2d")
-.set_attr<FForwardTransform>("FScaleAxisForwardTransform", Conv2DForwardTransform);
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", Conv2DForwardRewrite);
 
 
 Expr ForwardFoldScaleAxis(Expr data) {
-  return ForwardTransformer().Fold(data);
+  auto expected_scale_axes =
+      ForwardPrep().Prepare(data);
+  auto fcontext = [&](const Call& call) -> NodeRef{
+    auto it = expected_scale_axes.find(call.get());
+    if (it != expected_scale_axes.end()) {
+      return it->second;
+    } else {
+      return NodeRef(nullptr);
+    }
+  };
+  return ForwardRewrite(
+      data, "FScaleAxisForwardRewrite", fcontext);
 }
 
 // Expose the FoldScaleAxisFoward
@@ -602,7 +513,7 @@ class BackwardPrep : private ExprVisitor {
     ExprVisitor::VisitExpr_(call);
     static const auto& fprep =
         Op::GetAttr<FBackwardPrep>("FScaleAxisBackwardPrep");
-    auto f = GetFunc(fprep, call->op);
+    auto f = fprep.get(call->op, nullptr);
     if (f == nullptr) return;
     auto rit = ref_counter_.find(call);
     CHECK(rit != ref_counter_.end());
@@ -705,7 +616,7 @@ Expr BackwardTransformerNode::Transform(
     const CallNode* call_node, AxesSet axes, Expr scale) {
   static const auto& ftransform =
       Op::GetAttr<FBackwardTransform>("FScaleAxisBackwardTransform");
-  auto f = GetFunc(ftransform, call_node->op);
+  auto f = ftransform.get(call_node->op, nullptr);
   if (f != nullptr) {
     return f(GetRef<Call>(call_node),
              axes,
diff --git a/src/relay/pass/forward_rewrite.cc b/src/relay/pass/forward_rewrite.cc
new file mode 100644
index 000000000000..9c1e35782e92
--- /dev/null
+++ b/src/relay/pass/forward_rewrite.cc
@@ -0,0 +1,132 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file forward_rewrite.cc
+ * \brief Apply rewriting rules in a forward fashion.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+
+namespace tvm {
+namespace relay {
+
+// Realizer class that realizes the expression
+// Note that we can take benefit of its internal memo
+// so that calling realize repeatively won't hurt perf.
+class TempRealizer : private ExprMutator {
+ public:
+  Expr Realize(Expr expr) {
+    return VisitExpr(expr);
+  }
+
+ private:
+  Expr VisitExpr(const Expr& expr) final {
+    auto it = memo_.find(expr);
+    if (it != memo_.end()) {
+      return it->second;
+    } else {
+      Expr res;
+      if (const auto* temp = expr.as_derived<TempExprNode>()) {
+        res = temp->Realize();
+
+      } else {
+        res = ExprFunctor::VisitExpr(expr);
+      }
+      memo_[res] = res;
+      return res;
+    }
+  }
+};
+
+class ForwardRewriter : private ExprMutator {
+ public:
+  ForwardRewriter(const OpMap<FForwardRewrite>& rewrite_map,
+                  std::function<NodeRef(const Call&)> fcontext)
+      : rewrite_map_(rewrite_map),
+        fcontext_(fcontext) {
+  }
+
+  // Transform expression.
+  Expr Rewrite(Expr expr) {
+    return this->VisitExpr(expr);
+  }
+
+ private:
+  // The rewrite rule.
+  const OpMap<FForwardRewrite>& rewrite_map_;
+  // The context.
+  std::function<NodeRef(const Call&)> fcontext_{nullptr};
+  // internal realizer
+  TempRealizer realizer_;
+
+  Expr VisitExpr(const Expr& expr) final {
+    // by default always realize.
+    return realizer_.Realize(ExprMutator::VisitExpr(expr));
+  }
+
+  // Visit and allow non-realized version.
+  Expr GetTempExpr(const Expr& expr)  {
+    return ExprMutator::VisitExpr(expr);
+  }
+
+  // Automatic fold TupleGetItem.
+  Expr VisitExpr_(const TupleGetItemNode* op) final {
+    Expr tuple = this->GetTempExpr(op->tuple);
+    if (const auto* ptuple = tuple.as<TupleNode>()) {
+      return ptuple->fields[op->index];
+    } else {
+      if (tuple.same_as(op->tuple)) {
+        return GetRef<Expr>(op);
+      } else {
+        return TupleGetItemNode::make(tuple, op->index);
+      }
+    }
+  }
+
+  Expr VisitExpr_(const CallNode* call_node) final {
+    const Call& ref_call = GetRef<Call>(call_node);
+    PackedFunc frewrite = rewrite_map_.get(call_node->op, nullptr);
+
+    auto new_op = this->Mutate(call_node->op);
+    bool unchanged = call_node->op.same_as(new_op);
+
+    Array<Expr> call_args;
+    for (auto arg : call_node->args) {
+      Expr new_arg = this->GetTempExpr(arg);
+      if (frewrite == nullptr) {
+        new_arg = realizer_.Realize(new_arg);
+      }
+      unchanged &= new_arg.same_as(arg);
+      call_args.push_back(new_arg);
+    }
+    // try to rewrite.
+    if (frewrite != nullptr) {
+      Expr res = frewrite(
+          ref_call, call_args,
+          fcontext_ != nullptr ? fcontext_(ref_call) : NodeRef(nullptr));
+      if (res.defined()) return res;
+      // abort, use old rule
+      for (size_t i = 0; i < call_args.size(); ++i) {
+        Expr arg = call_args[i];
+        Expr new_arg = realizer_.Realize(arg);
+        if (!arg.same_as(new_arg)) {
+          call_args.Set(i, new_arg);
+          unchanged = false;
+        }
+      }
+    }
+    if (unchanged) return ref_call;
+    return CallNode::make(
+        new_op, call_args, call_node->attrs, call_node->type_args);
+  }
+};
+
+Expr ForwardRewrite(const Expr& expr,
+                    const std::string& rewrite_map_name,
+                    std::function<NodeRef(const Call&)> fcontext) {
+  auto rewrite_map = Op::GetAttr<FForwardRewrite>(rewrite_map_name);
+  return ForwardRewriter(rewrite_map, fcontext).Rewrite(expr);
+}
+}  // namespace relay
+}  // namespace tvm

From f6119e4c37ca8c08ca2e7f6321853e917c4de74e Mon Sep 17 00:00:00 2001
From: Rasterer <jzhebin@gmail.com>
Date: Sat, 17 Nov 2018 11:17:14 +0800
Subject: [PATCH 366/529] [TOPI] Improve performance for dilated convolution
 (#2107)

---
 topi/python/topi/arm_cpu/conv2d.py      | 50 ++++++++++++++++---------
 topi/python/topi/mali/conv2d.py         |  5 ++-
 topi/python/topi/nn/conv2d.py           | 49 ++++++++++++------------
 topi/python/topi/nn/depthwise_conv2d.py | 29 +++++++-------
 topi/python/topi/x86/conv2d.py          | 14 +++----
 5 files changed, 80 insertions(+), 67 deletions(-)

diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index cfd423b584cf..22c9d2368de3 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -113,11 +113,6 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, ou
     else:
         dilation_h, dilation_w = dilation
 
-    if dilation_h != 1 or dilation_w != 1:
-        dilation_args = (1, 1, dilation_h, dilation_w) if len(kernel.shape) == 4\
-                else (1, 1, dilation_h, dilation_w, 1)
-        kernel = dilate(kernel, dilation_args)
-
     if len(kernel.shape) == 4:
         pre_packed = False
         CO, _, KH, KW = get_const_tuple(kernel.shape)
@@ -126,11 +121,13 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, ou
         CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
         CO = CO * VC
 
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
     HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-
-    OH = (IH + pad_top + pad_bottom - KH) // HSTR + 1
-    OW = (IW + pad_left + pad_right - KW) // WSTR + 1
+    OH = (IH + pad_top + pad_bottom - dilated_kernel_h) // HSTR + 1
+    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
     data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right])
 
     # ==================== define configuration space ====================
@@ -171,14 +168,22 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, ou
     VH = cfg["tile_oh"].size[-1]
     VW = cfg["tile_ow"].size[-1]
 
-    dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1)
     kvshape = (CO // VC, CI, KH, KW, VC)
     ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
     oshape = (N, CO, OH, OW)
 
-    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
-                           data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
-                           name='data_vec')
+    if dilation_h != 1 or dilation_w != 1:
+        # undilate input data
+        dvshape = (N, OH // VH, OW // VW, CI, KH, KW, VH, VW)
+        data_vec = tvm.compute(dvshape, lambda n, h, w, ci, kh, kw, vh, vw:
+                               data_pad[n][ci][(h*VH+vh)*HSTR+kh*dilation_h]
+                               [(w*VW+vw)*WSTR+kw*dilation_w],
+                               name='data_vec_undilated')
+    else:
+        dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1)
+        data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
+                               data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
+                               name='data_vec')
 
     if pre_packed:
         kernel_vec = kernel
@@ -191,10 +196,16 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, ou
     kh = tvm.reduce_axis((0, KH), name='kh')
     kw = tvm.reduce_axis((0, KW), name='kw')
 
-    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
-        tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
-                kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
-                axis=[ci, kh, kw]), name='conv')
+    if dilation_h != 1 or dilation_w != 1:
+        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+            tvm.sum(data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype) *
+                    kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
+                    axis=[ci, kh, kw]), name='conv')
+    else:
+        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+            tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
+                    kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
+                    axis=[ci, kh, kw]), name='conv')
 
     output = tvm.compute(oshape, lambda n, co, h, w:
                          conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
@@ -240,7 +251,10 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
     # mark parallel
     s[last].parallel(co)
 
-    _, h, _, _, _, _ = s[data_vec].op.axis
+    if data_vec.op.name == 'data_vec_undilated':
+        _, h, _, _, _, _, _, _ = s[data_vec].op.axis
+    else:
+        _, h, _, _, _, _ = s[data_vec].op.axis
     s[data_vec].parallel(h)
 
     if kernel_vec.op.name == 'kernel_vec':
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 1ad58038abb1..d7b1f939ef45 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -118,7 +118,10 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
         s[data_pad].compute_inline()
 
     # schedule data packing
-    _, h, w, ci, vh, vw = s[data_vec].op.axis
+    if isinstance(data_vec.op, tvm.tensor.ComputeOp) and data_vec.op.name == 'data_vec_undilated':
+        _, h, w, ci, _, _, vh, vw = s[data_vec].op.axis
+    else:
+        _, h, w, ci, vh, vw = s[data_vec].op.axis
     tile_and_bind3d(s, data_vec, h, w, ci, 1)
     if vh.dom.extent.value < max_unroll:
         s[data_vec].unroll(vh)
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index d4b9393c19dd..a85d1268dbf8 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -6,7 +6,6 @@
 import numpy as np
 import tvm
 
-from .dilate import dilate
 from .pad import pad
 from .util import get_pad_tuple
 from ..util import simplify, const_matrix, get_const_tuple
@@ -128,17 +127,16 @@ def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     else:
         dilation_h, dilation_w = dilation
 
-    if dilation_h != 1 or dilation_w != 1:
-        Filter = dilate(Filter, (1, 1, dilation_h, dilation_w))
-
     batch, in_channel, in_height, in_width = Input.shape
     num_filter, channel, kernel_h, kernel_w = Filter.shape
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (kernel_h, kernel_w))
     # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
     out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
     # compute graph
     pad_before = [0, 0, pad_top, pad_left]
     pad_after = [0, 0, pad_down, pad_right]
@@ -150,7 +148,8 @@ def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     return tvm.compute(
         (batch, out_channel, out_height, out_width),
         lambda nn, ff, yy, xx: tvm.sum(
-            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
+            temp[nn, rc, yy * stride_h + ry * dilation_h,
+                 xx * stride_w + rx * dilation_w].astype(out_dtype) *
             Filter[ff, rc, ry, rx].astype(out_dtype),
             axis=[rc, ry, rx]), tag="conv2d_nchw")
 
@@ -195,17 +194,16 @@ def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
     else:
         dilation_h, dilation_w = dilation
 
-    if dilation_h != 1 or dilation_w != 1:
-        Filter = dilate(Filter, (dilation_h, dilation_w, 1, 1))
-
     in_height, in_width, in_channel, batch = Input.shape
     kernel_h, kernel_w, channel, num_filter = Filter.shape
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (kernel_h, kernel_w))
     # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
     out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
     pad_before = [pad_top, pad_left, 0, 0]
     pad_after = [pad_down, pad_right, 0, 0]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
@@ -215,7 +213,8 @@ def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
     Output = tvm.compute(
         (out_height, out_width, out_channel, batch),
         lambda yy, xx, ff, nn: tvm.sum(
-            PaddedInput[yy * stride_h + ry, xx * stride_w + rx, rc, nn].astype(out_dtype) *
+            PaddedInput[yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w,
+                        rc, nn].astype(out_dtype) *
             Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
         name="Conv2dOutput", tag="conv2d_hwcn")
     return Output
@@ -259,17 +258,16 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
     else:
         dilation_h, dilation_w = dilation
 
-    if dilation_h != 1 or dilation_w != 1:
-        Filter = dilate(Filter, (dilation_h, dilation_w, 1, 1))
-
     batch, in_height, in_width, in_channel = Input.shape
     kernel_h, kernel_w, channel, num_filter = Filter.shape
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (kernel_h, kernel_w))
     # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
     out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
     pad_before = [0, pad_top, pad_left, 0]
     pad_after = [0, pad_down, pad_right, 0]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
@@ -279,7 +277,8 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
     Output = tvm.compute(
         (batch, out_height, out_width, out_channel),
         lambda nn, yy, xx, ff: tvm.sum(
-            PaddedInput[nn, yy * stride_h + ry, xx * stride_w + rx, rc].astype(out_dtype) *
+            PaddedInput[nn, yy * stride_h + ry * dilation_h,
+                        xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
             Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
         name="Conv2dOutput", tag="conv2d_nhwc")
     return Output
diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py
index b5f46b840c9c..ca24b08dd0bb 100644
--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -72,18 +72,17 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=No
     else:
         dilation_h, dilation_w = dilation
 
-    if dilation_h != 1 or dilation_w != 1:
-        Filter = dilate(Filter, (1, 1, dilation_h, dilation_w))
-
     batch, in_channel, in_height, in_width = Input.shape
     # shape of dilated kernel
     filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape
 
+    dilated_kernel_h = (filter_height - 1) * dilation_h + 1
+    dilated_kernel_w = (filter_width - 1) * dilation_w + 1
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (filter_height, filter_width))
+        padding, (dilated_kernel_h, dilated_kernel_w))
     out_channel = simplify(in_channel * channel_multiplier)
-    out_height = simplify((in_height - filter_height + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - filter_width + pad_left + pad_right) // stride_w + 1)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
 
     # padding stage
     pad_before = [0, 0, pad_top, pad_left]
@@ -95,7 +94,8 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=No
     Output = tvm.compute(
         (batch, out_channel, out_height, out_width),
         lambda b, c, i, j: tvm.sum(
-            (PaddedInput[b, c/channel_multiplier, i*stride_h+di, j*stride_w+dj].astype(out_dtype) *
+            (PaddedInput[b, c/channel_multiplier, i*stride_h+di*dilation_h,
+                         j*stride_w+dj*dilation_w].astype(out_dtype) *
              Filter[c/channel_multiplier, c%channel_multiplier, di, dj].astype(out_dtype)),
             axis=[di, dj]),
         name='DepthwiseConv2d', tag="depthwise_conv2d_nchw")
@@ -143,18 +143,17 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=No
     else:
         dilation_h, dilation_w = dilation
 
-    if dilation_h != 1 or dilation_w != 1:
-        Filter = dilate(Filter, (dilation_h, dilation_w, 1, 1))
-
     batch, in_height, in_width, in_channel = Input.shape
     # shape of dilated kernel
     filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
 
+    dilated_kernel_h = (filter_height - 1) * dilation_h + 1
+    dilated_kernel_w = (filter_width - 1) * dilation_w + 1
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (filter_height, filter_width))
+        padding, (dilated_kernel_h, dilated_kernel_w))
     out_channel = simplify(in_channel * channel_multiplier)
-    out_height = simplify((in_height - filter_height + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - filter_width + pad_left + pad_right) // stride_w + 1)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
 
     # padding stage
     pad_before = [0, pad_top, pad_left, 0]
@@ -166,8 +165,8 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=No
     Output = tvm.compute(
         (batch, out_height, out_width, out_channel),
         lambda b, i, j, c: tvm.sum(
-            (PaddedInput[b, i*stride_h + di, j*stride_w + dj, c/channel_multiplier].astype(
-                out_dtype) *
+            (PaddedInput[b, i*stride_h + di*dilation_h, j*stride_w + dj*dilation_w,
+                         c/channel_multiplier].astype(out_dtype) *
              Filter[di, dj, c/channel_multiplier, c%channel_multiplier].astype(out_dtype)),
             axis=[di, dj]),
         name='DepthwiseConv2d', tag="depthwise_conv2d_nhwc")
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 7e0b90f1db9b..22d842cd49c7 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -9,7 +9,6 @@
 from ..util import get_const_tuple
 from ..nn.conv2d import conv2d, conv2d_NCHWc, \
     conv2d_alter_layout, _get_workload as _get_conv2d_workload
-from ..nn.dilate import dilate
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
 from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, depthwise_conv2d_nchw
 from ..nn.pad import pad
@@ -89,9 +88,6 @@ def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout
     else:
         dilation_h, dilation_w = dilation
 
-    if dilation_h != 1 or dilation_w != 1:
-        kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
-
     HPAD, WPAD = padding
     HSTR, WSTR = strides
 
@@ -101,8 +97,10 @@ def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout
     pad_height = in_height + 2 * HPAD
     pad_width = in_width + 2 * WPAD
 
-    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
-    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1
+    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
+    out_height = (in_height + 2 * HPAD - dilated_kernel_h) // HSTR + 1
+    out_width = (in_width + 2 * WPAD - dilated_kernel_w) // WSTR + 1
 
     # pack data
     DOPAD = (HPAD != 0 or WPAD != 0)
@@ -136,8 +134,8 @@ def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout
     kw = tvm.reduce_axis((0, kernel_width), name='kw')
 
     conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_vec[n, ic//ic_bn, oh*HSTR+kh, ic%ic_bn,
-                                        ow*WSTR+kw].astype(out_dtype) *
+                       tvm.sum(data_vec[n, ic//ic_bn, oh*HSTR+kh*dilation_h, ic%ic_bn,
+                                        ow*WSTR+kw*dilation_w].astype(out_dtype) *
                                kernel_vec[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn,
                                           oc_block].astype(out_dtype),
                                axis=[ic, kh, kw]), name='conv')

From 55ee7c60112a91949014270e03775dd4a30ee328 Mon Sep 17 00:00:00 2001
From: hlu1 <14827759+hlu1@users.noreply.github.com>
Date: Sun, 18 Nov 2018 08:53:49 -0800
Subject: [PATCH 367/529] [nnvm] Add caffe2 frontend (#1981)

---
 nnvm/python/nnvm/frontend/__init__.py         |   1 +
 nnvm/python/nnvm/frontend/caffe2.py           | 458 ++++++++++++++++++
 nnvm/python/nnvm/frontend/onnx.py             |  75 +--
 .../python/nnvm/frontend/onnx_caffe2_utils.py |  46 ++
 .../frontend/caffe2/model_zoo/__init__.py     |  18 +
 .../frontend/caffe2/model_zoo/squeezenet.py   | 118 +++++
 .../python/frontend/caffe2/test_forward.py    |  93 ++++
 .../python/frontend/caffe2/test_graph.py      |  24 +
 8 files changed, 774 insertions(+), 59 deletions(-)
 create mode 100755 nnvm/python/nnvm/frontend/caffe2.py
 create mode 100644 nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
 create mode 100644 nnvm/tests/python/frontend/caffe2/model_zoo/__init__.py
 create mode 100644 nnvm/tests/python/frontend/caffe2/model_zoo/squeezenet.py
 create mode 100644 nnvm/tests/python/frontend/caffe2/test_forward.py
 create mode 100755 nnvm/tests/python/frontend/caffe2/test_graph.py

diff --git a/nnvm/python/nnvm/frontend/__init__.py b/nnvm/python/nnvm/frontend/__init__.py
index 80f66c0d35e3..49f53df1174f 100644
--- a/nnvm/python/nnvm/frontend/__init__.py
+++ b/nnvm/python/nnvm/frontend/__init__.py
@@ -6,3 +6,4 @@
 from .keras import from_keras
 from .darknet import from_darknet
 from .tensorflow import from_tensorflow
+from .caffe2 import from_caffe2
diff --git a/nnvm/python/nnvm/frontend/caffe2.py b/nnvm/python/nnvm/frontend/caffe2.py
new file mode 100755
index 000000000000..2450af628a90
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/caffe2.py
@@ -0,0 +1,458 @@
+# pylint: disable=import-self, invalid-name, line-too-long, unused-argument
+"""Caffe2 frontend"""
+from __future__ import absolute_import as _abs
+import tvm
+from nnvm import symbol as _sym
+from nnvm.frontend.common import get_nnvm_op, Renamer, AttrConverter as AttrCvt
+from .onnx_caffe2_utils import dimension_picker, dimension_constraint, infer_channels, revert_caffe2_pad
+from . import onnx
+
+__all__ = ['from_caffe2']
+
+
+def _clean_up_pool_args(args):
+    """ A helper function to clean up common arguments in conv and pooling ops.
+    """
+    assert isinstance(args, dict)
+
+    if 'stride_h' in args and 'stride_w' in args:
+        assert 'stride' not in args and 'strides' not in args
+        args['strides'] = [args['stride_h'], args['stride_w']]
+        args.pop('stride_h')
+        args.pop('stride_w')
+    elif 'stride' in args:
+        args['strides'] = [args['stride'], args['stride']]
+        args.pop('stride')
+
+    # rename 'kernel', 'kernels', to 'kernel_shape'
+    if 'kernel_h' in args and 'kernel_w' in args:
+        assert 'kernel' not in args and 'kernels' not in args
+        args['kernel_shape'] = [args['kernel_h'], args['kernel_w']]
+        args.pop('kernel_h')
+        args.pop('kernel_w')
+    elif 'kernel' in args:
+        args['kernel_shape'] = [args['kernel'], args['kernel']]
+        args.pop('kernel')
+    elif 'kernels' in args:
+        args['kernel_shape'] = args['kernels']
+        args.pop('kernels')
+
+    if 'pad_t' in args and 'pad_l' in args and 'pad_b' in args and 'pad_r' in args:
+        assert 'pad' not in args and 'pads' not in args
+        args['pads'] = [
+            args['pad_t'], args['pad_l'], args['pad_b'], args['pad_r']
+        ]
+        for pad in ['pad_t', 'pad_l', 'pad_b', 'pad_r']:
+            args.pop(pad)
+    elif 'pad' in args:
+        args['pads'] = [args['pad'], args['pad']]
+        args.pop('pad')
+
+    if 'dilation_h' in args and 'dilation_w' in args:
+        assert 'dilation' not in args and 'dilations' not in args
+        args['dilations'] = [args['dilation_h'], args['dilation_w']]
+        args.pop('dilation_h')
+        args.pop('dilation_w')
+    elif 'dilation' in args:
+        args['dilations'] = [args['dilation'], args['dilation']]
+        args.pop('dilation')
+
+    return args
+
+
+class Caffe2OpConverter(object):
+    """ A helper class for holding Caffe2 op converters.
+    """
+
+    @classmethod
+    def get_converter(cls):
+        """ Get converter.
+
+        :return: converter, which should be `_impl`.
+        """
+
+        if hasattr(cls, '_impl'):
+            return getattr(cls, '_impl')
+        else:
+            raise NotImplementedError('{} not implemented'.format(
+                cls.__name__))
+
+
+_caffe2_internal_args = {
+    # nnpack args
+    'algo',
+    'convolution_transform_strategy',
+    'float16_compute',
+    'shared_buffer',
+
+    # training args
+    'init_params',
+    'cudnn_exhaustive_search',
+    'exhaustive_search',
+
+    # training args
+    'adj',
+    'hwgq',
+
+    # args that we don't care
+    'legacy_pad',
+}
+
+
+class Pool(Caffe2OpConverter):
+    """ A helper class for pool op converters.
+    """
+
+    name = ''
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        _clean_up_pool_args(args)
+        if 'global_pooling' in args and args['global_pooling'] == 1:
+            op_name = dimension_picker('global_' + cls.name)
+            return get_nnvm_op(op_name(args))(*inputs)
+
+        return AttrCvt(
+            op_name=dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'strides': 'strides',
+            },
+            excludes={
+                # TVM poolop does not support dilation
+                'dilations',
+            },
+            ignores=_caffe2_internal_args | {'global_pooling', 'order'},
+            custom_check=dimension_constraint())(inputs, args, params)
+
+
+class AveragePool(Pool):
+    name = 'avg_pool'
+
+
+class MaxPool(Pool):
+    name = 'max_pool'
+
+
+class Conv(Caffe2OpConverter):
+    """ Operator converter for Conv.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        # get number of channels
+        channels = infer_channels(inputs[1], params)
+        args['channels'] = channels
+        _clean_up_pool_args(args)
+        return AttrCvt(
+            op_name=dimension_picker('conv'),
+            transforms={
+                'group': ('groups', 1),
+                'kernel_shape':
+                'kernel_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'strides':
+                'strides',
+                'dilations': ('dilation', (1, 1)),
+                'order':
+                ('layout', ("NCHW"),
+                 lambda x: x if isinstance(x, str) else x.decode('UTF-8')),
+            },
+            excludes={},
+            ignores=_caffe2_internal_args,
+            extras={'use_bias': len(inputs) == 3},
+            custom_check=dimension_constraint())(inputs, args, params)
+
+
+class Concat(Caffe2OpConverter):
+    """ Operator converter for Concat.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        def _get_axis_from_order_str(order):
+            order = order if isinstance(order, str) else order.decode('UTF-8')
+            if order == 'NCHW':
+                return 1
+            elif order == 'NHWC':
+                return 3
+            else:
+                raise RuntimeError(
+                    "Unsupported storage order: {} in caffe2".format(order))
+
+        return AttrCvt(
+            op_name='concatenate',
+            transforms={
+                'order': ('axis', (1), _get_axis_from_order_str),
+            },
+            excludes={
+                'add_axis',
+            })(inputs, args, params)
+
+
+class NormalizePlanarYUV(Caffe2OpConverter):
+    """ Operator converter for NormalizePlanarYUV.
+    caffe2 definition: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/norm_planar_yuv_op.cc
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        assert len(inputs) == 3
+        mean = _sym.expand_dims(inputs[1], axis=2, num_newaxis=2)
+        std = _sym.expand_dims(inputs[2], axis=2, num_newaxis=2)
+
+        return _sym.broadcast_div(_sym.broadcast_sub(inputs[0], mean), std)
+
+
+class ResizeNearest(Caffe2OpConverter):
+    """ Operator converter for Upsample (nearest mode).
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        width_scale = args['width_scale'] if 'width_scale' in args else 1
+        height_scale = args['height_scale'] if 'height_scale' in args else 1
+        assert width_scale == height_scale
+
+        return _sym.upsampling(
+            inputs[0], scale=int(width_scale), method="NEAREST_NEIGHBOR")
+
+
+class FC(Caffe2OpConverter):
+    """ Operator converter for FC.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        inputs[0] = _sym.flatten(inputs[0])
+        args['units'] = infer_channels(inputs[1], params)
+        return AttrCvt(
+            'dense',
+            ignores=['axis', 'axis_w'],
+            extras={'use_bias': len(inputs) == 3},
+        )(inputs, args, params)
+
+
+class SpatialBN(Caffe2OpConverter):
+    """ Operator converter for SpatialBN.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        return AttrCvt(
+            op_name='batch_norm',
+            disables=['momentum'],
+            ignores=[
+                'order', 'spatial', 'is_test', 'consumed_inputs', 'num_batches'
+            ])(inputs, args, params)
+
+
+# compatible operators that do NOT require any conversion.
+_identity_list = []
+
+# _convert_map defines maps of name to converter functor(callable)
+# for 1 to 1 mapping, use Renamer if nothing but name is different
+# use AttrCvt if attributes need to be converted
+# for 1 to N mapping(composed), use custom callable functions
+# for N to 1 mapping, currently not supported(?)
+
+# Minimal set of ops for squeezenet and resnet50
+def _get_convert_map():
+    return {
+        # caffe2/onnx common operators
+        'Add': onnx.Add.get_converter(opset=1),
+        'Sum': onnx.Sum.get_converter(opset=1),
+        'Softmax': onnx.Softmax.get_converter(opset=1),
+
+        # nn
+        'AveragePool': AveragePool.get_converter(),
+        'MaxPool': MaxPool.get_converter(),
+        'Conv': Conv.get_converter(),
+        'Concat': Concat.get_converter(),
+        'FC': FC.get_converter(),
+        'SpatialBN': SpatialBN.get_converter(),
+        'ResizeNearest': ResizeNearest.get_converter(),
+        'Relu': AttrCvt('relu', {}, ignores=['order']),
+        'Sigmoid': Renamer('sigmoid'),
+        'Dropout': AttrCvt('dropout', {'ratio': 'rate'}, ignores=['is_test']),
+
+        # c2 image preprocessing ops
+        'NormalizePlanarYUV': NormalizePlanarYUV.get_converter(),
+    }
+
+
+class Caffe2NetDef(object):
+    """A helper class for handling nnvm graph copying from pb2.GraphProto.
+    Definition: https://github.com/pytorch/pytorch/blob/master/caffe2/proto/caffe2.proto
+    """
+
+    def __init__(self):
+        self._nodes = {}
+        self._params = {}
+        self._visited_nodes = set()
+        self._ops = {}
+
+    def from_caffe2(self, init_net, predict_net):
+        """Construct nnvm nodes from caffe2 graph.
+
+        Parameters
+        ----------
+        workspace : Caffe2 workspace
+        predict_net : protobuf object
+
+        Returns
+        -------
+        sym : nnvm.sym.Symbol
+            The returned nnvm symbol
+        params : dict
+            A dict of name: tvm.nd.array pairs, used as pretrained weights
+        """
+        from caffe2.python import workspace
+        workspace.RunNetOnce(init_net)
+
+        # Input
+        input_name = predict_net.op[0].input[0]
+
+        # Params
+        self._params = {}
+        used_blobs = set()
+        for c2_op in predict_net.op:
+            for i in c2_op.input:
+                used_blobs.add(i)
+        for blob in workspace.Blobs():
+            if blob in used_blobs and blob != input_name:
+                self._params[blob] = tvm.nd.array(workspace.FetchBlob(blob))
+
+        # Variables
+        self._nodes = {}
+        for blob in predict_net.external_input:
+            self._nodes[blob] = _sym.Variable(name=blob)
+
+        # Ops
+        for c2_op in predict_net.op:
+            for blob in c2_op.output:
+                self._ops[blob] = c2_op
+        for c2_op in predict_net.op:
+            self._process_op(c2_op)
+
+        # Outputs
+        out = []
+        for blob in predict_net.external_output:
+            out.append(self._nodes[blob])
+
+        if len(out) > 1:
+            sym = _sym.Group(out)
+        else:
+            sym = out[0]
+
+        return sym, self._params
+
+    def _get_node(self, blob):
+        """Get the nnvm Symbol of blob and detect cyclic dependency in the graph."""
+        if blob in self._nodes:
+            return self._nodes[blob]
+
+        assert blob not in self._visited_nodes, 'Cyclic dependency in the graph (in {})'.format(
+            blob)
+        self._visited_nodes.add(blob)
+
+        self._process_op(self._ops[blob])
+        return self._nodes[blob]
+
+    def _process_op(self, c2_op):
+        op_type = c2_op.type
+        args = self._parse_arg(c2_op.arg)
+        inputs = [self._get_node(i) for i in c2_op.input]
+        tvm_op = self._convert_operator(op_type, inputs, args)
+        # Ignore all outputs except the first one
+        self._nodes[c2_op.output[0]] = tvm_op[0]
+
+    def _parse_arg(self, arg):
+        """Convert a list of Argument to a dict, with names as keys."""
+        args = {}
+        for a in arg:
+            for f in ['f', 'i', 's']:
+                if a.HasField(f):
+                    args[a.name] = getattr(a, f)
+            for f in ['floats', 'ints', 'strings']:
+                if list(getattr(a, f)):
+                    assert a.name not in args, "Only one type of attr is allowed"
+                    args[a.name] = tuple(getattr(a, f))
+            for f in ['n']:
+                if a.HasField(f):
+                    raise NotImplementedError(
+                        "Field {} is not supported in nnvm.".format(f))
+            for f in ['nets']:
+                if list(getattr(a, f)):
+                    raise NotImplementedError(
+                        "Field {} is not supported in nnvm.".format(f))
+            if a.name not in args:
+                raise ValueError("Cannot parse attribute: \n{}\n.".format(a))
+        return args
+
+    def _convert_operator(self,
+                          op_type,
+                          inputs,
+                          args,
+                          identity_list=None,
+                          convert_map=None):
+        """Convert from Caffe2 operator to nnvm operator.
+        The converter must specify conversions explicity for incompatible name, and
+        apply handlers to operator attributes.
+
+        Parameters
+        ----------
+        op_type : str
+            Operator name, such as Convolution, FullyConnected
+        inputs : list of nnvm.Symbol
+            List of input symbols.
+        args : dict
+            Dict of operator attributes
+        identity_list : list
+            List of operators that don't require conversion
+        convert_map : dict
+            Dict of name : callable, where name is the op's name that
+            require conversion to nnvm, callable are functions which
+            take args and return (new_op_type, new_args)
+
+        Returns
+        -------
+        sym : nnvm.Symbol
+            Converted nnvm Symbol
+        """
+        identity_list = identity_list if identity_list else _identity_list
+        convert_map = convert_map if convert_map else _get_convert_map()
+        if op_type in identity_list:
+            sym = get_nnvm_op(op_type)(*inputs, **args)
+        elif op_type in convert_map:
+            # Add a sanitizing step to convert all byte strings in args to strings
+            sym = convert_map[op_type](inputs, args, self._params)
+        else:
+            raise NotImplementedError(
+                "Operator {} not implemented.".format(op_type))
+        return sym
+
+
+def from_caffe2(init_net, predict_net):
+    """Load caffe2 graph which contains init_net and predict_net into nnvm graph.
+
+    Parameters
+    ----------
+    init_net : protobuf object
+        Caffe2 NetDef containing the weights
+
+    predict_net : protobuf object
+        Caffe2 NetDef containing the graph
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.ndarray
+        Dict of converted parameters stored in tvm.ndarray format
+    """
+
+    caffe2 = Caffe2NetDef()
+    return caffe2.from_caffe2(init_net, predict_net)
diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index 097909de1a8d..92033a31da60 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -4,9 +4,9 @@
 import numpy as np
 import tvm
 from .. import symbol as _sym
-from .. import graph as _graph
-from ..compiler import graph_util
 from .common import get_nnvm_op, Renamer, SymbolTable, AttrConverter as AttrCvt
+from .onnx_caffe2_utils import dimension_picker, dimension_constraint, \
+    infer_channels, revert_caffe2_pad
 
 __all__ = ['from_onnx']
 
@@ -74,16 +74,16 @@ class Pool(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         return AttrCvt(
-            op_name=_dimension_picker(cls.name),
+            op_name=dimension_picker(cls.name),
             transforms={
                 'kernel_shape': 'pool_size',
-                'pads': ('padding', (0, 0), _revert_caffe2_pad)
+                'pads': ('padding', (0, 0), revert_caffe2_pad)
             },
             # very weird attributes here in onnx, force check
             ignores=['dilations'],
             # TODO(zhreshold): make sure ceil_mode in onnx, and layout?
             extras={'ceil_mode': False},
-            custom_check=_dimension_constraint())(inputs, attr, params)
+            custom_check=dimension_constraint())(inputs, attr, params)
 
 
 class Absolute(OnnxOpConverter):
@@ -118,18 +118,18 @@ class Conv(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         # get number of channels
-        channels = _infer_channels(inputs[1], params)
+        channels = infer_channels(inputs[1], params)
         attr['channels'] = channels
         return AttrCvt(
-            op_name=_dimension_picker('conv'),
+            op_name=dimension_picker('conv'),
             transforms={
                 'kernel_shape': 'kernel_size',
                 'dilations': ('dilation', (0, 0)),
-                'pads': ('padding', (0, 0), _revert_caffe2_pad),
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
                 'group': ('groups', 1)
             },
             extras={'use_bias': len(inputs) == 3},
-            custom_check=_dimension_constraint())(inputs, attr, params)
+            custom_check=dimension_constraint())(inputs, attr, params)
 
 
 class ConvTranspose(OnnxOpConverter):
@@ -137,20 +137,20 @@ class ConvTranspose(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         # get number of channels
-        channels = _infer_channels(inputs[1], params, True)
+        channels = infer_channels(inputs[1], params, True)
         attr['channels'] = channels
         groups = attr.pop('group')
         attr['groups'] = groups
         return AttrCvt(
-            op_name=_dimension_picker('conv', '_transpose'),
+            op_name=dimension_picker('conv', '_transpose'),
             transforms={
                 'kernel_shape': 'kernel_size',
                 'dilations': ('dilation', (0, 0)),
-                'pads': ('padding', (0, 0), _revert_caffe2_pad)
+                'pads': ('padding', (0, 0), revert_caffe2_pad)
             },
             disables=['output_shape'],
             extras={'use_bias': len(inputs) == 3},
-            custom_check=_dimension_constraint())(inputs, attr, params)
+            custom_check=dimension_constraint())(inputs, attr, params)
 
 
 class Div(Elemwise):
@@ -180,7 +180,7 @@ def _impl_v1(cls, inputs, attr, params):
         transA = int(attr.get('transA', 0))
         transB = int(attr.get('transB', 0))
         # get number of channels
-        channels = _infer_channels(inputs[1], params, not transB)
+        channels = infer_channels(inputs[1], params, not transB)
         if transA:
             inputs[0] = _sym.transpose(inputs[0], axes=(1, 0))
         if not transB:
@@ -254,7 +254,7 @@ class Prelu(OnnxOpConverter):
     def _impl_v1(cls, inputs, attr, params):
         assert len(inputs) == 2, "Prelu need 2 inputs, {} given".format(
             len(inputs))
-        channels = _infer_channels(inputs[1], params, False)
+        channels = infer_channels(inputs[1], params, False)
         if channels == 1:
             return inputs[0] * inputs[1]
         return _sym.broadcast_mul(inputs[0], inputs[1])
@@ -362,17 +362,6 @@ def _impl_v1(cls, inputs, attr, params):
         return ret
 
 
-def _revert_caffe2_pad(attr):
-    """Caffe2 require two times the normal padding."""
-    if len(attr) == 4:
-        attr = attr[:2]
-    elif len(attr) == 2:
-        pass
-    else:
-        raise ValueError("Invalid caffe2 type padding: {}".format(attr))
-    return attr
-
-
 def _broadcast_constraint():
 
     def _broadcast_check(attrs):
@@ -383,43 +372,11 @@ def _broadcast_check(attrs):
     return _broadcast_check, "Specifying broadcast axis not allowed."
 
 
-def _dimension_picker(prefix, surfix=''):
-
-    def _impl(attr):
-        kernel = attr['kernel_shape']
-        if len(kernel) == 2:
-            return prefix + '2d' + surfix
-        raise NotImplementedError("Only 2d kernel supported.")
-
-    return _impl
-
-
-def _dimension_constraint():
-
-    def _dim_check(attrs):
-        if len(attrs['kernel_shape']) == 2:
-            return True
-        return False
-
-    return _dim_check, "Only 2d kernel supported."
-
-
-def _infer_channels(inputs, params, transpose=False):
-    """A hack for getting 'channles' or 'units' since onnx don't provide
-    these attributes. We check the shape of weights provided to get the number.
-    """
-    g = _graph.create(inputs)
-    shape_dict = {k: v.shape for k, v in params.items()}
-    _, out_shapes = graph_util.infer_shape(g, **shape_dict)
-    channels = out_shapes[0][0] if not transpose else out_shapes[0][1]
-    return channels
-
-
 def _fully_connected(opset):
 
     def _impl(inputs, attr, params):
         # get number of channels
-        channels = _infer_channels(inputs[1], params)
+        channels = infer_channels(inputs[1], params)
         attr['units'] = channels
         return AttrCvt('dense', ignores=['axis', 'axis_w'])(inputs, attr)
 
diff --git a/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py b/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
new file mode 100644
index 000000000000..4dfc366d0b6f
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
@@ -0,0 +1,46 @@
+"""Util functions shared by the ONNX and Caffe2 frontends."""
+from __future__ import absolute_import as _abs
+from nnvm import graph as _graph
+from nnvm.compiler import graph_util
+
+
+def dimension_picker(prefix, surfix=''):
+    def _impl(attr):
+        kernel = attr['kernel_shape']
+        if len(kernel) == 2:
+            return prefix + '2d' + surfix
+        else:
+            raise NotImplementedError("Only 2d kernel supported.")
+
+    return _impl
+
+
+def dimension_constraint():
+    def _dim_check(attrs):
+        if len(attrs['kernel_shape']) == 2:
+            return True
+        return False
+
+    return _dim_check, "Only 2d kernel supported."
+
+
+def infer_channels(inputs, params, transpose=False):
+    """A hack for getting 'channels' or 'units' since caffe2 don't provide
+    these attributes. We check the shape of weights provided to get the number.
+    """
+    g = _graph.create(inputs)
+    shape_dict = {k: v.shape for k, v in params.items()}
+    _, out_shapes = graph_util.infer_shape(g, **shape_dict)
+    channels = out_shapes[0][0] if not transpose else out_shapes[0][1]
+    return channels
+
+
+def revert_caffe2_pad(pads):
+    """Caffe2 require two times the normal padding."""
+    if len(pads) == 4:
+        pads = pads[:2]
+    elif len(pads) == 2:
+        pass
+    else:
+        raise ValueError("Invalid caffe2 type padding: {}".format(pads))
+    return pads
diff --git a/nnvm/tests/python/frontend/caffe2/model_zoo/__init__.py b/nnvm/tests/python/frontend/caffe2/model_zoo/__init__.py
new file mode 100644
index 000000000000..302177e75288
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/model_zoo/__init__.py
@@ -0,0 +1,18 @@
+"""Store for caffe2 examples and common models."""
+from __future__ import absolute_import as _abs
+import os
+import importlib
+
+models = [
+    'squeezenet',
+    'resnet50',
+    'vgg19',
+]
+
+# skip download if model exist
+for model in models:
+    try:
+        locals()['c2_' + model] = importlib.import_module('caffe2.python.models.' + model)
+    except ImportError:
+        os.system("python -m caffe2.python.models.download -i -f " + model)
+        locals()['c2_' + model] = importlib.import_module('caffe2.python.models.' + model)
diff --git a/nnvm/tests/python/frontend/caffe2/model_zoo/squeezenet.py b/nnvm/tests/python/frontend/caffe2/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..2de2d1075494
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/model_zoo/squeezenet.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=unused-argument
+
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+from nnvm import symbol as sym
+from nnvm.testing.utils import create_workload
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = sym.concatenate(left, right, axis=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = sym.conv2d(net, channels=channels, kernel_size=(kernel_size, kernel_size),
+                     padding=(padding, padding))
+    net = sym.relu(net)
+    return net
+
+# Net
+def get_symbol(num_classes, version, **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version == '1.1', ("Unsupported SqueezeNet version {version}:"
+                              "1.1 expected".format(version=version))
+    net = sym.Variable("data")
+
+    net = sym.conv2d(net, channels=64, kernel_size=(3, 3), strides=(2, 2))
+    net = sym.relu(net)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 16, 64, 64)
+    net = _make_fire(net, 16, 64, 64)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 32, 128, 128)
+    net = _make_fire(net, 32, 128, 128)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 64, 256, 256)
+    net = _make_fire(net, 64, 256, 256)
+
+    net = sym.dropout(net, rate=0.5)
+    net = sym.conv2d(net, channels=num_classes, kernel_size=(1, 1))
+    net = sym.relu(net)
+    net = sym.global_avg_pool2d(net)
+    return sym.softmax(net, axis=1)
+
+def get_workload(batch_size=1, num_classes=1000, version='1.0',
+                 image_shape=(3, 224, 224), dtype="float32", **kwargs):
+    """Get benchmark workload for SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, version=version, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/tests/python/frontend/caffe2/test_forward.py b/nnvm/tests/python/frontend/caffe2/test_forward.py
new file mode 100644
index 000000000000..68a1ab7eda2b
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/test_forward.py
@@ -0,0 +1,93 @@
+import numpy as np
+import nnvm
+import tvm
+from tvm.contrib import graph_runtime
+from nnvm.testing.config import ctx_list
+from model_zoo import c2_squeezenet, c2_resnet50, c2_vgg19
+
+from caffe2.python import workspace
+
+
+def get_tvm_output(model,
+                   input_data,
+                   target,
+                   ctx,
+                   output_shape,
+                   output_dtype='float32'):
+    """ Generic function to execute and get tvm output"""
+    sym, params = nnvm.frontend.from_caffe2(model.init_net, model.predict_net)
+
+    # supporting multiple inputs in caffe2 in a bit tricky,
+    # because the input names can appear at the beginning or end of model.predict_net.external_input
+    assert isinstance(input_data, np.ndarray)
+
+    # here we use the first input blob to the first op to get the input name
+    input_names = model.predict_net.op[0].input[0]
+    shape_dict = {input_names: input_data.shape}
+    dtype_dict = {input_names: input_data.dtype}
+
+    graph, lib, params = nnvm.compiler.build(
+        sym, target, shape=shape_dict, dtype=dtype_dict, params=params)
+
+    ctx = tvm.cpu(0)
+    m = graph_runtime.create(graph, lib, ctx)
+
+    # set inputs
+    m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype)))
+    m.set_input(**params)
+
+    # execute
+    m.run()
+
+    # get outputs
+    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+        tvm_output_list = []
+        for i, s in enumerate(output_shape):
+            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+            tvm_output_list.append(tvm_output.asnumpy())
+        return tvm_output_list
+    else:
+        tvm_output = m.get_output(0, tvm.nd.empty((output_shape),
+                                                  output_dtype))
+        return tvm_output.asnumpy()
+
+
+def get_caffe2_output(model, x, dtype='float32'):
+    workspace.RunNetOnce(model.init_net)
+
+    input_blob = model.predict_net.op[0].input[0]
+    workspace.FeedBlob(input_blob, x.astype(dtype))
+    workspace.RunNetOnce(model.predict_net)
+
+    output_blob = model.predict_net.external_output[0]
+    c2_output = workspace.FetchBlob(output_blob)
+    return c2_output
+
+
+def verify_caffe2_forward_impl(model, data_shape, out_shape):
+    dtype = 'float32'
+    data = np.random.uniform(size=data_shape).astype(dtype)
+    c2_out = get_caffe2_output(model, data, dtype)
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, data, target, ctx, out_shape, dtype)
+        tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+
+def verify_squeezenet1_1():
+    verify_caffe2_forward_impl(c2_squeezenet, (1, 3, 224, 224),
+                               (1, 1000, 1, 1))
+
+
+def verify_resnet50():
+    verify_caffe2_forward_impl(c2_resnet50, (1, 3, 224, 224),
+                               (1, 1000))
+
+
+def verify_vgg19():
+    verify_caffe2_forward_impl(c2_vgg19, (1, 3, 224, 224), (1, 1000))
+
+
+if __name__ == '__main__':
+    verify_squeezenet1_1()
+    verify_resnet50()
+    verify_vgg19()
diff --git a/nnvm/tests/python/frontend/caffe2/test_graph.py b/nnvm/tests/python/frontend/caffe2/test_graph.py
new file mode 100755
index 000000000000..425fc9a6201d
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/test_graph.py
@@ -0,0 +1,24 @@
+"""Test graph equality of caffe2 models."""
+import nnvm
+from nnvm.compiler import graph_util, graph_attr
+from model_zoo import c2_squeezenet, squeezenet
+
+def compare_graph(init, predict, nnvm_sym, ishape):
+    caffe2_sym, params = nnvm.frontend.from_caffe2(init, predict)
+    g1 = nnvm.graph.create(caffe2_sym)
+    g2 = nnvm.graph.create(nnvm_sym)
+    input_name = predict.external_input[0]
+    ishapes = {input_name: ishape}
+    graph_attr.set_shape_inputs(g1, ishapes)
+    graph_attr.set_shape_inputs(g2, ishapes)
+    g1 = g1.apply("InferShape").apply("SimplifyInference")
+    g2 = g2.apply("InferShape").apply("SimplifyInference")
+    graph_util.check_graph_equal(g1, g2)
+
+def test_squeeze_net():
+    symbol, params = squeezenet.get_workload(version='1.1')
+    compare_graph(c2_squeezenet.init_net, c2_squeezenet.predict_net, symbol, ishape=(1, 3, 224, 224))
+
+
+if __name__ == '__main__':
+    test_squeeze_net()

From 03c78fa9b369f47329d535d45233e701f1d416ff Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Sun, 18 Nov 2018 08:55:15 -0800
Subject: [PATCH 368/529] [Relay] compute & schedule for relu, softmax (#2127)

---
 python/tvm/relay/op/nn/_nn.py        | 14 ++++++++++++++
 src/relay/op/nn/nn.cc                | 20 ++++++++++++++++++--
 tests/python/relay/test_op_level1.py | 16 ++++++++++++----
 3 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 7bc26cdec9f9..8d53e27892bc 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -5,6 +5,20 @@
 from .. import op as reg
 from ..op import OpPattern, schedule_injective
 
+# relu
+reg.register_schedule("nn.relu", schedule_injective)
+reg.register_pattern("nn.relu", OpPattern.ELEMWISE)
+
+
+@reg.register_schedule("nn.softmax")
+def schedule_softmax(_, outputs, target):
+    """Schedule definition of softmax"""
+    with target:
+        return topi.generic.schedule_softmax(outputs)
+
+reg.register_pattern("nn.softmax", OpPattern.OPAQUE)
+
+
 # dense
 @reg.register_compute("nn.dense")
 def compute_dense(attrs, inputs, out_type, target):
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index fb4c7304a5eb..16b65aeeab7f 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -7,6 +7,8 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/image.h>
+#include <topi/nn.h>
+#include <topi/nn/softmax.h>
 #include <vector>
 #include "../type_relations.h"
 #include "../op_common.h"
@@ -252,7 +254,15 @@ RELAY_REGISTER_OP("nn.softmax")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  const auto* param = attrs.as<SoftmaxAttrs>();
+  CHECK(param != nullptr);
+  return Array<Tensor>{ topi::nn::softmax(inputs[0], param->axis) };
+});
 
 
 TVM_REGISTER_API("relay.op.nn._make.log_softmax")
@@ -364,7 +374,13 @@ RELAY_REGISTER_OP("nn.relu")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  return Array<Tensor>{ topi::relu(inputs[0], 0.0f) };
+});
 
 
 // Positional relay function to create LRN operator used by frontend FFI.
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 88a7aba59389..53de7aa26279 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -3,6 +3,7 @@
 import numpy as np
 from tvm import relay
 from tvm.relay.testing import ctx_list
+import topi.testing
 
 def sigmoid(x):
     one = np.ones_like(x)
@@ -42,7 +43,7 @@ def check_single_op(opfunc, ref):
                    (tvm.relay.sqrt, np.sqrt),
                    (tvm.relay.sigmoid, sigmoid),
                    (tvm.relay.tanh, np.tanh),
-                   (relay.nn.relu, None)]: # Just add RELU here after registering.
+                   (relay.nn.relu, relu)]:
         check_single_op(opfunc, ref)
 
 
@@ -120,12 +121,19 @@ def test_expand_dims_infer_type():
 
 
 def test_softmax():
-    n, d = tvm.var("n"), tvm.var("d")
-    x = relay.var("x", shape=(n, d))
+    shape = (10, 4)
+    x = relay.var("x", shape=shape)
     y = relay.nn.softmax(x, axis=1)
     assert "nn.softmax" in y.astext()
     yy = relay.ir_pass.infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, d))
+    assert yy.checked_type == relay.TensorType(shape)
+    func = relay.Function([x], y)
+    x_data = np.random.uniform(size=shape).astype("float32")
+    ref_res = topi.testing.softmax_python(x_data)
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res = intrp.evaluate(func)(x_data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
 def test_log_softmax():

From 81ff1ef720eebf8b424179032d39836a70114942 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 18 Nov 2018 14:16:37 -0500
Subject: [PATCH 369/529] [SCHEDULE] Fix boundary check (#2126)

* Fix boundary check

* Add unittest
---
 src/schedule/message_passing.cc                     |  5 +++--
 tests/python/unittest/test_schedule_schedule_ops.py | 10 ++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/schedule/message_passing.cc b/src/schedule/message_passing.cc
index 622e0b698902..6c185d6f8637 100644
--- a/src/schedule/message_passing.cc
+++ b/src/schedule/message_passing.cc
@@ -491,11 +491,12 @@ std::vector<Expr> MakeBoundCheck(
       IntSet s = EvalSet(value, iset_dmap);
       Expr vmin = s.min();
       Expr vmax = s.max();
-      if (vmin.type() != value.type() || !can_prove(vmin >= iv->dom->min)) {
+      // The range of `value` resides in [vmin, vmax]
+      if (vmin.type() != value.type() || !can_prove(vmin >= 0)) {
         preds.emplace_back(value >= 0);
       }
       if (vmax.type() != value.type() || !can_prove(vmax < iv->dom->extent)) {
-        preds.emplace_back(value < (iv->dom->extent - iv->dom->min));
+        preds.emplace_back(value < iv->dom->extent);
       }
     }
   }
diff --git a/tests/python/unittest/test_schedule_schedule_ops.py b/tests/python/unittest/test_schedule_schedule_ops.py
index 8774514cfa17..e60073fe9f5c 100644
--- a/tests/python/unittest/test_schedule_schedule_ops.py
+++ b/tests/python/unittest/test_schedule_schedule_ops.py
@@ -12,6 +12,7 @@ def test_schedule0():
     assert isinstance(bounds, tvm.container.Map)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+
 def test_schedule1():
     m = tvm.var('m')
     l = tvm.var('l')
@@ -53,10 +54,13 @@ def test_schedule_scan():
     assert tuple(res.shape) == (m, n)
     s = tvm.create_schedule(res.op)
     s = s.normalize()
+    ir = tvm.lower(s, [s_state], simple_mode=True)
+    assert not hasattr(ir.body.body.body.body.rest.body.body.rest.body, "condition")
     bounds = tvm.schedule.InferBound(s)
     assert(bounds[res.op.scan_axis].min.value == 1)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+
 def test_inline_multi_reduce():
     def argmax_comp(x, y):
         idx = tvm.select((x[1] >= y[1]), x[0], y[0])
@@ -80,7 +84,6 @@ def argmax_init(idx_typ, val_typ):
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
 
-
 def test_auto_inline():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -96,6 +99,7 @@ def test_auto_inline():
     bounds = tvm.schedule.InferBound(s)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+
 def test_schedule_const_bound():
     n = 128
     A = tvm.placeholder((n,), name='A')
@@ -146,6 +150,7 @@ def test_scan_inline1():
     s[s_x1].compute_inline()
     stmt = tvm.lower(s, [x, res1, res2])
 
+
 def test_scan_inline2():
     m = tvm.var("m")
     n = tvm.var("n")
@@ -183,6 +188,7 @@ def test_schedule_cache():
     bounds = tvm.schedule.InferBound(s)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+
 def test_schedule_middle_cache():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -202,7 +208,6 @@ def test_schedule_middle_cache():
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
 
-
 def test_schedule_cache_relayout1():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -249,6 +254,7 @@ def test_schedule_cache_relayout3():
     bounds = tvm.schedule.InferBound(s)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+
 def test_schedule_cache_relayout4():
     def _compute(*indice):
         return A(*indice) + 1, B(*indice) / 2

From 6fef53b0dffff552debd33ce0beb5944f13a6d29 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Sun, 18 Nov 2018 17:37:04 -0800
Subject: [PATCH 370/529] [TOPHUB] fix x86 backend after introducing dilation
 (#2129)

---
 python/tvm/autotvm/tophub.py   | 2 +-
 topi/python/topi/x86/conv2d.py | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 3e52ecb52b73..d90fd76b2532 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -21,7 +21,7 @@
 # the version of each package
 PACKAGE_VERSION = {
     'arm_cpu': "v0.04",
-    'llvm':    "v0.02",
+    'llvm':    "v0.03",
 
     'cuda':    "v0.04",
     'rocm':    "v0.02",
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 22d842cd49c7..e48a95780e7f 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -292,6 +292,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo):
     out_channel = attrs.get_int("channels")
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
     layout = attrs['layout']
     kh, kw = attrs.get_int_tuple("kernel_size")
 
@@ -309,10 +310,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfo):
     target = tvm.target.current_target()
     # query schedule and fallback if necessary
     workload = autotvm.task.args_to_workload(
-        [data, kernel, strides, padding, out_dtype], depthwise_conv2d_nchw) \
+        [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \
         if is_depthwise else \
         autotvm.task.args_to_workload(
-            [data, kernel, strides, padding, layout, out_dtype], conv2d)
+            [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d)
     cfg = dispatch_ctx.query(target, workload)
     if cfg.is_fallback:
         _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise)
@@ -334,7 +335,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo):
         # Store altered operator's config
         new_kernel = tvm.placeholder((out_channel//oc_bn, kh, kw, oc_bn), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, new_attrs['layout'],
+            [new_data, new_kernel, strides, padding, dilation, new_attrs['layout'],
              new_attrs['out_layout'], out_dtype], depthwise_conv2d_NCHWc)
     else:
         out_channel, _, kh, kw = get_const_tuple(kernel.shape)
@@ -345,7 +346,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo):
         new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn),
                                      dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, new_attrs['layout'],
+            [new_data, new_kernel, strides, padding, dilation, new_attrs['layout'],
              new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
 
     dispatch_ctx.update(target, new_workload, cfg)

From 49bd3b0dbdac47d3d7d7f1d2d2916e68027f9db2 Mon Sep 17 00:00:00 2001
From: Jian Weng <werefluke@gmail.com>
Date: Sun, 18 Nov 2018 22:52:17 -0800
Subject: [PATCH 371/529] [HYBRID FRONTEND] Modify hybrid script to new
 interface; hybrid op supported; enable compilation_database in CMakeList.txt
 (#1757)

---
 CMakeLists.txt                              |   1 +
 docs/langref/hybrid_script.rst              |  22 +-
 include/tvm/operation.h                     |  63 +++++
 python/tvm/build_module.py                  |   5 -
 python/tvm/hybrid/__init__.py               |   3 +-
 python/tvm/hybrid/api.py                    |  49 ++--
 python/tvm/hybrid/intrin.py                 |  26 +-
 python/tvm/hybrid/parser.py                 | 238 ++++++++++-------
 python/tvm/hybrid/util.py                   |  26 +-
 python/tvm/hybrid/var_decl.py               |  24 +-
 python/tvm/tensor.py                        |   5 +
 src/api/api_lang.cc                         |  10 +
 src/op/hybrid_op.cc                         | 189 ++++++++++++++
 src/op/op_util.cc                           |  31 +++
 src/op/op_util.h                            |  12 +-
 tests/python/unittest/test_hybrid_script.py | 268 +++++++++++++-------
 16 files changed, 706 insertions(+), 266 deletions(-)
 create mode 100644 src/op/hybrid_op.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8bfca8020c3c..98bbc5b650d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,6 +57,7 @@ include_directories("3rdparty/compiler-rt")
 # initial variables
 set(TVM_LINKER_LIBS "")
 set(TVM_RUNTIME_LINKER_LIBS "")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Generic compilation options
 if(MSVC)
diff --git a/docs/langref/hybrid_script.rst b/docs/langref/hybrid_script.rst
index fdaed2b5be40..f8da87d8cfd2 100644
--- a/docs/langref/hybrid_script.rst
+++ b/docs/langref/hybrid_script.rst
@@ -22,13 +22,15 @@ you need to use ``tvm.hybrid.script`` decorator to indicate this is a hybrid fun
 
     @tvm.hybrid.script
     def outer_product(a, b, c):
+        c = output_tensor((100, 99), 'float32')
         for i in range(a.shape[0]):
             for j in range(b.shape[0]):
                 c[i, j] = a[i] * b[j]
-    a = numpy.random.rand(100)
-    b = numpy.random.rand(99)
-    c = numpy.zeros((100, 99))
-    outer_product(a, b, c)
+          return c
+    a = numpy.random.randn(100)
+    b = numpy.random.randn(99)
+    c = outer_product(a, b)
+
 
 This decorator will import `Keywords`_ required spontaneously when software emulation.
 After software emulation is done, the imported keywords will be cleaned up. Users do not need
@@ -40,25 +42,25 @@ or ``numpy`` numeric type.
 Backend Compilation
 ~~~~~~~~~~~~~~~~~~~
 
+This function is not encouraged to use, users are encouraged to use the second interface.
 The current parse interface looks like:
 
 .. code-block:: python
 
    a = tvm.placeholder((100, ), name='a')
    b = tvm.placeholder((99, ), name='b')
-   c = tvm.placeholder((100, 99), name='c')
-   tvm.hybrid.parse(outer_product, [a, b, c]) # return an ir root of this function
+   parser = tvm.hybrid.parse(outer_product, [a, b]) # return the parser of this function
 
-If we pass these tvm tensors to this function, it returns a op node:
 
-**Under construction, we are still deciding what kind of node should be returned.**
+If we pass these tvm tensors to this function, it returns a op node:
 
 .. code-block:: python
 
    a = tvm.placeholder((100, ), name='a')
    b = tvm.placeholder((99, ), name='b')
-   c = tvm.placeholder((100, 99), name='c')
-   op = outer_product(a, b, c) # return the corresponding op node
+   c = outer_product(a, b, c) # return the output tensor(s) of the operator
+
+**Under construction, we are still deciding what kind of node should be returned.**
 
 Tuning
 ~~~~~~
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index 1a1d28ab71bb..02cd0d016f39 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -450,6 +450,69 @@ class ExternOpNode : public OperationNode {
   TVM_DECLARE_NODE_TYPE_INFO(ExternOpNode, OperationNode);
 };
 
+/*!
+ * \brief A computation operator that generated by hybrid script.
+ */
+class HybridOpNode : public OperationNode {
+ public:
+  /*! \brief The input tensors */
+  Array<Tensor> inputs;
+  /*! \brief Symbolic placeholder representation of outputs */
+  Array<Tensor> outputs;
+  /*! \brief the statement that generates the computation. This is
+   * slightly different from the body in ExternOpNode. All the output
+   * tensors keep its own name specified by users in the script.
+   * However, when compilation, these tensors will be placed by those
+   * actual output tensors. */
+  Stmt body;
+
+  /*! \brief constructor */
+  HybridOpNode() {}
+  // override functions
+  int num_outputs() const final;
+  Array<IterVar> root_iter_vars() const final;
+  Type output_dtype(size_t i) const final;
+  Array<Expr> output_shape(size_t i) const final;
+  Array<Tensor> InputTensors() const final;
+  Operation ReplaceInputs(
+      const Operation& self,
+      const std::unordered_map<Tensor, Tensor>& rmap) const final;
+  void PropBoundToInputs(
+      const Operation& self,
+      const std::unordered_map<const Variable*, IntSet>& dom_map,
+      std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
+  void GatherBound(
+      const Operation& self,
+      const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+      std::unordered_map<IterVar, Range>* out_dom_map) const final;
+  Stmt BuildRealize(
+      const Stage& stage,
+      const std::unordered_map<IterVar, Range>& realize_map,
+      const Stmt& body) const final;
+  Stmt BuildProvide(
+      const Stage& stage,
+      const std::unordered_map<IterVar, Range>& dom_map,
+      bool debug_keep_trivial_loop) const final;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("tag", &tag);
+    v->Visit("attrs", &attrs);
+    v->Visit("inputs", &inputs);
+    v->Visit("outputs", &outputs);
+    v->Visit("body", &body);
+  }
+  EXPORT static Operation make(std::string name,
+                               std::string tag,
+                               Map<std::string, NodeRef> attrs,
+                               Array<Tensor> inputs,
+                               Array<Tensor> outputs,
+                               Stmt body);
+
+  static constexpr const char* _type_key = "HybridOp";
+  TVM_DECLARE_NODE_TYPE_INFO(HybridOpNode, OperationNode);
+};
+
 /*! \brief The compute function to specify the input source of a Tensor */
 using FCompute = std::function<Expr (const Array<Var>& i)>;
 
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 2bb7442bab76..d65642340bad 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -340,11 +340,6 @@ def lower(sch,
         bounds = schedule.InferBound(sch)
         stmt = schedule.ScheduleOps(sch, bounds)
         stmt = ir_pass.InjectPrefetch(stmt)
-    else:
-        #So far there is no op for hybrid script, so a plain ir body is given
-        if not isinstance(sch, _stmt.Stmt):
-            raise ValueError("sch should be either a Schedule or a Stmt")
-        stmt = sch
 
     for f in lower_phase0:
         stmt = f(stmt)
diff --git a/python/tvm/hybrid/__init__.py b/python/tvm/hybrid/__init__.py
index e0a39c562f0f..6c137490c38e 100644
--- a/python/tvm/hybrid/__init__.py
+++ b/python/tvm/hybrid/__init__.py
@@ -7,4 +7,5 @@
 2. Developers can build HalideIR by writing Python code.
 """
 
-from .api import script, parse
+from .api import script
+from .parser import parse_python
diff --git a/python/tvm/hybrid/api.py b/python/tvm/hybrid/api.py
index 48e192d4ba39..5267731f4f52 100644
--- a/python/tvm/hybrid/api.py
+++ b/python/tvm/hybrid/api.py
@@ -1,9 +1,12 @@
 """APIs of lowering the Python subset to HalideIR"""
 from __future__ import absolute_import as _abs
 
-import types
 from .._ffi.base import decorate
+from .. import _api_internal as _tvm_internal
+from ..tensor import Tensor
+
 from .parser import parse_python
+from .util import _pruned_source
 
 
 def script(pyfunc):
@@ -17,40 +20,26 @@ def script(pyfunc):
     hybrid_func : function
         A decorated hybrid script function.
     """
-    def wrapped_func(func, *args, **kwargs):
+    def wrapped_func(func, *args, **kwargs): #pylint: disable=missing-docstring
         from .util import _enter_hybrid_runtime, _restore_runtime, _is_tvm_arg_types
         if _is_tvm_arg_types(args):
-            return parse(func, args)
+            src = _pruned_source(func)
+            parser = parse_python(src, args)
+
+            input_tensors = []
+            for i in args:
+                if isinstance(i, Tensor):
+                    input_tensors.append(i)
+
+            op = _tvm_internal._HybridOp(parser.func_name, "HybridOp", None, input_tensors,
+                                         parser.outputs, parser.parsed_body)
+            res = [op.output(i) for i in range(len(parser.outputs))]
+
+            return res[0] if len(res) == 1 else res
 
         intersect = _enter_hybrid_runtime(func)
         value = func(*args, **kwargs)
         _restore_runtime(func, intersect)
         return value
-    return decorate(pyfunc, wrapped_func)
-
-
-def parse(func, args):
-    """Parse a subset of Python to HalideIR
 
-    Parameters
-    ----------
-    func : str or types.FunctionType
-        If it is a string, parse the source code
-        If it is a function, parse the function
-
-    args : list of Buffer or Tensor or Var
-        The argument lists to the function.
-        Leave it None if no buffer is related to the function to be parsed
-
-    Returns
-    -------
-    root : Stmt
-        The result Halide IR and the parser class instance.
-    """
-    from .util import _pruned_source
-    if isinstance(func, str):
-        src = func
-    else:
-        assert isinstance(func, types.FunctionType)
-        src = _pruned_source(func)
-    return parse_python(src, args)
+    return decorate(pyfunc, wrapped_func)
diff --git a/python/tvm/hybrid/intrin.py b/python/tvm/hybrid/intrin.py
index b3fb64579b60..92e259585b7a 100644
--- a/python/tvm/hybrid/intrin.py
+++ b/python/tvm/hybrid/intrin.py
@@ -48,6 +48,7 @@ def allocate(shape, dtype='float32', scope='global'): #pylint: disable=unused-ar
     """
     return numpy.zeros(shape).astype(dtype)
 
+output_tensor = allocate #pylint: disable=invalid-name
 
 def popcount(x):
     """
@@ -87,18 +88,19 @@ def sigmoid(x):
 
 
 HYBRID_GLOBALS = {
-    'unroll'    : unroll,
-    'vectorize' : vectorize,
-    'parallel'  : parallel,
-    'allocate'  : allocate,
-    'bind'      : bind,
-    'sqrt'      : numpy.sqrt,
-    'log'       : numpy.log,
-    'tanh'      : numpy.tanh,
-    'power'     : numpy.power,
-    'exp'       : numpy.exp,
-    'sigmoid'   : sigmoid,
-    'popcount'  : popcount
+    'unroll'       : unroll,
+    'vectorize'    : vectorize,
+    'parallel'     : parallel,
+    'allocate'     : allocate,
+    'output_tensor': output_tensor,
+    'bind'         : bind,
+    'sqrt'         : numpy.sqrt,
+    'log'          : numpy.log,
+    'tanh'         : numpy.tanh,
+    'power'        : numpy.power,
+    'exp'          : numpy.exp,
+    'sigmoid'      : sigmoid,
+    'popcount'     : popcount
 }
 
 
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index cf21ea950549..a16f5abd4349 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -2,8 +2,9 @@
 
 import ast
 import operator
+import logging
 import sys
-from .util import make_nop, halide_imm_types, is_docstring
+from .util import make_nop, halide_imm_types, is_docstring, _internal_assert
 from .intrin import LOOP_INTRIN, MATH_INTRIN
 from .var_decl import determine_variable_usage
 from ..api import thread_axis
@@ -72,15 +73,17 @@ def __init__(self, args, usage, func_name=None):
             The name of the function to be lowered; if not provided,
             the compiler will use the name in the AST
         """
-        self.args = args[:]
+        self.args = list(args)
         self.usage = usage.copy()
         self._args = {} # Dict maps arg name to actual arg instance (either a var or a buffer)
-        self.var_buffers = {} # Buffers formed by mutatble variables
         self.alloc_buffers = {} # Buffers formed by allocate instructions
         self.loops_above = {} # State variable that indicates loop levels above the current node
         self.var_consts = {} # Variables that are determined as readonly in previous stage
         self.func_name = func_name # The name of the function to be lowered
-        self.iter_axis = []
+        self.outputs = [] # Output tensors' name
+        self.side_effect = set() # Tensors with side effects
+        self.parsed_body = None # The parsed HalideIR body
+        self.returned = False
 
 
     def wrap_up_realize(self, node, body):
@@ -90,9 +93,8 @@ def wrap_up_realize(self, node, body):
                 continue
             _, level, _ = val
             if level == node:
-                if key in self.var_buffers.keys():
-                    _buf = self.var_buffers[key]
-                    _scope = 'global'
+                if key in self._args.keys():
+                    continue
                 else:
                     _buf, _scope = self.alloc_buffers[key]
                 _domain = [_make.range_by_min_extent(0, i) for i in _buf.shape]
@@ -103,12 +105,13 @@ def wrap_up_realize(self, node, body):
         return body
 
 
-    def _get_buffer_from_id(self, s):
-        if s not in self._args.keys() and s not in self.alloc_buffers.keys():
-            raise ValueError("This %s is expected to be in argument list or allocated buffer!" % s)
-        if s in self._args.keys() and s in self.alloc_buffers.keys():
-            raise ValueError("%s, a buffer cannot be both argument and allocated!" % s)
+    def _get_buffer_from_id(self, s, for_provide=False):
+        _internal_assert((s in self._args.keys()) + (s in self.alloc_buffers.keys()) == 1,
+                         "This %s is expected to be in either \
+                          argument list or allocated buffer!" % s)
         if s in self._args.keys():
+            if for_provide:
+                self.side_effect.add(self._args[s])
             return self._args[s]
         return self.alloc_buffers[s][0]
 
@@ -116,15 +119,15 @@ def _get_buffer_from_id(self, s):
 
     #pylint: disable=invalid-name, missing-docstring
     def visit_Module(self, node):
-        if len(node.body) != 1:
-            raise ValueError("Only one-function source code can be fed to this parser!")
+        _internal_assert(len(node.body) == 1, \
+                         "Only one-function source code can be fed to this parser!")
         return self.visit(node.body[0])
 
 
     def visit_FunctionDef(self, node):
-        if len(node.args.args) != len(self.args):
-            raise ValueError("The number of arguments passed to the function\
-                should be the same as it is defined!")
+        _internal_assert(len(node.args.args) == len(self.args), \
+                         "The number of arguments passed to the \
+                         function should be the same as it is defined!")
         for idx, arg in enumerate(node.args.args):
             _attr = 'id' if sys.version_info[0] < 3 else 'arg' # To make py2 and 3 compatible
             self._args[getattr(arg, _attr)] = self.args[idx]
@@ -145,17 +148,17 @@ def visit_Name(self, node):
             return self._args[_id]
         elif _id in self.loops_above.keys():
             return self.loops_above[_id]
-        if _id in self._args.keys():
-            raise ValueError("This id %s should be handled in visit_Subscript!" % _id)
-        if _id  not in self.usage.keys():
-            raise ValueError("This id %s is expected to be a defined variable!" % _id)
+        _internal_assert(_id not in self._args.keys(), \
+                "This id %s should be handled in visit_Subscript!" % _id)
+        _internal_assert(_id in self.usage.keys(), \
+                "This id %s is expected to be a defined variable!" % _id)
         # Buffer
-        if _id in self.var_buffers.keys():
-            _buf = self.var_buffers[_id]
+        if _id in self.alloc_buffers.keys():
+            _buf, _ = self.alloc_buffers[_id]
             return _make.Call(_buf.dtype, _id, [_api.const(0)], _expr.Call.Halide, _buf.op, 0)
         # Compilation time constant
-        if _id not in self.var_consts.keys():
-            raise ValueError("This id %s is expected to a compilation time constant!" % _id)
+        _internal_assert(_id in self.var_consts.keys(),
+                         "This id %s is expected to a compilation time constant!" % _id)
         return self.var_consts[_id]
 
 
@@ -164,8 +167,7 @@ def visit_Num(self, node):
 
 
     def visit_Assign(self, node):
-        if len(node.targets) != 1:
-            raise ValueError("So far only one-valued assignment is supported!")
+        _internal_assert(len(node.targets) == 1, "So far only one-valued assignment is supported!")
         lhs = node.targets[0]
         rhs = self.visit(node.value)
         if isinstance(rhs, _expr.Expr):
@@ -174,36 +176,40 @@ def visit_Assign(self, node):
             #TODO: support defined intermediate buffer later
             lhs_ = lhs
             lhs = lhs.id
-            if lhs in self.loops_above.keys():
-                raise ValueError("You CAN NEVER overwrite a loop variable!")
+            _internal_assert(lhs not in self.loops_above.keys(), \
+                    "Loop variable cannot be overwritten!")
             decl, _, rw = self.usage[lhs]
             if decl == lhs_:
-                if lhs in self.var_consts.keys():
-                    raise ValueError("BUG: A constant cannot be overwritten!")
-                if lhs in self.var_buffers.keys() or lhs in self.alloc_buffers.keys():
-                    raise ValueError("BUG: This value should not be defined before this point!")
+                _internal_assert(lhs not in self.var_consts.keys(), \
+                                 "A constant cannot be overwritten!")
+                _internal_assert(lhs not in self.alloc_buffers.keys(), \
+                                 "This value should not be defined before this point!")
                 if isinstance(rhs, tuple):
                     shape, dtype, scope = rhs
                     ph = _api.placeholder(shape, dtype=dtype, name=lhs)
-                    self.alloc_buffers[lhs] = (ph, scope)
+                    if scope != 'output':
+                        self.alloc_buffers[lhs] = (ph, scope)
+                    else:
+                        self._args[lhs] = ph
+                        self.outputs.append(lhs)
                     return make_nop()
                 if isinstance(rhs, halide_imm_types) and ast.Store not in rw:
                     self.var_consts[lhs] = rhs
                 else:
-                    self.var_buffers[lhs] = _api.placeholder((1, ), dtype=rhs.dtype, name=lhs)
+                    ph = _api.placeholder((1, ), dtype=rhs.dtype, name=lhs)
+                    self.alloc_buffers[lhs] = (ph, 'global')
             if lhs in self.var_consts.keys():
                 return make_nop()
-            else:
-                if lhs not in self.var_buffers.keys():
-                    raise ValueError("BUG: This variable should be defined before!")
-                tgt = self.var_buffers[lhs]
-                return _make.Provide(tgt.op, 0, rhs, [_api.const(0, dtype=rhs.dtype)])
+            _internal_assert(lhs in self.alloc_buffers.keys(), \
+                             "This variable should be defined before!")
+            tgt, _ = self.alloc_buffers[lhs]
+            return _make.Provide(tgt.op, 0, rhs, [_api.const(0, dtype=rhs.dtype)])
         else:
             lhs = self.visit(lhs)
-            if not isinstance(lhs, _expr.Call):
-                raise ValueError("An array access's LHS is expected to be a expr.Call!")
+            _internal_assert(isinstance(lhs, _expr.Call), \
+                             "An array access's LHS is expected to be a expr.Call!")
             #TODO: support slice later
-            buf = self._get_buffer_from_id(lhs.name)
+            buf = self._get_buffer_from_id(lhs.name, for_provide=True)
             return _make.Provide(buf.op, 0, rhs, lhs.args)
 
 
@@ -219,21 +225,20 @@ def visit_Subscript(self, node):
             array = node.value.id
             _buf = self._get_buffer_from_id(array)
             return _make.Call(_buf.dtype, array, args, _expr.Call.Halide, _buf.op, 0)
-        elif isinstance(node.value, ast.Attribute):
-            if not isinstance(node.value.value, ast.Name):
-                raise ValueError("The root of array access is expect to be a id!")
-            if node.value.attr != "shape":
-                raise ValueError("Attribute access so far only 'shape' is supported!")
-            if len(args) != 1:
-                raise ValueError("For 'shape' access the argument should be only one!")
-            args = args[0]
-            #TODO: maybe support non-constant value later?
-            if not isinstance(args, (_expr.IntImm, _expr.UIntImm)):
-                raise ValueError("So far only constant shape access supported!")
-            buf = self._get_buffer_from_id(node.value.value.id)
-            return buf.shape[args.value]
-        else:
-            raise ValueError("Not supported yet!")
+
+        _internal_assert(isinstance(node.value, ast.Attribute), \
+                "Only variable and attribute's subscript supported so far")
+        _internal_assert(isinstance(node.value.value, ast.Name), \
+            "The root of array access is expect to be a id!")
+        _internal_assert(node.value.attr == "shape", \
+            "Attribute access so far only 'shape' is supported!")
+        _internal_assert(len(args) == 1, "For 'shape' access the argument should be only one!")
+        args = args[0]
+        #TODO: maybe support non-constant value later?
+        _internal_assert(isinstance(args, (_expr.IntImm, _expr.UIntImm)), \
+            "So far only constant shape access supported!")
+        buf = self._get_buffer_from_id(node.value.value.id)
+        return buf.shape[args.value]
 
 
     def visit_With(self, node):
@@ -241,14 +246,11 @@ def visit_With(self, node):
             context = node.context_expr
             option = node.optional_vars
         else:
-            if len(node.items) != 1:
-                raise ValueError("Only one with element is supported so far!")
+            _internal_assert(len(node.items) == 1, "Only one with element is supported so far!")
             context = node.items[0].context_expr
             option = node.items[0].optional_vars
-        if not isinstance(context, ast.Call):
-            raise ValueError("The object must be a Python function call!")
-        if not isinstance(option, ast.Name):
-            raise ValueError("The object after 'as' must be an id!")
+        _internal_assert(isinstance(context, ast.Call), "The object must be a Python func call!")
+        _internal_assert(isinstance(option, ast.Name), "The object after 'as' must be an id!")
         self.annotation[option.id] = context.func.id
         return list_to_block(self.visit, node.body)
 
@@ -272,10 +274,8 @@ def visit_IfExp(self, node):
 
     def visit_Compare(self, node):
         lhs = self.visit(node.left)
-        if len(node.ops) != 1:
-            raise ValueError("Only one compare op is supported!")
-        if len(node.comparators) != 1:
-            raise ValueError("Only one comparator is supported!")
+        _internal_assert(len(node.ops) == 1, "Only one compare op is supported!")
+        _internal_assert(len(node.comparators) == 1, "Only one comparator is supported!")
         rhs = self.visit(node.comparators[0])
         return HybridParser._binop_maker[type(node.ops[0])](lhs, rhs)
 
@@ -293,16 +293,15 @@ def visit_BinOp(self, node):
 
     def visit_Call(self, node):
         # Yet, no function pointer supported
-        if not isinstance(node.func, ast.Name):
-            raise ValueError("Only id-function function call is supported so far!")
+        _internal_assert(isinstance(node.func, ast.Name), \
+            "Only id-function function call is supported so far!")
         func_id = node.func.id
         n = len(node.args)
         if func_id in LOOP_INTRIN.keys() and func_id != 'bind':
             if n == 1:
                 low, ext = _api.const(0, dtype='int32'), self.visit(node.args[0])
             else:
-                if n != 2:
-                    raise ValueError("A loop intrinsic should only have 1 or 2 arguments!")
+                _internal_assert(n == 2, "A loop intrinsic should only have 1 or 2 arguments!")
                 low, ext = self.visit(node.args[0]), self.visit(node.args[1])
             if not _ir_pass.Equal(low, _api.const(0, dtype='int32')):
                 ext = ext - low
@@ -310,10 +309,9 @@ def visit_Call(self, node):
             iter_var = None
             return iter_var, low, ext, for_type
         elif func_id == 'bind':
-            if n != 2:
-                raise ValueError("A loop bind should only have 2 arguments!")
-            if not isinstance(node.args[0], ast.Str):
-                raise ValueError("A loop bind's first argument should be a string!")
+            _internal_assert(n == 2, "A loop bind should only have 2 arguments!")
+            _internal_assert(isinstance(node.args[0], ast.Str), \
+                "A loop bind's first argument should be a string!")
             _vn = node.args[0].s
             iter_var = thread_axis(node.args[0].s)
             low, ext = _api.const(0, dtype='int32'), self.visit(node.args[1])
@@ -321,29 +319,39 @@ def visit_Call(self, node):
             return iter_var, low, ext, for_type
         elif func_id in MATH_INTRIN:
             return getattr(intrin, func_id)(*[self.visit(arg) for arg in node.args])
-        elif func_id == 'allocate':
-            if not isinstance(node.args[0], ast.Tuple):
-                raise ValueError("allocate's first argument should be a tuple of shape!")
+        elif func_id in ['allocate', 'output_tensor']:
+            _internal_assert(isinstance(node.args[0], ast.Tuple), \
+                "allocate's first argument should be a tuple of shape!")
             shape = tuple(self.visit(i) for i in node.args[0].elts)
+            if func_id == 'output_tensor':
+                _internal_assert(not self.loops_above, \
+                        "Are you sure to allocate a output buffer multiple times?")
             for i in shape:
-                if not isinstance(i, _expr.Expr):
-                    raise ValueError("The shape should be an expression")
+                _internal_assert(isinstance(i, _expr.Expr), "The shape should be an expression")
             if n > 1:
-                if not isinstance(node.args[1], ast.Str):
-                    raise ValueError("The data type should be an string")
-                dtype = node.args[1].s
+                if isinstance(node.args[1], ast.Str):
+                    dtype = node.args[1].s
+                else:
+                    _internal_assert(isinstance(node.args[1], ast.Attribute), \
+                            "Unable to evaluate to get data type")
+                    to_eval = node.args[1]
+                    _internal_assert(isinstance(to_eval.value, ast.Name), \
+                            "Unable to evaluate the attribute to get data type")
+                    _internal_assert(to_eval.attr == 'dtype', \
+                            "Only dtype attribute is supported so far")
+                    dtype = self._get_buffer_from_id(to_eval.value.id).dtype
             else:
                 dtype = 'float32'
             if n > 2:
-                if not isinstance(node.args[2], ast.Str):
-                    raise ValueError("The data type should be an string")
+                _internal_assert(isinstance(node.args[2], ast.Str), \
+                        "The data scope should be an string")
+                _internal_assert(func_id != 'output_tensor', "Output tensor cannot specify scope")
                 scope = node.args[2].s
             else:
-                scope = 'global'
+                scope = 'global' if func_id != 'output_tensor' else 'output'
             return (shape, dtype, scope)
         elif func_id == 'max' or func_id == 'min':
-            if n != 2:
-                raise ValueError("Max/Min function should have 2 elements")
+            _internal_assert(n == 2, "Max/Min function should have 2 elements")
             a, b = self.visit(node.args[0]), self.visit(node.args[1])
             return getattr(_make, func_id.title())(a, b)
         else:
@@ -352,19 +360,17 @@ def visit_Call(self, node):
 
     def visit_For(self, node):
         iter_var, low, ext, for_type = self.visit(node.iter)
-        if not isinstance(node.target, ast.Name):
-            raise ValueError("The loop iterator should be a variable!")
+        _internal_assert(isinstance(node.target, ast.Name), \
+                "The loop iterator should be a variable!")
         _name = node.target.id
         if iter_var is None:
-            if for_type is None:
-                raise ValueError("The loop bind function parse error!")
+            _internal_assert(for_type is not None, "The loop bind function parse error!")
             offset = iter_var = _api.var(_name)
             if not _ir_pass.Equal(low, _api.const(0, dtype='int32')):
                 offset = iter_var + low
             self.loops_above[_name] = offset
         else:
-            if for_type is not None:
-                raise ValueError("The loop iterating function parse error!")
+            _internal_assert(for_type is None, "The loop iterating function parse error!")
             self.loops_above[_name] = iter_var.var
         _body = list_to_block(self.visit, node.body)
         _body = self.wrap_up_realize(node, _body)
@@ -376,10 +382,46 @@ def visit_For(self, node):
         return res
 
 
+    def visit_Return(self, node):
+        _internal_assert(not self.loops_above, "Return should not be in a loop body!")
+        ids = []
+        if isinstance(node.value, ast.Name):
+            ids.append(node.value.id)
+        else:
+            _internal_assert(isinstance(node.value, ast.Tuple), \
+                    "You should return either a single tensor or a tuple")
+            for i in node.value.elts:
+                _internal_assert(isinstance(i, ast.Name), "What do you return?")
+                ids.append(i.id)
+        _internal_assert(len(set(ids)) == len(ids), "Duplicated tensors in the return tuples")
+        if len(ids) != len(self.outputs):
+            logging.log(logging.CRITICAL, '[Warning] Not all the output buffers returned!')
+        self.outputs = [self._args[i] for i in ids]
+        self.returned = True
+        return make_nop()
+
+
 def parse_python(src, args):
-    """The helper function of calling the AST visitor"""
+    """The helper function of calling the AST visitor
+
+    Parameters
+    ----------
+    src : str
+        The source code of the function to be parsed.
+
+    args : list of Tensors or Vars
+        The argument lists to the function.
+        It is NOT encouraged to write a function without arguments.
+        It is NOT encouraged to write a function with side effect.
+
+    Returns
+    -------
+    root : Stmt
+        The result Halide IR and the parser class instance.
+    """
     root = ast.parse(src)
     var_usage = determine_variable_usage(root, args)
     parser = HybridParser(args, var_usage)
-    halide_ir = parser.visit(root)
-    return halide_ir
+    parser.parsed_body = parser.visit(root)
+    _internal_assert(parser.returned, 'No valid return found in the function body!')
+    return parser
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index 2a43957e9706..e38f466381ff 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -2,6 +2,8 @@
 
 import ast
 import inspect
+import logging
+import sys
 import numpy
 from .intrin import HYBRID_GLOBALS
 from .._ffi.base import numeric_types
@@ -30,10 +32,17 @@ def is_docstring(node):
 
 def _pruned_source(func):
     """Prune source code's extra leading spaces"""
-    lines = inspect.getsource(func).split('\n')
-    leading_space = len(lines[0]) - len(lines[0].lstrip(' '))
-    lines = [line[leading_space:] for line in lines]
-    return '\n'.join(lines)
+    try:
+        lines = inspect.getsource(func).split('\n')
+        leading_space = len(lines[0]) - len(lines[0].lstrip(' '))
+        lines = [line[leading_space:] for line in lines]
+        return '\n'.join(lines)
+    except IOError as err:
+        if sys.version_info[0] == 2 and str(err) == 'could not get source code':
+            logging.log(logging.CRITICAL, \
+                        'This module is not fully operated under Python2... ' \
+                        'Please move to Python3!')
+            raise err
 
 
 def _is_tvm_arg_types(args):
@@ -70,3 +79,12 @@ def _restore_runtime(func, intersect):
         _globals.pop(elem)
     for k, v in intersect:
         _globals[k] = v
+
+def _internal_assert(cond, err):
+    """Simplify the code segment like if not XXX then raise an error"""
+    if not cond:
+        raise ValueError(err)
+
+# Almost the same functionality as the one above, but in this case,
+# the error is caused by users inproper usage.
+_user_assert = _internal_assert
diff --git a/python/tvm/hybrid/var_decl.py b/python/tvm/hybrid/var_decl.py
index df38bac1acba..586ef95461ea 100644
--- a/python/tvm/hybrid/var_decl.py
+++ b/python/tvm/hybrid/var_decl.py
@@ -3,6 +3,7 @@
 import ast
 import sys
 from .intrin import HYBRID_GLOBALS
+from .util import _internal_assert
 
 
 class PyVariableUsage(ast.NodeVisitor):
@@ -18,8 +19,8 @@ def __init__(self, args):
 
     def visit_FunctionDef(self, node):
         self.scope_level.append(node)
-        if len(node.args.args) != len(self.args):
-            raise ValueError('#arguments passed should be the same as #arguments defined')
+        _internal_assert(len(node.args.args) == len(self.args), \
+                '#arguments passed should be the same as #arguments defined')
         for idx, arg in enumerate(node.args.args):
             _attr = 'id' if sys.version_info[0] < 3 else 'arg' # To make py2 and 3 compatible
             self._args[getattr(arg, _attr)] = self.args[idx]
@@ -28,8 +29,8 @@ def visit_FunctionDef(self, node):
 
 
     def visit_For(self, node):
-        if not isinstance(node.target, ast.Name):
-            raise ValueError("For's iterator should be an id")
+        _internal_assert(isinstance(node.target, ast.Name), \
+                "For's iterator should be an id")
         self.visit(node.iter)
         self.scope_level.append(node)
         for i in node.body:
@@ -39,11 +40,10 @@ def visit_For(self, node):
 
     def visit_Call(self, node):
         #No function pointer supported so far
-        if not isinstance(node.func, ast.Name):
-            raise ValueError("Function call should be an id")
+        _internal_assert(isinstance(node.func, ast.Name), "Function call should be an id")
         func_id = node.func.id
-        if func_id not in list(HYBRID_GLOBALS.keys()) + ['range', 'max', 'min']:
-            raise ValueError("Function call id not in intrinsics' list")
+        _internal_assert(func_id in list(HYBRID_GLOBALS.keys()) + ['range', 'max', 'min'], \
+                "Function call id not in intrinsics' list")
         for elem in node.args:
             self.visit(elem)
 
@@ -56,12 +56,12 @@ def visit_Name(self, node):
         if node.id in fors:
             return
         # The loop variable cannot be overwritten when iteration
-        if isinstance(node.ctx, ast.Store) and node.id in fors:
-            raise ValueError("Iter var cannot be overwritten")
+        _internal_assert(not isinstance(node.ctx, ast.Store) or node.id not in fors, \
+                         "Iter var cannot be overwritten")
 
         if node.id not in self.status.keys():
-            if not isinstance(node.ctx, ast.Store):
-                raise ValueError('In Python, "first store" indicates "declaration"')
+            _internal_assert(isinstance(node.ctx, ast.Store), \
+                    'Undeclared variable %s' % node.id)
             self.status[node.id] = (node, self.scope_level[-1], set())
         else:
             decl, loop, usage = self.status[node.id]
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index f32b70eb9a12..9a98e9a6e769 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -180,3 +180,8 @@ def scan_axis(self):
 class ExternOp(Operation):
     """Extern operation."""
     pass
+
+@register_node
+class HybridOp(Operation):
+    """Hybrid operation."""
+    pass
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index 3525e23b8b20..e30111e938bd 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -313,6 +313,16 @@ TVM_REGISTER_API("_ExternOp")
                               args[6]);
   });
 
+TVM_REGISTER_API("_HybridOp")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = HybridOpNode::make(args[0],
+                              args[1],
+                              args[2],
+                              args[3],
+                              args[4],
+                              args[5]);
+  });
+
 TVM_REGISTER_API("_OpGetOutput")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     *ret = args[0].operator Operation().output(
diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
new file mode 100644
index 000000000000..4dbb2c0b964f
--- /dev/null
+++ b/src/op/hybrid_op.cc
@@ -0,0 +1,189 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Hybrid computation rule.
+ * \file hybrid_op.cc
+ */
+#include <tvm/operation.h>
+#include <tvm/arithmetic.h>
+#include <tvm/ir.h>
+#include <tvm/ir_mutator.h>
+#include <unordered_set>
+#include "op_util.h"
+
+namespace tvm {
+using namespace ir;
+// HybridOpNode
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<HybridOpNode>([](const HybridOpNode *op, IRPrinter *p) {
+    p->stream << "hybrid(" << op->name << ", " << op << ")";
+  });
+
+TVM_REGISTER_NODE_TYPE(HybridOpNode);
+
+int HybridOpNode::num_outputs() const {
+  return static_cast<int>(outputs.size());
+}
+
+Array<IterVar> HybridOpNode::root_iter_vars() const {
+  return {};
+}
+
+Type HybridOpNode::output_dtype(size_t i) const {
+  return outputs[i]->dtype;
+}
+
+Array<Expr> HybridOpNode::output_shape(size_t i) const {
+  return outputs[i]->shape;
+}
+
+
+Operation HybridOpNode::make(std::string name,
+                             std::string tag,
+                             Map<std::string, NodeRef> attrs,
+                             Array<Tensor> inputs,
+                             Array<Tensor> outputs,
+                             Stmt body) {
+  if (!attrs.defined()) {
+    attrs = Map<std::string, NodeRef>();
+  }
+  auto n = make_node<HybridOpNode>();
+  n->name = std::move(name);
+  n->tag = std::move(tag);
+  n->attrs = std::move(attrs);
+  n->inputs = std::move(inputs);
+  n->outputs = std::move(outputs);
+  n->body = std::move(body);
+  Operation res = Operation(n);
+  return res;
+}
+
+Array<Tensor> HybridOpNode::InputTensors() const {
+  return inputs;
+}
+
+Operation HybridOpNode::ReplaceInputs(
+    const Operation& self,
+    const std::unordered_map<Tensor, Tensor>& rmap) const {
+  CHECK_EQ(self.operator->(), this);
+  auto n = make_node<HybridOpNode>(*this);
+  n->body = op::ReplaceTensor(this->body, rmap);
+  for (size_t i = 0; i < n->inputs.size(); ++i) {
+    Tensor t = n->inputs[i];
+    if (rmap.count(t)) {
+      n->inputs.Set(i, rmap.at(t));
+    }
+  }
+
+  if (body.same_as(n->body) &&
+      inputs.same_as(n->inputs)) {
+    return self;
+  } else {
+    return Operation(n);
+  }
+}
+
+void HybridOpNode::PropBoundToInputs(
+    const Operation& self,
+    const std::unordered_map<const Variable*, IntSet>& dom_map,
+    std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
+  for (Tensor t : this->inputs) {
+    auto it = out_dom_map->find(t);
+    if (it == out_dom_map->end()) continue;
+    TensorDom& dom = it->second;
+    for (size_t i = 0; i < t->shape.size(); ++i) {
+      dom.data[i].emplace_back(IntSet::range(
+          Range::make_by_min_extent(
+              make_const(t->shape[i].type(), 0), t->shape[i])));
+    }
+  }
+}
+
+void HybridOpNode::GatherBound(
+    const Operation& self,
+    const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+    std::unordered_map<IterVar, Range>* out_dom_map) const {
+}
+
+Stmt HybridOpNode::BuildRealize(
+    const Stage& stage,
+    const std::unordered_map<IterVar, Range>& realize_map,
+    const Stmt& body) const {
+  CHECK_EQ(stage->op.get(), this);
+  Stmt realize_body = body;
+  for (int k = 0; k < num_outputs(); ++k) {
+    Tensor t = stage->op.output(k);
+    HalideIR::Internal::Region bounds;
+    for (size_t i = 0; i < t->shape.size(); ++i) {
+      bounds.push_back(
+          Range::make_by_min_extent(
+              make_const(t->shape[i].type(), 0), t->shape[i]));
+    }
+    realize_body = ir::Realize::make(
+        t->op, t->value_index, t->dtype,
+        bounds, const_true(), realize_body);
+  }
+  return realize_body;
+}
+
+Stmt HybridOpNode::BuildProvide(
+    const Stage& stage,
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) const {
+  CHECK_EQ(stage->op.operator->(), this);
+  Stmt ret = AttrStmt::make(make_zero(Int(32)), attr::extern_scope, 0, this->body);
+  auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) {
+    Array<NodeRef> bind_spec;
+    Array<Expr> tuple;
+    bind_spec.push_back(buffer);
+    bind_spec.push_back(tensor);
+    for (size_t k = 0; k < buffer->shape.size(); ++k) {
+      tuple.push_back(make_const(buffer->shape[k].type(), 0));
+      tuple.push_back(buffer->shape[k]);
+    }
+    ret = AttrStmt::make(
+        bind_spec, attr::buffer_bind_scope,
+        Call::make(Handle(), intrinsic::tvm_tuple, tuple, Call::Intrinsic), ret);
+  };
+  for (int i = static_cast<int>(outputs.size()) - 1; i >= 0; --i) {
+    Buffer buffer = decl_buffer(
+      outputs[i]->shape,
+      outputs[i]->dtype);
+    f_push_bind(buffer, stage->op.output(i));
+  }
+  for (int i = static_cast<int>(inputs.size()) - 1; i >= 0; --i) {
+    Buffer buffer = decl_buffer(
+      inputs[i]->shape,
+      inputs[i]->dtype);
+    f_push_bind(buffer, inputs[i]);
+  }
+
+  std::unordered_map<Tensor, Tensor> rmap;
+  for (int i = 0; i < this->num_outputs(); ++i) {
+    rmap[outputs[i]] = stage->op.output(i);
+  }
+  auto n = make_node<HybridOpNode>(*this);
+  /*
+   * These two lines of codes replace tensors' reads & writes.
+   * This is the simplest way I (@were) can come up with to glue
+   * hybrid scripts to the structure of TVM op.
+   * NAMING CONFLICT: In hybrid script all the tensors have their own 
+   * names specified by the users. However, In TVM op, all the output
+   * tensors' names are the same as the op's name. I cannot change the
+   * name to the op's name in the function body after the op node is
+   * formed, because:
+   *   1. Output tensors all point to the corresponding op node. 
+   *   2. Once OpNode is wrapped up by an Operation node, it can
+   *      no longer be changed.
+   * This is a chiken-egg paradox. It is impossible to put the output
+   * tensors into the function body without forming the op node. The
+   * function body is immutable after the node is formed.
+   *
+   * Finally, I decided to resolve this issue "lazily". During the
+   * pipeline of compilation, these tensors will be replaced when
+   * forming the function body and passing to next stage of compilation.
+   * */
+  ret = op::ReplaceTensor(ret, rmap);
+  ret = op::ReplaceProvideTensor(ret, rmap);
+  return ret;
+}
+}  // namespace tvm
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index ba83997a0a16..886f7c912303 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -164,6 +164,37 @@ std::vector<Stmt> MakeIfNest(const std::vector<Expr>& predicates) {
   return nest;
 }
 
+// replacer to replace tensors' usage in Provide
+class ProviderReplacer : public ir::IRMutator {
+ public:
+  explicit ProviderReplacer(const std::unordered_map<Tensor, Tensor>& vmap)
+      : vmap_(vmap) {}
+
+  Stmt Mutate_(const ir::Provide* op, const Stmt& s) {
+    Tensor t = Operation(op->func.node_).output(op->value_index);
+    auto it = vmap_.find(t);
+    if (it != vmap_.end()) {
+      Stmt ret = ir::Provide::make(
+        it->second->op, it->second->value_index, op->value, op->args);
+      found = true;
+      return IRMutator::Mutate_(ret.as<ir::Provide>(), ret);
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
+  // whether it is found.
+  bool found{false};
+
+ private:
+  const std::unordered_map<Tensor, Tensor>& vmap_;
+};
+
+Stmt ReplaceProvideTensor(Stmt stmt,
+                   const std::unordered_map<Tensor, Tensor>& replace) {
+  ProviderReplacer repl(replace);
+  Stmt ret = repl.Mutate(stmt);
+  return repl.found ? ret : stmt;
+}
 
 // replacer to replace tensors
 class TensorReplacer : public ir::IRMutator {
diff --git a/src/op/op_util.h b/src/op/op_util.h
index 558e8d4e7324..6971f14eef73 100644
--- a/src/op/op_util.h
+++ b/src/op/op_util.h
@@ -49,14 +49,22 @@ MakeLoopNest(const Stage& stage,
 std::vector<Stmt> MakeIfNest(const std::vector<Expr>& predicates);
 
 /*!
- * \brief Replace the tensor reference in stmt by the replace map.
+ * \brief Replace the tensor reference (especially in Provide's) in stmt by the replace map.
+ * \param stmt The statement to be processed.
+ * \param replace The replacement rule.
+ */
+Stmt ReplaceProvideTensor(Stmt stmt,
+                   const std::unordered_map<Tensor, Tensor>& replace);
+
+/*!
+ * \brief Replace the tensor reference (especially in Call's) in stmt by the replace map.
  * \param stmt The statement to be processed.
  * \param replace The replacement rule.
  */
 Stmt ReplaceTensor(Stmt stmt,
                    const std::unordered_map<Tensor, Tensor>& replace);
 /*!
- * \brief Replace the tensor reference in expr by the replace map.
+ * \brief Replace the tensor reference (especially in Call's) in stmt by the replace map.
  * \param expr The expression to be processed.
  * \param replace The replacement rule.
  */
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 3124586ca343..9156e40f949f 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -3,7 +3,7 @@
 from tvm.hybrid.intrin import HYBRID_GLOBALS
 
 @nose.tools.nottest
-def run_and_check(func, args, outs, var_dict={}, target='llvm'):
+def run_and_check(func, args, var_dict={}, target='llvm'):
     def tvm_val_2_py_val(val):
         val = tvm.ir_pass.Substitute(val, var_dict)
         val = tvm.ir_pass.Simplify(val)
@@ -14,39 +14,50 @@ def tvm_val_2_py_val(val):
 
     emu_args = []
     nd_args = []
-    to_check = []
     for i in args:
         if isinstance(i, tvm.tensor.Tensor):
             shape = [tvm_val_2_py_val(j) for j in i.shape]
-            if i in outs:
-                emu_args.append(numpy.zeros(shape).astype(i.dtype))
-                nd_args.append(tvm.nd.array(emu_args[-1], ctx))
-                to_check.append((nd_args[-1], emu_args[-1]))
-            else:
-                emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
-                nd_args.append(tvm.nd.array(emu_args[-1], ctx))
+            emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
+            nd_args.append(tvm.nd.array(emu_args[-1], ctx))
         else:
             assert isinstance(i, tvm.expr.Var)
             emu_args.append(tvm_val_2_py_val(i))
             nd_args.append(emu_args[-1])
 
-    func(*emu_args)
-
-    lowerd_func = tvm.lower(func(*args), args)
-    module = tvm.build(lowerd_func, target=target)
+    outs = func(*args)
+    op = outs[0].op if isinstance(outs, list) else outs.op
+    sch = tvm.create_schedule(op)
+    module = tvm.build(sch, args + (outs if isinstance(outs, list) else [outs]), target=target)
     assert module
+    
+    out_tensors = []
+    for i in range(op.num_outputs):
+        output = op.output(i)
+        shape = [tvm_val_2_py_val(j) for j in output.shape]
+        nd_args.append(tvm.nd.array(numpy.zeros(shape).astype(output.dtype), ctx))
+        out_tensors.append(nd_args[-1])
+
+    ref_data = func(*emu_args)
+    if isinstance(ref_data, numpy.ndarray):
+        ref_data = [ref_data]
+    
     module(*nd_args)
 
-    for nd, np in to_check:
+    for nd, np in zip(out_tensors, ref_data):
         tvm.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)
 
 
 @script
-def outer_product(n, m, a, b, c):
-    """This is a simple outer product"""
+def outer_product(n, m, a, b):
+    """This is a simple outer product.
+    Actually this function is not required to be documented.
+    I write this docstring to test skipping docstring functionality.
+    """
+    c = output_tensor((n, m), a.dtype)
     for i in range(n):
         for j in range(m):
             c[i, j] = a[i] * b[j]
+    return c
 
 #Test global function
 #Test bridge between frontend and backend
@@ -55,8 +66,14 @@ def test_outer_product():
     m = tvm.var('m')
     a = tvm.placeholder((n, ), name='a')
     b = tvm.placeholder((m, ), name='b')
-    c = tvm.placeholder((n, m), name='c')
-    ir = outer_product(n, m, a, b, c)
+
+    try:
+        c = outer_product(n, m, a, b)
+        ir = c.op.body
+    except IOError as err:
+        assert sys.version_info[0] == 2 and str(err) == 'could not get source code'
+        return
+
     #Check for i in (0, n)
     assert isinstance(ir, tvm.stmt.For)
     assert ir.loop_var.name == 'i'
@@ -81,10 +98,8 @@ def test_outer_product():
     assert mul.a.name == 'a'
     assert mul.b.name == 'b'
 
-    func = tvm.lower(ir, [n, m, a, b, c])
-    func = tvm.build(func)
 
-    run_and_check(outer_product, [n, m, a, b, c], [c], {n: 999, m: 1001})
+    run_and_check(outer_product, [n, m, a, b], {n: 99, m: 101})
 
     for key, _ in HYBRID_GLOBALS.items():
         assert key not in globals().keys()
@@ -94,19 +109,25 @@ def test_outer_product():
 #Test allocation of local variable
 def test_fanout():
     @script
-    def fanout(n, a, b):
+    def fanout(n, a):
         three = 3.0
+        b = output_tensor((a.shape[0] - 3, ), a.dtype)
         for i in range(a.shape[0] - 3):
             sigma = 0.0
             for j in range(3):
                 sigma = sigma + a[i + j]
             sigma = sigma / three
             b[i] = sigma
+        return b
 
     n = tvm.var('n')
     a = tvm.placeholder((n, ), 'float32', name='a')
-    b = tvm.placeholder((n-3, ), 'float32', name='b')
-    ir = fanout(n, a, b)
+    try:
+        b = fanout(n, a)
+        ir = b.op.body
+    except IOError as err:
+        assert sys.version_info[0] == 2 and str(err) == 'could not get source code'
+        return
 
     #Check for i in (0, n-3)
     assert isinstance(ir, tvm.stmt.For)
@@ -163,38 +184,31 @@ def fanout(n, a, b):
     assert len(write.value.args) == 1
     assert write.value.args[0].value == 0
 
-    run_and_check(fanout, [n, a, b], [b], {n: 10})
-
-
-@script
-def failure():
-    for i in range(1, 100):
-        i = 0
-
-def test_failure():
-    try:
-        tvm.hybrid.parse(failure, [])
-    except IOError as err:
-        assert sys.version_info[0] == 2
-        print('[Warning] Case test_failure is skipped by Python2 because "%s"' % str(err))
-    except Exception as err:
-        assert str(err) == 'You CAN NEVER overwrite a loop variable!'
+    run_and_check(fanout, [n, a], {n: 10})
 
 
 def test_looptype():
     @script
     def looptype(a, b, c):
+        d = output_tensor((8, ), 'int32')
+        e = output_tensor((8, ), 'int32')
+        f = output_tensor((8, ), 'int32')
         for i in parallel(8):
-            a[i] = i
+            d[i] = a[i]
         for j in vectorize(8):
-            b[j] = j
+            e[j] = b[j]
         for k in unroll(8):
-            c[k] = k
+            f[k] = c[k]
+        return d, e, f
 
     a = tvm.placeholder((8, ), name='a', dtype='int32')
     b = tvm.placeholder((8, ), name='b', dtype='int32')
     c = tvm.placeholder((8, ), name='c', dtype='int32')
-    ir = looptype(a, b, c)
+    try:
+        d, e, f = looptype(a, b, c)
+        ir = d.op.body
+    except:
+        return
     iloop = ir.first
     jloop = ir.rest.first
     kloop = ir.rest.rest
@@ -202,24 +216,26 @@ def looptype(a, b, c):
     assert jloop.for_type == tvm.stmt.For.Vectorized
     assert kloop.for_type == tvm.stmt.For.Unrolled
 
-    run_and_check(looptype, [a, b, c], [a, b, c])
+    run_and_check(looptype, [a, b, c])
 
 
 def test_if():
     @script
-    def if_then_else(a, b):
+    def if_then_else(a):
+        b = output_tensor((10, ), 'int32')
+        c = output_tensor((10, ), 'int32')
         for i in range(10):
             if i % 2 == 0:
-                a[i] = -1
+                c[i] = a[i]
             else:
-                a[i] = 1
+                c[i] = b[i]
         for i in unroll(10):
             b[i] = -1 if i % 2 == 0 else 1
+        return b, c
 
     a = tvm.placeholder((10, ), dtype='int32', name='a')
-    b = tvm.placeholder((10, ), dtype='int32', name='b')
 
-    run_and_check(if_then_else, [a, b], [a, b])
+    run_and_check(if_then_else, [a])
 
 
 def test_bind():
@@ -227,55 +243,66 @@ def test_bind():
         print('[Warning] No GPU found! Skip bind test!')
         return
     @script
-    def vec_add(a, b, c):
+    def vec_add(a, b):
+        c = output_tensor((1000, ), dtype='float32')
         for tx in bind('threadIdx.x', 1000):
             c[tx] = b[tx] + c[tx]
+        return c
 
     a = tvm.placeholder((1000, ), dtype='float32', name='a')
     b = tvm.placeholder((1000, ), dtype='float32', name='b')
-    c = tvm.placeholder((1000, ), dtype='float32', name='c')
 
-    run_and_check(vec_add, [a, b, c], [c], target='cuda')
+    run_and_check(vec_add, [a, b], target='cuda')
 
 def test_math_intrin():
     @script
     def intrin_real(a):
-        a[0] = sqrt(a[0])
-        a[1] = log(a[1])
-        a[2] = exp(a[2])
-        a[3] = sigmoid(a[3])
-        a[4] = power(a[4], a[5])
-        a[5] = tanh(a[5])
-        a[6] = min(a[4], a[5])
-        a[7] = max(a[5], a[6])
+        b = output_tensor((8, ), 'float32')
+        b[0] = sqrt(a[0])
+        b[1] = log(a[1])
+        b[2] = exp(a[2])
+        b[3] = sigmoid(a[3])
+        b[4] = power(a[4], a[5])
+        b[5] = tanh(a[5])
+        b[6] = min(a[4], a[5])
+        b[7] = max(a[5], a[6])
+        return b
 
     a8 = tvm.placeholder((8, ), dtype='float32', name='a')
-    ir = intrin_real(a8)
-    func = tvm.build(tvm.lower(ir, [a8]))
+    b8 = intrin_real(a8)
+    sch = tvm.create_schedule(b8.op)
+    func = tvm.build(sch, [a8, b8])
     assert func
     a = numpy.arange(2, 10).astype('float32')
     tvm_a = tvm.ndarray.array(a)
-    func(tvm_a)
-    intrin_real(a)
-    tvm.testing.assert_allclose(a, tvm_a.asnumpy(), rtol=1e-5)
+    tvm_b = tvm.ndarray.array(numpy.zeros((8, ), dtype='float32'))
+    b = intrin_real(a)
+    func(tvm_a, tvm_b)
+    tvm.testing.assert_allclose(b, tvm_b.asnumpy(), rtol=1e-5)
 
     @script
     def intrin_int(a):
-        a[0] = popcount(a[0])
+        b = output_tensor((1, ), 'int32')
+        b[0] = popcount(a[0])
+        return b
 
     a1 = tvm.placeholder((1, ), dtype='int32')
-    ir = intrin_int(a1)
-    func = tvm.build(tvm.lower(ir, [a1]))
+    b1 = intrin_int(a1)
+    sch = tvm.create_schedule(b1.op)
+    func = tvm.build(sch, [a1, b1])
     assert func
-    a = numpy.array([1234567890]).astype('int32')
+    a = numpy.array([114514]).astype('int32')
     tvm_a = tvm.ndarray.array(a)
-    intrin_int(a)
-    func(tvm_a)
-    assert tvm_a.asnumpy()[0] == a[0]
+    tvm_b = tvm.ndarray.array(numpy.array([0]).astype('int32'))
+    b = intrin_int(a)
+    func(tvm_a, tvm_b)
+    assert tvm_b.asnumpy()[0] == b[0]
 
+# test non caconical loops
 def test_non_zero():
     @tvm.hybrid.script
-    def blur(a, b):
+    def blur(a):
+        b = output_tensor((30, 30), 'float32')
         for i in range(2, 32):
             for j in range(2, 32):
                 s = 0.0
@@ -283,29 +310,28 @@ def blur(a, b):
                     for dj in range(3):
                         s = s + a[i-di, j-dj]
                 b[i-2, j-2] = s / 9.0
-    try:
-        a = tvm.placeholder((32, 32), 'float32', 'a')
-        b = tvm.placeholder((30, 30), 'float32', 'b')
-        run_and_check(blur, [a, b], [b])
-    except IOError as err:
-        assert sys.version_info[0] == 2
-        print('[Warning] Case test_non_zero is skipped by Python2 because "%s"' % str(err))
+        return b
+
+    a = tvm.placeholder((32, 32), 'float32', 'a')
+    run_and_check(blur, [a])
 
     @tvm.hybrid.script
-    def triangle(a, b, c):
+    def triangle(a, b):
+        c = output_tensor((10, 10), dtype='float32')
         for i in range(10):
             for j in range(i, 10):
                 c[i, j] = a[i] * b[j]
+        return c
 
     a = tvm.placeholder((10, ), dtype='float32', name='a')
     b = tvm.placeholder((10, ), dtype='float32', name='b')
-    c = tvm.placeholder((10, 10), dtype='float32', name='c')
 
-    run_and_check(triangle, [a, b, c], [c])
+    run_and_check(triangle, [a, b])
 
 def test_allocate():
     @tvm.hybrid.script
-    def blur2d(a, b):
+    def blur2d(a):
+        b = output_tensor((30, 30), 'float32')
         for i in range(30):
             ha = allocate((3, 30), 'float32')
             for j in range(3):
@@ -313,15 +339,15 @@ def blur2d(a, b):
                     ha[j, k] = a[i+j, k] + a[i+j, k+1] + a[i+j, k+2]
             for j in range(30):
                 b[i, j] = (ha[0, j] + ha[1, j] + ha[2, j]) / 9.0
+        return b
 
     a = tvm.placeholder((32, 32), 'float32', 'a')
-    b = tvm.placeholder((30, 30), 'float32', 'b')
-
-    run_and_check(blur2d, [a, b], [b])
+    run_and_check(blur2d, [a])
 
     if tvm.gpu().exist:
         @tvm.hybrid.script
-        def share_vec_add(a, b, c):
+        def share_vec_add(a, b):
+            c = output_tensor((256, ), 'float32')
             shared = allocate((256, ), 'float32', 'shared')
             for i in bind("threadIdx.x", 256):
                 shared[i] = a[i]
@@ -330,23 +356,81 @@ def share_vec_add(a, b, c):
                 local[i] = b[i]
             for i in bind("threadIdx.x", 256):
                 c[i] = shared[i] + local[i]
+            return c
 
         a = tvm.placeholder((256, ), dtype='float32', name='a')
         b = tvm.placeholder((256, ), dtype='float32', name='b')
-        c = tvm.placeholder((256, ), dtype='float32', name='c')
-        run_and_check(share_vec_add, [a, b, c], [c], target='cuda')
+        run_and_check(share_vec_add, [a, b], target='cuda')
     else:
         print('[Warning] No GPU found! Skip shared mem test!')
 
+def test_upstream():
+    @tvm.hybrid.script
+    def upstream(a):
+        b = output_tensor((20, ), 'float32')
+        for i in range(20):
+            b[i] = a[i] * i
+        return b
+
+    a = tvm.placeholder((20, ), 'float32')
+    b = tvm.placeholder((20, ), 'float32')
+    c = tvm.compute((20, ), lambda x: a[x] + b[x])
+    d = upstream(c)
+    sch = tvm.create_schedule([c.op, d.op])
+    ir = tvm.lower(sch, [a, b, d], simple_mode=True)
+    func = tvm.build(sch, [a, b, d])
+    assert(func)
+
+    a = numpy.random.randn(20).astype('float32')
+    b = numpy.random.randn(20).astype('float32')
+    ref = numpy.zeros((20, ), 'float32')
+    for i in range(20):
+        ref[i] = (a[i] + b[i]) * i
+
+    tvm_a = tvm.nd.array(a)
+    tvm_b = tvm.nd.array(b)
+    tvm_d = tvm.nd.array(numpy.zeros((20, )).astype('float32'))
+
+    func(tvm_a, tvm_b, tvm_d)
+    tvm.testing.assert_allclose(tvm_d.asnumpy(), ref, 1e-5, 1e-5)
+
+def test_downstream():
+    @tvm.hybrid.script
+    def downstream(a):
+        b = output_tensor((20, ), 'float32')
+        for i in range(20):
+            b[i] = a[i] * i
+        return b
+    
+    a = tvm.placeholder((20, ), 'float32')
+    b = downstream(a)
+    c = tvm.compute((20, ), lambda x: b[x] + 1.0)
+    sch = tvm.create_schedule(c.op)
+    module = tvm.build(sch, [a, c])
+    assert module
+
+    a = numpy.random.randn(20).astype('float32')
+    ref = numpy.zeros((20, )).astype('float32')
+    for i in range(20):
+        ref[i] = (a[i] * i) + 1.0
+
+    tvm_a = tvm.nd.array(a)
+    tvm_c = tvm.nd.array(numpy.zeros((20, )).astype('float32'))
+    module(tvm_a, tvm_c)
+    tvm.testing.assert_allclose(tvm_c.asnumpy(), ref, 1e-5, 1e-5)
+
 
 if __name__ == "__main__":
     test_outer_product()
     test_fanout()
-    test_failure()
     test_looptype()
     test_if()
     test_bind()
     test_math_intrin()
     test_non_zero()
     test_allocate()
+    #test_inplace()
+    test_upstream()
+    test_downstream()
+
 

From 8ac74173db6e2fab3b9ae76dd3e34f26940d2154 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Lorenzo=20D=C3=ADaz?=
 <6094231+javierlorenzod@users.noreply.github.com>
Date: Mon, 19 Nov 2018 18:00:55 +0100
Subject: [PATCH 372/529] Update README.md typo (#2132)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e2fc7b8c45d2..828b0f7e880b 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,6 @@ Acknowledgement
 ---------------
 We learnt a lot from the following projects when building TVM.
 - [Halide](https://github.com/halide/Halide): TVM uses [HalideIR](https://github.com/dmlc/HalideIR) as data structure for
-  arithematic simplification and low level lowering. We also learnt and adapted some part of lowering pipeline from Halide.
+  arithmetic simplification and low level lowering. We also learnt and adapted some part of lowering pipeline from Halide.
 - [Loopy](https://github.com/inducer/loopy): use of integer set analysis and its loop transformation primitives.
 - [Theano](https://github.com/Theano/Theano): the design inspiration of symbolic scan operator for recurrence.

From 70e140f9605a2693571096c44a3ea6ec2158273e Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 19 Nov 2018 14:23:37 -0500
Subject: [PATCH 373/529] Relay Op sprint (part 2) - Level 1 - log_softmax
 (#2128)

---
 python/tvm/relay/op/nn/_nn.py        | 10 +++++++++-
 src/relay/op/nn/nn.cc                | 13 ++++++++++++-
 tests/python/relay/test_op_level1.py | 15 +++++++++++----
 3 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 8d53e27892bc..e30cf8ba2ccf 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -9,7 +9,6 @@
 reg.register_schedule("nn.relu", schedule_injective)
 reg.register_pattern("nn.relu", OpPattern.ELEMWISE)
 
-
 @reg.register_schedule("nn.softmax")
 def schedule_softmax(_, outputs, target):
     """Schedule definition of softmax"""
@@ -19,6 +18,15 @@ def schedule_softmax(_, outputs, target):
 reg.register_pattern("nn.softmax", OpPattern.OPAQUE)
 
 
+@reg.register_schedule("nn.log_softmax")
+def schedule_log_softmax(_, outputs, target):
+    """Schedule definition of log_softmax"""
+    with target:
+        return topi.generic.schedule_softmax(outputs)
+
+reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
+
+
 # dense
 @reg.register_compute("nn.dense")
 def compute_dense(attrs, inputs, out_type, target):
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 16b65aeeab7f..dfa68197819b 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -291,7 +291,18 @@ RELAY_REGISTER_OP("nn.log_softmax")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  const auto* param = attrs.as<SoftmaxAttrs>();
+  CHECK(param != nullptr);
+  CHECK(param->axis == -1 || param->axis == static_cast<int32_t>(inputs[0].ndim()) - 1)
+      << "log_softmax currently only works on last dimension";
+  return Array<Tensor>{ topi::nn::log_softmax(inputs[0]) };
+});
+
 
 
 // BatchFlatten
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 53de7aa26279..35844ddd4a3f 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -137,12 +137,19 @@ def test_softmax():
 
 
 def test_log_softmax():
-    n, d = tvm.var("n"), tvm.var("d")
-    x = relay.var("x", shape=(n, d))
-    y = relay.nn.log_softmax(x, axis=0)
+    shape = (10, 4)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.log_softmax(x, axis=1)
     assert "nn.log_softmax" in y.astext()
     yy = relay.ir_pass.infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, d))
+    assert yy.checked_type == relay.TensorType(shape)
+    func = relay.Function([x], y)
+    x_data = np.random.uniform(size=shape).astype("float32")
+    ref_res = topi.testing.log_softmax_python(x_data)
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res = intrp.evaluate(func)(x_data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
 def test_concatenate():

From 291dbfb9c2fab266ba5ed0bb10032eb1f4f9e15e Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 19 Nov 2018 12:26:35 -0800
Subject: [PATCH 374/529] [COMMUNITY] new community guideline (#2077)

---
 docs/contribute/community.rst | 53 +++++++++++------------------------
 1 file changed, 16 insertions(+), 37 deletions(-)

diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
index 1023cf0ddccc..3a3e5ec3d0fd 100644
--- a/docs/contribute/community.rst
+++ b/docs/contribute/community.rst
@@ -1,51 +1,30 @@
-TVM Community Structure
+TVM Community Guideline
 =======================
 
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community. There are several roles in the community:
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md>`_ for the current list of contributors.
 
-- Project Management Committee(PMC) Small group of active committers that moderate the discussion, RFC, manage project releases.
-- Committer Individual who has made substantial contributions to the project and is granted write access to the project and oversees the general direction of the projects.
-- Code Owner Individual who is responsible for a specific area of the codebase.
-- Reviewer Individual who is qualified to review for a specific area of the codebase.
-- Contributor Anyone who contributes to the project.
 
-This document explains responsibility and criteria for each role.
-See `CONTRIBUTORS.md <https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md>`_ for the current list of contributors and their roles.
+General Development Process
+---------------------------
+Everyone in the community is welcomed to send patches, documents, and propose new directions to the project. The key guideline here is to enable everyone in the community to get involved and participate the decision and development.  When major changes are proposed, an RFC should be sent to allow discussion by the community. We encourage public discussion, archivable channels such as issues, discuss forum and mailing-list, so that everyone in the community can participate and review the process later.
 
+Code reviews are one of the key ways to ensure the quality of the code. High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project. A pull request needs to be reviewed before it gets merged. A committer who has the expertise of the corresponding area would moderate the pull request and the merge the code when it is ready. The corresponding committer could request multiple reviewers who are familiar with the area of the code. We encourage contributors to request code reviews themselves and help review each other's code -- remember everyone is volunteering their time to the community, high-quality code review itself costs as much as the actual code contribution, you could get your code quickly reviewed if you do others the same favor.
 
-Project Management Committee
-----------------------------
+The community should strive to reach a consensus on technical decisions through discussion. We expect committers and PMCs to moderate technical discussions in a diplomatic way, and provide suggestions with clear technical reasoning when necessary.
 
-The PMC consists of a small group of active committers that moderate the discussion, provide mentorship to committers and code owners and manage the project release. PMC members need to actively manage the general project directions. Note that most major design choices and proposed changes should reach consensus among the committers.
 
-Committer
----------
-
-Committers are individuals who are granted the write access to the project. Committers oversee the general project directions and participate in the evaluation of the RFCs involving major design changes. Here is a list of useful things to do to help become a committer.
-
-- Deep understanding of one or a few modules in the project.
-- Good understanding of general project structure, demonstrated by discussion over RFCs, code reviews and proposals of new features
-- Active history of code reviews that demonstrate a good technical ability
-- Contribution history of high-quality documentation and tutorials to the promote project
-- History of creating clean, maintainable code and including good test cases.
 
-New committers are nominated by current committers from current code owners.
-
-Code Owner
+Committers
 ----------
+Committers are individuals who are granted the write access to the project. A committer is usually responsible for a certain area or several areas of the code where they oversee the code review process. The area of contribution can take all forms, including code contributions and code reviews, documents, education, and outreach. Committers are essential for a high quality and healthy project. The community actively look for new committers from contributors. Here is a list of useful traits that help the community to recognize potential committers:
 
-A code owner is an individual who is responsible for a specific area of the code-base. Code owners are responsible for the areas they are in charge of and oversee the code review process of the corresponding module. Changes to a specific area need to be approved by one of its owners in order to be merged. Once a pull request is approved by the designated code owner, the code can be directly merged into the repo. Code owners are essential for a high quality and healthy codebase.
-
-We welcome new code owners that help to keep good code quality, testing, and documentation in specific areas. Here is a list of useful traits that help the community to recognize potential code owners:
+- Sustained contribution to the project, demonstrated by discussion over RFCs, code reviews and proposals of new features, and other development activities. Being familiar with, and being able to take ownership on one or several areas of the project.
+- Quality of contributions: High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review.  History of creating clean, maintainable code and including good test cases. Informative code reviews to help other contributors that adhere to a good standard.
+- Community involvement: active participation in the discussion forum, promote the projects via tutorials, talks and outreach. We encourage committers to collaborate broadly, e.g. do code reviews and discuss designs with community members that they do not interact physically.
 
-- High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review
-- Good coverage of tests and documentation in the contributions
-- Informative code reviews to help other contributors that adhere to a good standard, spot problems in contributions etc.
-- Active participation in the discussion forum
+The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. Potential candidates are usually proposed via an internal discussion among PMCs, followed by a consensus approval, i.e. least 3 +1 votes, and no vetoes. Any veto must be accompanied by reasoning. PMCs should serve the community by upholding the community practices and guidelines TVM a better community for everyone. PMCs should strive to identify new candidates outside of their own organization.
 
-Reviewer
---------
 
-A reviewer is an individual who actively contributed to the project and is willing to participate in the code review of new contributions. We invite reviewers from active contributors. The reviewer invitation will be sent to the potential reviewer’s email, so please log in to the discussion forum so that we can know which email address we could send an invitation to.
-We actively seek reviews from reviewers. High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project.
-A pull request to the project has to be reviewed by a reviewer in order to be merged.
+Reviewers
+---------
+Reviewers are individuals who actively contributed to the project and are willing to participate in the code review of new contributions. We identify reviewers from active contributors. The committers should explicitly solicit reviews from reviewers.  High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project. A pull request to the project has to be reviewed by at least one reviewer in order to be merged.

From a51a1205e18e7a531560345cb14975bbc8be1b68 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 19 Nov 2018 15:32:30 -0500
Subject: [PATCH 375/529] [TOPI] Minor fix in the LSTM recipe (#2131)

---
 topi/recipe/rnn/lstm.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/topi/recipe/rnn/lstm.py b/topi/recipe/rnn/lstm.py
index 53ccbe598c3d..f627d6ce8f8e 100644
--- a/topi/recipe/rnn/lstm.py
+++ b/topi/recipe/rnn/lstm.py
@@ -1,8 +1,6 @@
 """LSTM Example, still work in progress.."""
 import tvm
-import time
 import os
-import argparse
 from tvm.contrib import nvcc
 import numpy as np
 
@@ -14,16 +12,19 @@
 SKIP_CHECK = False
 UNROLL_WLOAD = True
 
+
 @tvm.register_func
 def tvm_callback_cuda_compile(code):
     """Use nvcc compiler for better perf."""
     ptx =  nvcc.compile_cuda(code, target="ptx")
     return ptx
 
+
 def write_code(code, fname):
     with open(fname, "w") as f:
         f.write(code)
 
+
 @tvm.register_func
 def tvm_callback_cuda_postproc(code):
     if not os.path.exists("perf"):
@@ -33,16 +34,16 @@ def tvm_callback_cuda_postproc(code):
         code = open("perf/%s_manual.cu" % TASK).read()
     return code
 
+
 def lstm():
     if not PERSIST_KERNEL:
         raise ValueError("Non persist LSTM not yet supported")
-    detect_global_barrier = DETECT_GLOBAL_BARRIER
     num_thread_y = 8
-    num_thread_x = 16 * 3 / 2
+    num_thread_x = 16 * 3 // 2
     num_sm = 24
     n_num_step = 128
     num_step = tvm.var('num_step')
-    num_hidden = 1152 / 2
+    num_hidden = 1152 // 2
     batch_size = 1
     # Global transition matrix
     # Input hidden channel can be pre-caculated by a gemm
@@ -165,11 +166,9 @@ def check_device(target):
         flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
         ctx.sync()
         # measure time cost of second step.
-        tstart = time.time()
-        flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
-        ctx.sync()
-        tgap = time.time() - tstart
-        print("Time cost=%g" % tgap)
+        evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000)
+        eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
+        print("Time cost=%g" % eval_result.mean)
 
     # set unroll_explicit for more readable code.
     with tvm.build_config(

From 5444c672b79e4502172c0abc8eb8232629c86e7e Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Mon, 19 Nov 2018 12:35:15 -0800
Subject: [PATCH 376/529] [WIP] [RPC] clean up uploaded modules (#2121)

 [RPC] clean up uploaded modules
---
 python/tvm/autotvm/measure/measure_methods.py |  6 ++++++
 python/tvm/rpc/client.py                      | 13 +++++++++++++
 src/runtime/file_util.cc                      |  4 ++++
 src/runtime/file_util.h                       |  6 ++++++
 src/runtime/rpc/rpc_server_env.cc             |  7 +++++++
 5 files changed, 36 insertions(+)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 802abe019013..ff93704edb44 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -467,6 +467,12 @@ def run_through_rpc(measure_input, build_result,
             ctx.sync()
 
         costs = time_f(*args).results
+
+        # clean up remote files
+        remote.remove(build_result.filename)
+        remote.remove(os.path.splitext(build_result.filename)[0] + '.so')
+        remote.remove('')
+
         if len(costs) > 2:  # remove largest and smallest value to reduce variance
             costs = list(costs)
             costs.sort()
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index ae44e5a79933..c975ec64aa76 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -103,6 +103,19 @@ def download(self, path):
                 "tvm.rpc.server.download")
         return self._remote_funcs["download"](path)
 
+    def remove(self, path):
+        """Remove file from remote temp folder.
+
+        Parameters
+        ----------
+        path: str
+            The relative location to remote temp folder.
+        """
+        if "remove" not in self._remote_funcs:
+            self._remote_funcs["remove"] = self.get_function(
+                "tvm.rpc.server.remove")
+        self._remote_funcs["remove"](path)
+
     def load_module(self, path):
         """Load a remote module, the file need to be uploaded first.
 
diff --git a/src/runtime/file_util.cc b/src/runtime/file_util.cc
index 4df335a54f25..ff579d12112d 100644
--- a/src/runtime/file_util.cc
+++ b/src/runtime/file_util.cc
@@ -142,5 +142,9 @@ void LoadMetaDataFromFile(
   fs.close();
 }
 
+void RemoveFile(const std::string& file_name) {
+  std::remove(file_name.c_str());
+}
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/file_util.h b/src/runtime/file_util.h
index de520fa3158c..2b797614281b 100644
--- a/src/runtime/file_util.h
+++ b/src/runtime/file_util.h
@@ -71,6 +71,12 @@ void SaveMetaDataToFile(
 void LoadMetaDataFromFile(
     const std::string& file_name,
     std::unordered_map<std::string, FunctionInfo>* fmap);
+
+/*!
+ * \brief Remove (unlink) a file.
+ * \param file_name The file name.
+ */
+void RemoveFile(const std::string& file_name);
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_FILE_UTIL_H_
diff --git a/src/runtime/rpc/rpc_server_env.cc b/src/runtime/rpc/rpc_server_env.cc
index ca91b88247e5..fb8d95d60b95 100644
--- a/src/runtime/rpc/rpc_server_env.cc
+++ b/src/runtime/rpc/rpc_server_env.cc
@@ -35,5 +35,12 @@ TVM_REGISTER_GLOBAL("tvm.rpc.server.download")
     *rv = arr;
   });
 
+TVM_REGISTER_GLOBAL("tvm.rpc.server.remove")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    std::string file_name = RPCGetPath(args[0]);
+    LOG(INFO) << "Remove " << file_name;
+    RemoveFile(file_name);
+  });
+
 }  // namespace runtime
 }  // namespace tvm

From a43dd3b5ec9237210119d0767c53353187846277 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 20 Nov 2018 02:07:46 +0530
Subject: [PATCH 377/529] [RELAY]sch & comp for ops in nn.py (#2092)

---
 include/tvm/relay/attrs/nn.h         |  2 +-
 python/tvm/relay/op/nn/_nn.py        | 45 ++++++++++++++++++++
 src/relay/op/nn/nn.cc                | 32 ++++++++++++--
 tests/python/relay/test_op_level2.py | 62 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level3.py | 41 ++++++++++++++++++
 5 files changed, 177 insertions(+), 5 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 5077c82412a6..33f18a89e3e8 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -327,7 +327,7 @@ struct BatchNormAttrs : public tvm::AttrsNode<BatchNormAttrs> {
 
 /*! \brief Attributes for LRN operator */
 struct LRNAttrs : public tvm::AttrsNode<LRNAttrs> {
-  IndexExpr size;
+  int size;
   int axis;
   double bias;
   double alpha;
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index e30cf8ba2ccf..cd807ad62128 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -17,6 +17,7 @@ def schedule_softmax(_, outputs, target):
 
 reg.register_pattern("nn.softmax", OpPattern.OPAQUE)
 
+schedule_broadcast = schedule_injective
 
 @reg.register_schedule("nn.log_softmax")
 def schedule_log_softmax(_, outputs, target):
@@ -194,3 +195,47 @@ def schedule_global_avg_pool2d(_, outs, target):
         return topi.generic.schedule_global_pool(outs)
 
 reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# leaky_relu
+reg.register_schedule("nn.leaky_relu", schedule_broadcast)
+reg.register_pattern("nn.leaky_relu", OpPattern.ELEMWISE)
+
+# prelu
+reg.register_schedule("nn.prelu", schedule_broadcast)
+reg.register_pattern("nn.prelu", OpPattern.BROADCAST)
+
+# flatten
+reg.register_schedule("nn.batch_flatten", schedule_broadcast)
+reg.register_pattern("nn.batch_flatten", OpPattern.INJECTIVE)
+
+
+# lrn
+@reg.register_compute("nn.lrn")
+def compute_lrn(attrs, inputs, out_dtype, target):
+    """Compute definition of lrn"""
+    assert len(inputs) == 1
+    return [topi.nn.lrn(inputs[0], attrs.size, attrs.axis,
+                        attrs.alpha, attrs.beta, attrs.bias)]
+
+@reg.register_schedule("nn.lrn")
+def schedule_lrn(attrs, outs, target):
+    """Schedule definition of lrn"""
+    with target:
+        return topi.generic.schedule_lrn(outs)
+
+reg.register_pattern("nn.lrn", OpPattern.OPAQUE)
+
+
+# l2_normalize
+@reg.register_compute("nn.l2_normalize")
+def compute_l2_normalize(attrs, inputs, out_dtype, target):
+    """Compute definition of l2 normalize"""
+    return [topi.nn.l2_normalize(inputs[0], attrs.eps, attrs.axis)]
+
+@reg.register_schedule("nn.l2_normalize")
+def schedule_l2_normalize(attrs, outs, target):
+    """Schedule definition of l2 normalize"""
+    with target:
+        return topi.generic.schedule_l2_normalize(outs)
+
+reg.register_pattern("nn.l2_normalize", OpPattern.OUT_ELEMWISE_FUSABLE)
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index dfa68197819b..d00f05cfc6fe 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -9,6 +9,7 @@
 #include <tvm/relay/attrs/image.h>
 #include <topi/nn.h>
 #include <topi/nn/softmax.h>
+#include <topi/nn/flatten.h>
 #include <vector>
 #include "../type_relations.h"
 #include "../op_common.h"
@@ -169,7 +170,15 @@ RELAY_REGISTER_OP("nn.leaky_relu")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    const auto* param = attrs.as<LeakyReluAttrs>();
+    return Array<Tensor>{ topi::leaky_relu(inputs[0], param->alpha) };
+});
 
 
 TVM_REGISTER_NODE_TYPE(PReluAttrs);
@@ -225,7 +234,15 @@ where :math:`*` is an channelwise multiplication for each sample in the batch.
 .add_argument("data", "Tensor", "Input data.")
 .add_argument("alpha", "Tensor", "Input channelwise alpha.")
 .set_support_level(3)
-.add_type_rel("PRelu", PReluRel);
+.add_type_rel("PRelu", PReluRel)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    const auto* param = attrs.as<PReluAttrs>();
+    return Array<Tensor>{ topi::prelu(inputs[0], inputs[1], param->axis)};
+});
 
 
 TVM_REGISTER_API("relay.op.nn._make.softmax")
@@ -365,7 +382,14 @@ Example::
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
-.add_type_rel("BatchFlatten", BatchFlattenRel);
+.add_type_rel("BatchFlatten", BatchFlattenRel)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    return Array<Tensor>{ topi::nn::flatten(inputs[0]) };
+});
 
 
 // relu
@@ -398,7 +422,7 @@ RELAY_REGISTER_OP("nn.relu")
 TVM_REGISTER_NODE_TYPE(LRNAttrs);
 
 Expr MakeLRN(Expr data,
-             IndexExpr size,
+             int size,
              int axis,
              double alpha,
              double beta,
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 7b3a6d3fe15e..1ae37240788f 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -295,6 +295,25 @@ def test_flatten_infer_type():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((d1, ((2*d3)*3)), "float32")
 
+    shape = (1, 5, 10, 10)
+    o_shape = (1, 500)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    z = relay.nn.batch_flatten(x)
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(o_shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = x_data.flatten().reshape(o_shape)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
 def test_pad_infer_type():
     # entirely concrete case
     n, c, h, w = 1, 2, 3, 4
@@ -320,6 +339,29 @@ def test_lrn():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c , h, w))
 
+    shape = (1, 5, 10, 10)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    size=5
+    axis=1
+    bias=0.5
+    alpha=.00001
+    beta=0.75
+    z = relay.nn.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
 def test_l2_normalize():
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     x = relay.var("x", shape=(n, c , h, w))
@@ -328,6 +370,26 @@ def test_l2_normalize():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c , h, w))
 
+    shape = (1, 5, 10, 10)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    eps=0.001
+    axis=1
+    z = relay.nn.l2_normalize(x, eps=0.001, axis=[axis])
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = topi.testing.l2_normalize_python(x_data, eps, axis)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
 
 if __name__ == "__main__":
     test_pool2d()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 26eccf991d0e..22469cc7fdbe 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -4,6 +4,7 @@
 import numpy as np
 from tvm import relay
 from tvm.relay import create_executor
+from tvm.relay.testing import ctx_list
 from nose.tools import raises
 
 def test_zeros_ones():
@@ -214,6 +215,25 @@ def test_infer_type_leaky_relu():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
+    shape = (1, 5, 10, 10)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    z = relay.nn.leaky_relu(x, alpha=0.1)
+    assert "alpha=0.1" in z.astext()
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = np.where(x_data > 0, x_data, x_data * 0.1)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
 def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
     x = relay.var("data", relay.TensorType(data, dtype))
     if alpha:
@@ -230,6 +250,27 @@ def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
         alpha_shape = (data[axis],)
         assert zz.args[1].checked_type == relay.TensorType(alpha_shape, "float32")
 
+    if all(isinstance(v, tvm.expr.Var) == 1 for v in data) or not alpha:
+        return
+
+    func = relay.Function([x, y], z)
+    x_data = np.random.uniform(low=-1, high=1, size=data).astype(dtype)
+    a_data = np.random.uniform(low=-1, high=1, size=alpha).astype(dtype)
+
+    if axis == 1:
+        ref_res = (x_data < 0) * (x_data * a_data.reshape(3, 1, 1)) + (x_data>=0) * x_data
+    else:
+        ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data>=0) * x_data
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data, a_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data, a_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+
 def test_infer_type_prelu():
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     verify_infer_type_prelu((n, c, h, w), (c,), 1, (n, c, h, w))

From 510c9d51cb85c5482926da22d7c1424055bd0de1 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 19 Nov 2018 14:59:41 -0800
Subject: [PATCH 378/529] [RELAY][BACKEND] Enable PlanMemory in the graph
 runtime. (#2120)

---
 include/tvm/relay/expr.h                      |   2 +
 python/tvm/relay/backend/_backend.py          |   1 +
 .../relay/backend/graph_runtime_codegen.py    |  33 +-
 python/tvm/relay/base.py                      |  14 +-
 src/relay/backend/graph_plan_memory.cc        | 349 ++++++++++++++++++
 src/relay/ir/text_printer.cc                  |  39 +-
 src/relay/pass/fuse_ops.cc                    |   2 +-
 .../relay/test_backend_graph_runtime.py       |  35 +-
 8 files changed, 450 insertions(+), 25 deletions(-)
 create mode 100644 src/relay/backend/graph_plan_memory.cc

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index c72612791b52..887d28b0fa9f 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -458,12 +458,14 @@ inline const TTypeNode* ExprNode::type_as() const {
 /*!
  * \brief Print node as text format.
  * \param node The node to be printed.
+ * \param show_meta_data Whether to print meta data section.
  * \param annotate An optional callback function for attaching
  *        additional comment block to an expr.
  * \return The text representation.
  */
 std::string RelayPrint(
     const NodeRef& node,
+    bool show_meta_data = true,
     runtime::TypedPackedFunc<std::string(Expr)> annotate = nullptr);
 }  // namespace relay
 }  // namespace tvm
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index b5454031cb4a..a51cc8072aac 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -55,6 +55,7 @@ def build(funcs, target, target_host=None):
     funcs : List[tvm.LoweredFunc]
          The list of lowered functions.
 
+
     target : tvm.Target
          The target to run the code on.
 
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
index 4bbab957ab1d..50568b58607b 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -21,6 +21,7 @@
 from __future__ import absolute_import
 import json
 import attr
+from . import _backend
 from . import compile_engine
 from ..op import Op
 from ..expr import Function, GlobalVar, ExprFunctor
@@ -103,11 +104,12 @@ def __init__(self, mod, target):
         self.nodes = []
         self.var_map = {}
         self.params = {}
+        self.storage_map = None
         self.compile_engine = compile_engine.get()
         self.lowered_funcs = set()
         self._name_map = {}
 
-    def add_node(self, node, checked_type):
+    def add_node(self, node, expr):
         """
         Add a node to the graph.
 
@@ -116,14 +118,21 @@ def add_node(self, node, checked_type):
         node: Node
             The node to add to the graph.
 
-        checked_type: Type
-            The type of the node.
+        expr: tvm.relay.Expr
+            The corresponding expression.
 
         Returns
         -------
         node_ref: Union[NodeRef, List[NodeRef]]
             A reference to the node.
         """
+        checked_type = expr.checked_type
+        # setup storage ids
+        assert expr in self.storage_map
+        node.attrs["storage_id"] = [
+            x.value for x in self.storage_map[expr]
+        ]
+
         node_id = len(self.nodes)
         self.nodes.append(node)
         # Tuple return value, flatten as tuple
@@ -168,7 +177,7 @@ def visit_constant(self, op):
         name = "p%d" % index
         self.params[name] = op.data
         node = InputNode(name, {})
-        return self.add_node(node, op.checked_type)
+        return self.add_node(node, op)
 
     def visit_function(self, _):
         raise RuntimeError("function not supported")
@@ -244,7 +253,7 @@ def visit_call(self, call):
         op_name = cached_func.func_name
         op_node = OpNode(self._get_unique_name(op_name), {},
                          op_name, inputs, {})
-        return self.add_node(op_node, call.checked_type)
+        return self.add_node(op_node, call)
 
     def _get_json(self):
         """
@@ -281,8 +290,7 @@ def _get_json(self):
             assert node.num_outputs == len(node.attrs["shape"])
             shapes += node.attrs["shape"]
             dltypes += node.attrs["dtype"]
-            for i in range(node.num_outputs):
-                storage_ids.append(i + num_entry)
+            storage_ids += node.attrs["storage_id"]
             num_entry += node.num_outputs
             node_row_ptr.append(num_entry)
 
@@ -302,6 +310,14 @@ def _get_json(self):
 
         return json.dumps(json_dict, indent=2)
 
+    def debug_dump_memory_plan(self, func):
+        """Debug function to dump memory plan."""
+        def _annotate(expr):
+            if expr in self.storage_map:
+                return str(self.storage_map[expr])
+            return ""
+        return func.astext(show_meta_data=False, annotate=_annotate)
+
     def codegen(self, func):
         """Compile a single function into a graph.
 
@@ -321,11 +337,12 @@ def codegen(self, func):
         params : Dict[str, tvm.nd.NDArray]
             Additional constant parameters.
         """
+        self.storage_map = _backend.GraphPlanMemory(func)
         # First we convert all the parameters into input nodes.
         for param in func.params:
             node = InputNode(param.name_hint, {})
             self.var_map[param] = self.add_node(
-                node, param.type_annotation)
+                node, param)
 
         # Then we compile the body into a graph which can depend
         # on input variables.
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
index 012315b40f51..0feffeb809c5 100644
--- a/python/tvm/relay/base.py
+++ b/python/tvm/relay/base.py
@@ -23,7 +23,7 @@ def register_relay_node(type_key=None):
 
 class RelayNode(NodeBase):
     """Base class of all relay node."""
-    def astext(self, annotate=None):
+    def astext(self, show_meta_data=True, annotate=None):
         """Get the text format of the expression.
 
         Returns
@@ -31,11 +31,21 @@ def astext(self, annotate=None):
         text : str
             The text format of the expression.
 
+        show_meta_data : bool
+            Whether to include meta data section in the text
+            if there is meta data.
+
         annotate: Optional[relay.Expr->str]
             Optional annotate function to provide additional
             information in the comment block.
+
+        Note
+        ----
+        meta data section is necessary to fully parse the text format.
+        However, it can contain dumps that are big(constat weights),
+        so it can be helpful to skip printing the meta data section.
         """
-        return _expr.RelayPrint(self, annotate)
+        return _expr.RelayPrint(self, show_meta_data, annotate)
 
 
 @register_relay_node
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
new file mode 100644
index 000000000000..f3c3e2935d22
--- /dev/null
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -0,0 +1,349 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/backend/graph_mem_alloca.cc
+ * \brief Memory index assignment pass for executing
+ *   the program in the graph runtime.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include "../../common/arena.h"
+
+namespace tvm {
+namespace relay {
+
+struct StorageToken {
+  /*! \brief Reference counter */
+  int ref_counter{0};
+  /*! \brief number of bytes */
+  size_t max_bytes{0};
+  /*! \brief The corresponding tensor type node. */
+  const TensorTypeNode* ttype{nullptr};
+  /*! \brief virtual device index */
+  int device_id{0};
+  /*! \brief The storage id */
+  int64_t storage_id{-1};
+};
+
+class StorageAllocaBaseVisitor : public ExprVisitor {
+ public:
+  // run the visitor on a function.
+  void Run(const Function& func) {
+    for (Var param : func->params) {
+      CreateToken(param.operator->(), false);
+    }
+    this->VisitExpr(func->body);
+  }
+
+  void VisitExpr_(const ConstantNode* op) final {
+    this->CreateToken(op, false);
+  }
+
+  void VisitExpr_(const VarNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const FunctionNode* op) final {
+    // do not recursive into sub function.
+  }
+
+  void VisitExpr_(const GlobalVarNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const OpNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const TupleNode* op) final {
+    std::vector<StorageToken*> fields;
+    for (Expr field : op->fields) {
+      auto tok = GetToken(field);
+      CHECK_EQ(tok.size(), 1U);
+      fields.push_back(tok[0]);
+    }
+    token_map_[op] = fields;
+  }
+
+  void VisitExpr_(const TupleGetItemNode* op) final {
+    const auto& tok = GetToken(op->tuple);
+    CHECK_LT(static_cast<size_t>(op->index), tok.size());
+    token_map_[op] = {tok[op->index]};
+  }
+
+  void VisitExpr_(const IfNode* op) final {
+    LOG(FATAL) << "if is not supported.";
+  }
+
+  void VisitExpr_(const LetNode* op) final {
+    auto token = GetToken(op->value);
+    token_map_[op->var.operator->()] = token;
+    token_map_[op] = GetToken(op->body);
+  }
+
+ protected:
+  /*! \brief internal token map */
+  std::unordered_map<const ExprNode*, std::vector<StorageToken*> > token_map_;
+
+  /*!
+   * \brief Get the necessary token.
+   * \param expr The expression.
+   * \return The corresponding token.
+   */
+  const std::vector<StorageToken*>& GetToken(const Expr& expr) {
+    this->VisitExpr(expr);
+    auto it = token_map_.find(expr.operator->());
+    CHECK(it != token_map_.end());
+    return it->second;
+  }
+  /*!
+   * \brief Populate the token map to set op's tokens
+   * \param op The node to be processed.
+   * \param can_realloc Whether we can re-allocate the memory.
+   */
+  virtual void CreateToken(const ExprNode* op, bool can_realloc) = 0;
+};
+
+
+class StorageAllocaInit : protected StorageAllocaBaseVisitor {
+ public:
+  explicit StorageAllocaInit(common::Arena* arena)
+      : arena_(arena) {}
+
+
+  /*! \return The internal token map */
+  std::unordered_map<const ExprNode*, std::vector<StorageToken*> >
+  GetInitTokenMap(const Function& func) {
+    this->Run(func);
+    return std::move(token_map_);
+  }
+
+
+ protected:
+  using StorageAllocaBaseVisitor::VisitExpr_;
+
+  void CreateToken(const ExprNode* op, bool can_realloc)  final {
+    CHECK(!token_map_.count(op));
+    std::vector<StorageToken*> tokens;
+    if (const auto* tuple_type = op->checked_type().as<TupleTypeNode>()) {
+      for (Type t : tuple_type->fields) {
+        const auto* ttype = t.as<TensorTypeNode>();
+        CHECK(ttype);
+        StorageToken* token = arena_->make<StorageToken>();
+        token->ttype = ttype;
+        tokens.push_back(token);
+      }
+    } else {
+      const auto* ttype = op->checked_type().as<TensorTypeNode>();
+      CHECK(ttype);
+      StorageToken* token = arena_->make<StorageToken>();
+      token->ttype = ttype;
+      tokens.push_back(token);
+    }
+    token_map_[op] = tokens;
+  }
+
+  void VisitExpr_(const CallNode* op) final {
+    // create token for the call node.
+    CreateToken(op, true);
+    // for each input, visit argument token.
+    for (Expr arg : op->args) {
+      for (StorageToken* tok : GetToken(arg)) {
+        tok->ref_counter += 1;
+      }
+    }
+  }
+
+ private:
+  // allocator
+  common::Arena* arena_;
+};
+
+
+class StorageAllocator : public StorageAllocaBaseVisitor {
+ public:
+  /*!
+   * \return totoal number of bytes allocated
+   */
+  size_t TotalAllocBytes() const {
+    size_t total = 0;
+    for (const auto* p : data_) {
+      total += p->max_bytes;
+    }
+    return total;
+  }
+
+  // Run storage allocation for a function.
+  Map<Expr, Array<Integer> > Plan(const Function& func) {
+    prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func);
+    this->Run(func);
+
+    Map<Expr, Array<Integer> > smap;
+
+    for (const auto& kv : token_map_) {
+      Array<Integer> vec;
+      for (StorageToken* tok : kv.second) {
+        vec.push_back(tok->storage_id);
+      }
+      smap.Set(GetRef<Expr>(kv.first), vec);
+    }
+    return smap;
+  }
+
+
+ protected:
+  using StorageAllocaBaseVisitor::VisitExpr_;
+  // override create token by getting token as prototype requirements.
+  void CreateToken(const ExprNode* op, bool can_realloc)  final {
+    CHECK(!token_map_.count(op));
+    auto it = prototype_.find(op);
+    CHECK(it != prototype_.end());
+    std::vector<StorageToken*> tokens;
+    for (StorageToken* tok : it->second) {
+      if (can_realloc) {
+        tokens.push_back(Request(tok));
+      } else {
+        // Allocate a new token,
+        StorageToken* allocated_tok = Alloc(tok, GetMemorySize(tok));
+        // ensure it never get de-allocated.
+        allocated_tok->ref_counter += 1;
+        tokens.push_back(allocated_tok);
+      }
+    }
+    token_map_[op] = tokens;
+  }
+  // The call map
+  void VisitExpr_(const CallNode* op) final {
+    std::vector<StorageToken*> args;
+    // for each input, visit argument token.
+    for (Expr arg : op->args) {
+      for (StorageToken* tok : GetToken(arg)) {
+        args.push_back(tok);
+      }
+    }
+    // create token for the call node.
+    CreateToken(op, true);
+    // check if there is orphaned output that can be released immediately.
+    for (StorageToken* tok : token_map_.at(op)) {
+      CheckForRelease(tok);
+    }
+    for (StorageToken* tok : args) {
+      tok->ref_counter -= 1;
+      CheckForRelease(tok);
+    }
+  }
+  /*!
+   * \brief ceil(size/word_size) to get number of words.
+   * \param size The original size.
+   * \param word_size The element size.
+   */
+  static size_t DivRoundUp(size_t size, size_t word_size) {
+    return (size + word_size - 1) / word_size;
+  }
+  /*!
+   * \brief Get the memory requirement.
+   * \param prototype The prototype token.
+   * \return The required memory size.
+   */
+  size_t GetMemorySize(StorageToken* prototype) {
+    const TensorTypeNode* ttype = prototype->ttype;
+    CHECK(ttype != nullptr);
+    size_t size = 1;
+    for (IndexExpr dim : ttype->shape) {
+      const int64_t* pval = as_const_int(dim);
+      CHECK(pval != nullptr)
+          << "Cannot allocate memory symbolic tensor shape "
+          << ttype->shape;
+      size *= static_cast<size_t>(pval[0]);
+    }
+    size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
+    return size;
+  }
+  /*!
+   * \brief Request a storage token for a given prototype.
+   * \param prototype. The prototype storage token.
+   * \return The result token.
+   */
+  StorageToken* Request(StorageToken* prototype) {
+    // calculate the size;
+    size_t size = GetMemorySize(prototype);
+    // search memory block in [size / match_range_, size * match_range_)
+    if (match_range_ == 0) {
+      return this->Alloc(prototype, size);
+    }
+    auto begin = free_.lower_bound(size / match_range_);
+    auto mid = free_.lower_bound(size);
+    auto end = free_.upper_bound(size * match_range_);
+    // search for memory blocks larger than requested
+    for (auto it = mid; it != end; ++it) {
+      StorageToken *tok = it->second;
+      if (tok->device_id != prototype->device_id) continue;
+      CHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      tok->max_bytes = std::max(size, tok->max_bytes);
+      tok->ref_counter = prototype->ref_counter;
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return tok;
+    }
+    // then search for memory blocks smaller than requested space
+    for (auto it = mid; it != begin;) {
+      --it;
+      StorageToken *tok = it->second;
+      if (tok->device_id != prototype->device_id) continue;
+      CHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      tok->max_bytes = std::max(size, tok->max_bytes);
+      tok->ref_counter = prototype->ref_counter;
+      // erase from map and return
+      free_.erase(it);
+      return tok;
+    }
+    // cannot find anything return a new one.
+    return this->Alloc(prototype, size);
+  }
+  /*!
+   * \brief Allocate a storage token by consuming prototype
+   * \param prototype The prototype token.
+   * \param size The size of memory being requested.
+   */
+  StorageToken* Alloc(StorageToken* prototype, size_t size) {
+    prototype->max_bytes = size;
+    prototype->storage_id = static_cast<int64_t>(data_.size());
+    data_.push_back(prototype);
+    return prototype;
+  }
+  /*!
+   * \brief Check if we can release token.
+   * \tok The token to be released.
+   */
+  void CheckForRelease(StorageToken* tok) {
+    CHECK_GE(tok->storage_id, 0);
+    CHECK_GE(tok->ref_counter, 0);
+    if (tok->ref_counter == 0) {
+      free_.insert({tok->max_bytes, tok});
+    }
+  }
+
+ private:
+  // allocator
+  common::Arena arena_;
+  // scale used for rough match
+  size_t match_range_{16};
+  // free list of storage entry
+  std::multimap<size_t, StorageToken*> free_;
+  // all the storage resources available
+  std::vector<StorageToken*> data_;
+  /*! \brief internal prototype token map */
+  std::unordered_map<const ExprNode*, std::vector<StorageToken*> > prototype_;
+};
+
+
+Map<Expr, Array<Integer> > GraphPlanMemory(const Function& func) {
+  return StorageAllocator().Plan(func);
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.GraphPlanMemory")
+.set_body_typed<Map<Expr, Array<Integer> >(const Function&)>(GraphPlanMemory);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index bfc5f0db52b7..5e97ce1010ad 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -113,6 +113,11 @@ class TextMetaDataContext {
     return SaveJSON(Array<NodeRef>(meta_data_));
   }
 
+  /*! \return whether the meta data context is empty. */
+  bool empty() const {
+    return meta_data_.empty();
+  }
+
  private:
   /*! \brief additional metadata stored in TVM json format */
   std::vector<NodeRef> meta_data_;
@@ -125,8 +130,9 @@ class TextPrinter :
     public TypeFunctor<void (const Type&, std::ostream& os)>,  // NOLINT(*)
     public AttrFunctor<void (const NodeRef&, std::ostream& os)> { // NOLINT(*)
  public:
-  explicit TextPrinter(runtime::TypedPackedFunc<std::string(Expr)> annotate)
-      : annotate_(annotate) {}
+  explicit TextPrinter(bool show_meta_data,
+                       runtime::TypedPackedFunc<std::string(Expr)> annotate)
+      : show_meta_data_(show_meta_data), annotate_(annotate) {}
   /*!
    * \brief Print a node to string.
    * \param node.
@@ -144,13 +150,17 @@ class TextPrinter :
     } else {
       stream_ << node;
     }
-    std::string meta_json = meta_.GetMetaSection();
-    if (meta_json.length() != 0) {
-      // append meta data in the end.
-      stream_ << "# meta data\n"
-              << "r\"\"\"\n"
-              << meta_json << "\n"
-              << "\"\"\"";
+    if (!meta_.empty()) {
+      if (show_meta_data_) {
+        std::string meta_json = meta_.GetMetaSection();
+        // append meta data in the end.
+        stream_ << "# meta data\n"
+                << "r\"\"\"\n"
+                << meta_json << "\n"
+                << "\"\"\"";
+      } else {
+        stream_ << "# meta data omitted. you can use show_meta_data=True to include meta-data\n";
+      }
     }
     return stream_.str();
   }
@@ -227,7 +237,9 @@ class TextPrinter :
     TextValue id = this->AllocTempVar();
     this->PrintIndent();
     stream_ << id << " = " << meta_.GetMetaNode(GetRef<NodeRef>(op));
-    this->PrintEndInst("\n");
+    this->PrintEndInst("");
+    this->PrintOptionalInfo(GetRef<Expr>(op));
+    stream_ << '\n';
     return id;
   }
 
@@ -697,6 +709,8 @@ class TextPrinter :
  private:
   class AttrPrinter;
   friend class AttrPrinter;
+  /*! \brief Whether to print meta data. */
+  bool show_meta_data_;
   /*! \brief additional comment function */
   runtime::TypedPackedFunc<std::string(Expr)> annotate_;
   /*! \brief meta data context */
@@ -790,13 +804,14 @@ void TextPrinter::PrintCallAttrs(const Expr& op,
 }
 
 std::string RelayPrint(const NodeRef& node,
+                       bool show_meta_data,
                        runtime::TypedPackedFunc<std::string(Expr)> annotate) {
-  return TextPrinter(annotate).Print(node);
+  return TextPrinter(show_meta_data, annotate).Print(node);
 }
 
 TVM_REGISTER_API("relay._expr.RelayPrint")
 .set_body_typed<std::string(
-    const NodeRef&,
+    const NodeRef&, bool,
     runtime::TypedPackedFunc<std::string(Expr)>)>(RelayPrint);
 
 }  // namespace relay
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index cb5f86f4b525..b9e0823e88fa 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -749,7 +749,7 @@ class FuseMutator : private ExprMutator {
   }
   // Debug function, dump the group assignment in text.
   void DebugDumpGroup(const Expr& body) {
-    std::string text = RelayPrint(body, [this](const Expr& expr) -> std::string {
+    std::string text = RelayPrint(body, false, [this](const Expr& expr) -> std::string {
         auto it = gmap_.find(expr.get());
         if (it == gmap_.end()) return "";
         std::ostringstream os;
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
index 7b610f82f6a5..7baa906abacc 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -77,7 +77,9 @@ def test_add_op_broadcast():
 def test_with_params():
     x = relay.var('x', shape=(10, 5))
     y = relay.var('y', shape=(1, 5))
-    func = relay.Function([x, y], add(x, y))
+    z = relay.add(x, y)
+    z = relay.exp(z)
+    func = relay.Function([x, y], z)
     x_data = np.random.rand(10, 5).astype('float32')
     y_data = np.random.rand(1, 5).astype('float32')
     params = {"y": y_data}
@@ -87,11 +89,40 @@ def test_with_params():
     mod.set_input(x=x_data)
     mod.run()
     res = mod.get_output(0).asnumpy()
-    ref_res = y_data + x_data
+    ref_res = np.exp(y_data + x_data)
     tvm.testing.assert_allclose(res, ref_res)
 
 
+def test_plan_memory():
+    # it is sufficient to cycle through two memories.
+
+    x = relay.var("x", shape=(10,))
+    y = relay.var("x", shape=(1,))
+    y2 = relay.exp(y)
+    z = relay.add(x, y2)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    func = relay.Function([x, y], z)
+    func = relay.ir_pass.infer_type(func)
+    func = relay.ir_pass.fuse_ops(func, opt_level=0)
+    func = relay.ir_pass.infer_type(func)
+    smap = relay.backend._backend.GraphPlanMemory(func)
+    storage_ids = set()
+    for k, v in smap.items():
+        for x in v:
+            storage_ids.add(x.value)
+
+    # Current rule requires vars have unique storage id
+    # because we don't do inplace, we will need another
+    # two alternating temporary space.
+    assert len(storage_ids) == 4
+
+
 if __name__ == "__main__":
+    test_plan_memory()
     test_with_params()
     test_add_op_scalar()
     test_add_op_tensor()

From 7199a4a4c0a9c74507e218df284aea6ad75590ab Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Mon, 19 Nov 2018 21:35:21 -0800
Subject: [PATCH 379/529] [Relay][Op] Add test for batch_flatten (#2134)

* Add tests for batch_flatten and softmax

* Softmax is already tested elsewhere
---
 python/tvm/relay/op/nn/_nn.py        |  1 +
 tests/python/relay/test_op_level2.py | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index cd807ad62128..b48bfde97f33 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -9,6 +9,7 @@
 reg.register_schedule("nn.relu", schedule_injective)
 reg.register_pattern("nn.relu", OpPattern.ELEMWISE)
 
+# softmax
 @reg.register_schedule("nn.softmax")
 def schedule_softmax(_, outputs, target):
     """Schedule definition of softmax"""
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 1ae37240788f..cd9321c5a91f 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -391,6 +391,27 @@ def test_l2_normalize():
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
 
+def batch_flatten(data):
+    shape = data.shape
+    target_dim = 1
+    for i in range(len(shape) - 1):
+        target_dim = target_dim * shape[i + 1]
+    return np.reshape(data, (shape[0], target_dim))
+
+
+def test_batch_flatten():
+    t1 = relay.TensorType((5, 10, 5))
+    x = relay.Var("x", t1)
+    func = relay.Function([x], relay.nn.batch_flatten(x))
+
+    data = np.random.rand(5, 10, 5).astype(t1.dtype)
+    ref_res = batch_flatten(data)
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res = intrp.evaluate(func)(data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+
 if __name__ == "__main__":
     test_pool2d()
     test_avg_pool2d_no_count_pad()
@@ -403,3 +424,4 @@ def test_l2_normalize():
     test_conv2d_transpose_infer_type()
     test_conv2d_transpose_run()
     test_conv2d_run()
+    test_batch_flatten()

From 5ab9847dea2a18f906b53a69c9b8f546bc3368d4 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 20 Nov 2018 22:50:09 +0530
Subject: [PATCH 380/529] [RELAY]Slice_like support (#2014)

---
 docs/langref/relay_op.rst             |   3 +-
 include/tvm/relay/attrs/transform.h   |  13 +++
 python/tvm/relay/op/_transform.py     |   6 +-
 python/tvm/relay/op/transform.py      |  26 +++++
 src/relay/op/tensor/transform.cc      | 147 ++++++++++++++++++++++++++
 tests/python/relay/test_op_level10.py |  62 +++++++++++
 6 files changed, 255 insertions(+), 2 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index e99ac3c97f73..95581a54e5a1 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -143,6 +143,7 @@ This level support backpropagation of broadcast operators. It is temporary.
 
    tvm.relay.broadcast_to_like
    tvm.relay.collapse_sum_like
+   tvm.relay.slice_like
 
 
 Level 1 Definitions
@@ -231,7 +232,6 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.strided_slice
 
 
-
 Level 5 Definitions
 -------------------
 .. autofunction:: tvm.relay.image.resize
@@ -241,3 +241,4 @@ Level 10 Definitions
 --------------------
 .. autofunction:: tvm.relay.broadcast_to_like
 .. autofunction:: tvm.relay.collapse_sum_like
+.. autofunction:: tvm.relay.slice_like
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index fc539f3ce742..7a8129180c4d 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -138,6 +138,19 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
         .describe("Stride values of the slice");
   }
 };
+
+
+struct SliceLikeAttrs : public tvm::AttrsNode<SliceLikeAttrs> {
+  Array<Integer> axes;
+
+  TVM_DECLARE_ATTRS(SliceLikeAttrs, "relay.attrs.SliceLikeAttrs") {
+    TVM_ATTR_FIELD(axes)
+        .describe("List of axes on which input data will be sliced according to the "
+                  "corresponding size of the second input. By default will slice "
+                  "on all axes. Negative axes mean counting in reverse.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 7867336d033f..01814e0f73e0 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -2,7 +2,11 @@
 """Backend compiler related feature registration"""
 from __future__ import absolute_import
 from . import op as _reg
-from .op import schedule_injective
+from .op import schedule_injective, OpPattern
 
 # strided_slice
 _reg.register_schedule("strided_slice", schedule_injective)
+
+# slice_like
+_reg.register_schedule("slice_like", schedule_injective)
+_reg.register_pattern("slice_like", OpPattern.INJECTIVE)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index e43a4a573e54..c5fedab054d2 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -361,3 +361,29 @@ def strided_slice(data, begin, end, strides=None):
     """
     strides = strides or []
     return _make.strided_slice(data, list(begin), list(end), list(strides))
+
+
+def slice_like(data, shape_like, axes=None):
+    """Slice the first input with respect to the second input.
+
+    For an input array with shape ``(d1, d2, ..., dk)``, `slice_like` operation slices the
+    the input array corresponding size of second array. By default will slice on all axes.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The source array.
+
+    shape_like : tvm.relay.Expr
+        The new shape.
+
+    axes : Optional[Tuple[int]]
+        List of axes on which input data will be sliced according to the corresponding size of
+        the second input. By default will slice on all axes. Negative axes mean counting in reverse.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.slice_like(data, shape_like, axes)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 98ac1c30b66c..7a3a2151158d 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1153,5 +1153,152 @@ the entries indicate where along axis the array is split.
 .set_support_level(3)
 .add_type_rel("Split", SplitRel);
 
+
+TVM_REGISTER_NODE_TYPE(SliceLikeAttrs);
+
+/*!
+* \brief SliceLikeRel User defined type constraint function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return False if the relation has not been resolved, it might be resolved later.
+*  True if this relation has been resolved.
+*/
+bool SliceLikeRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+
+  const auto* target = types[1].as<TensorTypeNode>();
+  if (target == nullptr) {
+    return false;
+  }
+
+  const auto param = attrs.as<SliceLikeAttrs>();
+  CHECK(param != nullptr);
+
+  const Array<IndexExpr> dshape = data->shape;
+  const Array<IndexExpr> target_shape = target->shape;
+  std::vector<IndexExpr>&& oshape = AsVector(dshape);
+
+  if (!param->axes.defined()) {
+    for (size_t i = 0; i < dshape.size(); ++i) {
+      if (i < target_shape.size()) {
+        oshape[i] = target_shape[i];
+        CHECK(reporter->Assert(oshape[i] <= dshape[i]))
+          << "End index of axis " << i << " exceeds input shape: "
+          << oshape[i] << " vs " << dshape[i];
+      }
+    }
+  } else {
+    CHECK(param->axes.size() != 0) << "Axes cannot be empty.";
+    for (Integer val : param->axes) {
+      int axis = val->value;
+      if (axis < 0) {
+        axis += dshape.size();
+      }
+      CHECK(axis < static_cast<int>(target_shape.size()))
+        << "Axis " << axis << " exceeds dimension "
+        << target_shape.size() << " of target_shape.";
+      oshape[axis] = target_shape[axis];
+      CHECK(reporter->Assert(oshape[axis] <= dshape[axis]))
+        << "End index of axis " << axis << " exceeds input shape: "
+        << oshape[axis] << " vs " << dshape[axis];
+    }
+  }
+
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+
+Expr MakeSliceLike(Expr data,
+                   Expr shape_like,
+                   Array<Integer> axes) {
+  auto attrs = make_node<SliceLikeAttrs>();
+  attrs->axes = std::move(axes);
+  static const Op& op = Op::Get("slice_like");
+  return CallNode::make(op, {data, shape_like}, Attrs(attrs), {});
+}
+
+// Adapter function to make int array.
+Array<Integer> GetIntArray(Array<IndexExpr> arr) {
+  for (size_t i = 0; i < arr.size(); ++i) {
+    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
+        << "Expect an int array";
+  }
+  return Array<Integer>(arr.node_);
+}
+
+template<typename AttrType>
+Array<Tensor> SliceLikeCompute(const Attrs& attrs,
+                               const Array<Tensor>& inputs,
+                               const Type& out_type,
+                               const Target& target) {
+  const auto* param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+  Array<IndexExpr> src_shape = inputs[0]->shape;
+  Array<IndexExpr> target_shape = inputs[1]->shape;
+  Array<IndexExpr> begin_idx, end_idx, strides;
+  for (size_t i = 0; i < src_shape.size(); ++i) {
+    begin_idx.push_back(0);
+    strides.push_back(1);
+  }
+  end_idx = Array<IndexExpr>(src_shape);
+  if (!param->axes.defined()) {
+    for (size_t i = 0; i < src_shape.size(); ++i) {
+      if (i < target_shape.size()) {
+        end_idx.Set(i, target_shape[i]);
+        CHECK_LE(topi::GetConstInt(end_idx[i]),
+                 topi::GetConstInt(src_shape[i]))
+          << "End index of axis " << i << " exceeds input shape: "
+          << topi::GetConstInt(end_idx[i]) << " vs "
+          << topi::GetConstInt(src_shape[i]);
+      }
+    }
+  } else {
+    for (int axis : param->axes) {
+      if (axis < 0) {
+        axis = static_cast<int>(src_shape.size()) + axis;
+      }
+      end_idx.Set(axis, target_shape[axis]);
+      CHECK_LE(topi::GetConstInt(end_idx[axis]),
+               topi::GetConstInt(src_shape[axis]))
+        << "End index of axis " << axis << " exceeds input shape: "
+        << topi::GetConstInt(end_idx[axis]) << " vs "
+        << topi::GetConstInt(src_shape[axis]);
+    }
+  }
+  return Array<Tensor>{
+    topi::strided_slice(inputs[0],
+                        GetIntArray(begin_idx),
+                        GetIntArray(end_idx),
+                        GetIntArray(strides))
+  };
+}
+
+
+TVM_REGISTER_API("relay.op._make.slice_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeSliceLike, args, rv);
+});
+
+
+RELAY_REGISTER_OP("slice_like")
+.describe(R"code(Slice the first input respect to the second input.
+)code" TVM_ADD_FILELINE)
+  .set_attrs_type_key("relay.attrs.SlicelikeAttrs")
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("shape_like", "Tensor", "Shape tensor.")
+.set_support_level(10)
+.add_type_rel("SliceLike", SliceLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", SliceLikeCompute<SliceLikeAttrs>);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 9486d029876d..ef1c57d263fa 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -1,7 +1,9 @@
 """ Support level10 operator test cases.
 """
+import numpy as np
 import tvm
 from tvm import relay
+from tvm.relay.testing import ctx_list
 
 def test_collapse_sum_like():
     x = relay.Var("x", relay.ty.TensorType((3, 4, 5, 6), "int8"))
@@ -18,6 +20,66 @@ def test_broadcast_to_like():
     zz = relay.ir_pass.infer_type(z)
     assert zz.checked_type == relay.ty.TensorType((3, 4, 5, 6), "int8")
 
+
+def np_slice_like(np_data, np_shape_like, axis=None):
+    begin_idx = [0 for _ in np_data.shape]
+    end_idx = list(np_data.shape)
+    if axis:
+        for i in axis:
+            if i < 0:
+                i = len(np_data.shape) + i
+            end_idx[i] = np_shape_like.shape[i]
+    else:
+        for i in range(len(np_data.shape)):
+            if i < len(np_shape_like.shape):
+                end_idx[i] = np_shape_like.shape[i]
+    slice_idx = []
+    for b, e in zip(begin_idx, end_idx):
+        slice_idx.append(slice(b, e))
+    np_result = np_data[tuple(slice_idx)]
+    return np_result
+
+
+def verify_slice_like(data, slice_like, axes, output, dtype="float32"):
+    x = relay.var("data", relay.TensorType(data, dtype))
+    y = relay.var("slice_like", relay.TensorType(slice_like, dtype))
+    z = relay.slice_like(x, y, axes)
+    zz = relay.ir_pass.infer_type(z)
+    if axes:
+        assert "axes" in z.astext()
+    assert zz.checked_type == relay.ty.TensorType(output, dtype)
+
+    if all(isinstance(v, int) == 0 for v in data) or \
+        all(isinstance(v, int) == 0 for v in slice_like):
+        return
+
+    func = relay.Function([x, y], z)
+    x_data = np.random.uniform(size=data).astype(dtype)
+    y_data = np.random.uniform(size=slice_like).astype(dtype)
+    ref_res = np_slice_like(x_data, y_data, axes)
+
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x_data, y_data)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+def test_slice_like():
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    verify_slice_like(data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
+    verify_slice_like(data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3))
+    verify_slice_like(data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1,2), output=(d2, d2, d3))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2), axes=None, output=(1, 2, 5))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(1, 2), output=(3, 2, 3))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(-1, -3), output=(1, 4, 3))
+    verify_slice_like(data=(1, 3, 224, 224),
+                      slice_like=(1, 3, 112, 112),
+                      axes=(2, 3),
+                      output=(1, 3, 112, 112))
+
+
 if __name__ == "__main__":
     test_collapse_sum_like()
     test_broadcast_to_like()
+    test_slice_like()

From b51524b87674ab6b8d880defe879154d5289eb37 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 10:31:19 -0800
Subject: [PATCH 381/529] [COMMUNITY] Update contributor list to reflect new
 guideline. (#2138)

---
 CONTRIBUTORS.md | 68 ++++++++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 91ecb2851985..602663ee867a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -5,38 +5,48 @@ contribute to, and influence the direction of the project. We actively invite co
 
 See the [community structure document](http://docs.tvm.ai/contribute/community.html) for the explanation of community structure and contribution guidelines.
 
+
 ## Committers
-- [Tianqi Chen](https://github.com/tqchen) (PMC)
-- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/)
-- [Ziheng Jiang](https://github.com/ZihengJiang)
-- [Haichen Shen](http://homes.cs.washington.edu/~haichen/)
-- [Yizhi Liu](https://github.com/yzhliu)
-
-## Code Owners
-- [Aditya Atluri](https://github.com/adityaatluri) ROCM
-- [Leyuan Wang](https://github.com/Laurawly) TOPI
-- [Yuwei Hu](https://github.com/Huyuwei) TOPI
-- [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
-- [Nick Hynes](https://github.com/nhynes) SGX and secured computing
-- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
+
+We add tag along with committer name to show areas that they are familiar with.
+We do encourage everyone to work anything they are interested in.
+
+- [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
+- [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta,
+- [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
+- [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
+- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 
 ## Reviewers
-- [Zhi Chen](https://github.com/zhiics)
-- [Xiaoqiang Dan](https://github.com/xqdan)
-- [Liangfu Chen](https://github.com/liangfu)
-- [Wuwei Lin](https://github.com/vinx13)
-- [Masahiro Masuda](https://github.com/masahi)
-- [Kazutaka Morita](https://github.com/kazum)
-- [Tatsuya Nishiyama](https://github.com/nishi-t)
-- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
-- [Jared Roesch](https://github.com/jroesch)
-- [Siva](https://github.com/srkreddy1238)
-- [Siju Samuel](https://github.com/siju-samuel)
-- [Alex Weaver](https://github.com/alex-weaver)
-- [Yao Wang](https://github.com/kevinthesun)
-- [Jian Weng](https://github.com/were)
-- [Eddie Yan](https://github.com/eqy)
-- [Joshua Z. Zhang](https://github.com/zhreshold)
+
+- [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri
+- [Tianqi Chen](https://github.com/tqchen): @tqchen
+- [Liangfu Chen](https://github.com/liangfu): @liangfu
+- [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Nick Hynes](https://github.com/nhynes): @nhynes
+- [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei
+- [Yizhi Liu](https://github.com/yzhliu) : @yzhliu
+- [Zhixun Tan](https://github.com/phisiart): @phisiart
+- [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Xiaoqiang Dan](https://github.com/xqdan): @xqdan
+- [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang
+- [Wuwei Lin](https://github.com/vinx13): @vinx13
+- [Masahiro Masuda](https://github.com/masahi): @masahi
+- [Thierry Moreau](https://github.com/tmoreau89): @tmoreau89
+- [Kazutaka Morita](https://github.com/kazum): @kazum
+- [Tatsuya Nishiyama](https://github.com/nishi-t): @nishi-t
+- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909
+- [Jared Roesch](https://github.com/jroesch): @jroesch
+- [Siva](https://github.com/srkreddy1238): @srkreddy1238
+- [Siju Samuel](https://github.com/siju-samuel): @siju-samuel
+- [Haichen Shen](https://github.com/icemelon9): @icemelon9
+- [Alex Weaver](https://github.com/alex-weaver): @alex-weaver
+- [Yao Wang](https://github.com/kevinthesun): @kevinthesun
+- [Leyuan Wang](https://github.com/Laurawly): @Laurawly
+- [Jian Weng](https://github.com/were): @were
+- [Eddie Yan](https://github.com/eqy): @eqy
+- [Joshua Z. Zhang](https://github.com/zhreshold): @zhreshold
+- [Lianmin Zheng](https://github.com/merrymercy): @merrymercy
 
 ## List of Contributors
 - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)

From af477197b424b02d53e2f6b5fedc562f660bffc9 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 10:33:42 -0800
Subject: [PATCH 382/529] Update CONTRIBUTORS.md

make name alphabetical
---
 CONTRIBUTORS.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 602663ee867a..0e77c34c115b 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -12,10 +12,11 @@ We add tag along with committer name to show areas that they are familiar with.
 We do encourage everyone to work anything they are interested in.
 
 - [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
-- [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta,
+- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
+- [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
-- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
+
 
 ## Reviewers
 

From d3aa793b55eda37f7a93d05f15032fb7f1133970 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 10:47:22 -0800
Subject: [PATCH 383/529] [TEAM] Huyuwei -> committer (#2139)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 0e77c34c115b..1286716b70fa 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -12,6 +12,7 @@ We add tag along with committer name to show areas that they are familiar with.
 We do encourage everyone to work anything they are interested in.
 
 - [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
+- [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta

From 3359bafb056ee19f72b5f9835e375e7cc093751a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 10:55:41 -0800
Subject: [PATCH 384/529] [TEAM] adityaatluri -> committer (#2140)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 1286716b70fa..d6a6dbb67b7a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -11,6 +11,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 We add tag along with committer name to show areas that they are familiar with.
 We do encourage everyone to work anything they are interested in.
 
+- [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri - rocm
 - [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay

From 99d870673c53f8c551f0fa92ff5c53f1e946125b Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 11:05:42 -0800
Subject: [PATCH 385/529] [TEAM] Laurawly -> committer (#2141)

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index d6a6dbb67b7a..fb1828ae96fb 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,7 +18,7 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
-
+- [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
 
 ## Reviewers
 

From 699bc5b875aab0cbe06f0369e3602f7a7dfbaa1c Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Tue, 20 Nov 2018 13:26:49 -0800
Subject: [PATCH 386/529] [TEAM] Lianmin Zheng -> committer (#2142)

---
 CONTRIBUTORS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index fb1828ae96fb..6ca1e997450c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,7 +18,8 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
-- [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
+- [Leyuan Wang](https://github.com/Laurawly): @Laurawly - topi
+- [Lianmin Zheng](https://github.com/merrymercy): @merrymercy - autotvm, topi
 
 ## Reviewers
 

From b91c076a9b9b9a39fca215568a2741e1bcd77dcb Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Tue, 20 Nov 2018 13:27:22 -0800
Subject: [PATCH 387/529] Add nick to committer (#2143)

---
 CONTRIBUTORS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6ca1e997450c..945dfb7b3ae5 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,7 +18,8 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
-- [Leyuan Wang](https://github.com/Laurawly): @Laurawly - topi
+- [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
+- [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy - autotvm, topi
 
 ## Reviewers

From 253a3569e1b14feaf38d70c145209e0712357eac Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 13:29:53 -0800
Subject: [PATCH 388/529] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 945dfb7b3ae5..cbdcf396e9b4 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -19,7 +19,7 @@ We do encourage everyone to work anything they are interested in.
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
 - [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
-- [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx
+- [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy - autotvm, topi
 
 ## Reviewers

From c1506ead51b96b063a9c5569c381c7e2fed1be0a Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Tue, 20 Nov 2018 22:50:37 -0800
Subject: [PATCH 389/529] fix dcgan layer naming overlap (#2145)

---
 python/tvm/relay/testing/dcgan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/testing/dcgan.py b/python/tvm/relay/testing/dcgan.py
index 96cd871e4122..d6c1d55df01a 100644
--- a/python/tvm/relay/testing/dcgan.py
+++ b/python/tvm/relay/testing/dcgan.py
@@ -36,7 +36,7 @@ def deconv2d_bn_relu(data, prefix, **kwargs):
     """a block of deconv + batch norm + relu"""
     eps = 1e-5 + 1e-12
     net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
-    net = layers.batch_norm_infer(net, epsilon=eps, name="batch_norm")
+    net = layers.batch_norm_infer(net, epsilon=eps, name="%s_batch_norm" % prefix)
     net = relay.nn.relu(net)
     return net
 

From 55c3cdff56badd23adb61f93c7e5e3bfcd572550 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 21 Nov 2018 16:31:27 +0800
Subject: [PATCH 390/529] Fix relative import in x86 conv2d (#2149)

---
 topi/python/topi/x86/conv2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index e48a95780e7f..1a73736264bd 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -21,7 +21,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depth
     """
     if is_depthwise:
         wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, out_dtype)
-        from depthwise_conv2d import _fallback_schedule
+        from .depthwise_conv2d import _fallback_schedule
         _fallback_schedule(cfg, wkl)
     else:
         wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype)

From 7725518003e216c5b37c0c57be003a4ebb808d77 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 21 Nov 2018 09:48:58 -0800
Subject: [PATCH 391/529] [RELAY] Move Layout to tvm Node system (#2125)

---
 include/tvm/expr.h                |   2 +-
 src/relay/op/image/resize.cc      |   4 +-
 src/relay/op/layout.cc            |  80 +++++
 src/relay/op/layout.h             | 422 +++++++++++++++++++++++
 src/relay/op/nn/convolution.cc    |  15 +-
 src/relay/op/nn/layout.h          | 536 ------------------------------
 src/relay/op/nn/nn.cc             |   2 +-
 src/relay/op/nn/pad.cc            |   2 +-
 src/relay/op/nn/pooling.cc        |  30 +-
 src/relay/op/nn/upsampling.cc     |   4 +-
 src/relay/pass/fold_scale_axis.cc |  33 +-
 src/relay/pass/pattern_util.h     |   3 +-
 12 files changed, 551 insertions(+), 582 deletions(-)
 create mode 100644 src/relay/op/layout.cc
 create mode 100644 src/relay/op/layout.h
 delete mode 100644 src/relay/op/nn/layout.h

diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 37b122ae5b03..35083cafae81 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -85,7 +85,7 @@ class Var : public HalideIR::VarExpr {
 
 
 /*!
- * \brief Container of constant ineteger (IntImm).
+ * \brief Container of constant integer (IntImm).
  *
  * This is used to store and automate type check
  * attributes that must be constant integer.
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index b4984becdf8b..bfa2ea4cdfa5 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/image.h>
-#include "../nn/layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -25,7 +25,7 @@ bool ResizeRel(const Array<Type>& types,
   const ResizeAttrs* param = attrs.as<ResizeAttrs>();
   CHECK(param != nullptr);
   const Layout in_layout(param->layout);
-  CHECK(in_layout.convertible(kNCHW))
+  CHECK(in_layout.Convertible(kNCHW))
     << "Resize only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
 
diff --git a/src/relay/op/layout.cc b/src/relay/op/layout.cc
new file mode 100644
index 000000000000..98fea55aa4c1
--- /dev/null
+++ b/src/relay/op/layout.cc
@@ -0,0 +1,80 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/relay/op/layout.cc
+ * \brief Layout expression.
+ */
+
+#include "layout.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(LayoutNode);
+
+std::vector<IndexExpr> ConvertLayout(
+    std::vector<IndexExpr> src,
+    const Layout& src_layout,
+    const Layout& dst_layout) {
+  CHECK_EQ(src_layout.ndim(), src.size());
+  if (src_layout == dst_layout) {
+    return src;
+  } else if (!src_layout.defined()) {
+    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
+  } else if (!dst_layout.defined()) {
+    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
+  }
+
+  CHECK(src_layout.Convertible(dst_layout))
+    << "cannot convert from "
+    << src_layout << " to " << dst_layout;
+
+  std::vector<IndexExpr> dst(dst_layout.ndim());
+  for (size_t i = 0; i < src_layout.ndim(); ++i) {
+    Layout::LayoutDim src_dim = src_layout[i];
+    if (Layout::IsSuperdim(src_dim)) {
+      int dst_major_pos = dst_layout.Indexof(Layout::ToSuperdim(src_dim));
+      int dst_minor_pos = dst_layout.Indexof(Layout::ToSubdim(src_dim));
+      int src_minor_pos = src_layout.Indexof(Layout::ToSubdim(src_dim));
+      int src_factor = src_layout.Subsizeof(src_dim);
+      int dst_factor = dst_layout.Subsizeof(src_dim);
+      IndexExpr src_dim_size = src[i];
+
+      if (src_minor_pos >= 0) {
+        CHECK(is_const_int(src[src_minor_pos], src_factor))
+          << "src shape " << Array<IndexExpr>(src)
+          << " does not agree with layout "
+          << src_layout;
+        src_dim_size *= src_factor;
+      }
+      dst[dst_major_pos] = src_dim_size;
+      if (dst_minor_pos >= 0) {
+        CHECK_GT(dst_factor, 0);
+        if (const int64_t* const_src_dim_size = as_const_int(src_dim_size)) {
+          CHECK_LE(dst_factor, const_src_dim_size[0])
+            << "Converting " << Array<IndexExpr>(src)
+            << " from " << src_layout
+            << " to " << dst_layout
+            << ": cannot split dimension size of "
+            << src_dim_size << " by " << dst_factor;
+        }
+        dst[dst_major_pos] /= dst_factor;
+        dst[dst_minor_pos] = dst_factor;
+      }
+    }
+  }
+  return dst;
+}
+
+std::vector<IndexExpr> ConvertLayout(
+    const Array<IndexExpr>& src,
+    const Layout& src_layout,
+    const Layout& dst_layout) {
+  std::vector<IndexExpr> ret(src.size());
+  for (size_t i = 0; i < src.size(); ++i) {
+    ret[i] = src[i];
+  }
+  return ConvertLayout(ret, src_layout, dst_layout);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/layout.h b/src/relay/op/layout.h
new file mode 100644
index 000000000000..97160f3cbb9e
--- /dev/null
+++ b/src/relay/op/layout.h
@@ -0,0 +1,422 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/op/layout.h
+ * \brief Layout expression.
+ *
+ *  This file is adapted from its nnvm counterpart and will keep involving
+ *  to the new layout system
+ *
+ *  The layout is composed of upper cases, lower cases and numbers,
+ *  where upper case indicates a (super-)dimension and
+ *  the corresponding lower case with factor size indicates the split (sub-)dimension.
+ *  For example, NCHW16c can describe a 5-D tensor of
+ *  [batch_size, channel, height, width, channel_block].
+ *  Here sub-dimension channel_block=16 is the split of super-dimension C (channel).
+ */
+#ifndef TVM_RELAY_OP_LAYOUT_H_
+#define TVM_RELAY_OP_LAYOUT_H_
+
+#include <tvm/base.h>
+#include <tvm/expr.h>
+#include <tvm/relay/base.h>
+
+#include <string>
+#include <sstream>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace tvm {
+namespace relay {
+
+class LayoutNode : public Node {
+ public:
+  std::string name;
+  Array<Integer> superdim_pos;
+  Array<Integer> subdim_pos;
+  Array<Integer> subdim_size;
+  Array<Integer> layout_simplified;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("superdim_pos", &superdim_pos);
+    v->Visit("subdim_pos", &subdim_pos);
+    v->Visit("subdim_size", &subdim_size);
+    v->Visit("layout_simplified", &layout_simplified);
+  }
+
+  static constexpr const char* _type_key = "Layout";
+  TVM_DECLARE_NODE_TYPE_INFO(LayoutNode, Node);
+};
+
+class Layout : public NodeRef {
+ public:
+  using LayoutDim = char;
+  static constexpr uint32_t kUniqueDim = 26;
+
+  explicit Layout(NodePtr<Node> n) : NodeRef(n) {}
+
+  /*! \brief default constructor */
+  Layout() : Layout("__undef__") {} // NOLINT(*)
+
+  /*! \brief construct from a string */
+  Layout(const char* str) : Layout(std::string(str)) {} // NOLINT(*)
+
+  /*!
+   * \brief construct from a string.
+   * \param layout input in layout convention:
+   *        upper case indicates a dimension and
+   *        the corresponding lower case with factor size
+   *        indicates the split dimension.
+   *        return undefined layout if "__undef__" is passed.
+   */
+  Layout(const std::string& layout) { // NOLINT(*)
+    if (layout.length() != 0) {
+      Parse(layout);
+    } else {
+      Parse("__undef__");
+    }
+  }
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  const LayoutNode* operator->() const {
+    return static_cast<const LayoutNode*>(node_.get());
+  }
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  LayoutNode* operator->() {
+    return static_cast<LayoutNode*>(node_.get());
+  }
+
+  /*!
+   * \brief Check whether a given dimension is a super-dimension.
+   * \param dim input dimension
+   * \return Whether a given dimension is a super-dimension.
+   */
+  static bool IsSuperdim(LayoutDim dim) {
+    return dim >= 'A' && dim <= 'Z';
+  }
+
+  /*!
+   * \brief Check whether a given dimension is a sub-dimension.
+   * \param dim input dimension
+   * \return Whether a given dimension is a sub-dimension.
+   */
+  static bool IsSubdim(LayoutDim dim) {
+    return dim >= 'a' && dim <= 'z';
+  }
+
+  /*!
+   * \brief Convert a given dimension to super-dimension.
+   * \param dim input dimension
+   * \return The converted description.
+   */
+  static LayoutDim ToSuperdim(LayoutDim dim) {
+    if (IsSubdim(dim)) {
+      return dim - 'a' + 'A';
+    }
+    return dim;
+  }
+
+  /*!
+   * \brief Convert a given dimension to sub-dimension.
+   * \param dim input dimension
+   * \return The converted description.
+   */
+  static LayoutDim ToSubdim(LayoutDim dim) {
+    if (IsSuperdim(dim)) {
+      return dim - 'A' + 'a';
+    }
+    return dim;
+  }
+
+  /*!
+ * \brief Return an undefined layout.
+ * \return a (global) undefined layout.
+ */
+  static const Layout& Undef() {
+    static Layout undef;
+    return undef;
+  }
+
+  /*!
+   * \brief Two layouts are convertible only if
+   *        they have same set of super-dimensions.
+   *        e.g., NCHW, NCHW16c, NHWC are convertible between each other,
+   *        but NCHW, CHW, OIHW are not.
+   * \param dst the target layout
+   * \return Whether can be converted to dst layout.
+   */
+  bool Convertible(const Layout &dst) const {
+    const LayoutNode *n = operator->();
+    if (!this->defined() || !dst.defined()) return false;
+    for (size_t i = 0; i < kUniqueDim; ++i) {
+      if ((n->superdim_pos[i]->value >= 0 && dst->superdim_pos[i]->value < 0) ||
+          (n->superdim_pos[i]->value < 0 && dst->superdim_pos[i]->value >= 0)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /*!
+   * \brief Returns a sublayout which is the portion of the object
+   *        that starts at dimension \p pos and spans \p len dimensions
+   *        (or until the end of the layout, whichever comes first).
+   * \param pos The start position.
+   * \param len The length of the sub-layout.
+   * \return A newly constructed Layout object.
+   */
+  Layout Sublayout(size_t pos, size_t len) const {
+    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
+    if (pos > ndim()) return Layout::Undef();
+    if (pos + len > ndim()) len = ndim() - pos;
+    if (len == 0) return Layout::Undef();
+    std::ostringstream new_layout;
+    for (size_t i = pos; i < pos + len; ++i) {
+      if (IsSubdim(layout_simplified[i]->value)) {
+        auto block_size = this->Subsizeof(layout_simplified[i]->value);
+        CHECK_GT(block_size, 0);
+        new_layout << block_size;
+      }
+      new_layout << layout_simplified[i]->value;
+    }
+    return Layout(new_layout.str());
+  }
+
+  /*! \return A newly constructed reversed Layout object. */
+  Layout Reverse() const {
+    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
+    if (!this->defined()) return Layout::Undef();
+    std::ostringstream new_layout;
+    for (int64_t i = this->ndim() - 1; i >= 0; --i) {
+      if (IsSubdim(layout_simplified[i]->value)) {
+        auto block_size = this->Subsizeof(layout_simplified[i]->value);
+        CHECK_GT(block_size, 0);
+        new_layout << block_size;
+      }
+      new_layout << layout_simplified[i]->value;
+    }
+    return Layout(new_layout.str());
+  }
+
+  /*!
+   * \brief Split \p dim by \p size and put the sub-dimension to position \p target_pos.
+   * \param dim The source dimension to be split. It must be a super-dimension.
+   * \param target_pos The target position of the newly split sub-dimension.
+   * \param size size of the sub-dimension.
+   * \return A newly constructed Layout object.
+   */
+  Layout Split(LayoutDim dim, size_t target_pos, uint32_t size) const {
+    const std::string &name = operator->()->name;
+    CHECK(target_pos <= this->ndim()) << "Invalid split position "
+                                      << target_pos << " for layout " << name;
+    CHECK(IsSuperdim(dim)) << "Cannot split a sub-dimension " << dim;
+    CHECK(this->Contains(dim)) << "Axis " << dim << " does not exist in " << name;
+    CHECK(!this->Contains(ToSubdim(dim))) << "Dimension " << dim
+                                           << " has already been split in "
+                                           << name;
+    CHECK(size > 0) << "Invalid split size " << size;
+    std::ostringstream new_layout;
+    for (size_t i = 0; i <= this->ndim(); ++i) {
+      if (i == target_pos) {
+        new_layout << size << Layout::ToSubdim(dim);
+      }
+      if (i == this->ndim()) break;
+      new_layout << this->at(i);
+    }
+    Layout x(new_layout.str());
+    return x;
+  }
+
+
+  /*! \return number of dimensions */
+  size_t ndim() const {
+    return operator->()->layout_simplified.size();
+  }
+
+  /*!
+   * \brief The description of the \p i-th dimension.
+   *        If it is a sub-dimension, the size will be returned as well,
+   *        e.g., 16c. Otherwise a single character is returned, e.g., C.
+   * \param i The position
+   * \return the description of the dimension.
+   */
+  std::string at(size_t i) const {
+    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
+    CHECK_LT(i, this->ndim()) << "position " << i
+                              << " exceeds ndim=" << this->ndim();
+    std::ostringstream repr;
+    if (IsSubdim(layout_simplified[i]->value)) {
+      auto factor = Subsizeof(layout_simplified[i]->value);
+      CHECK_GT(factor, 0);
+      repr << factor;
+    }
+    repr << static_cast<char>(layout_simplified[i]->value);
+    return repr.str();
+  }
+
+  /*!
+   * \brief return the index of the input dimension.
+   *        If it is not found in the layout or the layout is undefined,
+   *        return -1.
+   * \param dim the input dimension.
+   * \return the index or -1 if not found.
+   */
+  int32_t Indexof(LayoutDim dim) const {
+    if (!this->defined()) return -1;
+    else if (IsSuperdim(dim)) return operator->()->superdim_pos[dim - 'A']->value;
+    else if (IsSubdim(dim)) return operator->()->subdim_pos[dim - 'a']->value;
+    return -1;
+  }
+
+  /*!
+   * \param dim the input super-dimension or sub-dimension.
+   * \return the size of the sub-dimension of \p dim (if \p dim is a super-dimension),
+   *         or the size of \p dim itself (if \p dim is a sub-dimension).
+   *         Return -1 if \p dim is not in the layout or the layout is undefined.
+   */
+  int64_t Subsizeof(LayoutDim dim) const {
+    CHECK(IsSuperdim(dim) || IsSubdim(dim)) << "Invalid dim " << dim;
+    if (!this->defined() || !this->Contains(ToSubdim(dim))) {
+      return -1;
+    }
+    int idx = ToSubdim(dim) - 'a';
+    return operator->()->subdim_size[idx]->value;
+  }
+
+  /*!
+   * \brief Whether the layout contains a dimension.
+   * \param dim dimension to be checked.
+   * \return Whether the layout contains the dimension.
+   */
+  bool Contains(LayoutDim dim) const {
+    if (IsSuperdim(dim)) {
+      return operator->()->superdim_pos[dim-'A']->value >= 0;
+    } else if (IsSubdim(dim)) {
+      return operator->()->subdim_pos[dim-'a']->value >= 0;
+    }
+    return false;
+  }
+
+  LayoutDim operator[](size_t i) const {
+    return operator->()->layout_simplified[i];
+  }
+
+  /*! \return whether the layout is defined */
+  bool defined() const {
+    return operator->()->name != "__undef__";
+  }
+  /*! \return the string description of the layout */
+  const std::string& name() const {
+    return operator->()->name;
+  }
+
+  /*!
+   * \brief Whether the two layouts are equal.
+   * \param rhs Another layout.
+   * \return whether the two layouts are equal.
+   */
+  bool Equals(const Layout &rhs) const {
+    return operator->()->name == rhs->name;
+  }
+
+  using ContainerType = LayoutNode;
+
+ private:
+  void Parse(const std::string &layout) {
+    node_ = make_node<LayoutNode>();
+
+    std::vector<uint32_t> superdim_pos(kUniqueDim, -1);
+    std::vector<uint32_t> subdim_pos(kUniqueDim, -1);
+    std::vector<uint32_t> subdim_size(kUniqueDim, -1);
+    std::vector<char> layout_simplified;
+
+    if (layout != "__undef__") {  // parse layout string
+      int32_t factor = 0;
+      uint32_t curr = 0;
+      for (size_t i = 0; i < layout.size(); ++i) {
+        const LayoutDim c = layout.at(i);
+        if (IsSuperdim(c)) {
+          int pos = c - 'A';
+          CHECK_EQ(factor, 0) << "Invalid layout " << layout
+                              << ": invalid factor size " << factor
+                              << " before dimension " << c;
+          CHECK_EQ(superdim_pos[pos], -1) << "Invalid layout " << layout
+                                          << ": duplicate dimension " << c;
+          superdim_pos[pos] = curr++;
+          layout_simplified.push_back(c);
+        } else if (IsSubdim(c)) {
+          int pos = c - 'a';
+          CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size "
+                              << factor << " for dimension " << c;
+          CHECK_EQ(subdim_pos[pos], -1) << "Invalid layout " << layout
+                                        << ": duplicate dimension " << c;
+          CHECK_EQ(subdim_size[pos], -1) << "Invalid layout " << layout
+                                         << ": duplicate dimension " << c;
+          subdim_pos[pos] = curr++;
+          subdim_size[pos] = factor;
+          layout_simplified.push_back(c);
+          factor = 0;
+        } else if (c >= '0' && c <= '9') {
+          CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
+          factor = factor * 10 + c - '0';
+        } else {
+          LOG(FATAL) << "Invalid layout " << layout;
+        }
+      }
+      CHECK(!layout_simplified.empty()) << "Invalid layout " << layout;
+      for (LayoutDim dim : layout_simplified) {
+        CHECK(IsSuperdim(dim) || superdim_pos[dim-'a'] >= 0)
+          << "Invalid layout " << layout << ": missing axis "
+          << static_cast<char>(dim - 'a' + 'A');
+      }
+    }
+
+    LayoutNode *node = operator->();
+    node->name = layout;
+
+    for (uint32_t i = 0; i < kUniqueDim; ++i) {
+      node->superdim_pos.push_back(superdim_pos[i]);
+      node->subdim_pos.push_back(subdim_pos[i]);
+      node->subdim_size.push_back(subdim_size[i]);
+    }
+    for (LayoutDim dim : layout_simplified) {
+      node->layout_simplified.push_back(dim);
+    }
+  }
+};
+
+/*!
+ * \brief Convert shape in src_layout to shape in dst_layout
+ * \param src original shape
+ * \param src_layout layout of original shape
+ * \param dst_layout target layout
+ * \return shape in target layout
+ */
+std::vector<IndexExpr> ConvertLayout(
+    std::vector<IndexExpr> src,
+    const Layout& src_layout,
+    const Layout& dst_layout);
+
+/*!
+ * \brief Convert shape in src_layout to shape in dst_layout
+ * \param src original shape
+ * \param src_layout layout of original shape
+ * \param dst_layout target layout
+ * \return shape in target layout
+ */
+std::vector<IndexExpr> ConvertLayout(
+    const Array<IndexExpr>& src,
+    const Layout& src_layout,
+    const Layout& dst_layout);
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_LAYOUT_H_
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 8e1d9db50e7e..cb648166f7bb 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -6,7 +6,8 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <vector>
-#include "layout.h"
+
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -28,16 +29,16 @@ bool Conv2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->weight_layout);
-  CHECK(in_layout.convertible(kNCHW))
+  CHECK(in_layout.Convertible(kNCHW))
     << "Conv only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
-  CHECK(kernel_layout.convertible(kOIHW))
+  CHECK(kernel_layout.Convertible(kOIHW))
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
   Layout out_layout(param->out_layout);
   if (!out_layout.defined()) out_layout = in_layout;
-  CHECK(out_layout.convertible(kNCHW))
+  CHECK(out_layout.Convertible(kNCHW))
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
@@ -55,7 +56,7 @@ bool Conv2DRel(const Array<Type>& types,
          param->kernel_size[0],
          param->kernel_size[1]});
     wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
-    wshape[kernel_layout.indexof('O')] *= param->groups;
+    wshape[kernel_layout.Indexof('O')] *= param->groups;
     channels = param->channels;
     dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
     dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
@@ -177,10 +178,10 @@ bool Conv2DTransposeRel(const Array<Type>& types,
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->weight_layout);
-  CHECK(in_layout.convertible(kNCHW))
+  CHECK(in_layout.Convertible(kNCHW))
     << "Conv only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
-  CHECK(kernel_layout.convertible(kOIHW))
+  CHECK(kernel_layout.Convertible(kOIHW))
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
diff --git a/src/relay/op/nn/layout.h b/src/relay/op/nn/layout.h
deleted file mode 100644
index d9eb59d6e31c..000000000000
--- a/src/relay/op/nn/layout.h
+++ /dev/null
@@ -1,536 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file relay/op/nn/layout.h
- * \brief Layout expression.
- *
- *  This file is adapted from its nnvm counterpart and will keep involving
- *  to the new layout system
- *
- *  The layout is composed of upper cases, lower cases and numbers,
- *  where upper case indicates a (super-)dimension and
- *  the corresponding lower case with factor size indicates the split (sub-)dimension.
- *  For example, NCHW16c can describe a 5-D tensor of
- *  [batch_size, channel, height, width, channel_block].
- *  Here sub-dimension channel_block=16 is the split of super-dimension C (channel).
- */
-#ifndef TVM_RELAY_OP_NN_LAYOUT_H_
-#define TVM_RELAY_OP_NN_LAYOUT_H_
-
-#include <string>
-#include <sstream>
-#include <vector>
-#include <utility>
-#include <algorithm>
-
-namespace tvm {
-namespace relay {
-
-/*! \brief layout auxiliary structure */
-class Layout {
- public:
-  using LayoutDim = char;
-
-  /*! \brief default constructor */
-  Layout() : name_("__undef__") {} // NOLINT(*)
-
-  /*!
-   * \brief construct from a string.
-   * \param layout input in layout convention:
-   *        upper case indicates a dimension and
-   *        the corresponding lower case with factor size
-   *        indicates the split dimension.
-   *        return undefined layout if "__undef__" is passed.
-   */
-  Layout(const std::string& layout) { // NOLINT(*)
-    if (layout.length() != 0) {
-      parse(layout);
-    } else {
-      parse("__undef__");
-    }
-  }
-  /*!
-   * \brief copy constructor from another layout
-   * \param s the source layout
-   */
-  Layout(const Layout& s) { // NOLINT(*)
-    this->parse(s.name_);
-  }
-  /*!
-   * \brief move constructor from Layout
-   * \param src the source layout
-   */
-  Layout(Layout&& src) { // NOLINT(*)
-    this->swap(src);
-  }
-  /*!
-   * \brief assignment from another layout.
-   * \param src source layout
-   * \return reference of self
-   */
-  Layout& operator=(const Layout& src) {
-    this->parse(src.name_);
-    return *this;
-  }
-  /*!
-   * \brief assignment from rvalue of another layout.
-   * \param src source layout
-   * \return reference of self
-   */
-  Layout& operator=(Layout&& src) {
-    Layout(std::move(src)).swap(*this); // NOLINT(*)
-    return *this;
-  }
-  /*!
-   * \brief assignment from string.
-   * \param src source layout
-   * \return reference of self
-   */
-  Layout& operator=(const std::string& src) {
-    this->parse(src);
-    return *this;
-  }
-  /*!
-   * \return whether two layout equals
-   * \param s the layout to compare against
-   */
-  bool operator==(const Layout& s) const {
-    return name_ == s.name_;
-  }
-  /*!
-   * \return whether two layout not equal
-   * \param s the layout to compare against
-   */
-  bool operator!=(const Layout& s) const {
-    return !(*this == s);
-  }
-
-  /*!
-   * \brief Append the current layout by another.
-   * @param other the layout to be appended
-   * @return a new layout
-   */
-  Layout operator+(const Layout& other) const {
-    if (!this->defined() && !other.defined()) {
-      return Layout::Undef();
-    } else if (!this->defined()) {
-      return other;
-    } else if (!other.defined()) {
-      return *this;
-    }
-    return Layout(this->name_ + other.name_);
-  }
-
-  /*!
-   * \brief Check whether a given dimension is a super-dimension.
-   * \param dim input dimension
-   * \return Whether a given dimension is a super-dimension.
-   */
-  static bool is_superdim(LayoutDim dim) {
-    return dim >= 'A' && dim <= 'Z';
-  }
-
-  /*!
-   * \brief Check whether a given dimension is a sub-dimension.
-   * \param dim input dimension
-   * \return Whether a given dimension is a sub-dimension.
-   */
-  static bool is_subdim(LayoutDim dim) {
-    return dim >= 'a' && dim <= 'z';
-  }
-
-  /*!
-   * \brief Convert a given dimension to super-dimension.
-   * \param dim input dimension
-   * \return The converted description.
-   */
-  static LayoutDim to_superdim(LayoutDim dim) {
-    if (is_subdim(dim)) {
-      return dim - 'a' + 'A';
-    }
-    return dim;
-  }
-
-  /*!
-   * \brief Convert a given dimension to sub-dimension.
-   * \param dim input dimension
-   * \return The converted description.
-   */
-  static LayoutDim to_subdim(LayoutDim dim) {
-    if (is_superdim(dim)) {
-      return dim - 'A' + 'a';
-    }
-    return dim;
-  }
-
-  /*!
-   * \brief Return an undefined layout.
-   * \return a (global) undefined layout.
-   */
-  static const Layout& Undef() {
-    static Layout undef;
-    return undef;
-  }
-
-  /*!
-   * \brief Swap current object with other
-   * \param other another object to be swapped.
-   */
-  void swap(Layout& other) {  // NOLINT(*)
-    std::swap(name_, other.name_);
-    std::swap(superdim_pos_, other.superdim_pos_);
-    std::swap(subdim_pos_, other.subdim_pos_);
-    std::swap(subdim_size_, other.subdim_size_);
-    std::swap(layout_simplified_, other.layout_simplified_);
-  }
-
-  /*!
-   * \brief Two layouts are convertible only if
-   *        they have same set of super-dimensions.
-   *        e.g., NCHW, NCHW16c, NHWC are convertible between each other,
-   *        but NCHW, CHW, OIHW are not.
-   * \param dst the target layout
-   * \return Whether can be converted to dst layout.
-   */
-  bool convertible(const Layout &dst) const {
-    if (!this->defined() || !dst.defined()) return false;
-    for (size_t i = 0; i < kUniqueDim; ++i) {
-      if ((superdim_pos_[i] >= 0 && dst.superdim_pos_[i] < 0) ||
-          (superdim_pos_[i] < 0 && dst.superdim_pos_[i] >= 0)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /*!
-   * \brief Returns a sublayout which is the portion of the object
-   *        that starts at dimension \p pos and spans \p len dimensions
-   *        (or until the end of the layout, whichever comes first).
-   * \param pos The start position.
-   * \param len The length of the sub-layout.
-   * \return A newly constructed Layout object.
-   */
-  Layout sublayout(size_t pos, size_t len) const {
-    if (pos > ndim()) return Layout::Undef();
-    if (pos + len > ndim()) len = ndim() - pos;
-    if (len == 0) return Layout::Undef();
-    std::ostringstream new_layout;
-    for (size_t i = pos; i < pos + len; ++i) {
-      if (is_subdim(layout_simplified_[i])) {
-        auto block_size = this->subsizeof(layout_simplified_[i]);
-        CHECK_GT(block_size, 0);
-        new_layout << block_size;
-      }
-      new_layout << layout_simplified_[i];
-    }
-    return Layout(new_layout.str());
-  }
-
-  /*! \return A newly constructed reversed Layout object. */
-  Layout reverse() const {
-    if (!this->defined()) return Layout::Undef();
-    std::ostringstream new_layout;
-    for (int64_t i = this->ndim() - 1; i >= 0; --i) {
-      if (is_subdim(layout_simplified_[i])) {
-        auto block_size = this->subsizeof(layout_simplified_[i]);
-        CHECK_GT(block_size, 0);
-        new_layout << block_size;
-      }
-      new_layout << layout_simplified_[i];
-    }
-    return Layout(new_layout.str());
-  }
-
-  /*!
-   * \brief Split \p dim by \p size and put the sub-dimension to position \p target_pos.
-   * \param dim The source dimension to be split. It must be a super-dimension.
-   * \param target_pos The target position of the newly split sub-dimension.
-   * \param size size of the sub-dimension.
-   * \return A newly constructed Layout object.
-   */
-  Layout split(LayoutDim dim, size_t target_pos, uint32_t size) const {
-    CHECK(target_pos <= this->ndim()) << "Invalid split position "
-                                      << target_pos << " for layout " << name_;
-    CHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
-    CHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
-    CHECK(!this->contains(to_subdim(dim))) << "Dimension " << dim
-                                           << " has already been split in "
-                                           << name_;
-    CHECK(size > 0) << "Invalid split size " << size;
-    std::ostringstream new_layout;
-    for (size_t i = 0; i <= this->ndim(); ++i) {
-      if (i == target_pos) {
-        new_layout << size << Layout::to_subdim(dim);
-      }
-      if (i == this->ndim()) break;
-      new_layout << this->at(i);
-    }
-    Layout x(new_layout.str());
-    return x;
-  }
-
-  using iterator = std::vector<LayoutDim>::const_iterator;
-  using reverse_iterator = std::vector<LayoutDim>::const_reverse_iterator;
-
-  /*! \return begin iterator */
-  iterator begin() const {
-    return layout_simplified_.begin();
-  }
-  /*! \return end iterator */
-  iterator end() const {
-    return layout_simplified_.end();
-  }
-  /*! \return rbegin iterator */
-  reverse_iterator rbegin() const {
-    return layout_simplified_.rbegin();
-  }
-  /*! \return rend iterator */
-  reverse_iterator rend() const {
-    return layout_simplified_.rend();
-  }
-
-  /*! \return number of dimensions */
-  size_t ndim() const {
-    return layout_simplified_.size();
-  }
-
-  /*!
-   * \brief The description of the \p i-th dimension.
-   *        If it is a sub-dimension, the size will be returned as well,
-   *        e.g., 16c. Otherwise a single character is returned, e.g., C.
-   * \param i The position
-   * \return the description of the dimension.
-   */
-  std::string at(size_t i) const {
-    CHECK_LT(i, this->ndim()) << "position " << i
-                              << " exceeds ndim=" << this->ndim();
-    std::ostringstream repr;
-    if (is_subdim(layout_simplified_[i])) {
-      auto factor = subsizeof(layout_simplified_[i]);
-      CHECK_GT(factor, 0);
-      repr << factor;
-    }
-    repr << layout_simplified_[i];
-    return repr.str();
-  }
-
-  /*!
-   * \brief return the index of the input dimension.
-   *        If it is not found in the layout or the layout is undefined,
-   *        return -1.
-   * \param dim the input dimension.
-   * \return the index or -1 if not found.
-   */
-  int32_t indexof(LayoutDim dim) const {
-    if (!this->defined()) return -1;
-    else if (is_superdim(dim)) return superdim_pos_[dim - 'A'];
-    else if (is_subdim(dim)) return subdim_pos_[dim - 'a'];
-    return -1;
-  }
-
-  /*!
-   * \param dim the input super-dimension or sub-dimension.
-   * \return the size of the sub-dimension of \p dim (if \p dim is a super-dimension),
-   *         or the size of \p dim itself (if \p dim is a sub-dimension).
-   *         Return -1 if \p dim is not in the layout or the layout is undefined.
-   */
-  int64_t subsizeof(LayoutDim dim) const {
-    CHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
-    if (!this->defined() || !this->contains(to_subdim(dim))) {
-      return -1;
-    }
-    int idx = to_subdim(dim) - 'a';
-    return subdim_size_[idx];
-  }
-
-  /*!
-   * \brief Whether the layout contains a dimension.
-   * \param dim dimension to be checked.
-   * \return Whether the layout contains the dimension.
-   */
-  bool contains(LayoutDim dim) const {
-    if (is_superdim(dim)) {
-      return superdim_pos_[dim-'A'] >= 0;
-    } else if (is_subdim(dim)) {
-      return subdim_pos_[dim-'a'] >= 0;
-    }
-    return false;
-  }
-
-  LayoutDim operator[](size_t i) const {
-    return layout_simplified_[i];
-  }
-
-  /*! \return whether the layout is defined */
-  bool defined() const {
-    return name_ != "__undef__";
-  }
-
-  /*! \return the string description of the layout */
-  const std::string& name() const {
-    return name_;
-  }
-
-  /*!
-   * \brief Write layout in JSON format.
-   * \param writer JSONWriter
-   */
-  void Save(dmlc::JSONWriter* writer) const {
-    writer->Write(name_);
-  }
-
-  /*!
-   * \brief Load layout from JSON.
-   * \param reader JSONReader
-   */
-  void Load(dmlc::JSONReader* reader) {
-    std::string tmp;
-    reader->Read(&tmp);
-    this->parse(tmp);
-  }
-
-  /*!
-   * \brief allow output string of layout to ostream
-   * \param os the output stream
-   * \param l the layout
-   * \return the ostream
-   */
-  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
-    os << l.name_;
-    return os;
-  }
-
- private:
-  static const uint32_t kUniqueDim = 26;
-
-  std::string name_;
-  int32_t superdim_pos_[kUniqueDim];
-  int32_t subdim_pos_[kUniqueDim];
-  int64_t subdim_size_[kUniqueDim];
-  std::vector<LayoutDim> layout_simplified_;
-
-  void parse(const std::string& layout) {
-    name_ = layout;
-    std::fill_n(superdim_pos_, kUniqueDim, -1);
-    std::fill_n(subdim_pos_, kUniqueDim, -1);
-    std::fill_n(subdim_size_, kUniqueDim, -1);
-    layout_simplified_.clear();
-
-    if (layout == "__undef__") return;
-
-    int32_t factor = 0;
-    uint32_t curr = 0;
-    for (size_t i = 0; i < layout.size(); ++i) {
-      const LayoutDim c = layout.at(i);
-      if (is_superdim(c)) {
-        int pos = c - 'A';
-        CHECK_EQ(factor, 0) << "Invalid layout " << layout
-                            << ": invalid factor size " << factor
-                            << " before dimension " << c;
-        CHECK_EQ(superdim_pos_[pos], -1) << "Invalid layout " << layout
-                                         << ": duplicate dimension " << c;
-        superdim_pos_[pos] = curr++;
-        layout_simplified_.push_back(c);
-      } else if (is_subdim(c)) {
-        int pos = c - 'a';
-        CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size "
-                            << factor << " for dimension " << c;
-        CHECK_EQ(subdim_pos_[pos], -1) << "Invalid layout " << layout
-                                       << ": duplicate dimension " << c;
-        CHECK_EQ(subdim_size_[pos], -1) << "Invalid layout " << layout
-                                        << ": duplicate dimension " << c;
-        subdim_pos_[pos] = curr++;
-        subdim_size_[pos] = factor;
-        layout_simplified_.push_back(c);
-        factor = 0;
-      } else if (c >= '0' && c <= '9') {
-        CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
-        factor = factor * 10 + c - '0';
-      } else {
-        LOG(FATAL) << "Invalid layout " << layout;
-      }
-    }
-    CHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
-    for (LayoutDim dim : layout_simplified_) {
-      CHECK(is_superdim(dim) || superdim_pos_[dim-'a'] >= 0)
-        << "Invalid layout " << layout << ": missing axis "
-        << static_cast<char>(dim - 'a' + 'A');
-    }
-  }
-};
-
-/*!
- * \brief Convert shape in src_layout to shape in dst_layout
- * \param src original shape
- * \param src_layout layout of original shape
- * \param dst_layout target layout
- * \return shape in target layout
- */
-inline std::vector<IndexExpr> ConvertLayout(
-    std::vector<IndexExpr> src,
-    const Layout& src_layout,
-    const Layout& dst_layout) {
-  CHECK_EQ(src_layout.ndim(), src.size());
-  if (src_layout == dst_layout) {
-    return src;
-  } else if (!src_layout.defined()) {
-    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
-  } else if (!dst_layout.defined()) {
-    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
-  }
-
-  CHECK(src_layout.convertible(dst_layout))
-      << "cannot convert from "
-      << src_layout << " to " << dst_layout;
-
-  std::vector<IndexExpr> dst(dst_layout.ndim());
-  for (size_t i = 0; i < src_layout.ndim(); ++i) {
-    Layout::LayoutDim src_dim = src_layout[i];
-    if (Layout::is_superdim(src_dim)) {
-      int dst_major_pos = dst_layout.indexof(Layout::to_superdim(src_dim));
-      int dst_minor_pos = dst_layout.indexof(Layout::to_subdim(src_dim));
-      int src_minor_pos = src_layout.indexof(Layout::to_subdim(src_dim));
-      int src_factor = src_layout.subsizeof(src_dim);
-      int dst_factor = dst_layout.subsizeof(src_dim);
-      IndexExpr src_dim_size = src[i];
-
-      if (src_minor_pos >= 0) {
-        CHECK(is_const_int(src[src_minor_pos], src_factor))
-            << "src shape " << Array<IndexExpr>(src)
-            << " does not agree with layout "
-            << src_layout;
-        src_dim_size *= src_factor;
-      }
-      dst[dst_major_pos] = src_dim_size;
-      if (dst_minor_pos >= 0) {
-        CHECK_GT(dst_factor, 0);
-        if (const int64_t* const_src_dim_size = as_const_int(src_dim_size)) {
-          CHECK_LE(dst_factor, const_src_dim_size[0])
-              << "Converting " << Array<IndexExpr>(src)
-              << " from " << src_layout
-              << " to " << dst_layout
-              << ": cannot split dimension size of "
-              << src_dim_size << " by " << dst_factor;
-        }
-        dst[dst_major_pos] /= dst_factor;
-        dst[dst_minor_pos] = dst_factor;
-      }
-    }
-  }
-  return dst;
-}
-
-inline std::vector<IndexExpr> ConvertLayout(
-    const Array<IndexExpr>& src,
-    const Layout& src_layout,
-    const Layout& dst_layout) {
-  std::vector<IndexExpr> ret(src.size());
-  for (size_t i = 0; i < src.size(); ++i) {
-    ret[i] = src[i];
-  }
-  return ConvertLayout(ret, src_layout, dst_layout);
-}
-
-}  // namespace relay
-}  // namespace tvm
-#endif  // TVM_RELAY_OP_NN_LAYOUT_H_
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index d00f05cfc6fe..d3b454f35ede 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -13,7 +13,7 @@
 #include <vector>
 #include "../type_relations.h"
 #include "../op_common.h"
-#include "layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index da7db042178e..6e02d74e6ea8 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -7,7 +7,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <vector>
-#include "layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 0e54564e0032..0af0bbf63633 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -8,7 +8,7 @@
 #include <tvm/relay/attrs/nn.h>
 #include <topi/nn/pooling.h>
 #include <vector>
-#include "layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -33,13 +33,13 @@ bool Pool2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.contains('H') && layout.contains('W') &&
-        !layout.contains('h') && !layout.contains('w'))
+  CHECK(layout.Contains('H') && layout.Contains('W') &&
+        !layout.Contains('h') && !layout.Contains('w'))
     << "Invalid layout " << layout
     << ". Pool2D layout must have H and W, which cannot be split";
 
-  const auto hidx = layout.indexof('H');
-  const auto widx = layout.indexof('W');
+  const auto hidx = layout.Indexof('H');
+  const auto widx = layout.Indexof('W');
 
   IndexExpr pad_h, pad_w;
   if (param->padding.size() == 1) {
@@ -102,10 +102,10 @@ Array<Tensor> Pool2DCompute(const Attrs& attrs,
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
-  CHECK(layout.convertible(Layout("NCHW")))
+  CHECK(layout.Convertible(Layout("NCHW")))
       << "max_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.indexof('h'), -1) << "max_pool2d does not support input split on height";
-  CHECK_EQ(layout.indexof('w'), -1) << "max_pool2d does not support input split on width";
+  CHECK_EQ(layout.Indexof('h'), -1) << "max_pool2d does not support input split on height";
+  CHECK_EQ(layout.Indexof('w'), -1) << "max_pool2d does not support input split on width";
 
   CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool2D only support 4-D input (e.g., NCHW)"
@@ -240,13 +240,13 @@ bool GlobalPool2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.contains('H') && layout.contains('W') &&
-        !layout.contains('h') && !layout.contains('w'))
+  CHECK(layout.Contains('H') && layout.Contains('W') &&
+        !layout.Contains('h') && !layout.Contains('w'))
     << "Invalid layout " << layout
     << ". Pool2D layout must have H and W, which cannot be split";
 
-  const auto hidx = layout.indexof('H');
-  const auto widx = layout.indexof('W');
+  const auto hidx = layout.Indexof('H');
+  const auto widx = layout.Indexof('W');
   std::vector<IndexExpr> oshape({dshape[0], dshape[1], dshape[2], dshape[3]});
   oshape[hidx] = oshape[widx] = 1;
 
@@ -264,11 +264,11 @@ Array<Tensor> GlobalPool2DCompute(const Attrs& attrs,
   const auto* param = attrs.as<GlobalPool2DAttrs>();
   CHECK(param != nullptr);
   Layout layout(param->layout);
-  CHECK(layout.convertible(Layout("NCHW")))
+  CHECK(layout.Convertible(Layout("NCHW")))
     << "global_avg_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.indexof('h'), -1)
+  CHECK_EQ(layout.Indexof('h'), -1)
     << "global_avg_pool2d does not support input split on height";
-  CHECK_EQ(layout.indexof('w'), -1)
+  CHECK_EQ(layout.Indexof('w'), -1)
     << "global_avg_pool2d does not support input split on width";
 
   CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index 45bedd73c4c0..ed7b8449eace 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
-#include "layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -25,7 +25,7 @@ bool UpSamplingRel(const Array<Type>& types,
   const UpSamplingAttrs* param = attrs.as<UpSamplingAttrs>();
   CHECK(param != nullptr);
   const Layout in_layout(param->layout);
-  CHECK(in_layout.convertible(kNCHW))
+  CHECK(in_layout.Convertible(kNCHW))
     << "UpSampling only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
 
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index d3f7043088eb..96fe030c2d03 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -11,7 +11,8 @@
 #include <tvm/relay/expr_functor.h>
 #include "pattern_util.h"
 #include "pass_util.h"
-#include "../op/nn/layout.h"
+#include "../op/layout.h"
+
 
 namespace tvm {
 namespace relay {
@@ -378,8 +379,8 @@ Array<AxesSet> Conv2DForwardPrep(const Call& call, AxesSet out) {
   CHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout weight_layout(param->weight_layout);
-  int c_big_axis = data_layout.indexof('C');
-  int c_small_axis = data_layout.indexof('c');
+  int c_big_axis = data_layout.Indexof('C');
+  int c_small_axis = data_layout.Indexof('c');
 
   CHECK_GE(c_big_axis, 0);
   AxesSet data_axes = NullValue<AxesSet>();
@@ -391,7 +392,7 @@ Array<AxesSet> Conv2DForwardPrep(const Call& call, AxesSet out) {
   // only handle depthwise or full conv2d.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
-  if (weight_layout.indexof('i') < 0 &&
+  if (weight_layout.Indexof('i') < 0 &&
       c_small_axis < 0 &&
       (param->groups == 1 || is_depthwise_conv2d)) {
     data_axes = {c_big_axis};
@@ -412,15 +413,15 @@ Expr Conv2DForwardRewrite(const Call& ref_call,
   CHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout weight_layout(param->weight_layout);
-  int c_big_axis = data_layout.indexof('C');
+  int c_big_axis = data_layout.Indexof('C');
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // TODO(tvm-team) support general data layout
-  CHECK_EQ(weight_layout.indexof('i'), -1);
+  CHECK_EQ(weight_layout.Indexof('i'), -1);
   CHECK(sdata->axes.size() == 1 &&
         c_big_axis == sdata->axes[0]->value);
-  int big_oc_axis = weight_layout.indexof('O');
-  int big_ic_axis = weight_layout.indexof('I');
+  int big_oc_axis = weight_layout.Indexof('O');
+  int big_ic_axis = weight_layout.Indexof('I');
 
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(ref_call, param, weight_layout);
@@ -779,8 +780,8 @@ AxesSet Conv2DBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
     out_layout = Layout(param->data_layout);
   }
   Layout weight_layout(param->weight_layout);
-  int c_big_axis = out_layout.indexof('C');
-  int c_small_axis = out_layout.indexof('c');
+  int c_big_axis = out_layout.Indexof('C');
+  int c_small_axis = out_layout.Indexof('c');
 
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
@@ -791,8 +792,8 @@ AxesSet Conv2DBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
   // only handle depthwise or full conv2d.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
-  if (weight_layout.indexof('o') < 0 &&
-      weight_layout.indexof('i') < 0 &&
+  if (weight_layout.Indexof('o') < 0 &&
+      weight_layout.Indexof('i') < 0 &&
       c_small_axis < 0 &&
       (param->groups == 1 || is_depthwise_conv2d)) {
     return {c_big_axis};
@@ -816,16 +817,16 @@ Expr Conv2DBackwardTransform(const Call& call,
     out_layout = Layout(param->data_layout);
   }
   Layout weight_layout(param->weight_layout);
-  int c_big_axis = out_layout.indexof('C');
+  int c_big_axis = out_layout.Indexof('C');
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // TODO(tvm-team) support general data layout
-  CHECK_EQ(weight_layout.indexof('o'), -1);
-  CHECK_EQ(weight_layout.indexof('i'), -1);
+  CHECK_EQ(weight_layout.Indexof('o'), -1);
+  CHECK_EQ(weight_layout.Indexof('i'), -1);
   CHECK(axes.size() == 1 &&
         c_big_axis == axes[0]->value);
 
-  int big_oc_axis = weight_layout.indexof('O');
+  int big_oc_axis = weight_layout.Indexof('O');
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
   CHECK(param->groups == 1 || is_depthwise_conv2d);
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index f8e67bac33c5..1c855d9a53cb 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -11,7 +11,8 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/attrs/transform.h>
-#include "../op/nn/layout.h"
+#include "../op/layout.h"
+
 
 namespace tvm {
 namespace relay {

From 984f4fbc5b44e6d027efdb83f0b76164f478bf15 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <ashutosh.parkhi@imgtec.com>
Date: Wed, 21 Nov 2018 23:25:15 +0530
Subject: [PATCH 392/529] tensorflow frontend supports user given outputs
 (#1913)

---
 nnvm/python/nnvm/frontend/tensorflow.py       |  17 ++-
 .../frontend/tensorflow/test_forward.py       | 120 ++++++++++++------
 2 files changed, 91 insertions(+), 46 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index e7282eb9afd6..13ed717b0450 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -1039,7 +1039,7 @@ def __init__(self):
         self._num_param = 0
         self._num_rnn_layer = False
 
-    def from_tensorflow(self, graph, layout="NHWC", shape=None):
+    def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         """Construct nnvm nodes from tensorflow  graph definition - GraphDef.
 
         Follow the tensorflow graph definition to parse and convert it to NNVM.
@@ -1086,6 +1086,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None):
             raise NotImplementedError( \
                 "The following operators are not implemented: {}".format(missing_operators))
 
+        final_op = None
         # Parse the nodes to re-create TF graph using Symbol API of NNVM
         for node in graph.node:
             # Tensorflow doesn't have seperate list for params extraction.
@@ -1165,6 +1166,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None):
 
                 # Assuming only one output.
                 self._nodes[node.name] = op
+                final_op = op
 
             # Infer shapes if passed explicitely
             node_output = self._nodes[node.name]
@@ -1175,13 +1177,16 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None):
                 _, out_shapes = graph_util.infer_shape(g, **shape_dict)
                 self._output_shapes[node.name] = out_shapes
 
-        # Assume the final node is the output node
-        out = node_output
+        out = []
+        if outputs is None:
+            out.append(final_op)
+        else:
+            out = [self._nodes[out_name] for out_name in outputs]
 
         #Add the RNN outputs also with 'head' nodes of the nnvm graph
         if self._num_rnn_layer:
             out_rnn = _sym.concatenate(*self._out_rnn, axis=0)
-            out = [out, out_rnn]
+            out.append(out_rnn)
 
         if isinstance(out, list):
             out = _sym.Group(out)
@@ -1378,7 +1383,7 @@ def _fix_extranodes(self, op_name, attr, inputs):
 
         return inputs
 
-def from_tensorflow(graph, layout="NHWC", shape=None):
+def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
     """  Load tensorflow graph which is a python tensorflow graph object into nnvm graph.
     The companion parameters will be handled automatically.
 
@@ -1396,5 +1401,5 @@ def from_tensorflow(graph, layout="NHWC", shape=None):
         Dict of converted parameters stored in tvm.ndarray format
     """
     g = GraphProto()
-    sym, params = g.from_tensorflow(graph, layout, shape)
+    sym, params = g.from_tensorflow(graph, layout, shape, outputs)
     return sym, params
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 62d3577ba10a..e93f14ceb968 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -26,8 +26,15 @@
 #######################################################################
 # Generic run functions for TVM & tensorflow
 # ------------------------------------------
-def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm'):
+def convert_to_list(x):
+    if not isinstance(x, list):
+        x = [x]
+    return x
+
+def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm', out_names=None):
     """ Generic function to compile on nnvm and execute on tvm """
+    input_data = convert_to_list(input_data)
+    input_node = convert_to_list(input_node)
 
     layout = None
     if target == "cuda":
@@ -43,8 +50,8 @@ def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm'
     else:
         shape_dict = {input_node: input_data.shape}
         dtype_dict = {input_node: input_data.dtype}
-
-    sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict)
+   
+    sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict, outputs=out_names)
     graph, lib, params = nnvm.compiler.build(sym, target=target, target_host=target_host, shape=shape_dict,
                                              dtype=dtype_dict, params=params)
 
@@ -52,37 +59,34 @@ def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm'
     from tvm.contrib import graph_runtime
     m = graph_runtime.create(graph, lib, ctx)
     # set inputs
-    if isinstance(input_data, list):
-        for i, e in enumerate(input_node):
-            m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
-    else:
-        m.set_input(input_node, tvm.nd.array(input_data.astype(input_data.dtype)))
+    for i, e in enumerate(input_node):
+        m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
 
     m.set_input(**params)
     # execute
     m.run()
     # get outputs
-    if num_output > 1:
-        tvm_output_list = []
-        for i in range(0, num_output):
-            tvm_output = m.get_output(i)
-            tvm_output_list.append(tvm_output.asnumpy())
-        return tvm_output_list
-    else:
-        tvm_output = m.get_output(0)
-        return tvm_output.asnumpy()
+    assert out_names is None or num_output == len(out_names),"out_names: {} num_output: {}".format(
+                                                              out_names, num_output)
+    tvm_output_list = []
+    for i in range(0, num_output):
+        tvm_output = m.get_output(i)
+        tvm_output_list.append(tvm_output.asnumpy())
+    return tvm_output_list
 
 def run_tf_graph(sess, input_data, input_node, output_node):
     """ Generic function to execute tensorflow """
+    input_data = convert_to_list(input_data)
+    input_node = convert_to_list(input_node)
+    output_node = convert_to_list(output_node)
 
-    tensor = sess.graph.get_tensor_by_name(output_node)
+    tensor = [0] * len(output_node)
+    for i in range(len(output_node)):
+        tensor[i] = sess.graph.get_tensor_by_name(output_node[i])
 
-    if isinstance(input_data, list):
-        input_dict = {}
-        for i, e in enumerate(input_node):
-            input_dict[e] = input_data[i]
-    else:
-        input_dict = {input_node: input_data}
+    input_dict = {}
+    for i, e in enumerate(input_node):
+        input_dict[e] = input_data[i]
 
     output_data = sess.run(tensor, input_dict)
     return output_data
@@ -91,14 +95,16 @@ def run_tf_graph(sess, input_data, input_node, output_node):
 def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False, no_gpu=False):
     """Generic function to generate and compare tensorflow and TVM output"""
 
-    out_node = out_name.split(':')[0] if ":" in out_name else out_name
+    out_name = convert_to_list(out_name)
+    out_node = [0]*len(out_name)
+    for i in range(len(out_name)):
+        out_node[i] = out_name[i].split(':')[0] if ":" in out_name[i] else out_name[i]
 
-    if isinstance(in_name, list):
-        in_node = [0]*len(in_name)
-        for i in range(len(in_name)):
-            in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i]
-    else:
-        in_node = in_name.split(':')[0] if ":" in in_name else in_name
+    in_data = convert_to_list(in_data)
+    in_name = convert_to_list(in_name)
+    in_node = [0]*len(in_name)
+    for i in range(len(in_name)):
+        in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i]
 
     with tf.Session() as sess:
         if init_global_variables:
@@ -106,9 +112,8 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False,
         final_graph_def = tf.graph_util.convert_variables_to_constants(
             sess,
             sess.graph.as_graph_def(add_shapes=True),
-            [out_node],
+            out_node,
             )
-
         tf_output = run_tf_graph(sess, in_data, in_name, out_name)
 
         for device in ["llvm", "cuda"]:
@@ -120,7 +125,10 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False,
                 continue
 
             tvm_output = run_tvm_graph(final_graph_def, in_data, in_node, target=device)
-            tvm.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+            # since the names from tensorflow and nnvm runs are not exactly same, 
+            # first len(tf_output) will be compared
+            for i in range(len(tf_output)):
+                tvm.testing.assert_allclose(tf_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
 
         sess.close()
 
@@ -259,6 +267,7 @@ def test_forward_reshape():
     _test_reshape(np.arange(6), [3, -1])
     _test_reshape(np.arange(6), [-1])
 
+#######################################################################
 #######################################################################
 # Squeeze
 # -------
@@ -508,6 +517,35 @@ def test_forward_multi_input():
         compare_tf_with_tvm([in_data, in_data, in_data, in_data],
                             ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0')
 
+#######################################################################
+# Multi Output to Graph
+# ---------------------
+
+def test_forward_multi_output():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.int32, shape=[3, 3], name='in1')
+        in2 = tf.placeholder(tf.int32, shape=[3, 3], name='in2')
+        in3 = tf.placeholder(tf.int32, shape=[3, 3], name='in3')
+        in4 = tf.placeholder(tf.int32, shape=[3, 3], name='in4')
+
+        out1 = tf.add(in1, in2, name='out1')
+        out2 = tf.subtract(in3, in4, name='out2')
+        in_data = np.arange(9, dtype='int32').reshape([3, 3])
+        in_data = [in_data] * 4
+        in_name = ['in1:0', 'in2:0', 'in3:0', 'in4:0']
+        out_name = ['out1:0', 'out2:0']
+        out_node = [out.strip(':0') for out in out_name]
+        in_node = [inp.strip(':0') for inp in in_name]
+        
+        with tf.Session() as sess:
+            final_graph_def = tf.graph_util.convert_variables_to_constants(
+                sess, sess.graph.as_graph_def(add_shapes=True), out_node,)
+            tf_output = run_tf_graph(sess, in_data, in_name, out_name)
+            tvm_output = run_tvm_graph(final_graph_def, in_data, in_node, target='llvm',
+                                       out_names=out_node, num_output=2)
+            for i in range(len(tf_output)):
+                tvm.testing.assert_allclose(tf_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
+
 #######################################################################
 # Resize Bilinear
 # ---------------
@@ -580,7 +618,7 @@ def _get_tensorflow_output():
     out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
     out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
     tvm_out = [out, out_state_c, out_state_h]
-    tvm.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3)
+    tvm.testing.assert_allclose(tf_out[0], tvm_out[0], rtol=1e-3, atol=1e-3)
 
 def test_forward_lstm():
     '''test LSTM block cell'''
@@ -653,7 +691,7 @@ def test_forward_inception_v3():
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'input:0', 'InceptionV3/Predictions/Reshape_1:0')
             tvm_output = run_tvm_graph(graph_def, data, 'input')
-            tvm.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(tf_output[0], tvm_output[0], rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # Inception V1
@@ -689,7 +727,7 @@ def test_forward_inception_v1():
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'softmax:0')
             tvm_output = run_tvm_graph(graph_def, tvm_data, 'DecodeJpeg/contents')
-            tvm.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(tf_output[0], tvm_output[0], rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # Mobilenet
@@ -712,7 +750,7 @@ def test_forward_mobilenet():
             graph_def = nnvm.testing.tf.AddShapesToGraphDef(sess, out_node)
             tf_output = run_tf_graph(sess, data, 'input:0', out_node + ':0')
             tvm_output = run_tvm_graph(graph_def, data, 'input')
-            tvm.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # ResnetV2
@@ -731,7 +769,7 @@ def test_forward_resnetv2():
             with tf.Session() as sess:
                 tf_output = run_tf_graph(sess, data, 'input_tensor:0', out_node + ':0')
                 tvm_output = run_tvm_graph(graph_def, data, 'input_tensor', tf_output.shape, 'float32')
-                tvm.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
+                tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # PTB
@@ -797,6 +835,7 @@ def _get_sample(data, state):
             state_output = model.get_output(1, tvm.nd.empty(out_state_shape,
                                                         "float32")).asnumpy()
             sample = nnvm.testing.tf.pick_from_weight(tvm_output[0])
+
             return sample, state_output
 
         for x in data:
@@ -942,7 +981,7 @@ def test_forward_leaky_relu():
     with tf.Graph().as_default():
         in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
         tf.nn.leaky_relu(in1, alpha=0.4)
-        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'LeakyRelu:0')
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'LeakyRelu/mul:0')
 
 def test_forward_elu():
     ishape = (1, 3, 10, 10)
@@ -1042,6 +1081,7 @@ def test_forward_rel_ops():
 
     # General
     test_forward_multi_input()
+    test_forward_multi_output()
     test_forward_variable()
 
     # End to End

From 3d1a4157f1eae75716f9904000f8084e143db688 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Wed, 21 Nov 2018 23:27:33 +0530
Subject: [PATCH 393/529] [FRONTEND][TENSORFLOW] Enable strided_slice with fix.
 (#2002)

---
 nnvm/python/nnvm/frontend/tensorflow.py       | 32 +++++++++++--------
 .../frontend/tensorflow/test_forward.py       |  8 +++--
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 13ed717b0450..b01d489fb042 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -569,6 +569,7 @@ def _transform_mask(stride_dim, ellipsis_mask):
             m_begin = [0] * data_dim
             m_end = [0] * data_dim
             m_stride = [0] * data_dim
+            fshape_indices = []
             #Count new axis after ellipsis_mask, consider while applying ellipsis_mask.
             ellipsis_seen = False
             new_axes_after_ellipsis = 0
@@ -593,7 +594,10 @@ def _transform_mask(stride_dim, ellipsis_mask):
                         m_begin[final_index] = 0
                         m_end[final_index] = data_shape[0][final_index]
                         m_stride[final_index] = 1
+                        fshape_indices.append(final_index)
                         final_index += 1
+                elif mask &new_axis_mask:
+                    fshape_indices.append(-1)
                 elif not mask & new_axis_mask:
                     if final_index == len(m_begin):
                         break
@@ -614,28 +618,30 @@ def _transform_mask(stride_dim, ellipsis_mask):
                                                  if begin[index] < 0 else begin[index]
                         m_end[final_index] = begin[index] + 1
                         m_stride[final_index] = 1
+                        fshape_indices.append(-2)
+                    else:
+                        fshape_indices.append(final_index)
+
                     final_index += 1
-            return m_begin, m_end, m_stride
+            return m_begin, m_end, m_stride, fshape_indices
 
+        fshape_indices = None
         if begin_mask or end_mask or ellipsis_mask or new_axis_mask or shrink_axis_mask:
-            begin, end, stride = _transform_mask(stride_dim, ellipsis_mask)
+            begin, end, stride, fshape_indices = _transform_mask(stride_dim, ellipsis_mask)
         out = _sym.strided_slice(inputs[0], begin=begin, end=end, stride=stride)
         out_shape = _infer_out_shapes(out, params)[0]
+        if not fshape_indices:
+            fshape_indices = range(len(out_shape))
 
         #Create final output shape.
         final_output = []
-        out_index = 0
-        index = 0
-        while out_index != len(out_shape):
-            #axis with shrink_axis_mask dimension=1 and it is ignored.
-            mask = 1 << index
-            if (new_axis_mask & mask) and not ellipsis_mask & mask:
+        for gather_index in fshape_indices:
+            if gather_index == -1:
                 final_output.append(1)
-            elif (not mask & shrink_axis_mask) or index >= stride_dim:
-                #Shrink is considered till stride_dim
-                final_output.append(out_shape[out_index])
-                out_index += 1
-            index += 1
+            elif gather_index == -2:
+                pass
+            else:
+                final_output.append(out_shape[gather_index])
         return _sym.reshape(out, shape=tuple(final_output))
     return _impl
 
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index e93f14ceb968..c98748c0fc03 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -435,11 +435,15 @@ def _test_stridedslice(ip_shape, begin, end, stride, dtype,
 
 def test_forward_stridedslice():
     '''test StridedSlice'''
-    return
+
     _test_stridedslice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], 'float32')
     _test_stridedslice((3, 4, 3), [1, 0], [4, 3], [2, 1], 'float32', ellipsis_mask=8)
+    _test_stridedslice((3, 4, 3), [1, 0], [4, 2], [2, 1], 'float32', ellipsis_mask=2)
+    _test_stridedslice((3, 4, 5, 3), [1, 0], [4, 2], [2, 1], 'float32', ellipsis_mask=2)
+    _test_stridedslice((3, 4, 5, 3), [1, 0, 1], [4, 2, 2], [2, 1, 1], 'float32', ellipsis_mask=2)
     _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 2], [2, 1, 1], 'float32', new_axis_mask=5)
     _test_stridedslice((3, 4, 3), [1, 1, 1], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=4)
+    _test_stridedslice((6, 4, 5), [1, 1, 1], [6, 3, 4], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=5)
     _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=4, new_axis_mask=2)
     _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=3)
     _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=3)
@@ -1056,7 +1060,7 @@ def test_forward_rel_ops():
     test_forward_resize_bilinear()
     test_forward_pad()
     test_forward_gather()
-    #test_forward_stridedslice()
+    test_forward_stridedslice()
 
     # Activations
     test_forward_sigmoid()

From 54d776f3430c0a3ef9e7a32a8fca1f5c474a8f25 Mon Sep 17 00:00:00 2001
From: Alexey Romanov <alexey.v.romanov@gmail.com>
Date: Wed, 21 Nov 2018 20:58:19 +0300
Subject: [PATCH 394/529] [FRONTEND][TENSORFLOW] Fix a typo in _matmul (#2152)

---
 nnvm/python/nnvm/frontend/tensorflow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index b01d489fb042..b0b546a32b3d 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -342,7 +342,7 @@ def _matmul():
     def _impl(inputs, attr, params):
         channels = _infer_channels(inputs[1], params, not attr['transpose_b'])
         if attr['transpose_a']:
-            inputs[0] = _sym.transpose(inputs[0], axes(1, 0))
+            inputs[0] = _sym.transpose(inputs[0], axes=(1, 0))
         if not attr['transpose_b']:
             inputs[1] = _sym.transpose(inputs[1], axes=(1, 0))
         return AttrCvt(op_name="dense",

From b5bb67ecc0d2a67794af06e5c2a84557360d96dd Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Wed, 21 Nov 2018 17:11:33 -0500
Subject: [PATCH 395/529] [Relay] Port LSTM to Relay for testing (#2011)

---
 python/tvm/relay/testing/__init__.py       |   1 +
 python/tvm/relay/testing/layers.py         |   4 +-
 python/tvm/relay/testing/lstm.py           | 182 +++++++++++++++++++++
 src/relay/op/tensor/transform.cc           |   2 +-
 tests/python/relay/test_ir_text_printer.py |   8 +
 tests/python/relay/test_op_level3.py       |  13 ++
 6 files changed, 207 insertions(+), 3 deletions(-)
 create mode 100644 python/tvm/relay/testing/lstm.py

diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 913f97ecd4a1..43160d64549c 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -6,4 +6,5 @@
 from . import dqn
 from . import dcgan
 from . import mobilenet
+from . import lstm
 from .config import ctx_list
diff --git a/python/tvm/relay/testing/layers.py b/python/tvm/relay/testing/layers.py
index 1b279d9e72af..9d4d3b3b4e13 100644
--- a/python/tvm/relay/testing/layers.py
+++ b/python/tvm/relay/testing/layers.py
@@ -105,7 +105,7 @@ def conv2d_transpose(data, weight=None, **kwargs):
         weight = relay.var(name + "_weight")
     return relay.nn.conv2d_transpose(data, weight, **kwargs)
 
-def dense_add_bias(data, weight=None, bias=None, **kwargs):
+def dense_add_bias(data, weight=None, bias=None, units=None, **kwargs):
     """Wrapper of dense which automatically creates weights if not given.
 
     Parameters
@@ -133,6 +133,6 @@ def dense_add_bias(data, weight=None, bias=None, **kwargs):
         weight = relay.var(name + "_weight")
     if not bias:
         bias = relay.var(name + "_bias")
-    data = relay.nn.dense(data, weight, **kwargs)
+    data = relay.nn.dense(data, weight, units, **kwargs)
     data = relay.nn.bias_add(data, bias)
     return data
diff --git a/python/tvm/relay/testing/lstm.py b/python/tvm/relay/testing/lstm.py
new file mode 100644
index 000000000000..47e68a988dab
--- /dev/null
+++ b/python/tvm/relay/testing/lstm.py
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Implementation of a Long Short-Term Memory (LSTM) cell.
+
+Adapted from:
+https://gist.github.com/merrymercy/5eb24e3b019f84200645bd001e9caae9
+"""
+
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def lstm_cell(num_hidden, batch_size=1, dtype="float32", name=""):
+    """Long-Short Term Memory (LSTM) network cell.
+
+    Parameters
+    ----------
+    num_hidden : int
+        Number of units in output symbol.
+
+    batch_size : int
+        Batch size (length of states).
+
+    Returns
+    -------
+    result : tvm.relay.Function
+        A Relay function that evaluates an LSTM cell.
+        The function takes in a tensor of input data, a tuple of two
+        states, and weights and biases for dense operations on the
+        inputs and on the state. It returns a tuple with two members,
+        an output tensor and a tuple of two new states.
+    """
+    builder = relay.ScopeBuilder()
+
+    input_type = relay.TensorType((batch_size, num_hidden), dtype)
+    weight_type = relay.TensorType((num_hidden, 4*num_hidden), dtype)
+    bias_type = relay.TensorType((4*num_hidden,), dtype)
+
+    dense_type = relay.TensorType((batch_size, 4*num_hidden), dtype)
+    slice_type = relay.TupleType([input_type, input_type,
+                                  input_type, input_type])
+    ret_type = relay.TupleType([input_type,
+                                relay.TupleType([input_type, input_type])])
+
+    inputs = relay.Var("inputs", input_type)
+    states = relay.Var("states",
+                       relay.TupleType([input_type, input_type]))
+
+    i2h_weight = relay.Var("i2h_weight", weight_type)
+    i2h_bias = relay.Var("i2h_bias", bias_type)
+
+    h2h_weight = relay.Var("h2h_weight", weight_type)
+    h2h_bias = relay.Var("h2h_bias", bias_type)
+
+    i2h = builder.let(("i2h", dense_type),
+                      layers.dense_add_bias(
+                          data=inputs,
+                          units=num_hidden * 4,
+                          weight=i2h_weight, bias=i2h_bias,
+                          name="%si2h" % name))
+    h2h = builder.let(("h2h", dense_type),
+                      layers.dense_add_bias(
+                          data=relay.TupleGetItem(states, 0),
+                          units=num_hidden * 4,
+                          weight=h2h_weight, bias=h2h_bias,
+                          name="%sh2h" % name))
+
+    gates = builder.let(("gates", dense_type), relay.add(i2h, h2h))
+    slice_gates = builder.let(("slice_gates", slice_type),
+                              relay.split(gates,
+                                          indices_or_sections=4,
+                                          axis=1).astuple())
+
+    in_gate = builder.let(("in_gate", input_type),
+                          relay.sigmoid(relay.TupleGetItem(slice_gates, 0)))
+    forget_gate = builder.let(("forget_gate", input_type),
+                              relay.sigmoid(relay.TupleGetItem(slice_gates, 1)))
+    in_transform = builder.let(("in_transform", input_type),
+                               relay.tanh(relay.TupleGetItem(slice_gates, 2)))
+    out_gate = builder.let(("out_gate", input_type),
+                           relay.sigmoid(relay.TupleGetItem(slice_gates, 3)))
+
+    next_c = builder.let(("next_c", input_type),
+                         relay.add(relay.multiply(forget_gate,
+                                                  relay.TupleGetItem(states, 1)),
+                                   relay.multiply(in_gate, in_transform)))
+    next_h = builder.let(("next_h", input_type),
+                         relay.multiply(out_gate, relay.tanh(next_c)))
+    ret = builder.let(("ret", ret_type),
+                      relay.Tuple([next_h, relay.Tuple([next_h, next_c])]))
+    builder.ret(ret)
+
+    body = builder.get()
+
+    return relay.Function([inputs, states, i2h_weight,
+                           i2h_bias, h2h_weight, h2h_bias],
+                          body, ret_type)
+
+
+def get_net(iterations, num_hidden, batch_size=1, dtype="float32"):
+    '''Constructs an unrolled RNN with LSTM cells'''
+    input_type = relay.TensorType((batch_size, num_hidden), dtype)
+    weight_type = relay.TensorType((num_hidden, 4*num_hidden), dtype)
+    bias_type = relay.TensorType((4*num_hidden,), dtype)
+
+    state_type = relay.TupleType([input_type, input_type])
+    cell_type = relay.TupleType([input_type, state_type])
+
+    builder = relay.ScopeBuilder()
+
+    zeros = builder.let(("zeros", input_type),
+                        relay.zeros((batch_size, num_hidden), dtype))
+    init_states = builder.let(("init_states", state_type),
+                              relay.Tuple([zeros, zeros]))
+
+    states = init_states
+    out = None
+
+    for i in range(iterations):
+        inputs = relay.Var("data", input_type)
+        i2h_weight = relay.Var("i2h_%s_weight" % i, weight_type)
+        i2h_bias = relay.Var("i2h_%i_bias" % i, bias_type)
+        h2h_weight = relay.Var("h2h_%s_weight" % i, weight_type)
+        h2h_bias = relay.Var("h2h_%s_bias" % i, bias_type)
+
+        cell_fn = lstm_cell(num_hidden, batch_size, dtype, "lstm_%s" % i)
+
+        call = builder.let(("call_%s" % i, cell_type),
+                           relay.Call(cell_fn,
+                                      [inputs, states, i2h_weight,
+                                       i2h_bias, h2h_weight, h2h_bias]))
+        new_out = builder.let(("out_%s" % i, input_type),
+                              relay.TupleGetItem(call, 0))
+        new_states = builder.let(("states_%s" % i, state_type),
+                                 relay.TupleGetItem(call, 1))
+        states = new_states
+        out = new_out
+
+    builder.ret(out)
+    body = builder.get()
+    args = relay.ir_pass.free_vars(body)
+    return relay.Function(args, body, input_type)
+
+
+def get_workload(iterations, num_hidden, batch_size=1, dtype="float32"):
+    """Get benchmark workload for an LSTM RNN.
+
+    Parameters
+    ----------
+    iterations : int
+        The number of iterations in the desired LSTM RNN.
+    num_hidden : int
+        The size of the hiddxen state
+    batch_size : int, optional (default 1)
+        The batch size used in the model
+    dtype : str, optional (default "float32")
+        The data type
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(iterations, num_hidden, batch_size, dtype)
+    return create_workload(net)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 7a3a2151158d..a9e0a969fc5b 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1078,7 +1078,7 @@ bool SplitRel(const Array<Type>& types,
   }
   CHECK_LT(axis, data->shape.size())
     << "axis should be within the input dimension range.";
-  CHECK_GT(axis, 0)
+  CHECK_GE(axis, 0)
     << "axis should be within the input dimension range.";
 
   if (const IntImm* sections = param->indices_or_sections.as<IntImm>()) {
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index d12804d512f0..30130fd7bcac 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -97,10 +97,12 @@ def test_variable_name():
     v1 = relay.var("1")
     assert "%v1" in v1.astext()
 
+
 def test_mlp():
     net, params = tvm.relay.testing.mlp.get_workload(batch_size=1)
     net.astext()
 
+
 def test_resnet():
     net, params = tvm.relay.testing.resnet.get_workload(batch_size=1)
     net.astext()
@@ -117,6 +119,12 @@ def test_dcgan():
     net, params = tvm.relay.testing.dcgan.get_workload(batch_size=1)
     net.astext()
 
+
+def test_lstm():
+    net, params = tvm.relay.testing.lstm.get_workload(4, 4)
+    net.astext()
+
+
 if __name__ == "__main__":
     do_print[0] = True
     test_resnet()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 22469cc7fdbe..6f8fbd551293 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -161,6 +161,14 @@ def verify_split(dshape, indices_or_sections, ret_type, axis=None):
                      relay.ty.TensorType((5, 1, 2, 2), "float32"),
                      relay.ty.TensorType((5, 1, 2, 2), "float32")])),
                   axis=1)
+    verify_split((5, 5, 2, 2), 5,
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32")])),
+                  axis=0)
     verify_split((d1, d2, d3, d4), 4,
                  relay.ty.TupleType(tvm.convert([
                      relay.ty.TensorType((d1, d2, d3/4, d4), "float32"),
@@ -168,6 +176,11 @@ def verify_split(dshape, indices_or_sections, ret_type, axis=None):
                      relay.ty.TensorType((d1, d2, d3/4, d4), "float32"),
                      relay.ty.TensorType((d1, d2, d3/4, d4), "float32")])),
                   axis=2)
+    verify_split((d1, d2, d3, d4), 2,
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((d1/2, d2, d3, d4), "float32"),
+                     relay.ty.TensorType((d1/2, d2, d3, d4), "float32")])),
+                  axis=0)
     verify_split((d1, d2, d3, d4), (2, 4, 7),
                  relay.ty.TupleType(tvm.convert([
                      relay.ty.TensorType((d1, 2, d3, d4), "float32"),

From 150fb85530f051a7bb81646cf160fbbb3ac06ae1 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 22 Nov 2018 06:12:02 +0800
Subject: [PATCH 396/529] Alter op layout for group_conv2d on CUDA (#2148)

---
 nnvm/python/nnvm/top/nn.py               |  4 +--
 topi/python/topi/cuda/conv2d_winograd.py | 38 +++++++++++++++++++++---
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 2069a0a5ad50..74196c078798 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -108,7 +108,7 @@ def compute_conv2d(attrs, inputs, _):
          groups == channels:
         out = topi.nn.depthwise_conv2d_nchw(
             inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
-    elif layout == "NCHW":
+    elif layout in ["NCHW", "NCHW4c"]:
         out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
                                         out_dtype=out_dtype)
     elif layout == "NHWC" and \
@@ -146,7 +146,7 @@ def schedule_conv2d(attrs, outs, target):
             return topi.generic.schedule_depthwise_conv2d_nchw(outs)
         elif groups == channels and layout == "NHWC" and kernel_layout == "HWOI":
             return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
-        elif layout == "NCHW":
+        elif layout in ["NCHW", "NCHW4c"]:
             return topi.generic.schedule_group_conv2d_nchw(outs)
         else:
             raise ValueError("No compatible schedule")
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 1f2112979ee7..d32a87ba6b9d 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -7,7 +7,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..nn import conv2d, conv2d_winograd_without_weight_transform
+from ..nn import conv2d, group_conv2d_nchw, conv2d_winograd_without_weight_transform
 from ..util import get_const_int, get_const_tuple, const_matrix, traverse_inline
 from ..generic import schedule_conv2d_winograd_without_weight_transform
 
@@ -353,12 +353,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     CO, _, KH, KW = get_const_tuple(kernel.shape)
 
     dispatch_ctx = autotvm.DispatchContext.current
+    target = tvm.target.current_target()
 
     if groups == 1:
         # query config of this workload
-        workload = ('conv2d',) + autotvm.task.args_to_workload(
-            [tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype])
-        target = tvm.target.current_target()
+        workload = autotvm.task.args_to_workload(
+            [tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype], conv2d)
         cfg = autotvm.DispatchContext.current.query(target, workload)
 
         if cfg.is_fallback:  # if is fallback, clear query cache and return None
@@ -411,6 +411,36 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
+    elif groups != CI:
+        workload = autotvm.task.args_to_workload(
+            [tinfos[0], tinfos[1], strides, padding, dilation, groups, out_dtype],
+            group_conv2d_nchw)
+        cfg = autotvm.DispatchContext.current.query(target, workload)
+
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(target, workload)
+            return None
+
+        if cfg.template_key == 'int8':
+            assert 'cuda' in target.keys
+            new_layout = 'NCHW4c'
+            new_attrs['layout'] = new_layout
+            new_attrs['out_layout'] = new_layout
+            new_attrs['kernel_layout'] = 'OIHW4o4i'
+            ic_block_factor = oc_block_factor = 4
+
+            # Store the same config for the altered operator (workload)
+            new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
+                                       dtype=data.dtype)
+            new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,\
+                                         KH, KW, oc_block_factor, ic_block_factor),
+                                         dtype=kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, strides, padding, dilation, groups, out_dtype],
+                group_conv2d_nchw
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return sym.conv2d(*copy_inputs, **new_attrs)
 
     # do nothing for depthwise convolution
     return None

From 02c28d77cbafcf5fcc9580698b810521714b5bb4 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 21 Nov 2018 14:12:30 -0800
Subject: [PATCH 397/529] [TOPI] Fix atlest1d for reduce and squeeze (#2147)

---
 nnvm/include/nnvm/compiler/util.h            |  11 ++
 nnvm/src/top/tensor/reduce.cc                |  34 +++---
 nnvm/src/top/tensor/transform.cc             |   4 +-
 topi/include/topi/detail/fuse.h              |  14 +--
 topi/include/topi/nn/l2_normalize.h          |   2 +-
 topi/include/topi/nn/softmax.h               |   2 +-
 topi/include/topi/reduction.h                | 109 +++++++++++--------
 topi/include/topi/transform.h                |  22 ++--
 topi/python/topi/cuda/reduction.py           |   6 +-
 topi/src/topi.cc                             |   4 +-
 topi/tests/python/test_topi_reduce.py        |   4 +
 topi/tests/python/test_topi_transform.py     |   5 +-
 topi/tests/python_cpp/test_topi_transform.py |   5 +-
 13 files changed, 125 insertions(+), 97 deletions(-)

diff --git a/nnvm/include/nnvm/compiler/util.h b/nnvm/include/nnvm/compiler/util.h
index 5d5bc4478530..0f7fb2a5c875 100644
--- a/nnvm/include/nnvm/compiler/util.h
+++ b/nnvm/include/nnvm/compiler/util.h
@@ -28,6 +28,17 @@ inline tvm::Array<tvm::Expr> ShapeToArray(TShape shape) {
   return result;
 }
 
+/*
+ * \brief Helper function to convert TShape to TVM array. Useful for
+ * passing data from NNVM param structures to TOPI ops.
+ *
+ * \param shape The shape to convert
+ *
+ * \return An Array of Expr, where each element is a constant int32
+ */
+inline tvm::Array<tvm::Integer> ShapeToIntArray(TShape shape) {
+  return tvm::Array<tvm::Integer>(ShapeToArray(shape).node_);
+}
 }  // namespace compiler
 }  // namespace nnvm
 #endif  // NNVM_COMPILER_UTIL_H_
diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index 7b768ac64304..007a3cc6e3fb 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -3,9 +3,6 @@
  * \file reduce.cc
  * \brief reduce operator.
  */
-// Enforce TOPI to use old behavior that reduces to at least 1d
-#define TOPI_REDUCE_ATLEAST1D 1
-
 #include <nnvm/op.h>
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
@@ -20,13 +17,12 @@
 #include "topi/reduction.h"
 #include "topi/transform.h"
 
-static_assert(TOPI_REDUCE_ATLEAST1D, "need to use legacy reduce behavior");
-
 namespace nnvm {
 namespace top {
 using namespace tvm;
 using namespace nnvm::compiler;
 
+
 // reduce
 DMLC_REGISTER_PARAMETER(ReduceParam);
 
@@ -168,9 +164,9 @@ Example::
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::sum(inputs[0], axis, param.keepdims) };
+      topi::sum(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -202,9 +198,9 @@ NNVM_REGISTER_REDUCE_OP(max)
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::max(inputs[0], axis, param.keepdims) };
+      topi::max(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -235,9 +231,9 @@ NNVM_REGISTER_REDUCE_OP(min)
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::min(inputs[0], axis, param.keepdims) };
+      topi::min(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -299,8 +295,8 @@ values over a given axis.
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
-    Tensor out = topi::argmax(inputs[0], axis, param.keepdims);
+    auto axis = ShapeToIntArray(r_axes);
+    Tensor out = topi::argmax(inputs[0], axis, param.keepdims, true);
     if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
     return Array<Tensor>{out};
 });
@@ -322,8 +318,8 @@ values over a given axis.
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
-    Tensor out = topi::argmin(inputs[0], axis, param.keepdims);
+    auto axis = ShapeToIntArray(r_axes);
+    Tensor out = topi::argmin(inputs[0], axis, param.keepdims, true);
     if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
     return Array<Tensor>{out};
 });
@@ -352,7 +348,7 @@ Example::
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
 
     Expr count = make_const(inputs[0]->dtype, 1);
     for (auto& i : r_axes) {
@@ -360,7 +356,7 @@ Example::
     }
 
     return Array<Tensor>{
-      topi::divide(topi::sum(inputs[0], axis, param.keepdims), count) };
+      topi::divide(topi::sum(inputs[0], axis, param.keepdims, true), count) };
 });
 
 NNVM_REGISTER_REDUCE_OP(prod)
@@ -387,9 +383,9 @@ Example::
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::prod(inputs[0], axis, param.keepdims) };
+      topi::prod(inputs[0], axis, param.keepdims, true) };
 });
 
 
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 2f42727d6083..492208ed7a7c 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -756,8 +756,8 @@ Examples::
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const SqueezeParam& param = nnvm::get<SqueezeParam>(attrs.parsed);
-    auto axis = ShapeToArray(param.axis);
-    return Array<Tensor>{ topi::squeeze(inputs[0], axis) };
+    auto axis = ShapeToIntArray(param.axis);
+    return Array<Tensor>{ topi::squeeze(inputs[0], axis, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
diff --git a/topi/include/topi/detail/fuse.h b/topi/include/topi/detail/fuse.h
index 9ee7fbd1cffd..85ca0f9efacb 100644
--- a/topi/include/topi/detail/fuse.h
+++ b/topi/include/topi/detail/fuse.h
@@ -14,22 +14,16 @@ using namespace tvm;
 
 /*!
  * \brief Fuse all of the given args
- * 
+ *
  * \param stage The stage in which to apply the fuse
  * \param args The iteration variables to be fused
  *
  * \return The fused iteration variable
  */
 inline IterVar Fuse(Stage stage, const Array<IterVar>& args) {
-  CHECK_GE(args.size(), 1) << "Fuse requires at least 1 arg";
-
-  auto fused = args[0];
-  for (size_t i = 1; i < args.size(); ++i) {
-    IterVar out;
-    stage.fuse(fused, args[i], &out);
-    fused = out;
-  }
-  return fused;
+  IterVar res;
+  stage.fuse(args, &res);
+  return res;
 }
 
 }  // namespace detail
diff --git a/topi/include/topi/nn/l2_normalize.h b/topi/include/topi/nn/l2_normalize.h
index cda1f3b5c813..6d98a75ec157 100644
--- a/topi/include/topi/nn/l2_normalize.h
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -27,7 +27,7 @@ using namespace tvm;
 */
 inline Tensor l2_normalize(const Tensor& data,
                            float eps,
-                           const Array<Expr>& axis,
+                           const Array<Integer>& axis,
                            std::string name = "tensor",
                            std::string tag = "l2_normalize") {
   CHECK_EQ(data->shape.size(), 4) << "L2 normalization requires 4-D input";
diff --git a/topi/include/topi/nn/softmax.h b/topi/include/topi/nn/softmax.h
index d17f93046e72..8ee747ccd07c 100644
--- a/topi/include/topi/nn/softmax.h
+++ b/topi/include/topi/nn/softmax.h
@@ -40,7 +40,7 @@ inline Tensor softmax(const Tensor &x,
 
   auto k1 = tvm::reduce_axis(Range(0, input_shape[axis]), "k1");
   auto k2 = tvm::reduce_axis(Range(0, input_shape[axis]), "k2");
-  auto reduced_shape = MakeReduceTargetShape({axis}, x, false);
+  auto reduced_shape = MakeReduceTargetShape({axis}, x, false, false);
 
   auto insert_reduce_index = [axis, ndim](const Array<Var> &indices,
                                           const IterVar &reduce_index) {
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index 777c103ec950..f26d14951fd4 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -8,7 +8,6 @@
 
 #include <algorithm>
 #include <string>
-#include <set>
 #include <vector>
 #include <iterator>
 
@@ -20,13 +19,6 @@
 #include "topi/detail/constant_utils.h"
 #include "tvm/tvm.h"
 
-/*!
- * \brief macro flag to enable some legacy behavior which requires
- * reduction result to be at least 1d.
- */
-#ifndef TOPI_REDUCE_ATLEAST1D
-#define TOPI_REDUCE_ATLEAST1D 0
-#endif
 
 namespace topi {
 using namespace tvm;
@@ -42,30 +34,34 @@ using FCommReduce = std::function<
 * \brief Convert a reduction axis which could be empty or have negative
 * elements into a real axis with valid dimension indices.
 *
+* \param ndim Number of dimensions in the target.
+* \param axis The axis parameter.
+*
 * \return A non-empty sorted array of valid dimension indices, with no duplicates.
 * If the input axis is empty, the result will be an axis including all dimensions.
 * If any input element is negative, it will be treated as an offset from the
 * last dimension (same as python indexing rules).
 */
-inline std::vector<int> GetRealAxis(int ndim, const std::vector<int>& axis) {
+inline std::vector<int> GetRealAxis(int ndim, const Array<Integer>& axis) {
   std::vector<int> real_axis;
-  if (axis.size() == 0) {
+  if (!axis.defined() || axis.size() == 0) {
     for (int i = 0; i < ndim; ++i) {
       real_axis.push_back(i);
     }
   } else {
     // Use a set so duplicates are removed and the dims are sorted
-    std::set<int> dims;
-    for (auto ele : axis) {
-      if (ele < 0) {
-        ele += ndim;
-      }
-      if (ele >= ndim) {
-        LOG(ERROR) << ele << " exceeds the maximum dimension " << ndim;
+    for (auto elem : axis) {
+      int64_t val = elem->value;
+      if (val < 0) {
+        val += ndim;
       }
-      dims.emplace(ele);
+      CHECK_LE(val, ndim) << " exceeds the maximum dimension " << ndim;
+      CHECK_GE(val, 0);
+      real_axis.push_back(static_cast<int>(val));
     }
-    std::copy(dims.begin(), dims.end(), std::back_inserter(real_axis));
+    std::sort(real_axis.begin(), real_axis.end());
+    real_axis.resize(
+        std::unique(real_axis.begin(), real_axis.end()) - real_axis.begin());
   }
   return real_axis;
 }
@@ -84,7 +80,8 @@ inline Array<IterVar> MakeReduceAxes(const std::vector<int>& real_axis, const Te
 /*! \brief Calculate the target shape for a reduce op */
 inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
                                          const Tensor& data,
-                                         bool keepdims) {
+                                         bool keepdims,
+                                         bool atleast1d) {
   auto ndim = data->shape.size();
   Array<Expr> target_shape;
   if (keepdims) {
@@ -104,7 +101,7 @@ inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
       }
     }
   }
-  if (target_shape.size() == 0 && TOPI_REDUCE_ATLEAST1D) {
+  if (target_shape.size() == 0 && atleast1d) {
     target_shape.push_back(1);
   }
   return target_shape;
@@ -163,18 +160,19 @@ inline Tensor DoCommReduce(const Tensor& data,
  * \param keepdims If this is set to true, the axes which are reduced are
  * left in the result as dimensions with size one. This enables the result
  * to broadcast correctly against the input array.
+ * \param atleast1d Whether the output need to be atleast1d.
  *
  * \return The result tensor.
  */
 inline Tensor CommReduce(const Tensor& data,
-                         const Array<Expr>& axis,
+                         const Array<Integer>& axis,
                          FReduce func,
-                         bool keepdims = false) {
+                         bool keepdims,
+                         bool atleast1d) {
   auto ndim = data->shape.size();
   CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
-  auto axis_val = detail::GetConstIntValues(axis, "axis");
-  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis_val);
-  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims);
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
+  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
   return DoCommReduce(data, func, target_shape, real_axis,
       keepdims ? std::vector<int>() : real_axis);
 }
@@ -188,19 +186,20 @@ inline Tensor CommReduce(const Tensor& data,
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return The result tensor.
 */
 inline Tensor CommReduceIdx(const Tensor& data,
-                            const Array<Expr>& axis,
+                            const Array<Integer>& axis,
                             FCommReduce func,
-                            bool keepdims = false) {
+                            bool keepdims,
+                            bool atleast1d) {
   auto ndim = data->shape.size();
   CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
-  auto axis_val = detail::GetConstIntValues(axis, "axis");
-  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis_val);
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
   auto reduce_axes = MakeReduceAxes(real_axis, data);
-  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims);
+  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
 
   auto compute = [ndim, keepdims, &real_axis, &reduce_axes, &func, &data]
   (const Array<Var>& indices) {
@@ -311,11 +310,15 @@ inline Expr ProdOp(Expr source, Array<IterVar> axis) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the sum operation
 */
-inline Tensor sum(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
-  return CommReduce(data, axis, tvm::sum, keepdims);
+inline Tensor sum(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, tvm::sum, keepdims, atleast1d);
 }
 
 inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
@@ -356,11 +359,15 @@ inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the min operation
 */
-inline Tensor min(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
-  return CommReduce(data, axis, MinOp, keepdims);
+inline Tensor min(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, MinOp, keepdims, atleast1d);
 }
 
 /*!
@@ -373,11 +380,15 @@ inline Tensor min(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the max operation
 */
-inline Tensor max(const Tensor& data, Array<Expr> axis, bool keepdims = false) {  // NOLINT(*)
-  return CommReduce(data, axis, MaxOp, keepdims);
+inline Tensor max(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, MaxOp, keepdims, atleast1d);
 }
 
 /*!
@@ -390,10 +401,14 @@ inline Tensor max(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the argmin operation
 */
-inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+inline Tensor argmin(const Tensor& data,
+                     const Array<Integer>& axis,
+                     bool keepdims = false,
+                     bool atleast1d = false) {
   auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
     Array<Expr> result;
     result.push_back(tvm::select(lhs[1] <= rhs[1], lhs[0], rhs[0]));  // idx
@@ -407,7 +422,7 @@ inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false
     return result;
   };
   auto func = MakeCommReducer(fcombine, fidentity, "argmin");
-  return CommReduceIdx(data, axis, func, keepdims);
+  return CommReduceIdx(data, axis, func, keepdims, atleast1d);
 }
 
 /*!
@@ -420,10 +435,14 @@ inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the argmax operation
 */
-inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+inline Tensor argmax(const Tensor& data,
+                     const Array<Integer>& axis,
+                     bool keepdims = false,
+                     bool atleast1d = false) {
   auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
     Array<Expr> result;
     result.push_back(tvm::select(lhs[1] >= rhs[1], lhs[0], rhs[0]));  // idx
@@ -437,7 +456,7 @@ inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false
     return result;
   };
   auto func = MakeCommReducer(fcombine, fidentity, "argmax");
-  return CommReduceIdx(data, axis, func, keepdims);
+  return CommReduceIdx(data, axis, func, keepdims, atleast1d);
 }
 
 /*!
@@ -449,11 +468,15 @@ inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the prod operation
 */
-inline Tensor prod(const Tensor& data, Array<Expr> axis, bool keepdims = false) {  // NOLINT(*)
-  return CommReduce(data, axis, ProdOp, keepdims);
+inline Tensor prod(const Tensor& data,
+                   const Array<Integer>& axis,
+                   bool keepdims = false,
+                   bool atleast1d = false) {
+  return CommReduce(data, axis, ProdOp, keepdims, atleast1d);
 }
 
 }  // namespace topi
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index cb09f1cb419e..9bc62b2c0249 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -196,30 +196,34 @@ inline Tensor reshape(const Tensor& x,
 * \param x The input tensor
 * \param axis Indices of the dimensions to remove. If this is empty,
 * all entries with a constant size of 1 will be removed.
+ * \param atleast1d Whether the output need to be atleast1d.
 * \param name The name of the operation
 * \param tag The tag to mark the operation
 *
 * \return A Tensor whose op member is the squeeze operation
 */
 inline Tensor squeeze(const Tensor& x,
-                      Array<Expr> axis,
+                      Array<Integer> axis,
+                      bool atleast1d = false,
                       std::string name = "tensor",
                       std::string tag = kInjective) {
-  auto axis_val = GetConstIntValues(axis, "axis");
   auto ndim = x->shape.size();
-  if (axis_val.size() == 0) {
+  std::vector<int> axis_val;
+  if (!axis.defined() || axis.size() == 0) {
     for (size_t i = 0; i < ndim; ++i) {
       if (IsConstInt(x->shape[i]) && GetConstInt(x->shape[i]) == 1) {
         axis_val.push_back(static_cast<int>(i));
       }
     }
   } else {
-    for (size_t i = 0; i < axis_val.size(); ++i) {
-      if (axis_val[i] < 0) {
-        axis_val[i] += static_cast<int>(x->shape.size());
+    for (size_t i = 0; i < axis.size(); ++i) {
+      int64_t val = axis[i]->value;
+      if (val < 0) {
+        val += static_cast<int>(x->shape.size());
       }
-      CHECK_EQ(GetConstInt(x->shape[axis_val[i]]), 1) <<
-        "Dimension " << axis[i] << " must have size 1";
+      CHECK_EQ(GetConstInt(x->shape[val]), 1) <<
+          "Dimension " << val << " must have size 1";
+      axis_val.push_back(val);
     }
   }
 
@@ -231,7 +235,7 @@ inline Tensor squeeze(const Tensor& x,
       out_shape.push_back(x->shape[i]);
     }
   }
-  if (out_shape.size() == 0) {
+  if (out_shape.size() == 0 && atleast1d) {
     out_shape.push_back(1);
   }
 
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 79fa02156b19..4c5d1a507660 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -63,10 +63,12 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
             sch[temp_val_input].compute_at(sch[real_output], outer_in)
     else:
         if is_idx_reduce:
+            spatial_axis = sch[real_output].fuse(*(sch[real_output].op.axis))
+            sch[real_output].bind(spatial_axis, tvm.thread_axis("blockIdx.x"))
             sch[temp_idx_input].compute_at(sch[real_output],
-                                           sch[real_output].op.axis[0])
+                                           spatial_axis)
             sch[temp_val_input].compute_at(sch[real_output],
-                                           sch[real_output].op.axis[0])
+                                           spatial_axis)
     sch[real_output].set_store_predicate(thread_x.equal(0))
     return sch
 
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index b47ba1165eb9..13a5ccad654c 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -59,9 +59,9 @@ using namespace tvm;
 using namespace tvm::runtime;
 
 /*! \brief Canonicalize an argument that may be Array<Expr> or int to Array<Expr> */
-Array<Expr> ArrayOrInt(TVMArgValue arg) {
+Array<Integer> ArrayOrInt(TVMArgValue arg) {
   if (arg.type_code() == kDLInt || arg.type_code() == kDLUInt) {
-    Array<Expr> result;
+    Array<Integer> result;
     result.push_back(arg.operator int());
     return result;
   } else {
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 3b3472f538b7..77a33d86ed3e 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -97,6 +97,10 @@ def check_device(device):
 
 
 def test_reduce_map():
+    verify_reduce_map_ele(in_shape=(32,),
+                          axis=0,
+                          keepdims=False,
+                          type="argmax")
     verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
                         axis=(1, 2, 3),
                         keepdims=True,
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index dc3c3fb70b24..84d4aa6dc952 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -91,10 +91,7 @@ def check_device(device):
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.squeeze(data_npy, axis=axis)
         data_nd = tvm.nd.array(data_npy, ctx)
-        if out_npy.shape == ():
-            out_nd_shape = (1,)
-        else:
-            out_nd_shape = out_npy.shape
+        out_nd_shape = out_npy.shape
         out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py
index 492f1d94c341..b411375b333e 100644
--- a/topi/tests/python_cpp/test_topi_transform.py
+++ b/topi/tests/python_cpp/test_topi_transform.py
@@ -100,10 +100,7 @@ def check_device(device):
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.squeeze(data_npy, axis=axis)
         data_nd = tvm.nd.array(data_npy, ctx)
-        if out_npy.shape == ():
-            out_nd_shape = (1,)
-        else:
-            out_nd_shape = out_npy.shape
+        out_nd_shape = out_npy.shape
         out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)

From 3f220e232e14bcacf2d970a52cf4c43664ab2c22 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Wed, 21 Nov 2018 19:17:33 -0500
Subject: [PATCH 398/529] Reverse shape dims of weight type (#2155)

---
 python/tvm/relay/testing/lstm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/testing/lstm.py b/python/tvm/relay/testing/lstm.py
index 47e68a988dab..b0915e033ccb 100644
--- a/python/tvm/relay/testing/lstm.py
+++ b/python/tvm/relay/testing/lstm.py
@@ -49,7 +49,7 @@ def lstm_cell(num_hidden, batch_size=1, dtype="float32", name=""):
     builder = relay.ScopeBuilder()
 
     input_type = relay.TensorType((batch_size, num_hidden), dtype)
-    weight_type = relay.TensorType((num_hidden, 4*num_hidden), dtype)
+    weight_type = relay.TensorType((4*num_hidden, num_hidden), dtype)
     bias_type = relay.TensorType((4*num_hidden,), dtype)
 
     dense_type = relay.TensorType((batch_size, 4*num_hidden), dtype)
@@ -116,7 +116,7 @@ def lstm_cell(num_hidden, batch_size=1, dtype="float32", name=""):
 def get_net(iterations, num_hidden, batch_size=1, dtype="float32"):
     '''Constructs an unrolled RNN with LSTM cells'''
     input_type = relay.TensorType((batch_size, num_hidden), dtype)
-    weight_type = relay.TensorType((num_hidden, 4*num_hidden), dtype)
+    weight_type = relay.TensorType((4*num_hidden, num_hidden), dtype)
     bias_type = relay.TensorType((4*num_hidden,), dtype)
 
     state_type = relay.TupleType([input_type, input_type])

From d76c98275b4a2913999861f0677c2d229d6bcc91 Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Thu, 22 Nov 2018 16:35:53 +0900
Subject: [PATCH 399/529] [DOCS] fix link (#2157)

* fix fname in comment

* fix
---
 apps/howto_deploy/cpp_deploy.cc | 2 +-
 docs/faq.md                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc
index 1fd22e5f2b5f..9a6c5ebca703 100644
--- a/apps/howto_deploy/cpp_deploy.cc
+++ b/apps/howto_deploy/cpp_deploy.cc
@@ -1,7 +1,7 @@
 /*!
  *  Copyright (c) 2017 by Contributors
  * \brief Example code on load and run TVM module.s
- * \file cpp_deploy_example.cc
+ * \file cpp_deploy.cc
  */
 #include <cstdio>
 #include <dlpack/dlpack.h>
diff --git a/docs/faq.md b/docs/faq.md
index 54df0ced8fa8..9b735e54d5dd 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -4,7 +4,7 @@ This document contains frequently asked questions.
 
 How to Install
 --------------
-See [Installation](http://tvm.ai/install/)
+See [Installation](http://docs.tvm.ai/install/)
 
 TVM's relation to Other IR/DSL Projects
 ---------------------------------------

From ef02becb932a85532b4e1f97d31f2ee1dc0979fb Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 22 Nov 2018 10:32:00 -0800
Subject: [PATCH 400/529] [APPS] add an external dll call example (#2156)

---
 apps/extension/python/tvm_ext/__init__.py |  4 +++-
 apps/extension/src/tvm_ext.cc             |  5 +++++
 apps/extension/tests/test_ext.py          | 20 ++++++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/apps/extension/python/tvm_ext/__init__.py b/apps/extension/python/tvm_ext/__init__.py
index 5045a9ec02e0..25286f67b4f5 100644
--- a/apps/extension/python/tvm_ext/__init__.py
+++ b/apps/extension/python/tvm_ext/__init__.py
@@ -8,7 +8,9 @@
 def load_lib():
     """Load library, the functions will be registered into TVM"""
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    lib = ctypes.CDLL(os.path.join(curr_path, "../../lib/libtvm_ext.so"))
+    # load in as global so the global extern symbol is visible to other dll.
+    lib = ctypes.CDLL(
+        os.path.join(curr_path, "../../lib/libtvm_ext.so"), ctypes.RTLD_GLOBAL)
     return lib
 
 _LIB = load_lib()
diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc
index bb8b4b694187..362ac62dea3d 100644
--- a/apps/extension/src/tvm_ext.cc
+++ b/apps/extension/src/tvm_ext.cc
@@ -66,6 +66,11 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev")
   });
 }  // namespace tvm_ext
 
+// External function exposed to runtime.
+extern "C" float TVMTestAddOne(float y) {
+  return y + 1;
+}
+
 // This callback approach allows extension allows tvm to extract
 // This way can be helpful when we want to use a header only
 // minimum version of TVM Runtime.
diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index b7b97897a0fa..def30803135e 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -49,7 +49,27 @@ def test_extract_ext():
     assert fdict["mul"](3, 4) == 12
 
 
+def test_extern_call():
+    n = 10
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.compute((n,), lambda *i: tvm.call_extern("float32", "TVMTestAddOne", A(*i)), name='B')
+    s = tvm.create_schedule(B.op)
+
+    def check_llvm():
+        if not tvm.module.enabled("llvm"):
+            return
+        f = tvm.build(s, [A, B], "llvm")
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1)
+    check_llvm()
+
+
 if __name__ == "__main__":
+    test_extern_call()
     test_ext_dev()
     test_ext_vec()
     test_bind_add()

From de004c5279953c75a3a2e50e120dac693e7e182e Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 23 Nov 2018 02:41:40 +0800
Subject: [PATCH 401/529] [RELAY][PASS] CombineParallelConv2D (#2089)

---
 python/tvm/relay/build_module.py              |   5 +
 python/tvm/relay/ir_pass.py                   |  16 +
 src/relay/pass/combine_parallel_conv2d.cc     | 328 ++++++++++++++++++
 src/relay/pass/expr_subst.cc                  |  35 ++
 src/relay/pass/expr_subst.h                   |  18 +
 src/relay/pass/pattern_util.h                 |  18 +
 .../test_pass_combine_parallel_conv2d.py      | 138 ++++++++
 7 files changed, 558 insertions(+)
 create mode 100644 src/relay/pass/combine_parallel_conv2d.cc
 create mode 100644 src/relay/pass/expr_subst.cc
 create mode 100644 src/relay/pass/expr_subst.h
 create mode 100644 tests/python/relay/test_pass_combine_parallel_conv2d.py

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 557e4edac681..5a45ac276de9 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -13,6 +13,7 @@
 # List of optimization pass and level when switch on
 OPT_PASS_LEVEL = {
     "SimplifyInference": 0,
+    "CombineParallelConv2D": 1,
     "OpFusion": 1,
     "FoldConstant": 2,
     "FoldScaleAxis": 3,
@@ -144,6 +145,10 @@ def optimize(func, params=None):
         func = ir_pass.infer_type(func)
         func = ir_pass.simplify_inference(func)
 
+    if cfg.pass_enabled("CombineParallelConv2D"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.combine_parallel_conv2d(func)
+
     if cfg.pass_enabled("FoldScaleAxis"):
         func = ir_pass.infer_type(func)
         func = ir_pass.backward_fold_scale_axis(func)
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 9d59980f6127..ef0a59cd3f6d 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -292,3 +292,19 @@ def fuse_ops(expr, opt_level=1):
         Transformed expression, containing fused result.
     """
     return _ir_pass.FuseOps(expr, opt_level)
+
+
+def combine_parallel_conv2d(expr):
+    """Fold multiple conv2d into one.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    transformed_expr : tvm.relay.Expr
+        Transformed expression
+    """
+    return _ir_pass.CombineParallelConv2D(expr)
diff --git a/src/relay/pass/combine_parallel_conv2d.cc b/src/relay/pass/combine_parallel_conv2d.cc
new file mode 100644
index 000000000000..48d5d77990d6
--- /dev/null
+++ b/src/relay/pass/combine_parallel_conv2d.cc
@@ -0,0 +1,328 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file combine_parallel_conv2d.cc
+ * \brief Combine parallel 2d convolutions into a single convolution.
+ *
+ * This pass replaces convolutions that share the same input node and the same
+ * arguments (except that the number of output channels can be different) with a
+ * single convolution. The weight of the new 2d convolution is the concatenation
+ * of the original weights. Elemwise and broadcast ops following conv2d are also
+ * combined if possible.
+ *
+ * This prevents launching multiple kernels in networks with multiple
+ * convolution branches, such as Inception block.
+ */
+
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/op_attr_types.h>
+#include <unordered_map>
+#include <unordered_set>
+#include "./expr_subst.h"
+#include "./pattern_util.h"
+
+
+namespace tvm {
+namespace relay {
+
+using Branch = std::vector<const CallNode*>;
+using Group = std::vector<Branch>;
+
+/*
+  Find parallel branches starting with conv2d as shown below and then group branches by kernel
+  shape and attributes of conv2d. Conv2d can be followed by zero or more elemwise or broadcast ops.
+  Intermediate nodes have exactly one successor. It is possible that branches meet at a point,
+  which should be handled in ParallelConv2DCombiner.
+
+          data
+        /    \
+    conv2d   conv2d
+      |        |
+      op       op
+      |        |
+*/
+class BranchGroupFinder : private ExprVisitor {
+ public:
+  std::vector<Group> Find(const Expr& expr) {
+    this->VisitExpr(expr);
+
+    std::vector<Group> groups;
+    for (const auto& root : conv_roots_) {
+      const auto& convs = children_map_.at(root);
+      for (const CallNode* conv : convs) {
+        auto&& branch = CreateBranch(conv);
+        // add the branch to a group, or create a new group
+        auto it = std::find_if(groups.begin(), groups.end(), [&](const Group& group) {
+          CHECK(!group.empty() && !group[0].empty());
+          return IsCompatibleConv2D(conv, group[0][0]);
+        });
+        if (it != groups.end()) {
+          it->push_back(branch);
+        } else {
+          groups.emplace_back();
+          // each group has at least one branch
+          groups.back().push_back(branch);
+        }
+      }
+    }
+    return groups;
+  }
+
+ private:
+  std::unordered_set<Expr, NodeHash, NodeEqual> conv_roots_;
+  std::unordered_map<Expr, std::vector<const CallNode*>, NodeHash, NodeEqual> children_map_;
+
+  // Two 2d convolutions can be combined if they have the same attributes or
+  // only have different output channels.
+  bool IsCompatibleConv2D(const CallNode* a, const CallNode* b) {
+    AttrsEqual eq;
+    static const Layout kOIHW("OIHW");
+    const auto* attrs_a = a->attrs.as<Conv2DAttrs>();
+    const auto* attrs_b = b->attrs.as<Conv2DAttrs>();
+    CHECK(attrs_a);
+    CHECK(attrs_b);
+    const auto* tweight_a = a->args[1]->type_as<TensorTypeNode>();
+    const auto* tweight_b = b->args[1]->type_as<TensorTypeNode>();
+    const auto shape_a = ConvertLayout(tweight_a->shape, attrs_a->weight_layout, kOIHW);
+    const auto shape_b = ConvertLayout(tweight_b->shape, attrs_b->weight_layout, kOIHW);
+
+    return eq(attrs_a->strides, attrs_b->strides) && eq(attrs_a->padding, attrs_b->padding) &&
+           eq(attrs_a->dilation, attrs_b->dilation) && eq(attrs_a->groups, attrs_b->groups) &&
+           eq(attrs_a->data_layout, attrs_b->data_layout) &&
+           eq(attrs_a->weight_layout, attrs_b->weight_layout) &&
+           eq(attrs_a->out_dtype, attrs_b->out_dtype) &&
+           eq(attrs_a->out_layout, attrs_b->out_layout) && eq(shape_a[2], shape_b[2]) &&
+           eq(shape_a[3], shape_b[3]);
+  }
+
+  // Create a branch starting from conv2d.
+  Branch CreateBranch(const CallNode* conv) {
+    static auto fpattern = Op::GetAttr<TOpPattern>("TOpPattern");
+    // each branch has at least one element, the first element is always conv2d
+    Branch branch{conv};
+    auto it = children_map_.find(GetRef<Expr>(branch.back()));
+    while (it != children_map_.end() && it->second.size() == 1) {
+      const CallNode* call = it->second[0];
+      auto pattern = fpattern[Downcast<Op>(call->op)];
+      if (pattern <= kBroadcast) {
+        branch.push_back(it->second[0]);
+        it = children_map_.find(GetRef<Expr>(branch.back()));
+      } else {
+        break;
+      }
+    }
+    return branch;
+  }
+
+  void VisitExpr_(const CallNode* n) final {
+    static const Op& conv2d = Op::Get("nn.conv2d");
+    ExprVisitor::VisitExpr_(n);
+    if (n->op.same_as(conv2d) && n->attrs.as<Conv2DAttrs>()->groups == 1) {
+      conv_roots_.insert(n->args[0]);
+      children_map_[n->args[0]].push_back(n);
+    } else {
+      for (size_t i = 0; i < n->args.size(); i++) {
+        children_map_[n->args[i]].push_back(n);
+      }
+    }
+  }
+};
+
+class ParallelConv2DCombiner {
+ public:
+  Expr Combine(const Expr& expr) {
+    auto groups = BranchGroupFinder().Find(expr);
+    for (const Group& group : groups) {
+      if (group.size() < 2) continue;
+      CombineBranches(group);
+    }
+    return ExprSubst(expr, std::move(subst_map_));
+  }
+
+ private:
+  std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map_;
+
+  std::tuple<Expr, IndexExpr> TransformWeight(const Group& branches) {
+    int64_t num_filters = 0;  // number of filters of the transformed weight
+    Array<Expr> weights;
+    for (const auto& branch : branches) {
+      auto conv2d = branch[0];
+      weights.push_back(conv2d->args[1]);
+      auto channels = GetConv2DSuperChannelsDim(conv2d);
+      num_filters += channels;
+    }
+    auto index = branches[0][0]->attrs.as<Conv2DAttrs>()->weight_layout.find('O');
+    CHECK_NE(index, std::string::npos);
+    return std::make_tuple(MakeConcatenate(TupleNode::make(weights), index),
+                           MakeConstScalar(Int(32), num_filters));
+  }
+
+  Call MakeCombinedConv2D(const Group& branches) {
+    static const Op& conv2d = Op::Get("nn.conv2d");
+    Expr data = branches[0][0]->args[0];
+    Expr new_weight;
+    IndexExpr new_channels;
+    std::tie(new_weight, new_channels) = TransformWeight(branches);
+
+    const CallNode* group_root = branches[0][0];
+    const auto* attrs = group_root->attrs.as<Conv2DAttrs>();
+    CHECK(attrs);
+    const auto new_attrs = make_node<Conv2DAttrs>();
+    new_attrs->strides = attrs->strides;
+    new_attrs->padding = attrs->padding;
+    new_attrs->dilation = attrs->dilation;
+    new_attrs->groups = attrs->groups;
+    new_attrs->kernel_size = attrs->kernel_size;
+    new_attrs->data_layout = attrs->data_layout;
+    new_attrs->weight_layout = attrs->weight_layout;
+    new_attrs->out_layout = attrs->out_layout;
+    new_attrs->out_dtype = attrs->out_dtype;
+    new_attrs->channels = new_channels;
+
+    return CallNode::make(conv2d, {data, new_weight}, Attrs{new_attrs}, {});
+  }
+
+  bool IsArgCompatible(const CallNode* a, const CallNode* b, size_t index, size_t channel_pos) {
+    AttrsEqual eq;
+    auto ta = a->args[index]->type_as<TensorTypeNode>();
+    auto tb = b->args[index]->type_as<TensorTypeNode>();
+    auto toutput_a = a->type_as<TensorTypeNode>();
+    auto toutput_b = b->type_as<TensorTypeNode>();
+
+    if (!eq(ta->dtype, tb->dtype) || ta->shape.size() != tb->shape.size())
+      return false;
+
+    // Position of the 'C' dimension in the argument
+    size_t arg_channel_pos = channel_pos - toutput_a->shape.size() + ta->shape.size();
+
+    // Channel super-dimension shoule be present and not broadcasted
+    if ((arg_channel_pos > channel_pos) ||  // size_t overflow
+        !eq(ta->shape[arg_channel_pos], toutput_a->shape[channel_pos]) ||
+        !eq(tb->shape[arg_channel_pos], toutput_b->shape[channel_pos]))
+      return false;
+
+    for (size_t i = 0; i < ta->shape.size(); i++) {
+      if (i == arg_channel_pos) continue;
+      if (!eq(ta->shape[i], tb->shape[i]))
+        return false;
+    }
+    return true;
+  }
+
+  // Check if ops in depth-th level can be combined
+  bool CheckLevel(const Group& branches, size_t depth, size_t channel_pos, size_t parent_index) {
+    const CallNode* call = branches[0][depth];
+    AttrsEqual attrs_equal;
+    // check if all branches in current depth can be combined
+    for (auto it = branches.begin() + 1; it != branches.end(); it++) {
+      const Branch& branch = *it;
+      if (!branch[depth]->op.same_as(call->op) ||
+          !attrs_equal(branch[depth]->attrs, call->attrs) ||
+          branch[depth]->args.size() != call->args.size()) {
+        return false;
+      }
+
+      if (branch[depth]->args[parent_index].get() != branch[depth - 1])
+        return false;
+
+      // Check args
+      for (size_t i = 0; i < call->args.size(); i++) {
+        if (i == parent_index) continue;
+
+        if (!IsArgCompatible(call, branch[depth], i, channel_pos) ||
+            !attrs_equal(call->attrs, branch[depth]->attrs)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  // Combine args and make the combined CallNode
+  Call MakeCombinedCall(const Expr& data, const Group& branches, size_t depth, size_t channel_pos,
+                        size_t parent_index) {
+    Array<Expr> new_args;
+    const CallNode* call = branches[0][depth];
+    size_t ndim = call->type_as<TensorTypeNode>()->shape.size();
+
+    for (size_t i = 0; i < call->args.size(); i++) {
+      if (i == parent_index) {
+        new_args.push_back(data);
+        continue;
+      }
+      size_t arg_ndim = call->args[i]->type_as<TensorTypeNode>()->shape.size();
+      size_t arg_channel_pos = channel_pos - ndim + arg_ndim;
+      Array<Expr> tuple;
+      for (const auto& branch : branches) {
+        tuple.push_back(branch[depth]->args[i]);
+      }
+      auto concat = MakeConcatenate(TupleNode::make(tuple), arg_channel_pos);
+      new_args.push_back(std::move(concat));
+    }
+    return CallNode::make(call->op, new_args, call->attrs, {});
+  }
+
+  // Replace output of each branch with slices of the combined output
+  void UpdateGroupOutput(const Expr& data, const Group& branches, size_t depth,
+                         size_t channel_pos) {
+    int64_t index = 0;
+    for (const auto& branch : branches) {
+      const CallNode* conv2d = branch[0];
+      int64_t channels = GetConv2DSuperChannelsDim(conv2d);
+      Array<Integer> begin;
+      Array<Integer> end;
+      for (size_t i = 0; i < channel_pos; i++) {
+        begin.push_back(0);
+        end.push_back(NullValue<Integer>());
+      }
+      begin.push_back(index);
+      index += channels;
+      end.push_back(index);
+      auto slice = MakeStridedSlice(data, std::move(begin), std::move(end), Array<Integer>{});
+      subst_map_[GetRef<Expr>(branch[depth])] = slice;
+    }
+  }
+
+  // Combine branches in a group. Conv2d in different branches in the same group are safe to
+  // combine. Subsequent ops may or may not be combined. We start from conv2d and try to
+  // combine ops from all branches in the same depth.
+  void CombineBranches(const Group& branches) {
+    Call combined = MakeCombinedConv2D(branches);
+    auto conv_param = combined->attrs.as<Conv2DAttrs>();
+    const std::string& layout =
+        conv_param->out_layout == "" ? conv_param->data_layout : conv_param->out_layout;
+    size_t channel_pos = layout.find('C');
+    CHECK_NE(channel_pos, std::string::npos);
+    auto it = std::min_element(branches.begin(), branches.end(),
+                               [](const Branch& branch_a,
+                                  const Branch& branch_b) {
+                                    return branch_a.size() < branch_b.size();
+                                  });
+    size_t depth = it->size();
+    size_t i;
+    // starting from 1 to skip the conv2d
+    for (i = 1; i < depth; i++) {
+      size_t parent_index;
+      for (parent_index = 0; parent_index < branches[0][i]->args.size(); parent_index++) {
+        if (branches[0][i]->args[parent_index].get() == branches[0][i - 1]) break;
+      }
+      CHECK_NE(parent_index, branches[0][i]->args.size());
+      if (!CheckLevel(branches, i, channel_pos, parent_index)) break;
+      combined = MakeCombinedCall(combined, branches, i, channel_pos, parent_index);
+    }
+    UpdateGroupOutput(combined, branches, i - 1, channel_pos);
+  }
+};
+
+Expr CombineParallelConv2D(const Expr& expr) { return ParallelConv2DCombiner().Combine(expr); }
+
+TVM_REGISTER_API("relay._ir_pass.CombineParallelConv2D")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = CombineParallelConv2D(args[0]);
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/expr_subst.cc b/src/relay/pass/expr_subst.cc
new file mode 100644
index 000000000000..586f748abef5
--- /dev/null
+++ b/src/relay/pass/expr_subst.cc
@@ -0,0 +1,35 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file expr_subst.h
+ * \brief Utility functions for substituting expressions.
+ */
+
+#include <tvm/relay/expr_functor.h>
+#include "./expr_subst.h"
+
+namespace tvm {
+namespace relay {
+
+class ExprSubstituter : public ExprMutator {
+ public:
+  explicit ExprSubstituter(std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map)
+      : subst_map_(subst_map) {}
+
+  Expr VisitExpr(const Expr& expr) final {
+    auto it = subst_map_.find(expr);
+    if (it != subst_map_.end()) {
+      return (*it).second;
+    }
+    return ExprMutator::VisitExpr(expr);
+  }
+
+ private:
+  tvm::Map<Expr, Expr> subst_map_;
+};
+
+Expr ExprSubst(const Expr& expr, std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map) {
+  return ExprSubstituter(std::move(subst_map)).Mutate(expr);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/expr_subst.h b/src/relay/pass/expr_subst.h
new file mode 100644
index 000000000000..67892b3a0af7
--- /dev/null
+++ b/src/relay/pass/expr_subst.h
@@ -0,0 +1,18 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file expr_subst.h
+ * \brief Utility functions for substituting expressions.
+ */
+#ifndef TVM_RELAY_PASS_EXPR_SUBST_H_
+#define TVM_RELAY_PASS_EXPR_SUBST_H_
+#include <tvm/relay/expr.h>
+#include <unordered_map>
+
+namespace tvm {
+namespace relay {
+
+Expr ExprSubst(const Expr& expr, std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_EXPR_SUBST_H_
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 1c855d9a53cb..38ae923c5274 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -11,6 +11,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/attrs/transform.h>
+#include <string>
 #include "../op/layout.h"
 
 
@@ -120,6 +121,19 @@ inline bool IsDepthwiseConv2D(const Call& call,
       is_const_int(wshape[1], 1);
 }
 
+/*!
+ * \brief Get super-dimension of output channels of conv2d
+ * \param call The conv2d call.
+ * \return Super-dimension size of output channels of conv2d.
+ */
+inline int64_t GetConv2DSuperChannelsDim(const CallNode* call) {
+    auto param = call->attrs.as<Conv2DAttrs>();
+    auto tweight = call->args[1]->type_as<TensorTypeNode>();
+    auto index = param->weight_layout.find('O');
+    CHECK_NE(index, std::string::npos);
+    auto channels = as_const_int(tweight->shape[index]);
+    return *channels;
+}
 
 /*!
  * \brief Create a Constant with a scalar
@@ -172,6 +186,10 @@ inline Expr ReshapeLike(Expr lhs, Expr rhs) {
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
 }
 
+Expr MakeConcatenate(Expr data, int axis);
+
+Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_PATTERN_UTIL_H_
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
new file mode 100644
index 000000000000..31dfe095f682
--- /dev/null
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -0,0 +1,138 @@
+from tvm import relay
+import numpy as np
+
+
+def test_combine_parallel_conv2d():
+    """Simple testcase."""
+    def before(x, w1, w2, w3, w4):
+        args = [x, w1, w2, w3, w4]
+        y1 = relay.nn.conv2d(x, w1)
+        y2 = relay.nn.conv2d(x, w2)
+        # y3 cannot be combined
+        y3 = relay.nn.conv2d(x, w3)
+        y4 = relay.nn.conv2d(x, w4)
+        y = relay.Tuple((y1, y2, y3, y4))
+        return relay.Function(args, y)
+
+    def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, w1, w2, w3, w4]
+        w = relay.concatenate((w1, w2, w4), axis=0)
+        y = relay.nn.conv2d(x, w, channels=channels1 + channels2 + channels4)
+        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
+        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y3 = relay.nn.conv2d(x, w3)
+        y4 = relay.strided_slice(y, [0, channels1 + channels2],
+                                 [None, channels1 + channels2 + channels4])
+        y = relay.Tuple((y1, y2, y3, y4))
+        return relay.Function(args, y)
+
+    def check(x_shape, channels1, channels2, channels3, channels4):
+        x =  relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
+        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
+        w3 = relay.var("w3", shape=(channels3, in_c, 3, 3))
+        w4 = relay.var("w4", shape=(channels4, in_c, 1, 1))
+
+        y_before = before(x, w1, w2, w3, w4)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4, 4, 4, 4)
+    check((1, 4, 16, 16), 4, 8, 4, 7)
+
+
+def test_combine_parallel_conv2d_scale_relu():
+    """Testcase of combining conv2d + scale + relu"""
+    def before(x, w1, w2, scale1, scale2, bias):
+        args = [x, w1, w2, scale1, scale2, bias]
+        y1 = relay.nn.conv2d(x, w1)
+        y1 = relay.multiply(y1, scale1)
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(x, w2)
+        y2 = relay.multiply(y2, scale2)
+        y2 = relay.nn.relu(y2)
+        y2 = relay.add(y2, bias)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def expected(x, w1, w2, scale1, scale2, bias, channels1, channels2):
+        args = [x, w1, w2, scale1, scale2, bias]
+        w = relay.concatenate((w1, w2), axis=0)
+        scale = relay.concatenate((scale1, scale2), axis=0)
+        y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
+        y = relay.multiply(y, scale)
+        y = relay.nn.relu(y)
+        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
+        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y2 = relay.add(y2, bias)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def check(x_shape, channels1, channels2):
+        x = relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
+        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
+        scale1 = relay.var("scale1", shape=(channels1, 1, 1))
+        scale2 = relay.var("scale2", shape=(channels2, 1, 1))
+        bias = relay.var("bias", shape=(channels2, 1, 1))
+        y_before = before(x, w1, w2, scale1, scale2, bias)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w1, w2, scale1, scale2, bias, channels1, channels2)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4, 8)
+
+
+def test_combine_parallel_conv2d_scale():
+    """Testcase of un-combinable scale"""
+    def before(x, w1, w2, scale1, scale2):
+        args = [x, w1, w2, scale1, scale2]
+        y1 = relay.nn.conv2d(x, w1)
+        y1 = relay.multiply(y1, scale1)
+        y2 = relay.nn.conv2d(x, w2)
+        y2 = relay.multiply(y2, scale2)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def expected(x, w1, w2, scale1, scale2, channels1, channels2):
+        args = [x, w1, w2, scale1, scale2]
+        w = relay.concatenate((w1, w2), axis=0)
+        y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
+        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
+        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y1 = relay.multiply(y1, scale1)
+        y2 = relay.multiply(y2, scale2)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def check(x_shape, channels1, channels2):
+        x = relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
+        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
+        scale1 = relay.var("scale1", shape=(1,))
+        scale2 = relay.var("scale2", shape=(1,))
+        y_before = before(x, w1, w2, scale1, scale2)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w1, w2, scale1, scale2, channels1, channels2)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4, 8)
+
+if __name__ == "__main__":
+    test_combine_parallel_conv2d()
+    test_combine_parallel_conv2d_scale_relu()
+    test_combine_parallel_conv2d_scale()

From fc1a82375c515b4c623aee10f38db48efc6cebdb Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Sun, 25 Nov 2018 04:14:41 +0530
Subject: [PATCH 402/529] [RELAY]Testing Inception, Squeezenet, VGG port
 (#2013)

---
 python/tvm/relay/testing/__init__.py       |   3 +
 python/tvm/relay/testing/inception_v3.py   | 284 +++++++++++++++++++++
 python/tvm/relay/testing/squeezenet.py     | 151 +++++++++++
 python/tvm/relay/testing/vgg.py            | 117 +++++++++
 tests/python/relay/test_ir_text_printer.py |  15 ++
 5 files changed, 570 insertions(+)
 create mode 100644 python/tvm/relay/testing/inception_v3.py
 create mode 100644 python/tvm/relay/testing/squeezenet.py
 create mode 100644 python/tvm/relay/testing/vgg.py

diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 43160d64549c..47a04b531922 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -7,4 +7,7 @@
 from . import dcgan
 from . import mobilenet
 from . import lstm
+from . import inception_v3
+from . import squeezenet
+from . import vgg
 from .config import ctx_list
diff --git a/python/tvm/relay/testing/inception_v3.py b/python/tvm/relay/testing/inception_v3.py
new file mode 100644
index 000000000000..96684c5d6e1d
--- /dev/null
+++ b/python/tvm/relay/testing/inception_v3.py
@@ -0,0 +1,284 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision."
+arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+# pylint: disable=invalid-name,missing-docstring,unused-argument
+from tvm import relay
+from .init import create_workload
+from . import layers
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = layers.conv2d(
+        data=data,
+        channels=int(num_filter),
+        kernel_size=kernel,
+        strides=stride,
+        padding=pad,
+        name='%s%s_conv1' % (name, suffix))
+
+    bn = layers.batch_norm_infer(data=conv, epsilon=2e-5, name='%s%s_bn' % (name, suffix))
+    act = relay.nn.relu(data=bn)
+    return act
+
+def Pooling(data, kernel, stride, pad, pool_type, name):
+    if pool_type == 'max':
+        return relay.nn.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad)
+    elif pool_type == 'avg':
+        return relay.nn.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad,
+                                   count_include_pad=True)
+    else:
+        raise ValueError("Invalid pooling type: " + pool_type)
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+
+    cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv')
+    concat = relay.concatenate((tower_1x1, tower_5x5, tower_3x3, cproj), axis=0)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1),
+                      name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                      name=('%s_tower' % name), suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max",
+                      name=('max_pool_%s_pool' % name))
+    concat = relay.concatenate((tower_3x3, tower_d3x3, pooling), axis=0)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1),
+                 name=('%s_tower_2' % name), suffix='_conv')
+    # concat
+    concat = relay.concatenate((tower_1x1, tower_d7, tower_q7, cproj), axis=0)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name),
+                     suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                        name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2),
+                        name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0),
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = relay.concatenate((tower_3x3, tower_d7_3x3, pooling), axis=0)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1),
+                      name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0),
+                      name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name),
+                 suffix='_conv')
+    # concat
+    concat = relay.concatenate(
+        (tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj), axis=0)
+    return concat
+
+def get_net(batch_size,
+            num_classes,
+            image_shape,
+            dtype):
+    """Get network a Inception v3 network.
+
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : relay.Function
+        The dataflow.
+    """
+    data_shape = (batch_size,) + image_shape
+    data = relay.var("data",
+                     shape=data_shape,
+                     dtype=dtype)
+
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                   name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                    name="pool1")
+
+    # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+
+    # pool
+    pool = Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0),
+                   name="global_pool")
+
+    flatten = relay.nn.batch_flatten(pool)
+    fc1 = relay.nn.dense(flatten, relay.var("fc1_weight"), units=num_classes)
+    fc1 = relay.nn.bias_add(fc1, relay.var("fc2_bias"))
+    inception_v3 = relay.nn.softmax(data=fc1)
+    args = relay.ir_pass.free_vars(inception_v3)
+    return relay.Function(args, inception_v3)
+
+def get_workload(batch_size=1, num_classes=1000,
+                 image_shape=(3, 299, 299), dtype="float32"):
+    """Get benchmark workload for InceptionV3
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size, num_classes, image_shape, dtype)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/squeezenet.py b/python/tvm/relay/testing/squeezenet.py
new file mode 100644
index 000000000000..fa55cafbf2b4
--- /dev/null
+++ b/python/tvm/relay/testing/squeezenet.py
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=unused-argument
+
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+from tvm import relay
+from .init import create_workload
+from . import layers
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = relay.concatenate((left, right), axis=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = layers.conv2d(net, channels=channels, kernel_size=(kernel_size, kernel_size),
+                        padding=(padding, padding), name="conv2d")
+    net = relay.nn.relu(net)
+    return net
+
+# Net
+def get_net(batch_size, image_shape, num_classes, version, dtype):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    image_shape : tuple, optional
+        The input image shape
+
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version in ['1.0', '1.1'], ("Unsupported SqueezeNet version {version}:"
+                                       "1.0 or 1.1 expected".format(version=version))
+    data_shape = (batch_size,) + image_shape
+    net = relay.var("data", shape=data_shape, dtype=dtype)
+    if version == '1.0':
+        net = layers.conv2d(net,
+                            channels=96,
+                            kernel_size=(7, 7),
+                            strides=(2, 2),
+                            padding=(3, 3),
+                            name="conv2d")
+        net = relay.nn.bias_add(net, relay.var("dense1_bias"))
+        net = relay.nn.relu(net)
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 32, 128, 128)
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 64, 256, 256)
+    else:
+        net = layers.conv2d(net,
+                            channels=64,
+                            kernel_size=(3, 3),
+                            strides=(2, 2),
+                            padding=(1, 1),
+                            name="conv2d")
+        net = relay.nn.relu(net)
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 32, 128, 128)
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 64, 256, 256)
+    net = relay.nn.dropout(net, rate=0.5)
+    net = layers.conv2d(net, channels=num_classes, kernel_size=(1, 1), name="conv2d")
+    net = relay.nn.relu(net)
+    net = relay.nn.global_avg_pool2d(net)
+    net = relay.nn.batch_flatten(net)
+    net = relay.nn.softmax(net)
+    args = relay.ir_pass.free_vars(net)
+    return relay.Function(args, net)
+
+def get_workload(batch_size=1, num_classes=1000, version='1.0',
+                 image_shape=(3, 224, 224), dtype="float32"):
+    """Get benchmark workload for SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+
+    net = get_net(batch_size, image_shape, num_classes, version, dtype)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/vgg.py b/python/tvm/relay/testing/vgg.py
new file mode 100644
index 000000000000..7ec6669f6346
--- /dev/null
+++ b/python/tvm/relay/testing/vgg.py
@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""References:
+
+Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
+large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
+"""
+from tvm import relay
+from .init import create_workload
+from . import layers as wrapper
+
+def get_feature(internel_layer, layers, filters, batch_norm=False):
+    """Get VGG feature body as stacks of convoltions."""
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internel_layer = wrapper.conv2d(
+                data=internel_layer, kernel_size=(3, 3), padding=(1, 1),
+                channels=filters[i], name="conv%s_%s"%(i + 1, j + 1))
+            if batch_norm:
+                internel_layer = wrapper.batch_norm_infer(
+                    data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internel_layer = relay.nn.relu(data=internel_layer)
+        internel_layer = relay.nn.max_pool2d(
+            data=internel_layer, pool_size=(2, 2), strides=(2, 2))
+    return internel_layer
+
+def get_classifier(input_data, num_classes):
+    """Get VGG classifier layers as fc layers."""
+    flatten = relay.nn.batch_flatten(data=input_data)
+    fc6 = wrapper.dense_add_bias(data=flatten, units=4096, name="fc6")
+    relu6 = relay.nn.relu(data=fc6)
+    drop6 = relay.nn.dropout(data=relu6, rate=0.5)
+    fc7 = wrapper.dense_add_bias(data=drop6, units=4096, name="fc7")
+    relu7 = relay.nn.relu(data=fc7)
+    drop7 = relay.nn.dropout(data=relu7, rate=0.5)
+    fc8 = wrapper.dense_add_bias(data=drop7, units=num_classes, name="fc8")
+    return fc8
+
+def get_net(batch_size, image_shape, num_classes, dtype, num_layers=11, batch_norm=False):
+    """
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    image_shape : tuple, optional
+        The input image shape
+
+    num_classes : int, optional
+        Number of claseses
+
+    dtype : str, optional
+        The data type
+
+    num_layers : int
+        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+
+    batch_norm : bool, default False
+        Use batch normalization.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if num_layers not in vgg_spec:
+        raise ValueError("Invalide num_layers {}. Choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data_shape = (batch_size,) + image_shape
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    symbol = relay.nn.softmax(data=classifier)
+    args = relay.ir_pass.free_vars(symbol)
+    return relay.Function(args, symbol)
+
+def get_workload(batch_size, num_classes=1000, image_shape=(3, 224, 224), dtype="float32"):
+    """Get benchmark workload for VGG nets.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size, image_shape, num_classes, dtype)
+    return create_workload(net)
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 30130fd7bcac..f6a1236c89e6 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -124,6 +124,18 @@ def test_lstm():
     net, params = tvm.relay.testing.lstm.get_workload(4, 4)
     net.astext()
 
+def test_inception_v3():
+    net, params = tvm.relay.testing.inception_v3.get_workload(batch_size=1)
+    net.astext()
+
+def test_squeezenet():
+    for version in ['1.0', '1.1']:
+        net, params = tvm.relay.testing.squeezenet.get_workload(batch_size=1, version=version)
+        net.astext()
+
+def test_vgg():
+    net, params = tvm.relay.testing.vgg.get_workload(batch_size=1)
+    net.astext()
 
 if __name__ == "__main__":
     do_print[0] = True
@@ -132,6 +144,9 @@ def test_lstm():
     test_mlp()
     test_dqn()
     test_dcgan()
+    test_squeezenet()
+    test_inception_v3()
+    test_vgg()
     test_func()
     test_env()
     test_meta_data()

From fd330f17e2244af545e53ff29c663b4e5f15eebb Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Sat, 24 Nov 2018 22:33:14 -0500
Subject: [PATCH 403/529] [Relay][Op] Add compute, schedule, and tests for
 expand_dims and squeeze (#2133)

---
 python/tvm/relay/op/_transform.py    | 45 +++++++++++++++++++++++++++-
 tests/python/relay/test_op_level1.py | 17 +++++++++++
 tests/python/relay/test_op_level3.py | 17 +++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 01814e0f73e0..cd32aea38d90 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -1,8 +1,51 @@
 #pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
 from __future__ import absolute_import
+import topi
+import topi.cuda
+from tvm import container
 from . import op as _reg
-from .op import schedule_injective, OpPattern
+from .op import (schedule_injective, register_compute, register_schedule,
+                 register_pattern, OpPattern)
+
+schedule_broadcast = schedule_injective
+
+# squeeze
+@register_compute("squeeze")
+def squeeze_compiler(attrs, inputs, output_type, target):
+    """Compiler for squeeze dims."""
+    assert len(inputs) == 1
+
+    if attrs.axis is None:
+        axis = None
+    elif isinstance(attrs.axis, container.Array):
+        axis = tuple(attrs.axis)
+    else:
+        axis = int(attrs.axis)
+
+    return [topi.squeeze(inputs[0], axis)]
+
+register_pattern("squeeze", OpPattern.INJECTIVE)
+register_schedule("squeeze", schedule_injective)
+
+# expand_dims
+@register_compute("expand_dims")
+def expand_dims_compiler(attrs, inputs, output_type, target):
+    """Compiler for expand_dims."""
+    assert len(inputs) == 1
+
+    new_axis = int(attrs.num_newaxis)
+    assert new_axis >= 0
+
+    # axis should be in range [-data.ndim - 1, data.ndim]
+    axis = int(attrs.axis)
+    assert axis >= -len(inputs[0].shape) - 1
+    assert axis <= len(inputs[0].shape)
+
+    return [topi.expand_dims(inputs[0], axis, new_axis)]
+
+_reg.register_schedule("expand_dims", schedule_broadcast)
+_reg.register_pattern("expand_dims", OpPattern.BROADCAST)
 
 # strided_slice
 _reg.register_schedule("strided_slice", schedule_injective)
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 35844ddd4a3f..d28aa0a56941 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -90,6 +90,22 @@ def check_binary_op(opfunc, ref):
         check_binary_op(opfunc, ref)
 
 
+def test_expand_dims():
+    # based on topi test
+    def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis):
+        x = relay.Var("x", relay.TensorType(dshape, dtype))
+        func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis))
+        for target, ctx in ctx_list():
+            data = np.random.uniform(size=dshape).astype(dtype)
+            ref_res = data.reshape(oshape)
+            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(data)
+            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+    verify_expand_dims((3, 10), 'float32', (3, 10, 1, 1), 2, 2)
+    verify_expand_dims((3, 10), 'float32', (1, 3, 10), -3, 1)
+
+
 def test_bias_add():
     xshape=(10, 2, 3, 4)
     bshape=(2,)
@@ -295,6 +311,7 @@ def test_dense():
     test_binary_op()
     test_expand_dims_infer_type()
     test_concatenate()
+    test_expand_dims()
     test_softmax()
     test_log_softmax()
     test_dropout()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 6f8fbd551293..f6951b5ab734 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -60,6 +60,22 @@ def test_clip():
     np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
 
+def test_squeeze():
+    def verify_squeeze(shape, dtype, axis):
+        x = relay.var("x", relay.TensorType(shape, dtype))
+        squeeze = relay.squeeze(x, axis=axis)
+
+        np_axis = tuple(axis) if axis is not None else None
+
+        data = np.random.random_sample(shape).astype(dtype)
+        intrp = create_executor()
+        op_res = intrp.evaluate(squeeze, { x : relay.const(data) })
+        ref_res = np.squeeze(data, axis=np_axis)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+    verify_squeeze((1, 3, 2, 5), "float32", None)
+    verify_squeeze((1, 3, 1), "float32", [0])
+    verify_squeeze((1, 2, 1, 2, 1), "float32", [0, 2])
 
 
 def test_transpose_infer_type():
@@ -308,6 +324,7 @@ def test_infer_type_prelu():
     test_full_like()
     test_infer_type_leaky_relu()
     test_infer_type_prelu()
+    test_squeeze()
     test_squeeze_infer_type()
     test_squeeze_bad_axes_infer_type()
     test_split_infer_type()

From 0fff6b8ea441da7b82219b07fd317b3b86241106 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 26 Nov 2018 01:48:22 +0900
Subject: [PATCH 404/529] Compare relay and numpy outputs in graph runtime test
 (#2164)

---
 tests/python/relay/test_backend_graph_runtime.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
index 7baa906abacc..59970dee38f9 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -30,6 +30,7 @@ def check_rts(expr, args, expected_result, mod=None):
     eval_result = intrp.evaluate(expr)(*args)
     rts_result = graph.evaluate(expr)(*args)
     tvm.testing.assert_allclose(eval_result.asnumpy(), rts_result.asnumpy())
+    tvm.testing.assert_allclose(eval_result.asnumpy(), expected_result)
 
 def test_add_op_scalar():
     """

From 29928a260d90dd38691943df1c33658b0dfe225c Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Sun, 25 Nov 2018 22:31:44 +0530
Subject: [PATCH 405/529] Relay reshape reshape_like compute and schedule
 (#2159)

---
 python/tvm/relay/op/_transform.py    |  8 +++++
 src/relay/op/tensor/transform.cc     | 18 +++++++++--
 tests/python/relay/test_op_level3.py | 47 +++++++++++++++++++++++++++-
 3 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index cd32aea38d90..c05fbe8ec61e 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -53,3 +53,11 @@ def expand_dims_compiler(attrs, inputs, output_type, target):
 # slice_like
 _reg.register_schedule("slice_like", schedule_injective)
 _reg.register_pattern("slice_like", OpPattern.INJECTIVE)
+
+# reshape
+_reg.register_schedule("reshape", schedule_injective)
+_reg.register_pattern("reshape", OpPattern.INJECTIVE)
+
+# reshape_like
+_reg.register_schedule("reshape_like", schedule_injective)
+_reg.register_pattern("reshape_like", OpPattern.INJECTIVE)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index a9e0a969fc5b..52363e8af92a 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -376,7 +376,15 @@ Example::
 .set_attrs_type_key("relay.attrs.ReshapeAttrs")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
-.add_type_rel("Reshape", ReshapeRel);
+.add_type_rel("Reshape", ReshapeRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  const auto* param = attrs.as<ReshapeAttrs>();
+  CHECK(param != nullptr);
+  return Array<Tensor>{ topi::reshape(inputs[0], param->newshape) };
+});
 
 
 /*!
@@ -431,7 +439,13 @@ the input array into an output array with the same shape as the second input arr
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("shape_like", "Tensor", "Shape tensor.")
 .set_support_level(3)
-.add_type_rel("ReshapeLike", ReshapeLikeRel);
+.add_type_rel("ReshapeLike", ReshapeLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  return Array<Tensor>{ topi::reshape(inputs[0], inputs[1]->shape) };
+});
 
 
 // Take
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index f6951b5ab734..43c11c4509d1 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -123,8 +123,28 @@ def test_reshape_infer_type():
     assert yy.checked_type == relay.TensorType(
         (n, t, 2000), "float32")
 
+def test_reshape():
+    def verify_reshape(shape, oshape):
+        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
+        ref_res = np.reshape(x_data, oshape)
 
-def test_reshape_like():
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        z = relay.reshape(x, newshape=ref_res.shape)
+        zz = relay.ir_pass.infer_type(z)
+        assert "newshape=" in z.astext()
+        assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
+
+        func = relay.Function([x], z)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    verify_reshape((2, 3, 4), (8, 3))
+    verify_reshape((4, 7), (2, 7, 2))
+
+def test_reshape_like_infer_type():
     # concrete shape
     x = relay.var("x", relay.TensorType((1, 2, 3), "float32"))
     y = relay.var("y", relay.TensorType((1,6), "float32"))
@@ -141,6 +161,29 @@ def test_reshape_like():
     assert zz.checked_type == relay.TensorType((1, 8, 8), "float32")
 
 
+def test_reshape_like():
+    def verify_reshape_like(shape, oshape):
+        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
+        y_data = np.random.uniform(low=-1, high=1, size=oshape).astype("float32")
+        ref_res = np.reshape(x_data, y_data.shape)
+
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        y = relay.var("x", relay.TensorType(oshape, "float32"))
+        z = relay.reshape_like(x, y)
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.ty.TensorType(ref_res.shape, "float32")
+
+        func = relay.Function([x, y], z)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, y_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+    verify_reshape_like((2, 3, 4), (1, 8, 3))
+    verify_reshape_like((4, 7), (2, 7, 2))
+
 def test_take_infer_type():
     def verify_take(dshape, indices_shape, oshape, axis=None):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -318,6 +361,8 @@ def test_infer_type_prelu():
     test_clip()
     test_transpose_infer_type()
     test_reshape_infer_type()
+    test_reshape()
+    test_reshape_like_infer_type()
     test_reshape_like()
     test_take_infer_type()
     test_full()

From 30ea64f9d516c3d334c58a4230ce69d33b0b3aab Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 25 Nov 2018 12:54:03 -0800
Subject: [PATCH 406/529] [RELAY][FRONTEND] Initial MXNet frontend support.
 (#2163)

---
 docs/langref/relay_op.rst                     |   6 +-
 include/tvm/relay/attrs/nn.h                  |   2 +-
 include/tvm/relay/attrs/transform.h           |   2 +-
 nnvm/src/top/tensor/transform.cc              |   4 +-
 python/tvm/relay/__init__.py                  |   4 +-
 python/tvm/relay/backend/compile_engine.py    |  14 +-
 .../relay/backend/graph_runtime_codegen.py    |   2 +-
 python/tvm/relay/build_module.py              |   3 +-
 python/tvm/relay/frontend/__init__.py         |   4 +
 python/tvm/relay/frontend/common.py           | 129 ++++
 python/tvm/relay/frontend/mxnet.py            | 606 ++++++++++++++++++
 python/tvm/relay/op/__init__.py               |   1 +
 python/tvm/relay/op/_reduce.py                |  19 +
 python/tvm/relay/op/_tensor.py                |   3 +-
 python/tvm/relay/op/_transform.py             |  62 +-
 python/tvm/relay/op/nn/_nn.py                 |   2 +
 python/tvm/relay/op/nn/nn.py                  |   4 +-
 python/tvm/relay/testing/inception_v3.py      |  10 +-
 python/tvm/relay/testing/squeezenet.py        |  69 +-
 python/tvm/relay/testing/vgg.py               |  39 +-
 src/lang/attr_functor.h                       |   1 +
 src/relay/backend/graph_plan_memory.cc        |   5 +-
 src/relay/op/op_common.h                      |   2 +-
 src/relay/op/tensor/reduce.cc                 | 172 ++++-
 src/relay/op/tensor/transform.cc              | 178 ++++-
 src/relay/pass/fold_scale_axis.cc             |   9 +-
 .../frontend/mxnet/model_zoo/__init__.py      |  59 ++
 .../python/frontend/mxnet/model_zoo/dcgan.py  |  66 ++
 tests/python/frontend/mxnet/model_zoo/dqn.py  |  27 +
 .../frontend/mxnet/model_zoo/inception_v3.py  | 170 +++++
 tests/python/frontend/mxnet/model_zoo/mlp.py  |  40 ++
 .../python/frontend/mxnet/model_zoo/resnet.py | 199 ++++++
 .../frontend/mxnet/model_zoo/squeezenet.py    |  76 +++
 tests/python/frontend/mxnet/model_zoo/vgg.py  |  85 +++
 tests/python/frontend/mxnet/test_forward.py   | 214 +++++++
 tests/python/frontend/mxnet/test_graph.py     | 101 +++
 tests/python/relay/test_op_level3.py          |   2 +-
 topi/include/topi/transform.h                 |  25 +-
 topi/include/topi/vision/yolo/region.h        |   2 +-
 39 files changed, 2228 insertions(+), 190 deletions(-)
 create mode 100644 python/tvm/relay/frontend/__init__.py
 create mode 100644 python/tvm/relay/frontend/common.py
 create mode 100644 python/tvm/relay/frontend/mxnet.py
 create mode 100644 python/tvm/relay/op/_reduce.py
 create mode 100644 tests/python/frontend/mxnet/model_zoo/__init__.py
 create mode 100644 tests/python/frontend/mxnet/model_zoo/dcgan.py
 create mode 100644 tests/python/frontend/mxnet/model_zoo/dqn.py
 create mode 100644 tests/python/frontend/mxnet/model_zoo/inception_v3.py
 create mode 100644 tests/python/frontend/mxnet/model_zoo/mlp.py
 create mode 100644 tests/python/frontend/mxnet/model_zoo/resnet.py
 create mode 100644 tests/python/frontend/mxnet/model_zoo/squeezenet.py
 create mode 100644 tests/python/frontend/mxnet/model_zoo/vgg.py
 create mode 100644 tests/python/frontend/mxnet/test_forward.py
 create mode 100644 tests/python/frontend/mxnet/test_graph.py

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 95581a54e5a1..e7fda319cb9c 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -82,6 +82,7 @@ This level enables additional math and transform operators.
    tvm.relay.reshape_like
    tvm.relay.copy
    tvm.relay.transpose
+   tvm.relay.squeeze
    tvm.relay.floor
    tvm.relay.ceil
    tvm.relay.trunc
@@ -114,7 +115,7 @@ This level enables additional math and transform operators.
    tvm.relay.less_equal
    tvm.relay.maximum
    tvm.relay.minimum
-   tvm.relay.pow
+   tvm.relay.power
    tvm.relay.where
    tvm.relay.argmax
    tvm.relay.argmin
@@ -196,6 +197,7 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.reshape
 .. autofunction:: tvm.relay.reshape_like
 .. autofunction:: tvm.relay.copy
+.. autofunction:: tvm.relay.squeeze
 .. autofunction:: tvm.relay.transpose
 .. autofunction:: tvm.relay.take
 .. autofunction:: tvm.relay.zeros
@@ -220,7 +222,7 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.less_equal
 .. autofunction:: tvm.relay.maximum
 .. autofunction:: tvm.relay.minimum
-.. autofunction:: tvm.relay.pow
+.. autofunction:: tvm.relay.power
 .. autofunction:: tvm.relay.where
 .. autofunction:: tvm.relay.argmax
 .. autofunction:: tvm.relay.argmin
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 33f18a89e3e8..817ee04bd844 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -89,7 +89,7 @@ struct SoftmaxAttrs : public tvm::AttrsNode<SoftmaxAttrs> {
   int axis;
 
   TVM_DECLARE_ATTRS(SoftmaxAttrs, "relay.attrs.SoftmaxAttrs") {
-      TVM_ATTR_FIELD(axis).set_default(1)
+      TVM_ATTR_FIELD(axis).set_default(-1)
           .describe("The axis to sum over when computing softmax.");
   }
 };
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 7a8129180c4d..39cd82de83e2 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -62,7 +62,7 @@ struct TransposeAttrs : public tvm::AttrsNode<TransposeAttrs> {
 
 /*! \brief Attributes used in reshape operators */
 struct ReshapeAttrs : public tvm::AttrsNode<ReshapeAttrs> {
-  Array<IndexExpr> newshape;
+  Array<Integer> newshape;
   TVM_DECLARE_ATTRS(ReshapeAttrs, "relay.attrs.ReshapeAttrs") {
     TVM_ATTR_FIELD(newshape)
         .describe("The new shape. Should be compatible with the original shape.");
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 492208ed7a7c..6d8b75118a77 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -420,9 +420,9 @@ along which to split the array.
       return Array<Tensor>{
         topi::split_sections(inputs[0], param.indices_or_sections[0], param.axis) };
     } else {
-      Array<Expr> indices;
+      Array<Integer> indices;
       for (auto i : param.indices_or_sections) {
-        indices.push_back(tvm::make_const(tvm::Int(32), i));
+        indices.push_back(static_cast<int>(i));
       }
       return Array<Tensor>{ topi::split(inputs[0], indices, param.axis) };
     }
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 92e1e72fdac2..6b071f65a794 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -7,7 +7,7 @@
 from . import expr
 from . import module
 from . import ir_pass
-from .build_module import build, create_executor
+from .build_module import build, build_config, create_executor
 
 # Root operators
 from .op import Op
@@ -17,6 +17,7 @@
 from . import nn
 from . import vision
 from . import image
+from . import frontend
 from . import backend
 
 from .scope_builder import ScopeBuilder
@@ -40,6 +41,7 @@
 scalar_type = ty.scalar_type
 
 # Expr
+Expr = expr.Expr
 Constant = expr.Constant
 Tuple = expr.Tuple
 Var = expr.Var
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index a02579e2ac7a..1f7ab18677c4 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -72,8 +72,18 @@ def lower(self, source_func, target=None):
         cached_func: CachedFunc
             The result of lowering.
         """
-        key = _get_cache_key(source_func, target)
-        return _backend._CompileEngineLower(self, key)
+        # pylint: disable=broad-except
+        try:
+            key = _get_cache_key(source_func, target)
+            return _backend._CompileEngineLower(self, key)
+        except Exception:
+            import traceback
+            msg = traceback.format_exc()
+            msg += "Error during compile func\n"
+            msg += "--------------------------\n"
+            msg += source_func.astext(show_meta_data=False)
+            msg += "--------------------------\n"
+            raise RuntimeError(msg)
 
     def jit(self, source_func, target=None):
         """JIT a source_func to a tvm.Function.
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
index 50568b58607b..4351fea6b459 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -357,4 +357,4 @@ def _get_unique_name(self, name):
             return name
         index = self._name_map[name]
         self._name_map[name] += 1
-        return self.get_unique_name(name + str(index))
+        return self._get_unique_name(name + str(index))
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 5a45ac276de9..d67bc89702d3 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -13,7 +13,7 @@
 # List of optimization pass and level when switch on
 OPT_PASS_LEVEL = {
     "SimplifyInference": 0,
-    "CombineParallelConv2D": 1,
+    "CombineParallelConv2D": 4,
     "OpFusion": 1,
     "FoldConstant": 2,
     "FoldScaleAxis": 3,
@@ -157,7 +157,6 @@ def optimize(func, params=None):
 
     if cfg.pass_enabled("FoldConstant"):
         func = ir_pass.fold_constant(func)
-
     return func
 
 
diff --git a/python/tvm/relay/frontend/__init__.py b/python/tvm/relay/frontend/__init__.py
new file mode 100644
index 000000000000..28766b9ae3be
--- /dev/null
+++ b/python/tvm/relay/frontend/__init__.py
@@ -0,0 +1,4 @@
+"""Relay frontends."""
+from __future__ import absolute_import
+
+from .mxnet import from_mxnet
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
new file mode 100644
index 000000000000..8e037d4bc554
--- /dev/null
+++ b/python/tvm/relay/frontend/common.py
@@ -0,0 +1,129 @@
+"""Common utilities"""
+from __future__ import absolute_import as _abs
+
+
+class RequiredAttr(object):
+    """Dummpy class to represent required attr"""
+    pass
+
+
+class StrAttrsDict(object):
+    """Helper class to parse attrs stored as Dict[str, str].
+
+    Parameters
+    ----------
+    attrs : Dict[str, str]
+        The attributes to be used.
+    """
+    def __init__(self, attrs):
+        self.attrs = attrs
+
+    def get_float(self, key, default=RequiredAttr()):
+        """Get float attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            return float(self.attrs[key])
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_int(self, key, default=RequiredAttr()):
+        """Get int attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            val = self.attrs[key]
+            if val == "None":
+                return None
+            return int(val)
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_str(self, key, default=RequiredAttr()):
+        """Get str attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            return self.attrs[key]
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_int_tuple(self, key, default=RequiredAttr()):
+        """Get int tuple attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            tshape = self.attrs[key]
+            return tuple(int(x.strip()) for x in tshape.strip('()').split(','))
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_bool(self, key, default=RequiredAttr()):
+        """Get bool tuple attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            val = self.attrs[key]
+            return val.strip().lower() in ['true', '1', 't', 'y', 'yes']
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
new file mode 100644
index 000000000000..9d1bd0deffa9
--- /dev/null
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -0,0 +1,606 @@
+# pylint: disable=invalid-name, import-self, len-as-condition
+"""MXNet symbol frontend."""
+from __future__ import absolute_import as _abs
+
+import json
+from .. import ir_pass
+from .. import expr as _expr
+from .. import op as _op
+from ... import nd as _nd
+from .common import StrAttrsDict
+
+__all__ = ['from_mxnet']
+
+
+def _get_relay_op(op_name):
+    op = getattr(_op, op_name)
+    if not op:
+        raise RuntimeError("Unable to map op_name {} to relay".format(op_name))
+    return op
+
+
+def _warn_not_used(attr, op='nnvm'):
+    import warnings
+    err = "{} is ignored in {}.".format(attr, op)
+    warnings.warn(err)
+
+
+def _rename(new_op):
+    if isinstance(new_op, str):
+        new_op = _get_relay_op(new_op)
+    # attrs are ignored.
+    def impl(inputs, _):
+        return new_op(*inputs)
+    return impl
+
+
+def _reshape(inputs, attrs):
+    if attrs.get_bool("reverse", False):
+        raise RuntimeError("reshape do not support option reverse")
+    shape = attrs.get_int_tuple("shape")
+    return _op.reshape(inputs[0], newshape=shape)
+
+
+def _init_op(new_op):
+    """Init ops like zeros/ones"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 0
+        shape = attrs.get_int_tuple("shape")
+        dtype = attrs.get_str("dtype", "float32")
+        return new_op(shape=shape, dtype=dtype)
+    return _impl
+
+
+def _softmax_op(new_op):
+    """softmax/log_softmax"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        axis = attrs.get_int("axis", -1)
+        return new_op(inputs[0], axis=axis)
+    return _impl
+
+
+def _reduce(new_op):
+    """Reduction ops like sum/min/max"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        axis = attrs.get_int_tuple("axis", [])
+        keepdims = attrs.get_bool("keepdims", False)
+        # use None for reduce over all axis.
+        axis = None if len(axis) == 0 else axis
+        return new_op(inputs[0], axis=axis, keepdims=keepdims)
+    return _impl
+
+
+def _arg_reduce(new_op):
+    """Arg Reduction ops like argmin/argmax"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        axis = attrs.get_int("axis", None)
+        keepdims = attrs.get_bool("keepdims", False)
+        res = new_op(inputs[0], axis=[axis], keepdims=keepdims)
+        # cast to dtype.
+        res = res.astype("float32")
+        return res
+    return _impl
+
+
+def _cast(inputs, attrs):
+    """Type cast"""
+    dtype = attrs.get_str("dtype")
+    return _op.cast(inputs[0], dtype=dtype)
+
+
+def _clip(inputs, attrs):
+    a_min = attrs.get_float("a_min")
+    a_max = attrs.get_float("a_max")
+    return _op.clip(inputs[0], a_min=a_min, a_max=a_max)
+
+
+def _transpose(inputs, attrs):
+    axes = attrs.get_int_tuple("axes", None)
+    # translate default case
+    axes = None if len(axes) == 0 else axes
+    return _op.transpose(inputs[0], axes=axes)
+
+
+def _upsampling(inputs, attrs):
+    scale = attrs.get_int("scale")
+    return _op.nn.upsampling(inputs[0], scale=scale)
+
+
+def _elemwise_sum(inputs, _):
+    assert len(inputs) > 0
+    res = inputs[0]
+    for x in inputs[1:]:
+        res = _op.add(res, x)
+    return res
+
+
+def _binop_scalar(new_op):
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        scalar = attrs.get_float("scalar")
+        # Note: binary scalar only works for float op for now
+        scalar = _expr.const(scalar, dtype="float32")
+        return new_op(inputs[0], scalar)
+    return _impl
+
+
+def _rbinop_scalar(new_op):
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        scalar = attrs.get_float("scalar")
+        # Note: binary scalar only works for float op for now
+        scalar = _expr.const(scalar, dtype="float32")
+        return new_op(scalar, inputs[0])
+    return _impl
+
+# All the functions with _mx prefix specific to MXNet.
+# The functions without _mx prefix can be reused for
+# NNVMv1 conversion to _op.
+
+def _mx_fully_connected(inputs, attrs):
+    import mxnet as mx
+    units = attrs.get_int("num_hidden")
+    use_bias = not attrs.get_bool("no_bias", False)
+    try:
+        _ = mx.sym.FullyConnected(mx.sym.var("x"), num_hidden=1, flatten=True)
+        has_flatten = True
+    except mx.base.MXNetError:
+        # no flatten attribute in old mxnet
+        has_flatten = False
+    use_flatten = attrs.get_bool("flatten", True)
+    if has_flatten and use_flatten:
+        inputs[0] = _op.nn.batch_flatten(inputs[0])
+    res = _op.nn.dense(inputs[0], inputs[1], units=units)
+    if use_bias:
+        assert len(inputs) == 3
+        res = _op.nn.bias_add(res, inputs[2])
+    return res
+
+
+def _get_channel_axis(layout, op_name):
+    if layout == "NCHW":
+        return 1
+    elif layout == "NHWC":
+        return 3
+    raise RuntimeError("layout: {} is not supported in {}".format(layout, op_name))
+
+
+def _mx_activations(inputs, attrs):
+    act_type = attrs.get_str("act_type")
+    assert len(inputs) == 1
+    if act_type == "sigmoid":
+        return _op.sigmoid(inputs[0])
+    elif act_type == "tanh":
+        return _op.tanh(inputs[0])
+    elif act_type == "relu":
+        return _op.nn.relu(inputs[0])
+    elif act_type == "softrelu":
+        def _stable_softrelu(x):
+            # log(1 + exp(-abs(x))) + relu(x)
+            one = _expr.const(1, dtype="float32")
+            exp_neg_abs_x = _op.exp(_op.negative(_op.abs(x)))
+            return _op.add(_op.log(_op.add(one, exp_neg_abs_x)),
+                           _op.nn.relu(x))
+        return _stable_softrelu(inputs[0])
+    raise RuntimeError("Do not support act_type: {}".format(act_type))
+
+
+def _mx_conv2d(inputs, attrs):
+    kernel_size = attrs.get_int_tuple("kernel")
+    if len(kernel_size) != 2:
+        raise RuntimeError("non-2d kernel is not supported in conv2d")
+    data_layout = attrs.get_str("layout", "NCHW")
+    channel_axis = _get_channel_axis(data_layout, "conv2d")
+
+    if "kernel_layout" in attrs.attrs:
+        weight_layout = attrs.get_str("kernel_layout")
+    else:
+        weight_layout = "HWIO" if data_layout == "NHWC" else "OIHW"
+
+    new_attrs = {}
+    new_attrs["channels"] = attrs.get_int("num_filter")
+    new_attrs["kernel_size"] = kernel_size
+    new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1))
+    new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0))
+    new_attrs["dilation"] = attrs.get_int_tuple("dilate", (1, 1))
+    new_attrs["groups"] = attrs.get_int("num_group", 1)
+    new_attrs["data_layout"] = data_layout
+    new_attrs["weight_layout"] = weight_layout
+    use_bias = not attrs.get_bool("no_bias", False)
+    res = _op.nn.conv2d(inputs[0], inputs[1], **new_attrs)
+    if use_bias:
+        assert len(inputs) == 3
+        res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
+    return res
+
+
+def _mx_conv2d_transpose(inputs, attrs):
+    if "target_shape" in attrs.attrs:
+        raise RuntimeError("target_shape is not supported in conv2d_transpose")
+    kernel_size = attrs.get_int_tuple("kernel")
+    if len(kernel_size) != 2:
+        raise RuntimeError("non-2d kernel is not supported in conv2d")
+    data_layout = attrs.get_str("layout", "NCHW")
+    channel_axis = _get_channel_axis(data_layout, "conv2d_transpose")
+
+    if "kernel_layout" in attrs.attrs:
+        weight_layout = attrs.get_str("kernel_layout")
+    else:
+        weight_layout = "HWIO" if data_layout == "NHWC" else "OIHW"
+
+    new_attrs = {}
+    new_attrs["channels"] = attrs.get_int("num_filter")
+    new_attrs["kernel_size"] = kernel_size
+    new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1))
+    new_attrs["output_padding"] = attrs.get_int_tuple("adj", (0, 0))
+    new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0))
+    new_attrs["dilation"] = attrs.get_int_tuple("dilate", (1, 1))
+    new_attrs["groups"] = attrs.get_int("num_group", 1)
+    new_attrs["data_layout"] = data_layout
+    new_attrs["weight_layout"] = weight_layout
+    use_bias = not attrs.get_bool("no_bias", False)
+    res = _op.nn.conv2d_transpose(inputs[0], inputs[1], **new_attrs)
+
+    if use_bias:
+        assert len(inputs) == 3
+        res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
+    return res
+
+
+def _mx_pooling(inputs, attrs):
+    global_pool = attrs.get_bool("global_pool", False)
+    pool_type = attrs.get_str("pool_type")
+
+    def _pool2d(new_op, is_avg):
+        kernel_size = attrs.get_int_tuple("kernel")
+        if len(kernel_size) != 2:
+            raise RuntimeError("non-2d kernel is not supported in pool2d")
+        new_attrs = {}
+        new_attrs["pool_size"] = kernel_size
+        new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1))
+        new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0))
+        new_attrs["ceil_mode"] = (attrs.get_str("pooling_convention", "valid") == "full")
+        if is_avg:
+            new_attrs["count_include_pad"] = attrs.get_bool("count_include_pad", True)
+        return new_op(inputs[0], **new_attrs)
+
+    if pool_type == "max":
+        if global_pool:
+            return _op.nn.global_max_pool2d(inputs[0])
+        return _pool2d(_op.nn.max_pool2d, False)
+    elif pool_type == "avg":
+        if global_pool:
+            return _op.nn.global_avg_pool2d(inputs[0])
+        return _pool2d(_op.nn.avg_pool2d, True)
+    raise RuntimeError("Do not support pool_type:{}".format(pool_type))
+
+
+def _mx_dropout(inputs, attrs):
+    rate = attrs.get_float("p", 0.5)
+    return _op.nn.dropout(inputs[0], rate=rate)
+
+
+def _mx_batch_norm(inputs, attrs):
+    if attrs.get_bool("output_mean_var", False):
+        raise RuntimeError("batch_norm do not support output_mean_var")
+    if attrs.get_bool("use_global_stats", False):
+        _warn_not_used("use_global_stats", "batch_norm")
+    new_attrs = {}
+    new_attrs["axis"] = attrs.get_int("axis", 1)
+    new_attrs["epsilon"] = attrs.get_float("eps", 0.001)
+    new_attrs["center"] = True
+    new_attrs["scale"] = not attrs.get_bool("fix_gamma", False)
+    return _op.nn.batch_norm(*inputs, **new_attrs)
+
+
+def _mx_split(inputs, attrs):
+    axis = attrs.get_int("axis", 1)
+    new_attrs = {}
+    new_attrs["indices_or_sections"] = attrs.get_int("num_outputs")
+    new_attrs["axis"] = axis
+    res = _op.split(inputs[0], **new_attrs)
+    if attrs.get_bool("squeeze_axis", False):
+        return tuple([_op.squeeze(x, axis=[axis]) for x in res])
+    return res
+
+
+def _mx_softmax_activation(inputs, attrs):
+    mode = attrs.get_str("mode", "instance")
+    axis = 0 if mode == "instance" else 1
+    return _op.nn.softmax(inputs[0], axis=axis)
+
+
+def _mx_softmax_output(inputs, attrs):
+    if attrs.get_bool("multi_output", False):
+        return _op.nn.softmax(inputs[0], axis=1)
+    return _op.nn.softmax(inputs[0])
+
+
+def _mx_concat(inputs, attrs):
+    axis = attrs.get_int("dim", 1)
+    return _op.concatenate(tuple(inputs), axis=axis)
+
+
+def _mx_expand_dims(inputs, attrs):
+    axis = attrs.get_int("axis")
+    return _op.expand_dims(inputs[0], axis=axis)
+
+
+def _mx_leaky_relu(inputs, attrs):
+    act_type = attrs.get_str("act_type")
+    if act_type == "leaky":
+        return _op.nn.leaky_relu(inputs[0], alpha=attrs.get_float("slope", 0.25))
+    elif act_type == "prelu":
+        assert len(inputs) == 2
+        return _op.nn.prelu(*inputs)
+    elif act_type == "elu":
+        # -slope * relu(1-exp(x)) + relu(x)
+        slope = attrs.get_float("slope", 0.25)
+        one = _expr.const(1, dtype="float32")
+        x = inputs[0]
+        mslope = _op.nn.relu(_op.subtract(one, _op.exp(x)))
+        mslope = _op.multiply(mslope, _expr.const(-slope, dtype="float32"))
+        return _op.add(mslope, _op.nn.relu(x))
+    elif act_type == "rrelu":
+        # NOTE this is only converted for inference.
+        lower_bound = attrs.get_float("lower_bound")
+        upper_bound = attrs.get_float("upper_bound")
+        alpha = (lower_bound + upper_bound) / 2.0
+        return _op.nn.leaky_relu(inputs[0], alpha=alpha)
+    raise RuntimeError("act_type: {} is not supported".format(act_type))
+
+
+def _mx_lrn(inputs, attrs):
+    new_attrs = {}
+    new_attrs["alpha"] = attrs.get_float("alpha", 0.0001)
+    new_attrs["beta"] = attrs.get_float("beta", 0.75)
+    new_attrs["bias"] = attrs.get_float("knorm", 2)
+    # NCHW format and normalization along channel axis
+    new_attrs["axis"] = 1
+    new_attrs["size"] = attrs.get_int("nsize")
+    assert len(inputs) == 1
+    return _op.nn.lrn(inputs[0], **new_attrs)
+
+
+# Note: due to attribute conversion constraint
+# ops in the identity set must be attribute free
+_identity_list = [
+    "log",
+    "exp",
+    "sigmoid",
+    "tanh",
+    "exp",
+    "negative",
+    "reshape_like",
+    "slice_like",
+    "zeros_like",
+    "ones_like",
+]
+
+_convert_map = {
+    "_copy"         : _rename(_op.copy),
+    "relu"          : _rename(_op.nn.relu),
+    "broadcast_add" : _rename(_op.add),
+    "broadcast_sub" : _rename(_op.subtract),
+    "broadcast_mul" : _rename(_op.multiply),
+    "broadcast_div" : _rename(_op.divide),
+    "elemwise_add"  : _rename(_op.add),
+    "elemwise_sub"  : _rename(_op.subtract),
+    "elemwise_mul"  : _rename(_op.multiply),
+    "elemwise_div"  : _rename(_op.divide),
+    "flatten"       : _rename(_op.nn.batch_flatten),
+    "Flatten"       : _rename(_op.nn.batch_flatten),
+    "_plus_scalar"  : _binop_scalar(_op.add),
+    "__add_scalar__": _binop_scalar(_op.add),
+    "__sub_scalar__": _binop_scalar(_op.subtract),
+    "_minus_scalar" : _binop_scalar(_op.subtract),
+    "__mul_scalar__": _binop_scalar(_op.multiply),
+    "_mul_scalar"   : _binop_scalar(_op.multiply),
+    "__div_scalar__": _binop_scalar(_op.divide),
+    "_div_scalar"   : _binop_scalar(_op.divide),
+    "__pow_scalar__": _binop_scalar(_op.power),
+    "_rminus_scalar": _rbinop_scalar(_op.subtract),
+    "__rsub_scalar__": _rbinop_scalar(_op.subtract),
+    "_rdiv_scalar"  : _rbinop_scalar(_op.divide),
+    "__rdiv_scalar__"  : _rbinop_scalar(_op.divide),
+    "__rpow_scalar__": _rbinop_scalar(_op.power),
+    # reduction ops
+    "max"           : _reduce(_op.max),
+    "min"           : _reduce(_op.min),
+    "sum"           : _reduce(_op.sum),
+    "max_axis"      : _reduce(_op.max),
+    "min_axis"      : _reduce(_op.min),
+    "sum_axis"      : _reduce(_op.sum),
+    "argmax"        : _arg_reduce(_op.argmax),
+    "argmin"        : _arg_reduce(_op.argmin),
+    # init ops
+    "_ones"         : _init_op(_op.ones),
+    "_zeros"        : _init_op(_op.zeros),
+    # softmax
+    "softmax"       : _softmax_op(_op.nn.softmax),
+    "log_softmax"   : _softmax_op(_op.nn.log_softmax),
+    "Softmax"       : _softmax_op(_op.nn.softmax),
+    # per op specialization
+    "Reshape"       : _reshape,
+    "reshape"       : _reshape,
+    "Cast"          : _cast,
+    "clip"          : _clip,
+    "transpose"     : _transpose,
+    "UpSampling"    : _upsampling,
+    "add_n"         : _elemwise_sum,
+    # MXNet specific implementations
+    "FullyConnected": _mx_fully_connected,
+    "Activation"    : _mx_activations,
+    "Convolution"   : _mx_conv2d,
+    "Convolution_v1": _mx_conv2d,
+    "Deconvolution" : _mx_conv2d_transpose,
+    "Pooling"       : _mx_pooling,
+    "Pooling_v1"    : _mx_pooling,
+    "Dropout"       : _mx_dropout,
+    "BatchNorm"     : _mx_batch_norm,
+    "BatchNorm_v1"  : _mx_batch_norm,
+    "LRN"           : _mx_lrn,
+    "SliceChannel"  : _mx_split,
+    "split"         : _mx_split,
+    "expand_dims"   : _mx_expand_dims,
+    "Concat"        : _mx_concat,
+    "concat"        : _mx_concat,
+    "LeakyReLU"     : _mx_leaky_relu,
+    "SoftmaxOutput" : _mx_softmax_output,
+    "SoftmaxActivation" : _mx_softmax_activation,
+    # List of missing operators that are present in NNVMv1
+    # TODO(tvm-tvm): support all operators.
+    #
+    # "broadcast_to",
+    # "gather_nd",
+    # "_contrib_MultiBoxPrior" : _rename("multibox_prior"),
+    # "_contrib_MultiBoxDetection" : _contrib_multibox_detection,
+    # "Crop"          : _crop_like,
+
+}
+
+# set identity list
+_convert_map.update({k : _rename(k) for k in _identity_list})
+
+
+def _from_mxnet_impl(symbol, shape_dict, dtype_info):
+    """Convert mxnet symbol to nnvm implementation.
+
+    Reconstruct a relay Function by traversing the mxnet symbol.
+
+    Parameters
+    ----------
+    symbol : mxnet.sym.Symbol
+        Incompatible symbol from mxnet.
+        The op_name and attrs inside are not always compatible.
+
+    shape_dict : dict
+        Known parameter shapes
+
+    dtype_info : dict or str.
+        Known parameter dtypes
+
+    Returns:
+    -------
+    nnvm.sym.Symbol
+        Converted symbol
+    """
+    assert symbol is not None
+    jgraph = json.loads(symbol.tojson())
+    jnodes = jgraph["nodes"]
+    node_map = {}
+
+    for nid, node in enumerate(jnodes):
+        children = [node_map[e[0]][e[1]] for e in node["inputs"]]
+        attrs = StrAttrsDict(node.get("attrs", {}))
+        node_name = node["name"]
+        op_name = node["op"]
+        if op_name == "null":
+            shape = shape_dict[node_name] if node_name in shape_dict else None
+            if isinstance(dtype_info, dict):
+                dtype = dtype_info[node_name] if node_name in dtype_dict else "float32"
+            else:
+                dtype = dtype_info
+            node_map[nid] = [_expr.var(node_name, shape=shape, dtype=dtype)]
+        elif op_name in _convert_map:
+            res = _convert_map[op_name](children, attrs)
+            if isinstance(res, (_expr.TupleWrapper, tuple, list)):
+                pass
+            elif isinstance(res, _expr.Expr):
+                res = [res]
+            else:
+                raise RuntimeError("unexpected type %s" % type(res))
+            node_map[nid] = res
+        else:
+            raise RuntimeError("{} is not supported in relay frontend".format(op_name))
+
+    outputs = [node_map[e[0]][e[1]] for e in jgraph["heads"]]
+    outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
+    func = _expr.Function(ir_pass.free_vars(outputs), outputs)
+    return func
+
+
+def _update_shape_dtype(shape, dtype, params):
+    """Update shape dtype given params information"""
+    shape = {} if shape is None else shape
+    if not params:
+        return shape, dtype
+    shape = shape.copy()
+    shape.update({k : v.shape for k, v in params.items()})
+    if isinstance(dtype, str):
+        for k, v in params.items():
+            if v.dtype != dtype:
+                raise ValueError(
+                    "%s: dtype not expected %s vs %s" % (k, dtype, v.dtype))
+    else:
+        dtype = dtype.copy()
+        dtype.update({k : str(v.dtype) for k, v in params.items()})
+    return shape, dtype
+
+
+def from_mxnet(symbol,
+               shape=None,
+               dtype="float32",
+               arg_params=None,
+               aux_params=None):
+    """Convert from MXNet"s model into compatible relay Function.
+
+    Parameters
+    ----------
+    symbol : mxnet.Symbol or mxnet.gluon.HybridBlock
+        MXNet symbol.
+
+    shape : dict of str to tuple, optional
+        The input shape to the graph
+
+    dtype : str or dict of str to str
+        The input types to the graph
+
+    arg_params : dict of str to mx.NDArray
+        The argument parameters in mxnet
+
+    aux_params : dict of str to mx.NDArray
+        The auxiliary parameters in mxnet
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.NDArray
+        The parameter dict to be used by nnvm
+    """
+    try:
+        import mxnet as mx
+    except ImportError as e:
+        raise ImportError("{}. MXNet is required to parse symbols.".format(e))
+
+    if isinstance(symbol, mx.sym.Symbol):
+        params = {}
+        arg_params = arg_params if arg_params else {}
+        aux_params = aux_params if aux_params else {}
+        for k, v in arg_params.items():
+            params[k] = _nd.array(v.asnumpy())
+        for k, v in aux_params.items():
+            params[k] = _nd.array(v.asnumpy())
+        shape, dtype = _update_shape_dtype(shape, dtype, params)
+        sym = _from_mxnet_impl(symbol, shape, dtype)
+    elif isinstance(symbol, mx.gluon.HybridBlock):
+        if args_params is not None or aux_params is not None:
+            raise ValueError("arg_params and aux_params ae not used when importing HybridBlock")
+        params = {}
+        for k, v in symbol.collect_params().items():
+            params[k] = tvm.nd.array(v.data().asnumpy())
+        data = mx.sym.Variable("data")
+        sym = symbol(data)
+        shape, dtype = _update_shape_dtype(shape, dtype, params)
+        sym = _from_mxnet_impl(sym, shape, dtype)
+    elif isinstance(symbol, mx.gluon.Block):
+        raise NotImplementedError("Only Hybrid Blocks are supported now.")
+    else:
+        msg = "mxnet.Symbol or gluon.HybridBlock expected, got {}".format(type(symbol))
+        raise ValueError(msg)
+    return sym, params
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 30aef433d7c6..b32db4c23f3e 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -14,6 +14,7 @@
 # operator registry
 from . import _tensor
 from . import _transform
+from . import _reduce
 from ..expr import Expr
 from ..base import register_relay_node
 
diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
new file mode 100644
index 000000000000..fd18c0e71d53
--- /dev/null
+++ b/python/tvm/relay/op/_reduce.py
@@ -0,0 +1,19 @@
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+
+import topi
+from . import op as _reg
+
+
+def _schedule_reduce(_, outs, target):
+    """Generic schedule for reduce"""
+    with target:
+        return topi.generic.schedule_reduce(outs)
+
+
+_reg.register_schedule("argmax", _schedule_reduce)
+_reg.register_schedule("argmin", _schedule_reduce)
+_reg.register_schedule("sum", _schedule_reduce)
+_reg.register_schedule("max", _schedule_reduce)
+_reg.register_schedule("prod", _schedule_reduce)
+_reg.register_schedule("mean", _schedule_reduce)
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 7aef4d4377af..4832a195f9e8 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -273,4 +273,5 @@ def concatenate_compute(attrs, inputs, output_type, target):
     return [topi.concatenate(inputs, axis=attrs.axis)]
 
 register_schedule("concatenate", schedule_injective)
-register_pattern("concatenate", OpPattern.INJECTIVE)
+# TODO(tqchen): renable concat as injective
+register_pattern("concatenate", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index c05fbe8ec61e..d087526b7b88 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -1,63 +1,17 @@
-#pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
+# pylint: disable=invalid-name
 from __future__ import absolute_import
-import topi
-import topi.cuda
-from tvm import container
 from . import op as _reg
-from .op import (schedule_injective, register_compute, register_schedule,
-                 register_pattern, OpPattern)
 
-schedule_broadcast = schedule_injective
+schedule_injective = _reg.schedule_injective
+schedule_broadcast = _reg.schedule_injective
 
-# squeeze
-@register_compute("squeeze")
-def squeeze_compiler(attrs, inputs, output_type, target):
-    """Compiler for squeeze dims."""
-    assert len(inputs) == 1
-
-    if attrs.axis is None:
-        axis = None
-    elif isinstance(attrs.axis, container.Array):
-        axis = tuple(attrs.axis)
-    else:
-        axis = int(attrs.axis)
-
-    return [topi.squeeze(inputs[0], axis)]
-
-register_pattern("squeeze", OpPattern.INJECTIVE)
-register_schedule("squeeze", schedule_injective)
-
-# expand_dims
-@register_compute("expand_dims")
-def expand_dims_compiler(attrs, inputs, output_type, target):
-    """Compiler for expand_dims."""
-    assert len(inputs) == 1
-
-    new_axis = int(attrs.num_newaxis)
-    assert new_axis >= 0
-
-    # axis should be in range [-data.ndim - 1, data.ndim]
-    axis = int(attrs.axis)
-    assert axis >= -len(inputs[0].shape) - 1
-    assert axis <= len(inputs[0].shape)
-
-    return [topi.expand_dims(inputs[0], axis, new_axis)]
 
+_reg.register_schedule("squeeze", schedule_injective)
 _reg.register_schedule("expand_dims", schedule_broadcast)
-_reg.register_pattern("expand_dims", OpPattern.BROADCAST)
-
-# strided_slice
-_reg.register_schedule("strided_slice", schedule_injective)
-
-# slice_like
-_reg.register_schedule("slice_like", schedule_injective)
-_reg.register_pattern("slice_like", OpPattern.INJECTIVE)
-
-# reshape
 _reg.register_schedule("reshape", schedule_injective)
-_reg.register_pattern("reshape", OpPattern.INJECTIVE)
-
-# reshape_like
 _reg.register_schedule("reshape_like", schedule_injective)
-_reg.register_pattern("reshape_like", OpPattern.INJECTIVE)
+_reg.register_schedule("cast", schedule_broadcast)
+_reg.register_schedule("strided_slice", schedule_injective)
+_reg.register_schedule("slice_like", schedule_injective)
+_reg.register_schedule("split", schedule_injective)
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index b48bfde97f33..9c988b86e8bc 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -1,5 +1,7 @@
 #pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
+from __future__ import absolute_import
+
 import topi
 from topi.util import get_const_int, get_const_tuple
 from .. import op as reg
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 61c930436167..63b1e206e72c 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -145,7 +145,7 @@ def conv2d_transpose(data,
                                   weight_layout, output_padding, out_dtype)
 
 
-def softmax(data, axis=1):
+def softmax(data, axis=-1):
     r"""Computes softmax.
 
     .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
@@ -169,7 +169,7 @@ def softmax(data, axis=1):
     return _make.softmax(data, axis)
 
 
-def log_softmax(data, axis):
+def log_softmax(data, axis=-1):
     r"""Computes log softmax.
 
     .. math::
diff --git a/python/tvm/relay/testing/inception_v3.py b/python/tvm/relay/testing/inception_v3.py
index 96684c5d6e1d..491b221fbe0a 100644
--- a/python/tvm/relay/testing/inception_v3.py
+++ b/python/tvm/relay/testing/inception_v3.py
@@ -54,7 +54,7 @@ def Inception7A(data,
                       name=('%s_pool_%s_pool' % (pool, name)))
 
     cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv')
-    concat = relay.concatenate((tower_1x1, tower_5x5, tower_3x3, cproj), axis=0)
+    concat = relay.concatenate((tower_1x1, tower_5x5, tower_3x3, cproj), axis=1)
     return concat
 
 # First Downsample
@@ -72,7 +72,7 @@ def Inception7B(data,
                       name=('%s_tower' % name), suffix='_conv_2')
     pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max",
                       name=('max_pool_%s_pool' % name))
-    concat = relay.concatenate((tower_3x3, tower_d3x3, pooling), axis=0)
+    concat = relay.concatenate((tower_3x3, tower_d3x3, pooling), axis=1)
     return concat
 
 def Inception7C(data,
@@ -101,7 +101,7 @@ def Inception7C(data,
     cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1),
                  name=('%s_tower_2' % name), suffix='_conv')
     # concat
-    concat = relay.concatenate((tower_1x1, tower_d7, tower_q7, cproj), axis=0)
+    concat = relay.concatenate((tower_1x1, tower_d7, tower_q7, cproj), axis=1)
     return concat
 
 def Inception7D(data,
@@ -124,7 +124,7 @@ def Inception7D(data,
     pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0),
                       name=('%s_pool_%s_pool' % (pool, name)))
     # concat
-    concat = relay.concatenate((tower_3x3, tower_d7_3x3, pooling), axis=0)
+    concat = relay.concatenate((tower_3x3, tower_d7_3x3, pooling), axis=1)
     return concat
 
 def Inception7E(data,
@@ -153,7 +153,7 @@ def Inception7E(data,
                  suffix='_conv')
     # concat
     concat = relay.concatenate(
-        (tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj), axis=0)
+        (tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj), axis=1)
     return concat
 
 def get_net(batch_size,
diff --git a/python/tvm/relay/testing/squeezenet.py b/python/tvm/relay/testing/squeezenet.py
index fa55cafbf2b4..c7b8e8db166b 100644
--- a/python/tvm/relay/testing/squeezenet.py
+++ b/python/tvm/relay/testing/squeezenet.py
@@ -31,19 +31,21 @@
 from . import layers
 
 # Helpers
-def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
-    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels, prefix):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0, "%s_input" % prefix)
 
-    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
-    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0, "%s_left" % prefix)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1, "%s_right" % prefix)
     # NOTE : Assume NCHW layout here
     net = relay.concatenate((left, right), axis=1)
-
     return net
 
-def _make_fire_conv(net, channels, kernel_size, padding=0):
-    net = layers.conv2d(net, channels=channels, kernel_size=(kernel_size, kernel_size),
-                        padding=(padding, padding), name="conv2d")
+def _make_fire_conv(net, channels, kernel_size, padding=0, prefix=""):
+    net = layers.conv2d(net,
+                        channels=channels,
+                        kernel_size=(kernel_size, kernel_size),
+                        padding=(padding, padding), name="%s_conv" % prefix)
+    net = relay.nn.bias_add(net, relay.var("%s_conv_bias" % prefix))
     net = relay.nn.relu(net)
     return net
 
@@ -75,41 +77,44 @@ def get_net(batch_size, image_shape, num_classes, version, dtype):
                             kernel_size=(7, 7),
                             strides=(2, 2),
                             padding=(3, 3),
-                            name="conv2d")
-        net = relay.nn.bias_add(net, relay.var("dense1_bias"))
+                            name="conv1")
+        net = relay.nn.bias_add(net, relay.var("conv1_bias"))
         net = relay.nn.relu(net)
         net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 16, 64, 64)
-        net = _make_fire(net, 16, 64, 64)
-        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 16, 64, 64, "fire1")
+        net = _make_fire(net, 16, 64, 64, "fire2")
+        net = _make_fire(net, 32, 128, 128, "fire3")
         net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 32, 128, 128)
-        net = _make_fire(net, 48, 192, 192)
-        net = _make_fire(net, 48, 192, 192)
-        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 32, 128, 128, "fire4")
+        net = _make_fire(net, 48, 192, 192, "fire5")
+        net = _make_fire(net, 48, 192, 192, "fire6")
+        net = _make_fire(net, 64, 256, 256, "fire7")
         net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 64, 256, 256, "fire8")
     else:
         net = layers.conv2d(net,
                             channels=64,
                             kernel_size=(3, 3),
                             strides=(2, 2),
                             padding=(1, 1),
-                            name="conv2d")
+                            name="conv1")
+        net = relay.nn.bias_add(net, relay.var("conv1_bias"))
         net = relay.nn.relu(net)
         net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 16, 64, 64)
-        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64, "fire1")
+        net = _make_fire(net, 16, 64, 64, "fire2")
         net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 32, 128, 128)
-        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 32, 128, 128, "fire3")
+        net = _make_fire(net, 32, 128, 128, "fire4")
         net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
-        net = _make_fire(net, 48, 192, 192)
-        net = _make_fire(net, 48, 192, 192)
-        net = _make_fire(net, 64, 256, 256)
-        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 48, 192, 192, "fire5")
+        net = _make_fire(net, 48, 192, 192, "fire6")
+        net = _make_fire(net, 64, 256, 256, "fire7")
+        net = _make_fire(net, 64, 256, 256, "fire8")
     net = relay.nn.dropout(net, rate=0.5)
-    net = layers.conv2d(net, channels=num_classes, kernel_size=(1, 1), name="conv2d")
+    net = layers.conv2d(
+        net, channels=num_classes, kernel_size=(1, 1), name="conv_final")
+    net = relay.nn.bias_add(net, relay.var("conv_final_bias"))
     net = relay.nn.relu(net)
     net = relay.nn.global_avg_pool2d(net)
     net = relay.nn.batch_flatten(net)
@@ -117,8 +122,12 @@ def get_net(batch_size, image_shape, num_classes, version, dtype):
     args = relay.ir_pass.free_vars(net)
     return relay.Function(args, net)
 
-def get_workload(batch_size=1, num_classes=1000, version='1.0',
-                 image_shape=(3, 224, 224), dtype="float32"):
+
+def get_workload(batch_size=1,
+                 num_classes=1000,
+                 version='1.0',
+                 image_shape=(3, 224, 224),
+                 dtype="float32"):
     """Get benchmark workload for SqueezeNet
 
     Parameters
diff --git a/python/tvm/relay/testing/vgg.py b/python/tvm/relay/testing/vgg.py
index 7ec6669f6346..811de33c579a 100644
--- a/python/tvm/relay/testing/vgg.py
+++ b/python/tvm/relay/testing/vgg.py
@@ -24,20 +24,24 @@
 from .init import create_workload
 from . import layers as wrapper
 
-def get_feature(internel_layer, layers, filters, batch_norm=False):
+
+def get_feature(internal_layer, layers, filters, batch_norm=False):
     """Get VGG feature body as stacks of convoltions."""
     for i, num in enumerate(layers):
         for j in range(num):
-            internel_layer = wrapper.conv2d(
-                data=internel_layer, kernel_size=(3, 3), padding=(1, 1),
-                channels=filters[i], name="conv%s_%s"%(i + 1, j + 1))
+            internal_layer = wrapper.conv2d(
+                data=internal_layer, kernel_size=(3, 3), padding=(1, 1),
+                channels=filters[i], name="conv%s_%s" % (i + 1, j + 1))
+            internal_layer = relay.nn.bias_add(
+                internal_layer, relay.var("conv%s_%s_bias" % (i + 1, j + 1)))
             if batch_norm:
-                internel_layer = wrapper.batch_norm_infer(
-                    data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
-            internel_layer = relay.nn.relu(data=internel_layer)
-        internel_layer = relay.nn.max_pool2d(
-            data=internel_layer, pool_size=(2, 2), strides=(2, 2))
-    return internel_layer
+                internal_layer = wrapper.batch_norm_infer(
+                    data=internal_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internal_layer = relay.nn.relu(data=internal_layer)
+        internal_layer = relay.nn.max_pool2d(
+            data=internal_layer, pool_size=(2, 2), strides=(2, 2))
+    return internal_layer
+
 
 def get_classifier(input_data, num_classes):
     """Get VGG classifier layers as fc layers."""
@@ -51,6 +55,7 @@ def get_classifier(input_data, num_classes):
     fc8 = wrapper.dense_add_bias(data=drop7, units=num_classes, name="fc8")
     return fc8
 
+
 def get_net(batch_size, image_shape, num_classes, dtype, num_layers=11, batch_norm=False):
     """
     Parameters
@@ -68,7 +73,7 @@ def get_net(batch_size, image_shape, num_classes, dtype, num_layers=11, batch_no
         The data type
 
     num_layers : int
-        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+        Number of layers for the variant of vgg. Options are 11, 13, 16, 19.
 
     batch_norm : bool, default False
         Use batch normalization.
@@ -88,7 +93,12 @@ def get_net(batch_size, image_shape, num_classes, dtype, num_layers=11, batch_no
     args = relay.ir_pass.free_vars(symbol)
     return relay.Function(args, symbol)
 
-def get_workload(batch_size, num_classes=1000, image_shape=(3, 224, 224), dtype="float32"):
+
+def get_workload(batch_size,
+                 num_classes=1000,
+                 image_shape=(3, 224, 224),
+                 dtype="float32",
+                 num_layers=11):
     """Get benchmark workload for VGG nets.
 
     Parameters
@@ -105,6 +115,9 @@ def get_workload(batch_size, num_classes=1000, image_shape=(3, 224, 224), dtype=
     dtype : str, optional
         The data type
 
+    num_layers : int
+        Number of layers for the variant of vgg. Options are 11, 13, 16, 19.
+
     Returns
     -------
     net : nnvm.Symbol
@@ -113,5 +126,5 @@ def get_workload(batch_size, num_classes=1000, image_shape=(3, 224, 224), dtype=
     params : dict of str to NDArray
         The parameters.
     """
-    net = get_net(batch_size, image_shape, num_classes, dtype)
+    net = get_net(batch_size, image_shape, num_classes, dtype, num_layers)
     return create_workload(net)
diff --git a/src/lang/attr_functor.h b/src/lang/attr_functor.h
index 9257ad3b5490..69b7ec1f6e60 100644
--- a/src/lang/attr_functor.h
+++ b/src/lang/attr_functor.h
@@ -163,6 +163,7 @@ class AttrsHashHandler :
    * \param node The node to be hashed.
    */
   size_t Hash(const NodeRef& node) {
+    if (!node.defined()) return 0;
     return this->VisitAttr(node);
   }
 
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index f3c3e2935d22..5001e2cd4fea 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -31,7 +31,10 @@ class StorageAllocaBaseVisitor : public ExprVisitor {
     for (Var param : func->params) {
       CreateToken(param.operator->(), false);
     }
-    this->VisitExpr(func->body);
+    // must always keep output alive.
+    for (StorageToken* tok : GetToken(func->body)) {
+      tok->ref_counter += 1;
+    }
   }
 
   void VisitExpr_(const ConstantNode* op) final {
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
index 4c814bc1614f..5bb2f24cae81 100644
--- a/src/relay/op/op_common.h
+++ b/src/relay/op/op_common.h
@@ -16,7 +16,7 @@ namespace tvm {
 namespace relay {
 
 template<typename T>
-std::vector<T> AsVector(const Array<T> &array) {
+inline std::vector<T> AsVector(const Array<T> &array) {
     std::vector<T> result;
     result.reserve(array.size());
     for (const T& ele : array) {
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index 0a955fad631b..95c26c3ab7e4 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -5,6 +5,8 @@
  */
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
+#include <topi/elemwise.h>
+#include <topi/reduction.h>
 #include <numeric>
 #include <limits>
 #include "../op_common.h"
@@ -15,12 +17,12 @@ namespace relay {
 
 /*! \brief Attributes for Reduce operators */
 struct ReduceAttrs : public tvm::AttrsNode<ReduceAttrs> {
-  Array<IndexExpr> axis;
+  Array<Integer> axis;
   bool keepdims;
   bool exclude;
 
   TVM_DECLARE_ATTRS(ReduceAttrs, "relay.attrs.ReduceAttrs") {
-    TVM_ATTR_FIELD(axis).set_default(NullValue<Array<IndexExpr>>())
+    TVM_ATTR_FIELD(axis).set_default(NullValue<Array<Integer>>())
         .describe(R"code(The axis or axes along which to perform the reduction.
 
       The default, `axis=()`, will compute over all elements into a
@@ -50,7 +52,7 @@ struct ReduceAttrs : public tvm::AttrsNode<ReduceAttrs> {
 * \return r_axes The new reduced axes of the output.
 */
 inline std::vector<int64_t> GetReduceAxes(const uint32_t indim,
-                                          const Array<IndexExpr>& inaxis,
+                                          const Array<Integer>& inaxis,
                                           bool exclude) {
   if (!inaxis.defined()) {
     std::vector<int64_t> r_axes(indim);
@@ -60,9 +62,7 @@ inline std::vector<int64_t> GetReduceAxes(const uint32_t indim,
 
   std::vector<int64_t> in_axes;
   for (auto i : inaxis) {
-    const int64_t* k = as_const_int(i);
-    CHECK(k != nullptr) << "Reduce axis need to be constant, cannot be symbolic";
-    int64_t axis = k[0];
+    int64_t axis = i->value;
     if (axis < 0) {
       axis = axis + indim;
     }
@@ -97,6 +97,53 @@ inline std::vector<int64_t> GetReduceAxes(const uint32_t indim,
   return r_axes;
 }
 
+
+// Get axis under exclude condition.
+Array<Integer> GetExcludeAxes(size_t indim,
+                              const Array<Integer>& inaxis) {
+  std::vector<bool> axis_flag(indim, true);
+  for (auto i : inaxis) {
+    int64_t axis = i->value;
+    if (axis < 0) {
+      axis = axis + static_cast<int64_t>(indim);
+    }
+    // Check out of bounds error
+    CHECK_GE(axis, 0)
+      << "Axis out of bounds in reduce operator.";
+    CHECK_LT(axis, static_cast<int64_t>(indim))
+      << "Axis out of bounds in reduce operator.";
+    axis_flag[axis] = false;
+  }
+
+  Array<Integer> r_axes;
+
+  for (size_t i = 0; i < axis_flag.size(); ++i) {
+    if (axis_flag[i]) {
+      r_axes.push_back(static_cast<int>(i));
+    }
+  }
+  return r_axes;
+}
+
+
+template<typename F>
+Array<Tensor> ReduceCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target,
+                            F f) {
+  const ReduceAttrs* param = attrs.as<ReduceAttrs>();
+  CHECK(param != nullptr);
+  auto axes = param->axis;
+  if (param->exclude) {
+    axes = GetExcludeAxes(inputs[0]->shape.size(), param->axis);
+  }
+  if (axes.size() == 0) {
+    return { topi::identity(inputs[0]) };
+  }
+  return { f(inputs[0], axes, param->keepdims, false) };
+}
+
 /*!
 * \brief ReduceShapeImpl get the outshape for the reduction operator
 * \param in_shape Shape of input data.
@@ -200,7 +247,7 @@ bool ReduceRel(const Array<Type>& types,
   TVM_REGISTER_API("relay.op._make." OpName)                       \
   .set_body([](const TVMArgs& args, TVMRetValue* rv) {             \
     auto make_func = [](Expr data,                                 \
-                        Array<IndexExpr> axis,                     \
+                        Array<Integer> axis,                       \
                         bool keepdims,                             \
                         bool exclude) {                            \
       auto attrs = make_node<ReduceAttrs>();                       \
@@ -217,6 +264,14 @@ bool ReduceRel(const Array<Type>& types,
   .add_argument("data", "Tensor", "The input tensor.")
 
 
+Array<Tensor> ArgMaxCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::argmax);
+}
+
+
 RELAY_REGISTER_REDUCE_OP("argmax")
 .describe(R"code(Creates an operation that finds the indices of the maximum
 values over a given axis.
@@ -224,8 +279,17 @@ values over a given axis.
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.ReduceAttrs")
 .set_support_level(4)
-.add_type_rel("ArgReduce", ArgReduceRel);
+.add_type_rel("ArgReduce", ArgReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", ArgMaxCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
 
+Array<Tensor> ArgMinCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::argmin);
+}
 
 RELAY_REGISTER_REDUCE_OP("argmin")
 .describe(R"code(Creates an operation that finds the indices of the minimum
@@ -234,7 +298,16 @@ values over a given axis.
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.ReduceAttrs")
 .set_support_level(4)
-.add_type_rel("ArgReduce", ArgReduceRel);
+.add_type_rel("ArgReduce", ArgReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", ArgMinCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+Array<Tensor> SumCompute(const Attrs& attrs,
+                         const Array<Tensor>& inputs,
+                         const Type& out_type,
+                         const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::sum);
+}
 
 
 RELAY_REGISTER_REDUCE_OP("sum")
@@ -257,16 +330,35 @@ Example::
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.ReduceAttrs")
 .set_support_level(4)
-.add_type_rel("Reduce", ReduceRel);
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", SumCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 
+Array<Tensor> MaxCompute(const Attrs& attrs,
+                         const Array<Tensor>& inputs,
+                         const Type& out_type,
+                         const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::max);
+}
+
 RELAY_REGISTER_REDUCE_OP("max")
 .describe(R"code(Computes the max of array elements over given axes.
 
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.ReduceAttrs")
 .set_support_level(4)
-.add_type_rel("Reduce", ReduceRel);
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", MaxCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+
+Array<Tensor> MinCompute(const Attrs& attrs,
+                         const Array<Tensor>& inputs,
+                         const Type& out_type,
+                         const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::min);
+}
 
 
 RELAY_REGISTER_REDUCE_OP("min")
@@ -275,11 +367,20 @@ RELAY_REGISTER_REDUCE_OP("min")
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.ReduceAttrs")
 .set_support_level(4)
-.add_type_rel("Reduce", ReduceRel);
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", MinCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 
-RELAY_REGISTER_REDUCE_OP("mean")
-.describe(R"code(Computes the mean of array elements over given axes.
+Array<Tensor> ProdCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::prod);
+}
+
+RELAY_REGISTER_REDUCE_OP("prod")
+.describe(R"code(Computes the products of array elements over given axes.
 
 Example::
 
@@ -287,20 +388,40 @@ Example::
           [[1,4],[4,3],[5,2]],
           [[7,1],[7,2],[7,3]]]
 
-  mean(data)
-  [3.22]
+  mean(data, axis=1)
+  [35562240]
 
   mean(data, axis=[1,2])
-  [ 2.  3.16666667  4.5]
+  [ 36  480  2058]
 
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.ReduceAttrs")
 .set_support_level(4)
-.add_type_rel("Reduce", ReduceRel);
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", ProdCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 
-RELAY_REGISTER_REDUCE_OP("prod")
-.describe(R"code(Computes the products of array elements over given axes.
+Array<Tensor> MeanCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  IndexExpr count = make_const(inputs[0]->dtype, 1);
+  const ReduceAttrs* param = attrs.as<ReduceAttrs>();
+  CHECK(param != nullptr);
+  auto axes = param->axis;
+  for (int64_t i : GetReduceAxes(inputs[0]->shape.size(),
+                                 param->axis,
+                                 param->exclude)) {
+    count *= inputs[0]->shape[i];
+  }
+  auto res = ReduceCompute(attrs, inputs, out_type, target, topi::sum);
+  return {topi::divide(res[0], count)};
+}
+
+
+RELAY_REGISTER_REDUCE_OP("mean")
+.describe(R"code(Computes the mean of array elements over given axes.
 
 Example::
 
@@ -308,16 +429,17 @@ Example::
           [[1,4],[4,3],[5,2]],
           [[7,1],[7,2],[7,3]]]
 
-  mean(data, axis=1)
-  [35562240]
+  mean(data)
+  [3.22]
 
   mean(data, axis=[1,2])
-  [ 36  480  2058]
+  [ 2.  3.16666667  4.5]
 
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.ReduceAttrs")
 .set_support_level(4)
-.add_type_rel("Reduce", ReduceRel);
-
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", MeanCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 52363e8af92a..83a4c9067f43 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -8,9 +8,10 @@
 #include <tvm/ir_operator.h>
 #include <tvm/ir.h>
 #include <topi/transform.h>
+#include <topi/elemwise.h>
 #include <vector>
 #include "../op_common.h"
-
+#include "../../../arithmetic/compute_expr.h"
 
 namespace tvm {
 namespace relay {
@@ -37,6 +38,16 @@ bool CastRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> CastCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  const CastAttrs *param = attrs.as<CastAttrs>();
+  CHECK(param != nullptr);
+  DataType dtype = param->dtype;
+  return { topi::cast(inputs[0], dtype) };
+}
+
 Expr MakeCast(Expr data,
               DataType dtype) {
   auto attrs = make_node<CastAttrs>();
@@ -58,8 +69,9 @@ RELAY_REGISTER_OP("cast")
 .set_attrs_type_key("relay.attrs.CastAttrs")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
-.add_type_rel("Cast", CastRel);
-
+.add_type_rel("Cast", CastRel)
+.set_attr<FTVMCompute>("FTVMCompute", CastCompute)
+.set_attr<TOpPattern>("TOpPattern", kElemWise);
 
 // relay.expand_dims
 TVM_REGISTER_NODE_TYPE(ExpandDimsAttrs);
@@ -104,6 +116,15 @@ bool ExpandDimsRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> ExpandDimsCompute(const Attrs& attrs,
+                                const Array<Tensor>& inputs,
+                                const Type& out_type,
+                                const Target& target) {
+  const ExpandDimsAttrs *param = attrs.as<ExpandDimsAttrs>();
+  CHECK(param != nullptr);
+  return { topi::expand_dims(inputs[0], param->axis, param->num_newaxis) };
+}
+
 Expr MakeExpandDims(Expr data,
                     int axis,
                     int num_newaxis) {
@@ -129,7 +150,9 @@ RELAY_REGISTER_OP("expand_dims")
 .set_attrs_type_key("relay.attrs.ExpandDimsAttrs")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
-.add_type_rel("ExpandDims", ExpandDimsRel);
+.add_type_rel("ExpandDims", ExpandDimsRel)
+.set_attr<FTVMCompute>("FTVMCompute", ExpandDimsCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
 TVM_REGISTER_NODE_TYPE(ConcatenateAttrs);
 
@@ -303,13 +326,81 @@ bool ReshapeRel(const Array<Type>& types,
         << types[0];
     return false;
   }
+
   const auto* param = attrs.as<ReshapeAttrs>();
-  reporter->Assign(types[1], TensorTypeNode::make(param->newshape, data->dtype));
+  Array<IndexExpr> oshape;
+  size_t src_idx = 0;
+  int infer_idx = -1;
+
+  for (size_t i = 0; i < param->newshape.size(); ++i) {
+    int svalue = param->newshape[i]->value;
+    // special flag handling for shape inference.
+    if (svalue > 0) {
+      oshape.push_back(param->newshape[i]);
+      ++src_idx;
+    } else if (svalue == 0) {
+      // keep same
+      CHECK_LT(src_idx, data->shape.size());
+      oshape.push_back(data->shape[src_idx++]);
+    } else if (svalue == -1) {
+      // inference based on rest
+      CHECK_LT(infer_idx, 0)
+          << "One and only one dim can be inferred";
+      infer_idx = i;
+      oshape.push_back(1);
+      ++src_idx;
+    } else if (svalue == -2) {
+      // copy all remaining dims from source
+      while (src_idx < data->shape.size()) {
+        oshape.push_back(data->shape[src_idx++]);
+      }
+    } else if (svalue == -3) {
+      // merge two dims from source
+      CHECK_LT(src_idx + 1, data->shape.size());
+      IndexExpr d1 = data->shape[src_idx++];
+      IndexExpr d2 = data->shape[src_idx++];
+      oshape.push_back(d1 * d2);
+    } else if (svalue == -4) {
+      // split the source dim s into two dims
+      // read the left dim and then the right dim (either can be -1)
+      CHECK_LT(i + 2, param->newshape.size());
+      CHECK_LT(src_idx, data->shape.size());
+      IndexExpr d0 = data->shape[src_idx++];
+      Integer d1 = param->newshape[++i];
+      Integer d2 = param->newshape[++i];
+      if (d1->value == -1) {
+        CHECK(d2->value != -1)
+            << "Split dims cannot both be -1.";
+        oshape.push_back(d0 / d2);
+        oshape.push_back(d2);
+      } else {
+        CHECK_EQ(d2->value, -1);
+        oshape.push_back(d1);
+        oshape.push_back(d0 / d1);
+      }
+    }
+  }
+
+  if (infer_idx >= 0) {
+    IndexExpr new_size = arith::ComputeReduce<tvm::ir::Mul>(oshape, 1);
+    IndexExpr old_size = arith::ComputeReduce<tvm::ir::Mul>(data->shape, 1);
+    oshape.Set(infer_idx, old_size / new_size);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
   return true;
 }
 
+Array<Tensor> ReshapeCompute(const Attrs& attrs,
+                             const Array<Tensor>& inputs,
+                             const Type& out_type,
+                             const Target& target) {
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  CHECK(out_ttype != nullptr);
+  return { topi::reshape(inputs[0], out_ttype->shape) };
+}
+
 Expr MakeReshape(Expr data,
-                 Array<IndexExpr> newshape) {
+                 Array<Integer> newshape) {
   auto attrs = make_node<ReshapeAttrs>();
   attrs->newshape = std::move(newshape);
   static const Op& op = Op::Get("reshape");
@@ -377,14 +468,8 @@ Example::
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Reshape", ReshapeRel)
-.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
-                                         const Array<Tensor>& inputs,
-                                         const Type& out_type,
-                                         const Target& target) {
-  const auto* param = attrs.as<ReshapeAttrs>();
-  CHECK(param != nullptr);
-  return Array<Tensor>{ topi::reshape(inputs[0], param->newshape) };
-});
+.set_attr<FTVMCompute>("FTVMCompute", ReshapeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
 
 
 /*!
@@ -440,12 +525,8 @@ the input array into an output array with the same shape as the second input arr
 .add_argument("shape_like", "Tensor", "Shape tensor.")
 .set_support_level(3)
 .add_type_rel("ReshapeLike", ReshapeLikeRel)
-.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
-                                         const Array<Tensor>& inputs,
-                                         const Type& out_type,
-                                         const Target& target) {
-  return Array<Tensor>{ topi::reshape(inputs[0], inputs[1]->shape) };
-});
+.set_attr<FTVMCompute>("FTVMCompute", ReshapeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
 
 
 // Take
@@ -788,6 +869,7 @@ TVM_REGISTER_API("relay.op._make.squeeze")
     runtime::detail::unpack_call<Expr, 2>(MakeSqueeze, args, rv);
   });
 
+
 bool SqueezeRel(const Array<Type>& types,
                 int num_inputs,
                 const Attrs& attrs,
@@ -816,7 +898,13 @@ bool SqueezeRel(const Array<Type>& types,
       original_shape.push_back(std::pair<IndexExpr, bool>(e, true));
     }
     for (const auto& e : param->axis) {
-      original_shape.at(e->value).second = false;
+      int64_t axis_val = e->value;
+      if (axis_val < 0) {
+        axis_val += static_cast<int64_t>(original_shape.size());
+      }
+      CHECK_GE(axis_val, 0);
+      CHECK_LT(axis_val, original_shape.size());
+      original_shape.at(axis_val).second = false;
     }
     for (const auto p : original_shape) {
       if (p.second) {
@@ -832,6 +920,16 @@ bool SqueezeRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> SqueezeCompute(const Attrs& attrs,
+                             const Array<Tensor>& inputs,
+                             const Type& out_type,
+                             const Target& target) {
+  const SqueezeAttrs *param = attrs.as<SqueezeAttrs>();
+  CHECK(param != nullptr);
+  return { topi::squeeze(inputs[0], param->axis) };
+}
+
+
 RELAY_REGISTER_OP("squeeze")
 .describe(R"code(Squeeze the input tensor at the dimensions given by axes
 
@@ -842,7 +940,10 @@ RELAY_REGISTER_OP("squeeze")
 .set_attrs_type_key("relay.attrs.SqueezeAttrs")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
-.add_type_rel("Squeeze", SqueezeRel);
+.add_type_rel("Squeeze", SqueezeRel)
+.set_attr<FTVMCompute>("FTVMCompute", SqueezeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
 
 // Have no idea how to assert the constraint.
 // CollapseSumLike: <A, B> -> B where BroadCast(A, B) = A
@@ -1034,8 +1135,8 @@ Array<Tensor> StridedSliceCompute(const Attrs& attrs,
 
 
 TVM_REGISTER_API("relay.op._make.strided_slice")
-  .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-      runtime::detail::unpack_call<Expr, 4>(MakeStridedSlice, args, rv);
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 4>(MakeStridedSlice, args, rv);
   });
 
 
@@ -1082,7 +1183,7 @@ bool SplitRel(const Array<Type>& types,
   // `types` contains: [data, result]
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
-  CHECK(data != nullptr);
+  if (data == nullptr) return false;
   CHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
   const auto param = attrs.as<SplitAttrs>();
   CHECK(param != nullptr);
@@ -1131,6 +1232,23 @@ bool SplitRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> SplitCompute(const Attrs& attrs,
+                           const Array<Tensor>& inputs,
+                           const Type& out_type,
+                           const Target& target) {
+  const auto param = attrs.as<SplitAttrs>();
+  CHECK(param != nullptr);
+
+  if (const IntImm* sections = param->indices_or_sections.as<IntImm>()) {
+    int64_t num_sections = sections->value;
+    return Array<Tensor>{
+      topi::split_sections(inputs[0], num_sections, param->axis) };
+  } else {
+    auto indices = Downcast<Array<Integer> >(param->indices_or_sections);
+    return Array<Tensor>{ topi::split(inputs[0], indices, param->axis) };
+  }
+}
+
 Expr MakeSplit(Expr data,
                NodeRef indices_or_sections,
                int axis) {
@@ -1165,7 +1283,9 @@ the entries indicate where along axis the array is split.
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
-.add_type_rel("Split", SplitRel);
+.add_type_rel("Split", SplitRel)
+.set_attr<FTVMCompute>("FTVMCompute", SplitCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
 
 
 TVM_REGISTER_NODE_TYPE(SliceLikeAttrs);
@@ -1249,12 +1369,11 @@ Array<Integer> GetIntArray(Array<IndexExpr> arr) {
   return Array<Integer>(arr.node_);
 }
 
-template<typename AttrType>
 Array<Tensor> SliceLikeCompute(const Attrs& attrs,
                                const Array<Tensor>& inputs,
                                const Type& out_type,
                                const Target& target) {
-  const auto* param = attrs.as<AttrType>();
+  const auto* param = attrs.as<SliceLikeAttrs>();
   CHECK(param != nullptr);
   Array<IndexExpr> src_shape = inputs[0]->shape;
   Array<IndexExpr> target_shape = inputs[1]->shape;
@@ -1312,7 +1431,8 @@ RELAY_REGISTER_OP("slice_like")
 .add_argument("shape_like", "Tensor", "Shape tensor.")
 .set_support_level(10)
 .add_type_rel("SliceLike", SliceLikeRel)
-.set_attr<FTVMCompute>("FTVMCompute", SliceLikeCompute<SliceLikeAttrs>);
+.set_attr<FTVMCompute>("FTVMCompute", SliceLikeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index 96fe030c2d03..bcb91e7e5737 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -344,6 +344,7 @@ Expr MultiplyForwardRewrite(const Call& ref_call,
                             const Array<Expr>& new_args,
                             const AxesSet& expected_out_axes) {
   if (!expected_out_axes.defined()) return Expr();
+  if (expected_out_axes.size() == 0) return Expr();
   // TODO(tvm-team) allow same axes accumulation
   // not as important because it is less common in nn.
   const auto* slhs = new_args[0].as<ScaledExprNode>();
@@ -681,7 +682,9 @@ AxesSet AddSubBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
     // add of two elements.
     return in_axes[0];
   } else {
-    return NullValue<AxesSet>();
+    auto res = NullValue<AxesSet>();
+    CHECK(!res.defined());
+    return res;
   }
 }
 
@@ -751,14 +754,14 @@ Expr MultiplyBackwardTransform(const Call& call,
   const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
   AxesSet lhs_axes = transformer->GetExpectedAxes(call->args[0]);
   AxesSet rhs_axes = transformer->GetExpectedAxes(call->args[1]);
-  if (lhs_axes.defined()) {
+  if (lhs_axes.defined() && lhs_axes.size() != 0) {
     // NOTE we won't recursively call mutating on scale part.
     // since there  won't be scale chance within scale part.
     Expr rhs = call->args[1];
     if (MatchBroadcastToLeftAxes(tlhs, trhs, lhs_axes, &rhs)) {
       return transformer->Transform(call->args[0], lhs_axes, rhs);
     }
-  } else if (rhs_axes.defined()) {
+  } else if (rhs_axes.defined() && rhs_axes.size() != 0) {
     Expr lhs = call->args[0];
     if (MatchBroadcastToLeftAxes(trhs, tlhs, rhs_axes, &lhs)) {
       return transformer->Transform(call->args[1], rhs_axes, lhs);
diff --git a/tests/python/frontend/mxnet/model_zoo/__init__.py b/tests/python/frontend/mxnet/model_zoo/__init__.py
new file mode 100644
index 000000000000..eba8f8df0bba
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/__init__.py
@@ -0,0 +1,59 @@
+"""MXNet model zoo for testing purposes."""
+from __future__ import absolute_import
+from . import mlp, vgg, resnet, dqn, inception_v3, squeezenet, dcgan
+import tvm.relay.testing
+
+# mlp
+def mx_mlp():
+    num_class = 10
+    return mlp.get_symbol(num_class)
+
+def relay_mlp():
+    num_class = 10
+    return tvm.relay.testing.mlp.get_workload(1, num_class)[0]
+
+# vgg
+def mx_vgg(num_layers):
+    num_class = 1000
+    return vgg.get_symbol(num_class, num_layers)
+
+def relay_vgg(num_layers):
+    num_class = 1000
+    return tvm.relay.testing.vgg.get_workload(
+        1, num_class, num_layers=num_layers)[0]
+
+# resnet
+def mx_resnet(num_layers):
+    num_class = 1000
+    return resnet.get_symbol(num_class, num_layers, '3,224,224')
+
+def relay_resnet(num_layers):
+    num_class = 1000
+    return tvm.relay.testing.resnet.get_workload(
+        1, num_class, num_layers=num_layers)[0]
+
+
+# dqn
+mx_dqn = dqn.get_symbol
+
+def relay_dqn():
+    return tvm.relay.testing.dqn.get_workload(1)[0]
+
+# squeezenet
+def mx_squeezenet(version):
+    return squeezenet.get_symbol(version=version)
+
+def relay_squeezenet(version):
+    return tvm.relay.testing.squeezenet.get_workload(1, version=version)[0]
+
+# inception
+mx_inception_v3 = inception_v3.get_symbol
+
+def relay_inception_v3():
+    return tvm.relay.testing.inception_v3.get_workload(1)[0]
+
+# dcgan generator
+mx_dcgan = dcgan.get_symbol
+
+def relay_dcgan(batch_size):
+    return tvm.relay.testing.dcgan.get_workload(batch_size=batch_size)[0]
diff --git a/tests/python/frontend/mxnet/model_zoo/dcgan.py b/tests/python/frontend/mxnet/model_zoo/dcgan.py
new file mode 100644
index 000000000000..8af030b6b184
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/dcgan.py
@@ -0,0 +1,66 @@
+# pylint: disable=unused-argument
+"""
+The MXNet symbol of DCGAN generator
+
+Adopted from:
+https://github.com/tqchen/mxnet-gan/blob/master/mxgan/generator.py
+
+Reference:
+Radford, Alec, Luke Metz, and Soumith Chintala.
+"Unsupervised representation learning with deep convolutional generative adversarial networks."
+arXiv preprint arXiv:1511.06434 (2015).
+"""
+
+import mxnet as mx
+
+def deconv2d(data, ishape, oshape, kshape, name, stride=(2, 2)):
+    """a deconv layer that enlarges the feature map"""
+    target_shape = (oshape[-2], oshape[-1])
+    pad_y = (kshape[0] - 1) // 2
+    pad_x = (kshape[1] - 1) // 2
+    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
+    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
+
+    net = mx.sym.Deconvolution(data,
+                               kernel=kshape,
+                               stride=stride,
+                               pad=(pad_y, pad_x),
+                               adj=(adj_y, adj_x),
+                               num_filter=oshape[0],
+                               no_bias=True,
+                               name=name)
+    return net
+
+def deconv2d_bn_relu(data, prefix, **kwargs):
+    """a block of deconv + batch norm + relu"""
+    eps = 1e-5 + 1e-12
+
+    net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
+    net = mx.sym.BatchNorm(net, eps=eps, name="%s_bn" % prefix)
+    net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu')
+    return net
+
+def get_symbol(oshape=(3, 64, 64), ngf=128, code=None):
+    """get symbol of dcgan generator"""
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
+
+    code = mx.sym.Variable("data") if code is None else code
+    net = mx.sym.FullyConnected(code, name="g1", num_hidden=ngf*8*4*4, no_bias=True, flatten=False)
+    net = mx.sym.Activation(net, act_type='relu')
+    # 4 x 4
+    net = mx.sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
+    # 8 x 8
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
+    # 16x16
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
+    # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
+    net = deconv2d(
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
+    net = mx.sym.Activation(net, act_type='tanh')
+    return net
diff --git a/tests/python/frontend/mxnet/model_zoo/dqn.py b/tests/python/frontend/mxnet/model_zoo/dqn.py
new file mode 100644
index 000000000000..e037511efdf2
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/dqn.py
@@ -0,0 +1,27 @@
+"""
+The mxnet symbol of Nature DQN
+
+Reference:
+Mnih, Volodymyr, et al.
+"Human-level control through deep reinforcement learning."
+Nature 518.7540 (2015): 529.
+"""
+
+import mxnet as mx
+
+def get_symbol(num_action=18):
+    data = mx.sym.Variable(name='data')
+    net = mx.sym.Convolution(data, kernel=(8, 8), stride=(4, 4),
+                             num_filter=32, name='conv1')
+    net = mx.sym.Activation(net, act_type='relu', name='relu1')
+    net = mx.sym.Convolution(net, kernel=(4, 4), stride=(2, 2),
+                             num_filter=64, name='conv2')
+    net = mx.sym.Activation(net, act_type='relu', name='relu2')
+    net = mx.sym.Convolution(net, kernel=(3, 3), stride=(1, 1),
+                             num_filter=64, name='conv3')
+    net = mx.sym.Activation(net, act_type='relu', name='relu3')
+    net = mx.sym.FullyConnected(net, num_hidden=512, name='fc4')
+    net = mx.sym.Activation(net, act_type='relu', name='relu4')
+    net = mx.sym.FullyConnected(net, num_hidden=num_action, name='fc5', flatten=False)
+
+    return net
diff --git a/tests/python/frontend/mxnet/model_zoo/inception_v3.py b/tests/python/frontend/mxnet/model_zoo/inception_v3.py
new file mode 100644
index 000000000000..b8585bf05037
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/inception_v3.py
@@ -0,0 +1,170 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+import mxnet as mx
+import numpy as np
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
+    bn = mx.sym.BatchNorm(data=conv, eps=2e-5, name='%s%s_batchnorm' % (name, suffix))
+    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
+    return act
+
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(pooling, proj, name=('%s_tower_2' %  name), suffix='_conv')
+    concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name))
+    concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = mx.sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1")
+
+    # # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool")
+    flatten = mx.sym.Flatten(data=pool, name="flatten")
+    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1', flatten=False)
+    softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+    return softmax
diff --git a/tests/python/frontend/mxnet/model_zoo/mlp.py b/tests/python/frontend/mxnet/model_zoo/mlp.py
new file mode 100644
index 000000000000..922b208749bf
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/mlp.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+a simple multilayer perceptron
+"""
+import mxnet as mx
+
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.sym.Flatten(data=data)
+    try:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128, flatten=False)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64, flatten=False)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes, flatten=False)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    except:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    return mlp
diff --git a/tests/python/frontend/mxnet/model_zoo/resnet.py b/tests/python/frontend/mxnet/model_zoo/resnet.py
new file mode 100644
index 000000000000..3f9a870d31c0
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/resnet.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+import mxnet as mx
+import numpy as np
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    if bottle_neck:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=stride, pad=(0,0),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1,1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+        act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
+        conv3 = mx.sym.Convolution(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                                   workspace=workspace, name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    dtype : str
+        Precision (float32 or float16)
+    """
+    num_unit = len(units)
+    assert(num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if dtype == 'float32':
+        # data = mx.sym.identity(data=data, name='id')
+        data = data
+    else:
+        if dtype == 'float16':
+            data = mx.sym.Cast(data=data, dtype=np.float16)
+    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    (nchannel, height, width) = image_shape
+    if height <= 32:            # such as cifar10
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
+                                  no_bias=True, name="conv0", workspace=workspace)
+    else:                       # often expected to be 224 such as imagenet
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
+                                  no_bias=True, name="conv0", workspace=workspace)
+        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
+        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+
+    for i in range(num_stages):
+        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
+                             memonger=memonger)
+        for j in range(units[i]-1):
+            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
+    bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = mx.sym.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
+    flat = mx.sym.Flatten(data=pool1)
+    try:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1', flatten=False)
+    except:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
+    if dtype == 'float16':
+        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
+    return mx.sym.softmax(data=fc1, name='softmax')
+
+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    image_shape = [int(l) for l in image_shape.split(',')]
+    (nchannel, height, width) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+
+    return resnet(units       = units,
+                  num_stages  = num_stages,
+                  filter_list = filter_list,
+                  num_classes = num_classes,
+                  image_shape = image_shape,
+                  bottle_neck = bottle_neck,
+                  workspace   = conv_workspace,
+                  dtype       = dtype)
diff --git a/tests/python/frontend/mxnet/model_zoo/squeezenet.py b/tests/python/frontend/mxnet/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..deb896a21385
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/squeezenet.py
@@ -0,0 +1,76 @@
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+import mxnet as mx
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = mx.sym.concat(left, right, dim=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = mx.sym.Convolution(net, num_filter=channels, kernel=(kernel_size, kernel_size),
+                             pad=(padding, padding))
+    net = mx.sym.Activation(net, act_type='relu')
+    return net
+
+# Net
+def get_symbol(num_classes=1000, version='1.0', **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version in ['1.0', '1.1'], ("Unsupported SqueezeNet version {version}:"
+                                       "1.0 or 1.1 expected".format(version=version))
+    net = mx.sym.Variable("data")
+    if version == '1.0':
+        net = mx.sym.Convolution(net, num_filter=96, kernel=(7, 7), stride=(2, 2), pad=(3, 3))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 64, 256, 256)
+    else:
+        net = mx.sym.Convolution(net, num_filter=64, kernel=(3, 3), stride=(2, 2), pad=(1, 1))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 64, 256, 256)
+    net = mx.sym.Dropout(net, p=0.5)
+    net = mx.sym.Convolution(net, num_filter=num_classes, kernel=(1, 1))
+    net = mx.sym.Activation(net, act_type='relu')
+    net = mx.sym.Pooling(data=net, global_pool=True, kernel=(13, 13), pool_type='avg')
+    net = mx.sym.flatten(net)
+    return mx.sym.softmax(net)
diff --git a/tests/python/frontend/mxnet/model_zoo/vgg.py b/tests/python/frontend/mxnet/model_zoo/vgg.py
new file mode 100644
index 000000000000..68215bb80aaa
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/vgg.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""References:
+
+Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
+large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
+"""
+
+import mxnet as mx
+import numpy as np
+
+def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs):
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1))
+            if batch_norm:
+                internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1))
+        internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1))
+    return internel_layer
+
+def get_classifier(input_data, num_classes, **kwargs):
+    flatten = mx.sym.Flatten(data=input_data, name="flatten")
+    try:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6", flatten=False)
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7", flatten=False)
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8", flatten=False)
+    except:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
+    return fc8
+
+def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs):
+    """
+    Parameters
+    ----------
+    num_classes : int, default 1000
+        Number of classification classes.
+    num_layers : int
+        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+    batch_norm : bool, default False
+        Use batch normalization.
+    dtype: str, float32 or float16
+        Data precision.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if num_layers not in vgg_spec:
+        raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data = mx.sym.Variable(name="data")
+    if dtype == 'float16':
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    if dtype == 'float16':
+        classifier = mx.sym.Cast(data=classifier, dtype=np.float32)
+    symbol = mx.sym.softmax(data=classifier, name='softmax')
+    return symbol
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
new file mode 100644
index 000000000000..81a12b041ed7
--- /dev/null
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -0,0 +1,214 @@
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+from tvm.relay.testing.config import ctx_list
+from tvm import relay
+import mxnet as mx
+
+from mxnet import gluon
+from mxnet.gluon.model_zoo import vision
+import model_zoo
+
+
+def verify_mxnet_frontend_impl(mx_symbol,
+                               data_shape=(1, 3, 224, 224),
+                               out_shape=(1, 1000),
+                               gluon_impl=False,
+                               name=None,
+                               dtype='float32'):
+    """Use name different from test to avoid let nose pick it up"""
+    if gluon_impl:
+        def get_gluon_output(name, x):
+            net = vision.get_model(name)
+            net.collect_params().initialize(mx.init.Xavier())
+            net_sym = gluon.nn.SymbolBlock(outputs=net(mx.sym.var('data')),
+                                           inputs=mx.sym.var('data'),
+                                           params=net.collect_params())
+            out = net_sym(mx.nd.array(x.astype(dtype))).asnumpy()
+            return out, net_sym
+    else:
+        def get_mxnet_output(symbol, x, dtype='float32'):
+            from collections import namedtuple
+            Batch = namedtuple('Batch', ['data'])
+            mod = mx.mod.Module(symbol, label_names=None)
+            mod.bind(data_shapes=[('data', x.shape)], for_training=False)
+            mod.init_params()
+            mod.forward(Batch([mx.nd.array(x.astype(dtype))]))
+            out = mod.get_outputs()[0].asnumpy()
+            args, auxs = mod.get_params()
+            return out, args, auxs
+
+    def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
+        shape_dict = {"data": x.shape}
+        if gluon_impl:
+            new_sym, params = relay.frontend.from_mxnet(symbol, shape_dict)
+        else:
+            new_sym, params = relay.frontend.from_mxnet(symbol,
+                                                        shape_dict,
+                                                        arg_params=args,
+                                                        aux_params=auxs)
+        with relay.build_config(opt_level=3):
+            graph, lib, params = relay.build(new_sym, target, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("data", tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+        return out.asnumpy()
+
+    # random input
+    x = np.random.uniform(size=data_shape)
+    if gluon_impl:
+        gluon_out, gluon_sym = get_gluon_output(name, x)
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype)
+            tvm.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
+    else:
+        mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype)
+        assert "data" not in args
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype)
+            tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_mlp():
+    mlp = model_zoo.mx_mlp()
+    verify_mxnet_frontend_impl(mlp,
+                               data_shape=(1, 1, 28, 28),
+                               out_shape=(1, 10))
+
+def test_forward_vgg():
+    for n in [11]:
+        mx_sym = model_zoo.mx_vgg(n)
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_resnet():
+    for n in [18]:
+        mx_sym = model_zoo.mx_resnet(18)
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_elu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='elu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_rrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='rrelu', lower_bound=0.3, upper_bound=0.7)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_prelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='prelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_softrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.Activation(data, act_type='softrelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_fc_flatten():
+    # test flatten=True option in mxnet 0.11.1
+    data = mx.sym.var('data')
+    try:
+        mx_sym = mx.sym.FullyConnected(data, num_hidden=100, flatten=True)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+        mx_sym = mx.sym.FullyConnected(mx.sym.Flatten(data), num_hidden=100, flatten=False)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+    except:
+        pass
+
+def test_forward_clip():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicity
+    mx_sym = mx.sym.clip(data, a_min=0, a_max=1)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_split():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=False)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 1, 2, 1))
+
+def test_forward_split_squeeze():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=True)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 2, 1))
+
+def test_forward_expand_dims():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.expand_dims(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 1, 3, 4))
+
+def test_forward_pooling():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='avg')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='max')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+def test_forward_lrn():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.LRN(data, alpha=2, beta=2, knorm=1, nsize=5)
+    verify_mxnet_frontend_impl(mx_sym, (1, 10, 24, 24), (1, 10, 24, 24))
+
+def test_forward_ones():
+    data = mx.sym.var('data')
+    ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, ones)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_zeros():
+    data = mx.sym.var('data')
+    zeros = mx.sym.zeros(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, zeros)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_ones_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.ones_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_zeros_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.zeros_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_argmax():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmax(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (5, 3), (5,))
+
+def test_forward_argmin():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmin(data, axis=0)
+    verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
+
+
+if __name__ == '__main__':
+    test_forward_mlp()
+    test_forward_vgg()
+    test_forward_resnet()
+    test_forward_elu()
+    test_forward_rrelu()
+    test_forward_prelu()
+    test_forward_softrelu()
+    test_forward_fc_flatten()
+    test_forward_clip()
+    test_forward_split()
+    test_forward_split_squeeze()
+    test_forward_expand_dims()
+    test_forward_pooling()
+    test_forward_lrn()
+    test_forward_ones()
+    test_forward_zeros()
+    test_forward_ones_like()
+    test_forward_zeros_like()
+    test_forward_argmax()
+    test_forward_argmin()
diff --git a/tests/python/frontend/mxnet/test_graph.py b/tests/python/frontend/mxnet/test_graph.py
new file mode 100644
index 000000000000..c2bed8829b81
--- /dev/null
+++ b/tests/python/frontend/mxnet/test_graph.py
@@ -0,0 +1,101 @@
+import mxnet as mx
+from tvm import relay
+import model_zoo
+
+def compare_graph(f1, f2):
+    f1 = relay.ir_pass.infer_type(f1)
+    f2 = relay.ir_pass.infer_type(f2)
+    assert relay.ir_pass.alpha_equal(f1, f2)
+
+def test_mlp():
+    shape = {"data": (1, 1, 28, 28)}
+    mx_fun = model_zoo.mx_mlp()
+    from_mx_fun, _ = relay.frontend.from_mxnet(mx_fun, shape=shape)
+    relay_fun = model_zoo.relay_mlp()
+    compare_graph(from_mx_fun, relay_fun)
+
+
+def test_vgg():
+    shape = {"data": (1, 3, 224, 224)}
+    for n in [11, 13, 16, 19]:
+        mx_sym = model_zoo.mx_vgg(n)
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape=shape)
+        relay_sym = model_zoo.relay_vgg(n)
+        compare_graph(from_mx_sym, relay_sym)
+
+
+def test_resnet():
+    shape = {"data": (1, 3, 224, 224)}
+    for n in [18, 34, 50, 101]:
+        mx_sym = model_zoo.mx_resnet(n)
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape=shape)
+        relay_sym = model_zoo.relay_resnet(n)
+        compare_graph(from_mx_sym, relay_sym)
+
+
+def test_squeezenet():
+    shape = {"data": (1, 3, 224, 224)}
+    for version in ['1.0', '1.1']:
+        mx_sym = model_zoo.mx_squeezenet(version)
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+        relay_sym = model_zoo.relay_squeezenet(version)
+        compare_graph(from_mx_sym, relay_sym)
+
+
+def test_inception_v3():
+    shape = {"data": (1, 3, 299, 299)}
+    mx_sym = model_zoo.mx_inception_v3()
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+    relay_sym = model_zoo.relay_inception_v3()
+    compare_graph(from_mx_sym, relay_sym)
+
+
+def test_dqn():
+    shape = {"data": (1, 4, 84, 84)}
+    mx_sym = model_zoo.mx_dqn()
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+    relay_sym = model_zoo.relay_dqn()
+    compare_graph(from_mx_sym, relay_sym)
+
+
+def test_dcgan():
+    shape = {"data": (2, 100)}
+    mx_sym = model_zoo.mx_dcgan()
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+    relay_sym = model_zoo.relay_dcgan(batch_size=2)
+    compare_graph(from_mx_sym, relay_sym)
+
+
+def test_multi_outputs():
+    xshape = (10, 27)
+    yshape = (10, 9)
+
+    def mx_compose(F, **kwargs):
+        x = F.sym.Variable("x")
+        y = F.sym.Variable("y")
+        z = F.sym.split(x, **kwargs)
+        return F.sym.broadcast_sub(F.sym.broadcast_add(z[0], z[2]), y)
+
+    def relay_compose(F, **kwargs):
+        x = F.var("x", shape=xshape)
+        y = F.var("y", shape=yshape)
+        z = F.split(x, **kwargs)
+        z = F.subtract(F.add(z[0], z[2]), y)
+        return relay.Function(relay.ir_pass.free_vars(z), z)
+
+    mx_sym = mx_compose(mx, num_outputs=3, axis=1)
+    from_mx_sym, _ = relay.frontend.from_mxnet(
+        mx_sym, shape={"x":xshape, "y":yshape})
+    relay_sym = relay_compose(relay, indices_or_sections=3, axis=1)
+    compare_graph(from_mx_sym, relay_sym)
+
+
+if __name__ == "__main__":
+    test_mlp()
+    test_resnet()
+    test_vgg()
+    test_multi_outputs()
+    test_dqn()
+    test_dcgan()
+    test_squeezenet()
+    test_inception_v3()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 43c11c4509d1..806b63b7c6f5 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -115,7 +115,7 @@ def test_squeeze_bad_axes_infer_type():
 
 
 def test_reshape_infer_type():
-    n, t, d1, d2 = tvm.var("n"), tvm.var("t"), 100, 20
+    n, t, d1, d2 = 10, 20, 100, 20
     x = relay.var("x", relay.TensorType((n, t, d1, d2), "float32"))
     y = relay.reshape(x, newshape=(n, t, 2000))
     assert "newshape=" in y.astext()
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 9bc62b2c0249..ef7f1221a70c 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -332,7 +332,7 @@ inline Tensor concatenate(const Array<Tensor>& inputs,
 * \return A Tensor whose op member is the split operation
 */
 inline Array<Tensor> split(const Tensor& x,
-                           Array<Expr> split_indices,
+                           Array<Integer> split_indices,
                            int axis,
                            std::string name = "tensor",
                            std::string tag = kInjective) {
@@ -342,14 +342,15 @@ inline Array<Tensor> split(const Tensor& x,
   CHECK_LT(axis, x->shape.size()) << "axis out of bounds";
 
   auto src_axis_size = static_cast<int>(GetConstInt(x->shape[axis]));
-
-  auto split_indices_val = GetConstIntValues(split_indices, "split_indices");
-  CHECK(std::is_sorted(split_indices_val.begin(), split_indices_val.end())) <<
-    "split_indices must be sorted";
-
   std::vector<int> begin_ids;
   begin_ids.push_back(0);
-  std::copy(split_indices_val.begin(), split_indices_val.end(), std::back_inserter(begin_ids));
+
+  for (Integer idx : split_indices) {
+    int val = static_cast<int>(idx->value);
+    CHECK_GT(val, begin_ids.back())
+        << "split_indices must be sorted";
+    begin_ids.push_back(val);
+  }
 
   Array< Array<Expr> > out_shapes;
   for (size_t i = 0; i < begin_ids.size(); ++i) {
@@ -508,10 +509,10 @@ inline Tensor strided_slice(const Tensor& x,
 * \return A Tensor whose op member is the split operation
 */
 inline Array<Tensor> split_sections(const Tensor& x,
-                           int num_sections,
-                           int axis,
-                           std::string name = "tensor",
-                           std::string tag = kInjective) {
+                                    int num_sections,
+                                    int axis,
+                                    std::string name = "tensor",
+                                    std::string tag = kInjective) {
   if (axis < 0) {
     axis += static_cast<int>(x->shape.size());
   }
@@ -524,7 +525,7 @@ inline Array<Tensor> split_sections(const Tensor& x,
     << "num_sections must be an integer factor of the size of axis " << axis
     << " (" << src_axis_size << ")";
 
-  Array<Expr> split_indices;
+  Array<Integer> split_indices;
   auto seg_size = src_axis_size / num_sections;
   for (int i = 0; i < num_sections; ++i) {
     // region at index 0 is added by split()
diff --git a/topi/include/topi/vision/yolo/region.h b/topi/include/topi/vision/yolo/region.h
index 88553fc29b8a..7d303f445ac4 100644
--- a/topi/include/topi/vision/yolo/region.h
+++ b/topi/include/topi/vision/yolo/region.h
@@ -53,7 +53,7 @@ inline Tensor region(const Tensor &data,
                                      input_shape[2],
                                      input_shape[3]};
   auto data_block = reshape(data, intermediate_shape);
-  Array <Expr> split_indices;
+  Array <Integer> split_indices;
   for (int i = 1; i < split_size; ++i) {
     split_indices.push_back(i);
   }

From 2377f6d1c104ac8c0573c40478b1b11d677efae4 Mon Sep 17 00:00:00 2001
From: kice <wslikerqs@gmail.com>
Date: Sun, 25 Nov 2018 17:39:26 -0500
Subject: [PATCH 407/529] Fix str decoding error on non-English Windows (#2158)

---
 python/tvm/_ffi/base.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index 0a91c41127e4..2579f22e44af 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -18,7 +18,11 @@
     numeric_types = integer_types + (float, np.float32)
     # this function is needed for python3
     # to convert ctypes.char_p .value back to python str
-    py_str = lambda x: x.decode('utf-8')
+    if sys.platform == "win32":
+        encoding = 'cp' + str(ctypes.cdll.kernel32.GetACP())
+        py_str = lambda x: x.decode(encoding)
+    else:
+        py_str = lambda x: x.decode('utf-8')
 else:
     string_types = (basestring,)
     integer_types = (int, long, np.int32)

From ed8725dbe619139209216263a0d71faee075c192 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 25 Nov 2018 16:50:47 -0800
Subject: [PATCH 408/529] [COMMUNITY] @phisiart -> Committer (#2165)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index cbdcf396e9b4..29b1c222455c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,6 +18,7 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
+- [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
 - [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy - autotvm, topi

From 1a7e01e5e7817b2bc65d3658ad9a22cd4d110843 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 25 Nov 2018 18:27:29 -0800
Subject: [PATCH 409/529] [RELAY][OP] Move computes to cxx, enable concat as
 injective (#2166)

---
 .../relay/backend/graph_runtime_codegen.py    |  16 +-
 python/tvm/relay/frontend/mxnet.py            |   4 +-
 python/tvm/relay/op/_tensor.py                | 193 +-----------------
 src/relay/backend/compile_engine.cc           |  37 ++--
 src/relay/backend/interpreter.cc              |  27 ++-
 src/relay/op/tensor/binary.cc                 |  69 +++++--
 src/relay/op/tensor/unary.cc                  |  56 +++--
 tests/python/relay/test_op_level1.py          |  14 +-
 8 files changed, 145 insertions(+), 271 deletions(-)

diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
index 4351fea6b459..0da9b81269aa 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -236,18 +236,14 @@ def visit_call(self, call):
             self.lowered_funcs.add(loweredf)
 
         inputs = []
-        tuple_arg_count = 0
+        # flatten tuple in the call.
         for arg in call.args:
+            res = self.visit(arg)
             if isinstance(arg.checked_type, TupleType):
-                tuple_arg_count += 1
-            inputs.append(self.visit(arg))
-        # We need to specially handle tuple inputs and
-        # tuple output cases.
-        # Tuple input function(e.g. concat)
-        if tuple_arg_count:
-            assert len(call.args) == 1
-            assert isinstance(inputs[0], tuple)
-            inputs = list(inputs[0])
+                assert isinstance(res, tuple)
+                inputs += res
+            else:
+                inputs.append(res)
 
         inputs = [x.to_json() for x in inputs]
         op_name = cached_func.func_name
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 9d1bd0deffa9..b0b1e700987c 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -589,11 +589,11 @@ def from_mxnet(symbol,
         shape, dtype = _update_shape_dtype(shape, dtype, params)
         sym = _from_mxnet_impl(symbol, shape, dtype)
     elif isinstance(symbol, mx.gluon.HybridBlock):
-        if args_params is not None or aux_params is not None:
+        if arg_params is not None or aux_params is not None:
             raise ValueError("arg_params and aux_params ae not used when importing HybridBlock")
         params = {}
         for k, v in symbol.collect_params().items():
-            params[k] = tvm.nd.array(v.data().asnumpy())
+            params[k] = _nd.array(v.data().asnumpy())
         data = mx.sym.Variable("data")
         sym = symbol(data)
         shape, dtype = _update_shape_dtype(shape, dtype, params)
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 4832a195f9e8..774e091baefc 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -5,223 +5,37 @@
 from .op import register_compute, register_schedule, register_pattern
 from .op import schedule_injective, OpPattern
 
+
 schedule_broadcast = schedule_injective
 schedule_elemwise = schedule_injective
 
-# log
-@register_compute("log")
-def log_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.log(inputs[0])]
-
 register_schedule("log", schedule_broadcast)
-
-# exp
-@register_compute("exp")
-def exp_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.exp(inputs[0])]
-
 register_schedule("exp", schedule_broadcast)
-
-# sqrt
-@register_compute("sqrt")
-def sqrt_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.sqrt(inputs[0])]
-
 register_schedule("sqrt", schedule_broadcast)
-
-# sigmoid
-@register_compute("sigmoid")
-def sigmoid_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.sigmoid(inputs[0])]
-
 register_schedule("sigmoid", schedule_broadcast)
-
-# floor
-@register_compute("floor")
-def floor_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.floor(inputs[0])]
-
 register_schedule("floor", schedule_broadcast)
-
-# ceil
-@register_compute("ceil")
-def ceil_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.ceil(inputs[0])]
-
 register_schedule("ceil", schedule_broadcast)
-
-# trunc
-@register_compute("trunc")
-def trunc_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.trunc(inputs[0])]
-
 register_schedule("trunc", schedule_broadcast)
-
-# round
-@register_compute("round")
-def round_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.round(inputs[0])]
-
 register_schedule("round", schedule_broadcast)
-
-# abs
-@register_compute("abs")
-def abs_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.abs(inputs[0])]
-
 register_schedule("abs", schedule_broadcast)
-
-# tanh
-@register_compute("tanh")
-def tanh_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.tanh(inputs[0])]
-
 register_schedule("tanh", schedule_broadcast)
-
-# negative
-@register_compute("negative")
-def negative_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 1
-    return [topi.negative(inputs[0])]
-
 register_schedule("negative", schedule_broadcast)
 
-# add
-@register_compute("add")
-def add_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.add(inputs[0], inputs[1])]
-
-register_schedule("add", schedule_injective)
-
-# subtract
-@register_compute("subtract")
-def subtract_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.subtract(inputs[0], inputs[1])]
-
+register_schedule("add", schedule_broadcast)
 register_schedule("subtract", schedule_broadcast)
-
-# multiply
-@register_compute("multiply")
-def multiply_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.multiply(inputs[0], inputs[1])]
-
 register_schedule("multiply", schedule_broadcast)
-
-# divide
-@register_compute("divide")
-def divide_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.divide(inputs[0], inputs[1])]
-
 register_schedule("divide", schedule_broadcast)
-
-# power
-@register_compute("power")
-def power_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.power(inputs[0], inputs[1])]
-
 register_schedule("power", schedule_injective)
-
-# mod
-@register_compute("mod")
-def mod_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.mod(inputs[0], inputs[1])]
-
 register_schedule("mod", schedule_broadcast)
-
-# equal
-@register_compute("equal")
-def equal_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.equal(inputs[0], inputs[1])]
-
 register_schedule("equal", schedule_broadcast)
-
-# not_equal
-@register_compute("not_equal")
-def not_equal_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.not_equal(inputs[0], inputs[1])]
-
 register_schedule("not_equal", schedule_broadcast)
-
-# less
-@register_compute("less")
-def less_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.less(inputs[0], inputs[1])]
-
 register_schedule("less", schedule_broadcast)
-
-# less equal
-@register_compute("less_equal")
-def less_equal_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.less_equal(inputs[0], inputs[1])]
-
 register_schedule("less_equal", schedule_broadcast)
-
-# greater
-@register_compute("greater")
-def greater_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.greater(inputs[0], inputs[1])]
-
 register_schedule("greater", schedule_broadcast)
-
-# greater equal
-@register_compute("greater_equal")
-def greater_equal_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.greater_equal(inputs[0], inputs[1])]
-
 register_schedule("greater_equal", schedule_broadcast)
-
-# maximum
-@register_compute("maximum")
-def maximum_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.maximum(inputs[0], inputs[1])]
-
 register_schedule("maximum_compute", schedule_injective)
-
-# minimum
-@register_compute("minimum")
-def minimum_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.minimum(inputs[0], inputs[1])]
-
 register_schedule("minimum", schedule_injective)
-
-# right shift
-@register_compute("right_shift")
-def right_shift_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.right_shift(inputs[0], inputs[1])]
-
 register_schedule("right_shift", schedule_injective)
-
-# left shift
-@register_compute("left_shift")
-def left_shift_compute(attrs, inputs, output_type, target):
-    assert len(inputs) == 2
-    return [topi.left_shift(inputs[0], inputs[1])]
-
 register_schedule("left_shift", schedule_injective)
 
 # zeros
@@ -273,5 +87,4 @@ def concatenate_compute(attrs, inputs, output_type, target):
     return [topi.concatenate(inputs, axis=attrs.axis)]
 
 register_schedule("concatenate", schedule_injective)
-# TODO(tqchen): renable concat as injective
-register_pattern("concatenate", OpPattern.OPAQUE)
+register_pattern("concatenate", OpPattern.INJECTIVE)
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index b10e9f2e2ea3..8cb1279a1435 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -56,30 +56,26 @@ class ScheduleGetter :
         Op::GetAttr<FTVMSchedule>("FTVMSchedule");
     auto cache_node = make_node<CachedFuncNode>();
     cache_node->target = target_;
-
-    if (prim_func->params.size() == 1 &&
-        prim_func->params[0]->checked_type().as<TupleTypeNode>()) {
-      // Handle tuple input type by flattening them.
-      // This is the current calling convention of tuple input.
+    for (Var param : prim_func->params) {
       Array<tvm::Tensor> inputs;
-      for (Type field : prim_func->params[0]->type_as<TupleTypeNode>()->fields) {
-        const auto* ttype = field.as<TensorTypeNode>();
-        CHECK(ttype != nullptr);
+      if (const auto* ttype = param->checked_type().as<TensorTypeNode>()) {
         tvm::Tensor tensor = tvm::placeholder(
             GetShape(ttype->shape), ttype->dtype);
         cache_node->inputs.push_back(tensor);
         inputs.push_back(tensor);
+      } else {
+        // flatten tuple of tensor type.
+        const auto* tuple_type = param->type_as<TupleTypeNode>();
+        for (Type field : tuple_type->fields) {
+          const auto* ttype = field.as<TensorTypeNode>();
+          CHECK(ttype != nullptr);
+          tvm::Tensor tensor = tvm::placeholder(
+              GetShape(ttype->shape), ttype->dtype);
+          cache_node->inputs.push_back(tensor);
+          inputs.push_back(tensor);
+        }
       }
-      memo_[prim_func->params[0]] = inputs;
-
-    } else {
-      for (Var param : prim_func->params) {
-        const auto* ttype = param->type_as<TensorTypeNode>();
-        tvm::Tensor tensor = tvm::placeholder(
-            GetShape(ttype->shape), ttype->dtype);
-        cache_node->inputs.push_back(tensor);
-        memo_[param] = Array<Tensor>({tensor});
-      }
+      memo_[param] = inputs;
     }
     readable_name_stream_ << "fused";
     cache_node->outputs = this->VisitExpr(prim_func->body);
@@ -161,8 +157,9 @@ class ScheduleGetter :
 
     int op_pattern = fpattern[op];
     if (op_pattern >= kCommReduce) {
-      CHECK(!master_op_.defined())
-          << "Two complicated op in a primitive function";
+      CHECK(!master_op_.defined() || master_op_patetrn_ < kCommReduce)
+          << "Two complicated op in a primitive function "
+          << " master=" << master_op_ << " current=" << op;
     }
     if (op_pattern >= master_op_patetrn_) {
       master_op_ = op;
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index db96a3ad4de1..5bef4a22f371 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -212,7 +212,7 @@ class Interpreter :
     // Marshal the arguments.
     // Handle tuple input/output by flattening them.
     size_t arg_len = 0;
-    for (size_t i = 0; i < args.size(); i++) {
+    for (size_t i = 0; i < args.size(); ++i) {
       if (args[i].as<TensorValueNode>()) {
         ++arg_len;
       } else {
@@ -242,22 +242,19 @@ class Interpreter :
         << context_ << ", but get " << arg_ctx;
     };
 
-    if (func->params.size() == 1 &&
-        func->params[0]->checked_type().as<TupleTypeNode>()) {
-      // handle tuple input.
-      const TupleValueNode* tuple = args[0].as<TupleValueNode>();
-      CHECK(tuple);
-      for (size_t i = 0; i < tuple->fields.size(); ++i) {
-        fset_input(i, tuple->fields[i]);
-      }
-    } else {
-      CHECK_EQ(num_inputs, args.size());
-      // Decide the target context.
-      // Primitive functions always sit in the same context.
-      for (size_t i = 0; i < args.size(); i++) {
-        fset_input(i, args[i]);
+    int arg_counter = 0;
+    for (Value arg : args) {
+      if (arg.as<TensorValueNode>()) {
+        fset_input(arg_counter++,  arg);
+      } else {
+        const TupleValueNode* tuple = arg.as<TupleValueNode>();
+        CHECK(tuple != nullptr);
+        for (size_t i = 0; i < tuple->fields.size(); ++i) {
+          fset_input(arg_counter++, tuple->fields[i]);
+        }
       }
     }
+
     // TVM's calling convention is that the final argument is the output
     // buffer. To preserve the illusion of being a functional language
     // we need to allocate space for the output buffer based on the
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index 171824fcd3ae..3f28bd52cd4b 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -5,54 +5,75 @@
  */
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
+#include <topi/broadcast.h>
 #include "../type_relations.h"
 #include "../op_common.h"
 
 namespace tvm {
 namespace relay {
 
+#define RELAY_BINARY_COMPUTE(FTOPI)                        \
+  [] (const Attrs& attrs,                                  \
+      const Array<Tensor>& inputs,                         \
+      const Type& out_type,                                \
+      const Target& target) -> Array<Tensor> {             \
+    CHECK_EQ(inputs.size(), 2U);                           \
+    return {FTOPI(inputs[0], inputs[1])};                  \
+  }                                                        \
+
+
 // Addition
 RELAY_REGISTER_BINARY_OP("relay.op._make.", "add")
 .describe("Elementwise add with with broadcasting")
-.set_support_level(1);
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::add));
 
 // Subtraction
 RELAY_REGISTER_BINARY_OP("relay.op._make.", "subtract")
 .describe("Elementwise substract with broadcasting")
-.set_support_level(1);
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::subtract));
 
 // Right shift
 RELAY_REGISTER_BINARY_OP("relay.op._make.", "right_shift")
 .describe("Elementwise right shift with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::right_shift));
 
 RELAY_REGISTER_BINARY_OP("relay.op._make.", "left_shift")
 .describe("Elementwise left shift with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::left_shift));
 
 RELAY_REGISTER_BINARY_OP("relay.op._make.", "maximum")
 .describe("Elementwise maximum of two tensors with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::maximum));
 
 RELAY_REGISTER_BINARY_OP("relay.op._make.", "minimum")
 .describe("Elementwise minimum of two tensors with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::minimum));
 
 RELAY_REGISTER_BINARY_OP("relay.op._make.", "divide")
 .describe("Elementwise divide with broadcasting")
-.set_support_level(1);
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::divide));
 
 RELAY_REGISTER_BINARY_OP("relay.op._make.", "multiply")
 .describe("Elementwise multiply with broadcasting")
-.set_support_level(1);
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::multiply));
 
 RELAY_REGISTER_BINARY_OP("relay.op._make.", "power")
 .describe("Elementwise power with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::power));
 
 RELAY_REGISTER_BINARY_OP("relay.op._make.", "mod")
 .describe("Elementwise mod with broadcasting")
-.set_support_level(1);
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::mod));
 
 // Comparisons
 #define RELAY_REGISTER_CMP_OP(OpName)                               \
@@ -70,22 +91,38 @@ RELAY_REGISTER_BINARY_OP("relay.op._make.", "mod")
 
 RELAY_REGISTER_CMP_OP("equal")
 .describe("Elementwise equal compare with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::equal));
+
+
 RELAY_REGISTER_CMP_OP("not_equal")
 .describe("Elementwise not equal with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::not_equal));
+
+
 RELAY_REGISTER_CMP_OP("less")
 .describe("Elementwise less than with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::less));
+
+
 RELAY_REGISTER_CMP_OP("less_equal")
 .describe("Elementwise less than or equal compare with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::less_equal));
+
+
 RELAY_REGISTER_CMP_OP("greater")
 .describe("Elementwise greater than compare with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::greater));
+
+
 RELAY_REGISTER_CMP_OP("greater_equal")
 .describe("Elementwise greater than or equal compare with broadcasting")
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::greater_equal));
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 22f97e8f0d54..6c94fe2adcc2 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -5,12 +5,21 @@
  */
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
+#include <topi/elemwise.h>
 #include "../type_relations.h"
 #include "../op_common.h"
 
 namespace tvm {
 namespace relay {
 
+#define RELAY_UNARY_COMPUTE(FTOPI)                      \
+  [] (const Attrs& attrs,                               \
+      const Array<Tensor>& inputs,                      \
+      const Type& out_type,                             \
+      const Target& target) -> Array<Tensor> {          \
+    return {FTOPI(inputs[0])};                          \
+  }                                                     \
+
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "log")
 .describe(R"code(Returns the log input array, computed element-wise.
@@ -20,7 +29,9 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "log")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::log));
+
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "exp")
 .describe(R"code(Returns the exp input array, computed element-wise.
@@ -30,7 +41,8 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "exp")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::exp));
 
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "sqrt")
@@ -41,7 +53,9 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "sqrt")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::sqrt));
+
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "zeros_like")
 .describe(R"code(Returns an array of zeros, with same type and shape as the input.
@@ -49,6 +63,7 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "zeros_like")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel);
 
+
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "ones_like")
 .describe(R"code(Returns an array of ones, with same type and shape as the input.
 )code" TVM_ADD_FILELINE)
@@ -63,13 +78,17 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "sigmoid")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::sigmoid));
+
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "copy")
 .describe(R"code(Copy a tensor.
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::identity));
+
 
 // Clip
 struct ClipAttrs : public tvm::AttrsNode<ClipAttrs> {
@@ -107,7 +126,9 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "floor")
 .describe(R"code(Returns the floor of input array, computed element-wise.
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::floor));
+
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "ceil")
 .describe(R"code(Returns the ceil of input array, computed element-wise.
@@ -117,7 +138,9 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "ceil")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::ceil));
+
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "trunc")
 .describe(R"code(Returns the trunc of input array, computed element-wise.
@@ -127,7 +150,9 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "trunc")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::trunc));
+
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "round")
 .describe(R"code(Returns the round of input array, computed element-wise.
@@ -137,7 +162,9 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "round")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::round));
+
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "abs")
 .describe(R"code(Returns the abs of input array, computed element-wise.
@@ -147,7 +174,9 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "abs")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::abs));
+
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "tanh")
 .describe(R"code(Returns the tanh of input array, computed element-wise.
@@ -157,7 +186,9 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "tanh")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::tanh));
+
 
 RELAY_REGISTER_UNARY_OP("relay.op._make.", "negative")
 .describe(R"code(Returns the numeric negative of input array, computed element-wise.
@@ -167,7 +198,8 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "negative")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::negative));
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index d28aa0a56941..6a1662b65170 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -188,20 +188,22 @@ def test_concatenate():
 
     x = relay.var("x", shape=(10, 5))
     y = relay.var("y", shape=(10, 5))
+    t = relay.var("z", shape=())
     z = relay.concatenate((x, y), axis=1)
-
+    z = relay.add(z, t)
     # Check result.
-    func = relay.Function([x, y], z)
+    func = relay.Function([x, y, t], z)
     x_data = np.random.rand(10, 5).astype('float32')
     y_data = np.random.rand(10, 5).astype('float32')
-    ref_res = np.concatenate((x_data, y_data), axis=1)
+    t_data = np.random.uniform(size=()).astype('float32')
+    ref_res = np.concatenate((x_data, y_data), axis=1) + t_data
 
     for target, ctx in ctx_list():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
-        op_res1 = intrp1.evaluate(func)(x_data, y_data)
+        op_res1 = intrp1.evaluate(func)(x_data, y_data, t_data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=0.01)
-        op_res2 = intrp2.evaluate(func)(x_data, y_data)
+        op_res2 = intrp2.evaluate(func)(x_data, y_data, t_data)
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=0.01)
 
 def test_dropout():
@@ -306,11 +308,11 @@ def test_dense():
 
 
 if __name__ == "__main__":
+    test_concatenate()
     test_bias_add()
     test_unary_op()
     test_binary_op()
     test_expand_dims_infer_type()
-    test_concatenate()
     test_expand_dims()
     test_softmax()
     test_log_softmax()

From 22950abb9575cd7adec126ab51ae49c0606d8ef2 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Mon, 26 Nov 2018 11:41:33 +0530
Subject: [PATCH 410/529] [RELAY]sch and compute for reduce ops (#2091)

---
 python/tvm/relay/op/_reduce.py       |  1 +
 tests/python/relay/test_op_level4.py | 60 +++++++++++++++++++++++-----
 2 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
index fd18c0e71d53..5c720256bbd6 100644
--- a/python/tvm/relay/op/_reduce.py
+++ b/python/tvm/relay/op/_reduce.py
@@ -15,5 +15,6 @@ def _schedule_reduce(_, outs, target):
 _reg.register_schedule("argmin", _schedule_reduce)
 _reg.register_schedule("sum", _schedule_reduce)
 _reg.register_schedule("max", _schedule_reduce)
+_reg.register_schedule("min", _schedule_reduce)
 _reg.register_schedule("prod", _schedule_reduce)
 _reg.register_schedule("mean", _schedule_reduce)
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index dd12dc7cff3a..e5da48f107eb 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -106,8 +106,11 @@ def test_where():
     assert zz.checked_type == relay.TensorType((3, 4), "float32")
 
 
-def verify_reduce(test_func, data, axis, keepdims, exclude, output):
-    x = relay.var("x", relay.TensorType(data, "float32"))
+def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32"):
+    test_func = funcs[0]
+    ref_func = funcs[1]
+
+    x = relay.var("x", relay.TensorType(data, dtype))
     z = test_func(x, axis, keepdims, exclude)
     zz = relay.ir_pass.infer_type(z)
     if axis:
@@ -116,25 +119,60 @@ def verify_reduce(test_func, data, axis, keepdims, exclude, output):
         assert "keepdims=" in z.astext()
     if exclude:
         assert "exclude=" in z.astext()
-    out_type = "int32" if test_func in [relay.argmin, relay.argmax] else "float32"
+    out_type = "int32" if test_func in [relay.argmin, relay.argmax] else dtype
     assert zz.checked_type == relay.ty.TensorType(output, out_type)
 
+    if all(isinstance(v, tvm.expr.Var) == 1 for v in data) or len(output) == 0:
+        return
+
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(size=data).astype(dtype)
+    if ref_func in [np.sum]:
+        ref_res = ref_func(x_data + 0, axis=axis, dtype=dtype, keepdims=keepdims)
+    elif ref_func in [np.max, np.min, np.mean, np.prod]:
+        ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
+    else: #argmin/argmax
+        if axis and len(axis) > 1:
+            return
+        ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
 def test_reduce_functions():
+    def _with_keepdims(func):
+        def _wrapper(data, axis=None, keepdims=False):
+            if not keepdims:
+                return func(data, axis=axis)
+            else:
+                if axis is not None:
+                    axis = axis[0]
+                    out_shape = list(data.shape)
+                    out_shape[axis] = 1
+                else:
+                    out_shape = [1 for _ in range(len(data.shape))]
+                return func(data, axis=axis).reshape(out_shape)
+        return _wrapper
+
     d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
-    for func in [relay.sum,
-                 relay.max,
-                 relay.min,
-                 relay.mean,
-                 relay.prod,
-                 relay.argmin,
-                 relay.argmax]:
+    for func in [[relay.sum, np.sum],
+                 [relay.max, np.max],
+                 [relay.min, np.min],
+                 [relay.mean, np.mean],
+                 [relay.prod, np.prod],
+                 [relay.argmin, _with_keepdims(np.argmin)],
+                 [relay.argmax, _with_keepdims(np.argmax)]]:
         verify_reduce(func, (d1, d2, d3, d4), (2,), True, False, (d1, d2, 1, d4))
         verify_reduce(func, (d1, d2, d3), (1,), True, False, (d1, 1, d3))
         verify_reduce(func, (d1, d2, d3), None, True, False, (1, 1, 1))
         verify_reduce(func, (d1, d2, d3), (0, 1), True, False, (1, 1, d3))
         verify_reduce(func, (2, 3, 4), (1,), True, False, (2, 1, 4))
         verify_reduce(func, (2, 3, 4), (0, 1, 2), False, False, ())
-        verify_reduce(func, (4, 4, 3), None, True, False, (1, 1, 1))
         verify_reduce(func, (4, 4, 3), None, False, True, ())
         verify_reduce(func, (4, 4, 3), (0, 2), False, False, (4,))
         verify_reduce(func, (128, 24, 128), (0, 1), False, False, (128,))

From 9b77dffcf25f6d83f097a430466abc7565e68fbf Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Mon, 26 Nov 2018 18:25:17 +0000
Subject: [PATCH 411/529] [PASS] PostOrderVisit (#2169)

---
 include/tvm/relay/attrs/transform.h | 13 +++++++++++++
 include/tvm/relay/expr_functor.h    |  8 ++++++++
 python/tvm/relay/ir_pass.py         | 13 +++++++++++++
 src/relay/ir/expr_functor.cc        | 30 +++++++++++++++++++++++++++++
 src/relay/op/tensor/unary.cc        | 16 +++------------
 5 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 39cd82de83e2..3e56106df0c2 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -151,6 +151,19 @@ struct SliceLikeAttrs : public tvm::AttrsNode<SliceLikeAttrs> {
   }
 };
 
+// Clip
+struct ClipAttrs : public tvm::AttrsNode<ClipAttrs> {
+  double a_min;
+  double a_max;
+
+  TVM_DECLARE_ATTRS(ClipAttrs, "relay.attrs.ClipAttrs") {
+  TVM_ATTR_FIELD(a_min)
+    .describe("The minimum clip value.");
+  TVM_ATTR_FIELD(a_max)
+    .describe("The maximum clip value.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index 1681f9b87d2f..60b18218a313 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -182,6 +182,14 @@ class ExprMutator
   std::unordered_map<Expr, Expr, NodeHash, NodeEqual> memo_;
 };
 
+/*!
+ * \brief recursively visit the ir in post DFS order node, apply fvisit
+ * Each node is guaranteed to be visited only once.
+ * \param node The ir to be visited.
+ * \param fvisit The visitor function to be applied.
+ */
+void PostOrderVisit(const NodeRef& node, std::function<void(const NodeRef&)> fvisit);
+
 /*
  * \brief Bind function parameters or free variables.
  *
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index ef0a59cd3f6d..6297e366070f 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -10,6 +10,19 @@
 from .expr import Expr
 from .ty import Type
 
+def post_order_visit(expr, fvisit):
+    """Recursively visit the ir in post DFS order node,
+    apply fvisit. Each node is guaranteed to be visited
+    only once.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+    fvisit : function
+        The visitor function to be applied.
+    """
+    return _ir_pass.post_order_visit(expr, fvisit)
 
 def infer_type(expr, mod=None):
     """Infer the type of expr under the context of mod.
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 5e3ee1761c38..bacbfea7c063 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -228,6 +228,36 @@ void ExprVisitor::VisitExpr_(const TupleGetItemNode* op) {
 
 void ExprVisitor::VisitType(const Type& t) { return; }
 
+
+// visitor to implement apply
+class ExprApplyVisit : public ExprVisitor {
+ public:
+  explicit ExprApplyVisit(std::function<void(const Expr&)> f) : f_(f) {}
+  void VisitExpr(const Expr& e) final {
+    if (visited_.count(e.get()) != 0) return;
+    visited_.insert(e.get());
+    ExprVisitor::VisitExpr(e);
+    f_(e);
+  }
+
+ private:
+  std::function<void(const Expr&)> f_;
+  std::unordered_set<const Node*> visited_;
+};
+
+void PostOrderVisit(const Expr& e, std::function<void(const Expr&)> fvisit) {
+  ExprApplyVisit(fvisit).VisitExpr(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.post_order_visit")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    PackedFunc f = args[1];
+    PostOrderVisit(args[0], [f](const Expr& n) {
+        f(n);
+      });
+  });
+
+
 // Implement bind.
 class ExprBinder : public ExprMutator {
  public:
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 6c94fe2adcc2..fef0302a0507 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -5,6 +5,7 @@
  */
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
+#include <tvm/relay/attrs/transform.h>
 #include <topi/elemwise.h>
 #include "../type_relations.h"
 #include "../op_common.h"
@@ -89,19 +90,8 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "copy")
 .add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::identity));
 
-
-// Clip
-struct ClipAttrs : public tvm::AttrsNode<ClipAttrs> {
-  double a_min;
-  double a_max;
-
-  TVM_DECLARE_ATTRS(ClipAttrs, "relay.attrs.ClipAttrs") {
-  TVM_ATTR_FIELD(a_min)
-    .describe("The minimum clip value.");
-  TVM_ATTR_FIELD(a_max)
-    .describe("The maximum clip value.");
-  }
-};
+// relay.clip
+TVM_REGISTER_NODE_TYPE(ClipAttrs);
 
 TVM_REGISTER_API("relay.op._make.clip")
   .set_body_typed<Expr(Expr, double, double)>([](Expr a, double a_min, double a_max) {

From 8b23838dc167c94fc7893bdf9a8249fa753b8159 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 26 Nov 2018 10:44:49 -0800
Subject: [PATCH 412/529] [RELAY] Add multiref trigger to ForwardRewrite
 (#2168)

---
 include/tvm/relay/pass.h          |  5 ++++-
 python/tvm/relay/expr.py          | 17 ++++++++++++++++
 src/relay/ir/expr.cc              |  8 ++++++++
 src/relay/pass/forward_rewrite.cc | 33 ++++++++++++++++++++++++++-----
 4 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 4410ed0d0de1..58e160eb4ac9 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -164,11 +164,14 @@ Expr FuseOps(const Expr& expr, int fuse_opt_level);
  * \param rewrite_map_attr_name The Op's attr name which corresponds to the rewrite
  *                              rule function.
  * \param fcontext Additional callback to provide context argument for each call node.
+ * \param fmulti_ref_trigger Transformation function to be called when
+ *                           an Expr consumed by multiple callers.
  * \return The rewritten expression.
  */
 Expr ForwardRewrite(const Expr& expr,
                     const std::string& rewrite_map_attr_name,
-                    std::function<NodeRef(const Call&)> fcontext = nullptr);
+                    std::function<NodeRef(const Call&)> fcontext = nullptr,
+                    std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
 
 /*! \brief A hashing structure in the style of std::hash. */
 struct StructuralHash {
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index d71db0036f20..89a8a58fffa9 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -320,6 +320,23 @@ def __init__(self, tuple_value, index):
             _make.TupleGetItem, tuple_value, index)
 
 
+class TempExpr(Expr):
+    """Baseclass of all TempExpr.
+
+    TempExprs are pass specific expression that can be
+    useful to define intermediate result in the
+    rewriting pass such as layout or type transformation.
+    """
+    def realize(self):
+        """Convert the expression to a normal(non-temp) Expr.
+
+        Returns
+        -------
+        The corresponding normal expression.
+        """
+        return _expr.TempExprRealize(self)
+
+
 class ExprFunctor(object):
     """
     An abstract visitor defined over Expr.
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 43fdc68a4efe..e8b93ec5210d 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -258,5 +258,13 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
   p->stream << "TupleGetItemNode(" << node->tuple << ", " << node->index << ")";
 });
 
+
+TVM_REGISTER_API("relay._expr.TempExprRealize")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    TempExpr temp = args[0];
+    *ret = temp->Realize();
+});
+
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/forward_rewrite.cc b/src/relay/pass/forward_rewrite.cc
index 9c1e35782e92..7873db80c6b0 100644
--- a/src/relay/pass/forward_rewrite.cc
+++ b/src/relay/pass/forward_rewrite.cc
@@ -7,6 +7,7 @@
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op_attr_types.h>
+#include "pass_util.h"
 
 namespace tvm {
 namespace relay {
@@ -42,13 +43,18 @@ class TempRealizer : private ExprMutator {
 class ForwardRewriter : private ExprMutator {
  public:
   ForwardRewriter(const OpMap<FForwardRewrite>& rewrite_map,
-                  std::function<NodeRef(const Call&)> fcontext)
+                  std::function<NodeRef(const Call&)> fcontext,
+                  std::function<Expr(const Expr&)> fmulti_ref_trigger)
       : rewrite_map_(rewrite_map),
-        fcontext_(fcontext) {
+        fcontext_(fcontext),
+        fmulti_ref_trigger_(fmulti_ref_trigger) {
   }
 
   // Transform expression.
   Expr Rewrite(Expr expr) {
+    if (fmulti_ref_trigger_ != nullptr) {
+      ref_counter_ = GetExprRefCount(expr);
+    }
     return this->VisitExpr(expr);
   }
 
@@ -57,6 +63,10 @@ class ForwardRewriter : private ExprMutator {
   const OpMap<FForwardRewrite>& rewrite_map_;
   // The context.
   std::function<NodeRef(const Call&)> fcontext_{nullptr};
+  // The multiple reference trigger
+  std::function<Expr(const Expr&)> fmulti_ref_trigger_{nullptr};
+  // Internal ref counter
+  std::unordered_map<const Node*, size_t> ref_counter_;
   // internal realizer
   TempRealizer realizer_;
 
@@ -67,7 +77,17 @@ class ForwardRewriter : private ExprMutator {
 
   // Visit and allow non-realized version.
   Expr GetTempExpr(const Expr& expr)  {
-    return ExprMutator::VisitExpr(expr);
+    if (fmulti_ref_trigger_ != nullptr) {
+      Expr ret = ExprMutator::VisitExpr(expr);
+      auto it = ref_counter_.find(expr.get());
+      CHECK(it != ref_counter_.end());
+      if (it->second > 1) {
+        ret = fmulti_ref_trigger_(ret);
+      }
+      return ret;
+    } else {
+      return ExprMutator::VisitExpr(expr);
+    }
   }
 
   // Automatic fold TupleGetItem.
@@ -124,9 +144,12 @@ class ForwardRewriter : private ExprMutator {
 
 Expr ForwardRewrite(const Expr& expr,
                     const std::string& rewrite_map_name,
-                    std::function<NodeRef(const Call&)> fcontext) {
+                    std::function<NodeRef(const Call&)> fcontext,
+                    std::function<Expr(const Expr&)> fmulti_ref_trigger) {
   auto rewrite_map = Op::GetAttr<FForwardRewrite>(rewrite_map_name);
-  return ForwardRewriter(rewrite_map, fcontext).Rewrite(expr);
+  return ForwardRewriter(rewrite_map,
+                         fcontext,
+                         fmulti_ref_trigger).Rewrite(expr);
 }
 }  // namespace relay
 }  // namespace tvm

From deeb5d389231ad63fb042ec2efced3f57ae705b5 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 27 Nov 2018 03:45:08 +0900
Subject: [PATCH 413/529] [Relay] Register compute and schedule for upsampling,
 with miscellaneous fixes (#2171)

---
 python/tvm/relay/base.py             | 12 +++++----
 python/tvm/relay/build_module.py     |  4 +--
 python/tvm/relay/op/nn/_nn.py        | 11 ++++++++-
 src/relay/op/nn/upsampling.cc        | 25 ++++++++++++++++++-
 src/relay/op/tensor/transform.cc     |  2 +-
 tests/python/relay/test_op_level2.py | 37 ++++++++++++++++++++++++++++
 6 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
index 0feffeb809c5..169b07b41abf 100644
--- a/python/tvm/relay/base.py
+++ b/python/tvm/relay/base.py
@@ -26,11 +26,8 @@ class RelayNode(NodeBase):
     def astext(self, show_meta_data=True, annotate=None):
         """Get the text format of the expression.
 
-        Returns
-        -------
-        text : str
-            The text format of the expression.
-
+        Parameters
+        ----------
         show_meta_data : bool
             Whether to include meta data section in the text
             if there is meta data.
@@ -44,6 +41,11 @@ def astext(self, show_meta_data=True, annotate=None):
         meta data section is necessary to fully parse the text format.
         However, it can contain dumps that are big(constat weights),
         so it can be helpful to skip printing the meta data section.
+
+        Returns
+        -------
+        text : str
+            The text format of the expression.
         """
         return _expr.RelayPrint(self, show_meta_data, annotate)
 
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index d67bc89702d3..fe6cb28c9c72 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -274,8 +274,8 @@ def create_executor(kind="debug",
     kind : str
         The type of executor
 
-    mod : relay.Mod
-        The mod
+    mod : tvm.relay.Module
+        The Relay module containing collection of functions
 
     ctx : tvm.TVMContext
         The context to execute the code.
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 9c988b86e8bc..ebfda0ab4c50 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -76,7 +76,7 @@ def compute_conv2d(attrs, inputs, out_type, target):
         out = topi.nn.depthwise_conv2d_nchw(
             inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
     elif layout == "NHWC" and \
-         kernel_layout == "HWOI" and\
+         weight_layout == "HWOI" and\
          get_const_int(inputs[1].shape[2]) == groups and \
          get_const_int(inputs[1].shape[3]) == 1:
         out = topi.nn.depthwise_conv2d_nhwc(
@@ -242,3 +242,12 @@ def schedule_l2_normalize(attrs, outs, target):
         return topi.generic.schedule_l2_normalize(outs)
 
 reg.register_pattern("nn.l2_normalize", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+@reg.register_schedule("nn.upsampling")
+def schedule_upsampling(_, outs, target):
+    """Schedule definition of upsampling"""
+    with target:
+        return topi.generic.schedule_injective(outs)
+
+reg.register_pattern("nn.upsampling", OpPattern.INJECTIVE)
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index ed7b8449eace..6a98d2884621 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -5,6 +5,9 @@
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/op_attr_types.h>
+#include <topi/elemwise.h>
+#include <topi/nn/upsampling.h>
 #include "../layout.h"
 
 namespace tvm {
@@ -82,7 +85,27 @@ RELAY_REGISTER_OP("nn.upsampling")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
-.add_type_rel("UpSampling", UpSamplingRel);
+.add_type_rel("UpSampling", UpSamplingRel)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+          const Array<Tensor>& inputs,
+          const Type& out_type,
+          const Target& target) {
+  const auto* param = attrs.as<UpSamplingAttrs>();
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  CHECK(param != nullptr);
+  CHECK(param->layout == "NCHW" || param->layout == "NHWC");
+  CHECK(out_ttype != nullptr);
+  Array<IndexExpr> oshape;
+  if (param->layout == "NCHW") {
+    oshape.push_back(out_ttype->shape[2]);
+    oshape.push_back(out_ttype->shape[3]);
+  } else if (param->layout == "NHWC") {
+    oshape.push_back(out_ttype->shape[1]);
+    oshape.push_back(out_ttype->shape[2]);
+  }
+  return Array<Tensor>{ topi::nn::upsampling(inputs[0], oshape, param->layout, param->method)};
+});
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 83a4c9067f43..305219f56404 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1212,7 +1212,7 @@ bool SplitRel(const Array<Type>& types,
     auto indices = param->indices_or_sections.as<ArrayNode>()->data;
     auto begin = IndexExpr(make_zero(Int(32)));
     std::vector<Type> fields;
-    for (uint i = 0; i < indices.size(); ++i) {
+    for (unsigned int i = 0; i < indices.size(); ++i) {
       CHECK(reporter->Assert(IndexExpr(indices[i]) > begin))
           << "indices_or_sections need to be a sorted ascending list";
       std::vector<IndexExpr>&& oshape = AsVector(data->shape);
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index cd9321c5a91f..2060b44017d3 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -412,6 +412,42 @@ def test_batch_flatten():
         np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
 
+def _test_upsampling(layout, method):
+    n, c, h, w = tvm.var("n"), 16, 32, 32
+    scale = 2
+    dtype = "float32"
+    def get_shape():
+        if layout == "NCHW":
+            return (c, h, w), (c, h*scale, w*scale)
+        else:
+            return (h, w, c), (h*scale, w*scale, c)
+    ishape, oshape = get_shape()
+    x = relay.var("x", relay.TensorType((n,) + ishape, dtype))
+    y = relay.nn.upsampling(x, scale=scale, layout=layout, method=method)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n,) + oshape, dtype)
+    dshape = (1,) + ishape
+    x = relay.var("x", shape=dshape)
+    y = relay.nn.upsampling(x, scale=scale, layout=layout, method=method)
+    func = relay.Function([x], y)
+    data = np.random.uniform(size=dshape).astype(dtype)
+    if method == "NEAREST_NEIGHBOR":
+        ref = topi.testing.upsampling_python(data, scale, layout)
+    else:
+        ref = topi.testing.bilinear_resize_python(data, (h*scale, w*scale), layout)
+    for target, ctx in ctx_list():
+        executor = relay.create_executor("graph", ctx=ctx, target=target)
+        out = executor.evaluate(func)(data)
+        tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5)
+
+
+def test_upsampling():
+    _test_upsampling("NCHW", "NEAREST_NEIGHBOR")
+    _test_upsampling("NCHW", "BILINEAR")
+    _test_upsampling("NHWC", "NEAREST_NEIGHBOR")
+    _test_upsampling("NHWC", "BILINEAR")
+
+
 if __name__ == "__main__":
     test_pool2d()
     test_avg_pool2d_no_count_pad()
@@ -425,3 +461,4 @@ def test_batch_flatten():
     test_conv2d_transpose_run()
     test_conv2d_run()
     test_batch_flatten()
+    test_upsampling()

From e29021e1ee5f8e6c1bcc035dea8cde1ab65b0f28 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 27 Nov 2018 00:17:08 +0530
Subject: [PATCH 414/529] [RELAY]take and transpose comp and schd (#2135)

---
 nnvm/src/top/tensor/transform.cc     |  2 +-
 python/tvm/relay/op/_transform.py    |  2 ++
 src/relay/op/tensor/transform.cc     | 31 ++++++++++++++++--
 tests/python/relay/test_op_level3.py | 47 ++++++++++++++++++++++++++++
 topi/include/topi/transform.h        | 39 ++++++++++++-----------
 5 files changed, 100 insertions(+), 21 deletions(-)

diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 6d8b75118a77..9d259ae77d9b 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -874,7 +874,7 @@ Examples::
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
-    auto axes = ShapeToArray(param.axes);
+    auto axes = ShapeToIntArray(param.axes);
     return Array<Tensor>{ topi::transpose(inputs[0], axes) };
 })
 .set_attr<FGradient>(
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index d087526b7b88..8726db55f8c1 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -15,3 +15,5 @@
 _reg.register_schedule("strided_slice", schedule_injective)
 _reg.register_schedule("slice_like", schedule_injective)
 _reg.register_schedule("split", schedule_injective)
+_reg.register_schedule("take", schedule_injective)
+_reg.register_schedule("transpose", schedule_injective)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 305219f56404..689c9c9bb8d7 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -282,6 +282,15 @@ bool TransposeRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> TransposeCompute(const Attrs& attrs,
+                               const Array<Tensor>& inputs,
+                               const Type& out_type,
+                               const Target& target) {
+  const auto* param = attrs.as<TransposeAttrs>();
+  CHECK(param != nullptr);
+  return Array<Tensor>{ topi::transpose(inputs[0], param->axes) };
+}
+
 Expr MakeTranspose(Expr data,
                    Array<Integer> axes) {
   auto attrs = make_node<TransposeAttrs>();
@@ -307,7 +316,9 @@ RELAY_REGISTER_OP("transpose")
 .set_attrs_type_key("relay.attrs.TransposeAttrs")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
-.add_type_rel("Transpose", TransposeRel);
+.add_type_rel("Transpose", TransposeRel)
+.set_attr<FTVMCompute>("FTVMCompute", TransposeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
 
 /* relay.reshape */
 
@@ -575,6 +586,19 @@ bool TakeRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> TakeCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  const auto* param = attrs.as<TakeAttrs>();
+  CHECK(param != nullptr);
+  if (!param->axis.defined()) {
+    return Array<Tensor>{ topi::take(inputs[0], inputs[1]) };
+  } else {
+    return Array<Tensor>{ topi::take(inputs[0], inputs[1], param->axis) };
+  }
+}
+
 Expr MakeTake(Expr data,
               Expr indices,
               Integer axis) {
@@ -617,7 +641,10 @@ Examples::
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("indices", "Tensor", "The indices tensor.")
 .set_support_level(2)
-.add_type_rel("Take", TakeRel);
+.add_type_rel("Take", TakeRel)
+.set_attr<FTVMCompute>("FTVMCompute", TakeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
 
 // Init ops
 TVM_REGISTER_NODE_TYPE(InitOpAttrs);
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 806b63b7c6f5..99d7b4f95de5 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -87,6 +87,22 @@ def test_transpose_infer_type():
     assert yy.checked_type == relay.TensorType(
         (t, n, 100), "float32")
 
+def test_transpose():
+    def verify_transpose(dshape, axes):
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.transpose(x, axes=axes)
+
+        func = relay.Function([x], z)
+        x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
+        ref_res = np.transpose(x_data, axes=axes)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    verify_transpose((2, 3, 4), (0, 2, 1))
+
 
 def test_squeeze_infer_type():
     n, t, d = 1, 4, 1
@@ -202,6 +218,35 @@ def verify_take(dshape, indices_shape, oshape, axis=None):
     verify_take((d1, d2), (d3, d4, d5), (d1, d3, d4, d5), 1)
     verify_take((d1, d2, d3, d4), (d5, d6), (d1, d2, d5, d6, d4), -2)
 
+def test_take():
+    def verify_take(src_shape, indices_src, axis=None):
+        src_dtype = "float32"
+        indices_dtype = "int32"
+        indices_src = np.array(indices_src, dtype=indices_dtype)
+        x = relay.var("x", relay.TensorType(src_shape, src_dtype))
+        indices = relay.var("indices", relay.TensorType(indices_src.shape, indices_dtype))
+        z = relay.take(x, indices, axis=axis)
+
+        func = relay.Function([x, indices], z)
+        x_data = np.random.uniform(low=-1, high=1, size=src_shape).astype(src_dtype)
+        ref_res = np.take(x_data, indices=indices_src, axis=axis)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, indices_src)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+    verify_take((4,), [1])
+    verify_take((4,), [[0,1,2,3]])
+    verify_take((3,3,3), [[11,25]])
+    verify_take((4,), [[0,1],[2,3]])
+    verify_take((4,), [1], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 1)
+    verify_take((4,3,5,6), [[2,1,0,0]], -2)
+
+
 def test_split_infer_type():
     def verify_split(dshape, indices_or_sections, ret_type, axis=None):
         x = relay.var("x", relay.ty.TensorType(dshape, "float32"))
@@ -360,11 +405,13 @@ def test_infer_type_prelu():
     test_unary_identity()
     test_clip()
     test_transpose_infer_type()
+    test_transpose()
     test_reshape_infer_type()
     test_reshape()
     test_reshape_like_infer_type()
     test_reshape_like()
     test_take_infer_type()
+    test_take()
     test_full()
     test_full_like()
     test_infer_type_leaky_relu()
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index ef7f1221a70c..835f4272b940 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -86,42 +86,45 @@ inline Tensor expand_dims(const Tensor& x,
 * \return A Tensor whose op member is the transpose operation
 */
 inline Tensor transpose(const Tensor& x,
-                        Array<Expr> axes,
+                        Array<Integer> axes,
                         std::string name = "tensor",
                         std::string tag = kInjective) {
-  if (axes.size() == 0) {
-    axes = Array<Expr>();
+  if (!axes.defined() || axes.size() == 0) {
+    axes = Array<Integer>();
     for (int i = static_cast<int>(x->shape.size()) - 1; i >= 0; --i) {
       axes.push_back(i);
     }
   }
 
-  auto axes_val = GetConstIntValues(axes, "axes");
-  for (size_t i = 0; i < axes_val.size(); ++i) {
-    int axis = axes_val[i];
-    if (axes_val[i] < 0) {
-      axes_val[i] = static_cast<int>(x->shape.size()) + axes_val[i];
+  Array<Expr> new_shape;
+  for (size_t i = 0; i < axes.size(); ++i) {
+    int axis = static_cast<int>(axes[i]->value);
+    int new_axis = axis;
+    if (axis < 0) {
+      new_axis = static_cast<int>(x->shape.size()) + axis;
+      axes.Set(i, new_axis);
     }
-    CHECK((0 <= axes_val[i]) && (axes_val[i] < static_cast<int>(x->shape.size())))
+    CHECK((new_axis >= 0) && (new_axis < static_cast<int>(x->shape.size())))
       << "axis=" << axis << " is invalid for the "
       << static_cast<int>(x->shape.size()) << "-dimensional input tensor";
 
-    CHECK(1 == std::count(std::begin(axes_val), std::end(axes_val), axes_val[i]))
-      << "repeated axis in transpose";
+    for (size_t j = 0; j < axes.size(); ++j) {
+      if (i !=j) {
+        CHECK(new_axis != static_cast<int>(axes[j]->value)) << "repeated axis in transpose";
+      }
+    }
+    new_shape.push_back(x->shape[new_axis]);
   }
 
-  Array<Expr> new_shape;
-  for (size_t i = 0; i < axes_val.size(); ++i) {
-    new_shape.push_back(x->shape[axes_val[i]]);
-  }
   return compute(
     new_shape, [&](const Array<Var>& indices) {
       std::vector<Expr> idx;
-      for (size_t i = 0; i < axes_val.size(); ++i) {
+      for (size_t i = 0; i < axes.size(); ++i) {
         idx.push_back(1);
       }
-      for (size_t i = 0; i < axes_val.size(); ++i) {
-        idx[axes_val[i]] = indices[i];
+      for (size_t i = 0; i < axes.size(); ++i) {
+        int axis = static_cast<int>(axes[i]->value);
+        idx[axis] = indices[i];
       }
       return x(idx);
     }, name, tag);

From 5de489ddff70edf65fde2611b61dddd3ca51ce11 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Mon, 26 Nov 2018 17:11:19 -0500
Subject: [PATCH 415/529] [Relay] Densenet benchmark (#2154)

* Port densenet to Relay

* Invoke densenet test in __main()__

* Even the spacing in the IR text format tests

* Forgot to import densenet in init

* Correct reference to densenet in test
---
 python/tvm/relay/testing/__init__.py       |   1 +
 python/tvm/relay/testing/densenet.py       | 123 +++++++++++++++++++++
 tests/python/relay/test_ir_text_printer.py |   9 ++
 3 files changed, 133 insertions(+)
 create mode 100644 python/tvm/relay/testing/densenet.py

diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 47a04b531922..f49013928748 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -10,4 +10,5 @@
 from . import inception_v3
 from . import squeezenet
 from . import vgg
+from . import densenet
 from .config import ctx_list
diff --git a/python/tvm/relay/testing/densenet.py b/python/tvm/relay/testing/densenet.py
new file mode 100644
index 000000000000..7abebc75ecee
--- /dev/null
+++ b/python/tvm/relay/testing/densenet.py
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name, line-too-long
+"""
+Port of MxNet version of Densenet to Relay.
+https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/model_zoo/vision/densenet.py
+"""
+# pylint: enable=line-too-long
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def _make_dense_layer(data, growth_rate, bn_size, index):
+    """Single densenet layer."""
+    bn1 = layers.batch_norm_infer(data, name="batch_1_%s" % index)
+    relu1 = relay.nn.relu(bn1)
+    conv1 = layers.conv2d(relu1, channels=bn_size * growth_rate,
+                          kernel_size=(1, 1), name="conv2d_1_%s" % index)
+    bn2 = layers.batch_norm_infer(conv1, name="batch_2_" + index)
+    relu2 = relay.nn.relu(bn2)
+    conv2 = layers.conv2d(relu2, channels=growth_rate, kernel_size=(3, 3),
+                          padding=(1, 1), name="conv2d_2_%s" % index)
+    return conv2
+
+def _make_dense_block(data, num_layers, bn_size, growth_rate, index):
+    """Makes a block of dense layers of the specified size."""
+    layer_out = data
+    for i in range(num_layers):
+        layer_out = _make_dense_layer(layer_out, growth_rate, bn_size,
+                                      "(%s, %s)" % (index, i))
+    return layer_out
+
+def _make_transition(data, num_output_features, index):
+    """Transition between layers."""
+    bn = layers.batch_norm_infer(data, name="batch_t_%s" % index)
+    relu = relay.nn.relu(bn)
+    conv = layers.conv2d(relu, channels=num_output_features,
+                         kernel_size=(1, 1), name="conv_t_%s" % index)
+    return relay.nn.avg_pool2d(conv, pool_size=(2, 2), strides=(2, 2))
+
+def _make_dense_net(num_init_features, growth_rate, block_config,
+                    data_shape, data_dtype, bn_size=4, classes=1000):
+    """Builds up a densenet."""
+    data = relay.Var("data", relay.TensorType(data_shape, data_dtype)) # (bn_size, 3, 224, 224)))
+    conv1 = layers.conv2d(data, channels=num_init_features,
+                          kernel_size=(7, 7), strides=(2, 2), padding=(3, 3),
+                          name='conv1')
+    bn1 = layers.batch_norm_infer(conv1, name='batch1')
+    relu1 = relay.nn.relu(bn1)
+    mp = relay.nn.max_pool2d(relu1, pool_size=(3, 3), strides=(2, 2), padding=(1, 1))
+
+    num_features = num_init_features
+    layer_out = mp
+    for i, num_layers in enumerate(block_config):
+        layer_out = _make_dense_block(layer_out, num_layers, growth_rate, bn_size, i)
+        num_features = num_features + num_layers*growth_rate
+        if i != len(block_config) - 1:
+            layer_out = _make_transition(layer_out, num_features // 2, i)
+            num_features = num_features // 2
+    bn2 = layers.batch_norm_infer(layer_out, name='batch2')
+    relu2 = relay.nn.relu(bn2)
+    avg = relay.nn.avg_pool2d(relu2, pool_size=(7, 7))
+    flat = relay.nn.batch_flatten(avg)
+
+    ret = layers.dense_add_bias(flat, units=classes, name='dense')
+
+    return relay.Function(relay.ir_pass.free_vars(ret), ret)
+
+def get_workload(densenet_size=121, classes=1000, batch_size=4,
+                 image_shape=(3, 224, 224), dtype='float32'):
+    """Gets benchmark workload for densenet.
+
+    Parameters
+    ----------
+    densenet_size : int, optional (default 121)
+        Parameter for the network size. The supported sizes
+        are 121, 161, 169, and 201.
+
+    classes : int, optional (default 1000)
+        The number of classes.
+
+    batch_size : int, optional (detault 4)
+        The batch size for the network.
+
+    image_shape : shape, optional (default (3, 224, 224))
+        The shape of the input data.
+
+    dtype : data type, optional (default 'float32')
+        The data type of the input data.
+
+    Returns
+    -------
+    net: relay.Function
+        The computation graph representing densenet.
+
+    params : dict of str to NDArray
+        The benchmark paraeters.
+    """
+    specs = {121: (64, 32, [6, 12, 24, 16]),
+             161: (96, 48, [6, 12, 36, 24]),
+             169: (69, 32, [6, 12, 32, 32]),
+             201: (64, 32, [6, 12, 48, 32])}
+
+    num_init_features, growth_rate, block_config = specs[densenet_size]
+    data_shape = tuple([batch_size] + list(image_shape))
+    net = _make_dense_net(num_init_features, growth_rate, block_config,
+                          data_shape, dtype, batch_size, classes)
+    return create_workload(net)
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index f6a1236c89e6..624ef71ed870 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -107,14 +107,17 @@ def test_resnet():
     net, params = tvm.relay.testing.resnet.get_workload(batch_size=1)
     net.astext()
 
+
 def test_mobilenet():
     net, params = tvm.relay.testing.mobilenet.get_workload(batch_size=1)
     net.astext()
 
+
 def test_dqn():
     net, params = tvm.relay.testing.dqn.get_workload(batch_size=1)
     net.astext()
 
+
 def test_dcgan():
     net, params = tvm.relay.testing.dcgan.get_workload(batch_size=1)
     net.astext()
@@ -137,6 +140,11 @@ def test_vgg():
     net, params = tvm.relay.testing.vgg.get_workload(batch_size=1)
     net.astext()
 
+def test_densenet():
+    net, params = tvm.relay.testing.densenet.get_workload(batch_size=1)
+    net.astext()
+
+
 if __name__ == "__main__":
     do_print[0] = True
     test_resnet()
@@ -147,6 +155,7 @@ def test_vgg():
     test_squeezenet()
     test_inception_v3()
     test_vgg()
+    test_densenet()
     test_func()
     test_env()
     test_meta_data()

From 56853d22f09dc132175bb69c20d5d03a7cd54de6 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 27 Nov 2018 08:26:55 +0800
Subject: [PATCH 416/529] [Relay][Pass] Fix CombineParallelConv2D (#2167)

---
 python/tvm/relay/build_module.py              |  2 +-
 src/relay/pass/combine_parallel_conv2d.cc     | 19 ++++++++++++-------
 .../test_pass_combine_parallel_conv2d.py      |  6 ++++--
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index fe6cb28c9c72..863ca063137f 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -13,9 +13,9 @@
 # List of optimization pass and level when switch on
 OPT_PASS_LEVEL = {
     "SimplifyInference": 0,
-    "CombineParallelConv2D": 4,
     "OpFusion": 1,
     "FoldConstant": 2,
+    "CombineParallelConv2D": 3,
     "FoldScaleAxis": 3,
 }
 
diff --git a/src/relay/pass/combine_parallel_conv2d.cc b/src/relay/pass/combine_parallel_conv2d.cc
index 48d5d77990d6..e346aea518e9 100644
--- a/src/relay/pass/combine_parallel_conv2d.cc
+++ b/src/relay/pass/combine_parallel_conv2d.cc
@@ -37,7 +37,7 @@ using Group = std::vector<Branch>;
   Intermediate nodes have exactly one successor. It is possible that branches meet at a point,
   which should be handled in ParallelConv2DCombiner.
 
-          data
+         data
         /    \
     conv2d   conv2d
       |        |
@@ -47,17 +47,22 @@ using Group = std::vector<Branch>;
 class BranchGroupFinder : private ExprVisitor {
  public:
   std::vector<Group> Find(const Expr& expr) {
+    static const Op& conv2d = Op::Get("nn.conv2d");
+
     this->VisitExpr(expr);
 
     std::vector<Group> groups;
     for (const auto& root : conv_roots_) {
-      const auto& convs = children_map_.at(root);
-      for (const CallNode* conv : convs) {
-        auto&& branch = CreateBranch(conv);
+      const auto& children = children_map_.at(root);
+      size_t ngroups = groups.size();
+      for (const CallNode* child : children) {
+        if (!child->op.same_as(conv2d)) continue;
+
+        auto&& branch = CreateBranch(child);
         // add the branch to a group, or create a new group
-        auto it = std::find_if(groups.begin(), groups.end(), [&](const Group& group) {
+        auto it = std::find_if(groups.begin() + ngroups, groups.end(), [&](const Group& group) {
           CHECK(!group.empty() && !group[0].empty());
-          return IsCompatibleConv2D(conv, group[0][0]);
+          return IsCompatibleConv2D(child, group[0][0]);
         });
         if (it != groups.end()) {
           it->push_back(branch);
@@ -108,7 +113,7 @@ class BranchGroupFinder : private ExprVisitor {
       const CallNode* call = it->second[0];
       auto pattern = fpattern[Downcast<Op>(call->op)];
       if (pattern <= kBroadcast) {
-        branch.push_back(it->second[0]);
+        branch.push_back(call);
         it = children_map_.find(GetRef<Expr>(branch.back()));
       } else {
         break;
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
index 31dfe095f682..6fea201d64c8 100644
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -11,7 +11,8 @@ def before(x, w1, w2, w3, w4):
         # y3 cannot be combined
         y3 = relay.nn.conv2d(x, w3)
         y4 = relay.nn.conv2d(x, w4)
-        y = relay.Tuple((y1, y2, y3, y4))
+        y5 = relay.nn.max_pool2d(x)
+        y = relay.Tuple((y1, y2, y3, y4, y5))
         return relay.Function(args, y)
 
     def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
@@ -24,7 +25,8 @@ def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
         y3 = relay.nn.conv2d(x, w3)
         y4 = relay.strided_slice(y, [0, channels1 + channels2],
                                  [None, channels1 + channels2 + channels4])
-        y = relay.Tuple((y1, y2, y3, y4))
+        y5 = relay.nn.max_pool2d(x)
+        y = relay.Tuple((y1, y2, y3, y4, y5))
         return relay.Function(args, y)
 
     def check(x_shape, channels1, channels2, channels3, channels4):

From e8f28d0fb2a99952cb23c9581e0369e9a9bad4c3 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 27 Nov 2018 11:40:31 +0530
Subject: [PATCH 417/529] [RELAY]full, full_like compute and schedule (#2170)

---
 python/tvm/relay/op/_transform.py    |  2 ++
 src/relay/op/tensor/transform.cc     | 23 ++++++++++++++--
 tests/python/relay/test_op_level3.py | 41 ++++++++++++++++++++++++++--
 3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 8726db55f8c1..c1624028fe68 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -11,6 +11,8 @@
 _reg.register_schedule("expand_dims", schedule_broadcast)
 _reg.register_schedule("reshape", schedule_injective)
 _reg.register_schedule("reshape_like", schedule_injective)
+_reg.register_schedule("full", schedule_injective)
+_reg.register_schedule("full_like", schedule_injective)
 _reg.register_schedule("cast", schedule_broadcast)
 _reg.register_schedule("strided_slice", schedule_injective)
 _reg.register_schedule("slice_like", schedule_injective)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 689c9c9bb8d7..53741e666f38 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -673,6 +673,14 @@ bool FullRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> FullCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  return { topi::full(out_ttype->shape, out_ttype->dtype, inputs[0]()) };
+}
+
 Expr MakeFull(Expr fill_value,
               Array<IndexExpr> shape,
               DataType dtype) {
@@ -696,7 +704,9 @@ RELAY_REGISTER_OP("full")
 .set_num_inputs(1)
 .add_argument("fill_value", "double", "The value to fill.")
 .set_support_level(3)
-.add_type_rel("Full", FullRel);
+.add_type_rel("Full", FullRel)
+.set_attr<FTVMCompute>("FTVMCompute", FullCompute)
+.set_attr<TOpPattern>("TOpPattern", kElemWise);
 
 bool InitOpRel(const Array<Type>& types,
                int num_inputs,
@@ -777,6 +787,13 @@ bool FullLikeRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> FullLikeCompute(const Attrs& attrs,
+                              const Array<Tensor>& inputs,
+                              const Type& out_type,
+                              const Target& target) {
+  return { topi::full_like(inputs[0], inputs[1]()) };
+}
+
 Expr MakeFullLike(Expr data,
                   Expr fill_value) {
   static const Op& op = Op::Get("full_like");
@@ -797,7 +814,9 @@ and type as the input array.
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("fill_value", "double", "Scalar value to fill.")
 .set_support_level(3)
-.add_type_rel("FullLike", FullLikeRel);
+.add_type_rel("FullLike", FullLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", FullLikeCompute)
+.set_attr<TOpPattern>("TOpPattern", kElemWise);
 
 // where operator
 bool WhereRel(const Array<Type>& types,
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 99d7b4f95de5..617b532a6a1f 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -293,7 +293,7 @@ def verify_split(dshape, indices_or_sections, ret_type, axis=None):
                      relay.ty.TensorType((d1, (d2-7), d3, d4), "float32")])),
                   axis=1)
 
-def test_full():
+def test_full_infer_type():
     # default settings: match input dtype
     x = relay.var("x", relay.TensorType((), "int8"))
     y = relay.full(x, ())
@@ -308,7 +308,22 @@ def test_full():
     assert yy.checked_type == relay.TensorType((1, 2), "int8")
 
 
-def test_full_like():
+def test_full():
+    def verify_full(fill_value, src_shape, dtype):
+        x = relay.var("x", relay.scalar_type(dtype))
+        z = relay.full(x, src_shape, dtype)
+        func = relay.Function([x], z)
+        ref_res = np.full(src_shape, fill_value)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(fill_value)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    verify_full(4, (1, 3, 4, 4), "int32")
+    verify_full(4.0, (1, 4), "float32")
+
+
+def test_full_like_infer_type():
     # concrete shape
     base = relay.var("base", relay.TensorType((1, 2, 3), "float32"))
     fill = relay.var("fill", relay.TensorType((), "float32"))
@@ -324,6 +339,26 @@ def test_full_like():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
+
+def test_full_like():
+    def verify_full_like(base, fill_value, dtype):
+        x_data = np.random.uniform(low=-1, high=1, size=base).astype(dtype)
+        x = relay.var("x", relay.TensorType(base, dtype))
+        y = relay.var("y", relay.scalar_type(dtype))
+        z = relay.full_like(x, y)
+
+        func = relay.Function([x, y], z)
+        ref_res = np.full_like(x_data, fill_value)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, fill_value)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    verify_full_like((1, 3, 4, 4), 4, "int32")
+    verify_full_like((1, 1), 44.0, "float32")
+
+
 def test_infer_type_leaky_relu():
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
@@ -412,7 +447,9 @@ def test_infer_type_prelu():
     test_reshape_like()
     test_take_infer_type()
     test_take()
+    test_full_infer_type()
     test_full()
+    test_full_like_infer_type()
     test_full_like()
     test_infer_type_leaky_relu()
     test_infer_type_prelu()

From a93369d937cbdf45e584f81a4caaf62b2673d471 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 26 Nov 2018 23:07:37 -0800
Subject: [PATCH 418/529] [RELAY][IR] Introduce IdNode to preserve var id
 across rewriting (#2178)

---
 include/tvm/relay/base.h              | 28 +++++++++++++++++++++++++++
 include/tvm/relay/expr.h              | 24 +++++++++++++++++------
 python/tvm/relay/base.py              |  7 +++++++
 python/tvm/relay/expr.py              |  6 ++++++
 src/relay/backend/compile_engine.cc   |  2 +-
 src/relay/ir/alpha_equal.cc           |  3 ++-
 src/relay/ir/base.cc                  |  4 ++--
 src/relay/ir/expr.cc                  | 15 ++++++++++----
 src/relay/ir/expr_functor.cc          |  2 +-
 src/relay/ir/hash.cc                  |  3 ++-
 src/relay/ir/text_printer.cc          |  2 +-
 tests/python/relay/test_type_infer.py |  1 +
 12 files changed, 80 insertions(+), 17 deletions(-)

diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index 49e276b07c59..f72f557a9765 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -165,6 +165,34 @@ class RelayNode : public Node {
   TVM_DECLARE_BASE_NODE_INFO(RelayNode, Node);
 };
 
+/*!
+ * \brief The unique identifier of variables.
+ *
+ * Id is like name to the variables,
+ * except that id is unique for each Var.
+ *
+ * \note Do not create Id directly, they are created in Var.
+ */
+class IdNode : public Node {
+ public:
+  /*!
+   * \brief The name of the variable,
+   *  this only acts as a hint to the user,
+   *  and is not used for equality.
+   */
+  std::string name_hint;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name_hint", &name_hint);
+  }
+
+  static constexpr const char* _type_key = "relay.Id";
+  TVM_DECLARE_NODE_TYPE_INFO(IdNode, Node);
+};
+
+RELAY_DEFINE_NODE_REF(Id, IdNode, NodeRef);
+
+
 struct Module;
 
 }  // namespace relay
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 887d28b0fa9f..469b73a1df10 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -124,18 +124,22 @@ RELAY_DEFINE_NODE_REF(Tuple, TupleNode, Expr);
  * Its semantics are similar to tvm.Var node used in TVM's low level
  * tensor expression language.
  *
- * \note Each Var is bind only once and is immutable/
+ * \note Each Var is bind only once and is immutable.
  */
 class Var;
 /*! \brief Container for Var */
 class VarNode : public ExprNode {
  public:
   /*!
-   * \brief The name of the variable,
-   *  this only acts as a hint to the user,
-   *  and is not used for equality.
+   * \brief The unique identifier of the Var.
+   *
+   * vid will be preserved for the same Var during type inference
+   * and other rewritings, while the VarNode might be recreated
+   * to attach additional information.
+   * This property can be used to keep track of parameter Var
+   * information across passes.
    */
-  std::string name_hint;
+  Id vid;
   /*!
    * \brief type annotaion of the variable.
    * This field records user provided type annotation of the Var.
@@ -143,8 +147,13 @@ class VarNode : public ExprNode {
    */
   Type type_annotation;
 
+  /*! \return The name hint of the variable */
+  const std::string& name_hint() const {
+    return vid->name_hint;
+  }
+
   void VisitAttrs(tvm::AttrVisitor* v) final {
-    v->Visit("name_hint", &name_hint);
+    v->Visit("vid", &vid);
     v->Visit("type_annotation", &type_annotation);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -153,6 +162,9 @@ class VarNode : public ExprNode {
   TVM_DLL static Var make(std::string name_hint,
                           Type type_annotation);
 
+  TVM_DLL static Var make(Id vid,
+                          Type type_annotation);
+
   static constexpr const char* _type_key = "relay.Var";
   TVM_DECLARE_NODE_TYPE_INFO(VarNode, ExprNode);
 };
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
index 169b07b41abf..83aa4ec2cdd0 100644
--- a/python/tvm/relay/base.py
+++ b/python/tvm/relay/base.py
@@ -54,3 +54,10 @@ def astext(self, show_meta_data=True, annotate=None):
 class Span(RelayNode):
     def __init__(self, source, lineno, col_offset):
         self.__init_handle_by_constructor__(_make.Span, source, lineno, col_offset)
+
+
+@register_relay_node
+class Id(NodeBase):
+    """Unique identifier(name) for Var across type checking."""
+    def __init__(self):
+        raise RuntimeError("Cannot directly construct Id")
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 89a8a58fffa9..4725c0a7a07d 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -166,6 +166,12 @@ def __init__(self, name_hint, type_annotation=None):
         self.__init_handle_by_constructor__(
             _make.Var, name_hint, type_annotation)
 
+    @property
+    def name_hint(self):
+        """Get name hint of the current var."""
+        name = self.vid.name_hint
+        return name
+
 
 @register_relay_node
 class GlobalVar(Expr):
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 8cb1279a1435..17a5b60b322e 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -99,7 +99,7 @@ class ScheduleGetter :
   }
 
   Array<Tensor> VisitExpr_(const VarNode* op) final {
-    LOG(FATAL) << "Free variable " << op->name_hint;
+    LOG(FATAL) << "Free variable " << op->name_hint();
     return {};
   }
 
diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 8409581b53bf..873210321bf3 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -240,8 +240,9 @@ class AlphaEqualHandler:
   }
 
   bool VisitExpr_(const VarNode* lhs, const Expr& other) final {
+    // This function will only be triggered if we are matching free variables.
     if (const VarNode* rhs = other.as<VarNode>()) {
-      if (lhs->name_hint != rhs->name_hint) return false;
+      if (lhs->name_hint() != rhs->name_hint()) return false;
       if (!TypeEqual(lhs->type_annotation, rhs->type_annotation)) return false;
       return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
     } else {
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
index 1f73f297f99a..06593b6420f5 100644
--- a/src/relay/ir/base.cc
+++ b/src/relay/ir/base.cc
@@ -64,7 +64,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
               << node->col_offset << ")";
   });
 
+TVM_REGISTER_NODE_TYPE(IdNode);
+
 }  // namespace relay
 }  // namespace tvm
-
-
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index e8b93ec5210d..6f1260b05b99 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -63,23 +63,30 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
     p->stream << "Tuple(" << node->fields << ")";
   });
 
-Var VarNode::make(std::string name_hint, Type type_annotation) {
+
+Var VarNode::make(Id vid, Type type_annotation) {
   NodePtr<VarNode> n = make_node<VarNode>();
-  n->name_hint = std::move(name_hint);
+  n->vid = std::move(vid);
   n->type_annotation = std::move(type_annotation);
   return Var(n);
 }
 
+Var VarNode::make(std::string name_hint, Type type_annotation) {
+  NodePtr<IdNode> n = make_node<IdNode>();
+  n->name_hint = std::move(name_hint);
+  return VarNode::make(Id(n), type_annotation);
+}
+
 TVM_REGISTER_NODE_TYPE(VarNode);
 
 TVM_REGISTER_API("relay._make.Var")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = VarNode::make(args[0], args[1]);
+    *ret = VarNode::make(args[0].operator std::string(), args[1]);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<VarNode>([](const VarNode* node, tvm::IRPrinter* p) {
-    p->stream << "Var(" << node->name_hint;
+    p->stream << "Var(" << node->name_hint();
     if (node->type_annotation.defined()) {
       p->stream << ", ty=";
       p->print(node->type_annotation);
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index bacbfea7c063..c1719e81a6c6 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -30,7 +30,7 @@ Expr ExprMutator::VisitExpr_(const VarNode* op) {
   if (op->type_annotation.defined()) {
     auto type = this->VisitType(op->type_annotation);
     if (!op->type_annotation.same_as(type)) {
-      return VarNode::make(op->name_hint, type);
+      return VarNode::make(op->vid, type);
     }
   }
   // default case return self.
diff --git a/src/relay/ir/hash.cc b/src/relay/ir/hash.cc
index 4fd91256db9c..d7a8df98fa3f 100644
--- a/src/relay/ir/hash.cc
+++ b/src/relay/ir/hash.cc
@@ -202,7 +202,8 @@ class RelayHashHandler:
   }
 
   size_t VisitExpr_(const VarNode* var) final {
-    size_t name_hash = std::hash<std::string>()(var->name_hint);
+    // hash free variable
+    size_t name_hash = std::hash<const Node*>()(var->vid.get());
     return Combine(name_hash, TypeHash(var->type_annotation));
   }
 
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index 5e97ce1010ad..2664c475608b 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -690,7 +690,7 @@ class TextPrinter :
    * \return The corresponding name.
    */
   TextValue AllocVarName(const Var& var) {
-    std::string name = var->name_hint;
+    std::string name = var->name_hint();
     // always make sure first name is alpha
     if (name.length() != 0 && !std::isalpha(name[0])) {
       name = "%v" + name;
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index c1f06ccc763a..06cb19639dcf 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -141,6 +141,7 @@ def test_free_expr():
     y = relay.add(x, x)
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.scalar_type("float32")
+    assert x.vid.same_as(yy.args[0].vid)
 
 
 def test_type_args():

From f7f54e0a12851babd365be586694ae096d9fd5ed Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Wed, 28 Nov 2018 07:19:02 +0530
Subject: [PATCH 419/529] [Relay]resize op compute and schedule (#2172)

---
 python/tvm/relay/op/image/__init__.py |  1 +
 python/tvm/relay/op/image/_image.py   |  7 ++++++
 src/relay/op/image/resize.cc          | 30 +++++++++++++++++++++++++-
 tests/python/relay/test_op_level5.py  | 31 +++++++++++++++++++++++++++
 4 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/relay/op/image/_image.py

diff --git a/python/tvm/relay/op/image/__init__.py b/python/tvm/relay/op/image/__init__.py
index 9d1415b1dca4..5fa5c01575e0 100644
--- a/python/tvm/relay/op/image/__init__.py
+++ b/python/tvm/relay/op/image/__init__.py
@@ -2,3 +2,4 @@
 """Image network related operators."""
 from __future__ import absolute_import as _abs
 from .image import *
+from ._image import *
diff --git a/python/tvm/relay/op/image/_image.py b/python/tvm/relay/op/image/_image.py
new file mode 100644
index 000000000000..e44748372374
--- /dev/null
+++ b/python/tvm/relay/op/image/_image.py
@@ -0,0 +1,7 @@
+#pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+from ..op import  register_schedule, schedule_injective
+
+# resize
+register_schedule("image.resize", schedule_injective)
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index bfa2ea4cdfa5..e6efcb8ce459 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -5,7 +5,10 @@
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/image.h>
+#include <topi/elemwise.h>
+#include <topi/image/resize.h>
 #include "../layout.h"
+#include "../op_common.h"
 
 namespace tvm {
 namespace relay {
@@ -40,6 +43,29 @@ bool ResizeRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> ResizeCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  const auto* param = attrs.as<ResizeAttrs>();
+  CHECK(param != nullptr);
+  CHECK(param->layout == "NCHW" || param->layout == "NHWC");
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  CHECK(out_ttype != nullptr);
+  Array<IndexExpr> oshape;
+  if (param->layout == "NCHW") {
+    oshape.push_back(out_ttype->shape[2]);
+    oshape.push_back(out_ttype->shape[3]);
+  } else if (param->layout == "NHWC") {
+    oshape.push_back(out_ttype->shape[1]);
+    oshape.push_back(out_ttype->shape[2]);
+  }
+  return Array<Tensor>{ topi::image::resize(inputs[0],
+                                            oshape,
+                                            param->layout,
+                                            param->align_corners,
+                                            param->method) };
+}
 
 // Positional relay function to create image operator
 // used by frontend FFI.
@@ -82,7 +108,9 @@ RELAY_REGISTER_OP("image.resize")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(5)
-.add_type_rel("Resize", ResizeRel);
+.add_type_rel("Resize", ResizeRel)
+.set_attr<FTVMCompute>("FTVMCompute", ResizeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 0bd7a4816a1b..77e3f005dade 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -1,7 +1,10 @@
 """ Support level5 operator test cases.
 """
+import numpy as np
 import tvm
 from tvm import relay
+from tvm.relay.testing import ctx_list
+import topi.testing
 
 def test_resize_infer_type():
     n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
@@ -17,6 +20,33 @@ def test_resize_infer_type():
     zz = relay.ir_pass.infer_type(z)
     assert zz.checked_type == relay.TensorType((n, c, 100, 200), "int8")
 
+def test_resize():
+    def verify_resize(dshape, scale, method, layout):
+        if layout == "NHWC":
+            size = (dshape[1] * scale, dshape[2] * scale)
+        else:
+            size = (dshape[2] * scale, dshape[3] * scale)
+
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        if method == "BILINEAR":
+            ref_res = topi.testing.bilinear_resize_python(x_data, size, layout)
+        else:
+            ref_res = topi.testing.upsampling_python(x_data, scale, layout)
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.image.resize(x, size, layout, method, False)
+        assert "size=" in z.astext()
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
+        func = relay.Function([x], z)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    for method in ["BILINEAR", "NEAREST_NEIGHBOR"]:
+        for layout in ["NHWC", "NCHW"]:
+            verify_resize((1, 4, 4, 4), 2, method, layout)
 
 def test_multibox_prior():
     sizes = (0.3, 1.5, 0.7)
@@ -74,5 +104,6 @@ def test_nms():
 
 if __name__ == "__main__":
     test_resize_infer_type()
+    test_resize()
     test_multibox_prior()
     test_nms()

From d329e2d3b011f28fa18e4fd9fc13836c68873f4a Mon Sep 17 00:00:00 2001
From: mohankumarSriram <thacoolsoul@gmail.com>
Date: Wed, 28 Nov 2018 23:02:10 +0530
Subject: [PATCH 420/529] fixing nnvm tutorial typo (#2188)

---
 tutorials/nnvm_quick_start.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/nnvm_quick_start.py b/tutorials/nnvm_quick_start.py
index 0244cbe81e5e..7ff7f89cfe39 100644
--- a/tutorials/nnvm_quick_start.py
+++ b/tutorials/nnvm_quick_start.py
@@ -50,7 +50,7 @@
 out_shape = (batch_size, num_class)
 
 net, params = nnvm.testing.resnet.get_workload(
-    layers=18, batch_size=batch_size, image_shape=image_shape)
+    num_layers=18, batch_size=batch_size, image_shape=image_shape)
 print(net.debug_str())
 
 ######################################################################

From 0157a9ad8f8426f483d0f09159ac9513fd04001a Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Wed, 28 Nov 2018 23:27:40 +0530
Subject: [PATCH 421/529] [Relay]where compute and schedule (#2179)

---
 python/tvm/relay/op/_transform.py    |  1 +
 src/relay/op/tensor/transform.cc     | 11 ++++++++++-
 tests/python/relay/test_op_level4.py | 21 +++++++++++++++++----
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index c1624028fe68..b732c5292080 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -19,3 +19,4 @@
 _reg.register_schedule("split", schedule_injective)
 _reg.register_schedule("take", schedule_injective)
 _reg.register_schedule("transpose", schedule_injective)
+_reg.register_schedule("where", schedule_broadcast)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 53741e666f38..cc5b5c6a9fbc 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -857,6 +857,13 @@ Expr MakeWhere(const Expr& condition, const Expr& x, const Expr& y) {
   return CallNode::make(op, {condition, x, y});
 }
 
+Array<Tensor> WhereCompute(const Attrs& attrs,
+                           const Array<Tensor>& inputs,
+                           const Type& out_type,
+                           const Target& target) {
+  return { topi::where(inputs[0], inputs[1], inputs[2]) };
+}
+
 TVM_REGISTER_API("relay.op._make.where")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
   runtime::detail::unpack_call<Expr, 3>(MakeWhere, args, rv);
@@ -896,7 +903,9 @@ Examples::
 .add_argument("y", "Tensor", "Second array to be selected")
 .set_num_inputs(3)
 .set_support_level(4)
-.add_type_rel("Where", WhereRel);
+.add_type_rel("Where", WhereRel)
+.set_attr<FTVMCompute>("FTVMCompute", WhereCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
 
 // Squeeze
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index e5da48f107eb..075a58c31acf 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -98,12 +98,25 @@ def test_binary_int_broadcast():
 
 
 def test_where():
-    cond = relay.var("cond", relay.TensorType((3, 4), "float32"))
-    x = relay.var("x", relay.TensorType((3, 4), "float32"))
-    y = relay.var("y", relay.TensorType((3, 4), "float32"))
+    shape = (3, 4)
+    dtype = "float32"
+    cond = relay.var("cond", relay.TensorType(shape, dtype))
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    y = relay.var("y", relay.TensorType(shape, dtype))
     z = relay.where(cond, x, y)
     zz = relay.ir_pass.infer_type(z)
-    assert zz.checked_type == relay.TensorType((3, 4), "float32")
+    assert zz.checked_type == relay.TensorType(shape, dtype)
+
+    func = relay.Function([cond, x, y], z)
+    condition = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    x = np.random.uniform(size=shape).astype(dtype)
+    y = np.random.uniform(size=shape).astype(dtype)
+    ref_res = np.where(condition, x, y)
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(condition, x, y)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
 def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32"):

From f90be3fbc1f5d79cb6e2fe6f62e31c471b99d068 Mon Sep 17 00:00:00 2001
From: Pratyush Patel <pratyushpatel.1995@gmail.com>
Date: Wed, 28 Nov 2018 10:32:31 -0800
Subject: [PATCH 422/529] [BACKEND][CODEGEN] C codegen with tests (#2161)

* Implement C code generation with tests

* Code cleanup

* Implement C code generation with tests

* Code cleanup

* tabs to spaces

* make lint compliant

* update export_library and reserve unique C keywords

* move ReserveKeywordsAsUnique to codegen_c

* some documentation and code cleanup

* use tvm.contrib.util for tempdir in testcases
---
 python/tvm/_ffi/libinfo.py                   |  60 +++++
 python/tvm/_ffi/runtime_ctypes.py            |   1 +
 python/tvm/contrib/cc.py                     |   2 +
 python/tvm/module.py                         |  12 +-
 src/codegen/codegen_c.cc                     |  44 +++-
 src/codegen/codegen_c.h                      |   2 +
 src/codegen/codegen_c_host.cc                | 252 +++++++++++++++++++
 src/codegen/codegen_c_host.h                 |  40 +++
 src/codegen/codegen_source_base.h            |   7 +
 src/codegen/source_module.cc                 |  46 ++++
 tests/python/unittest/test_codegen_c_host.py |  87 +++++++
 11 files changed, 544 insertions(+), 9 deletions(-)
 create mode 100644 src/codegen/codegen_c_host.cc
 create mode 100644 src/codegen/codegen_c_host.h
 create mode 100644 tests/python/unittest/test_codegen_c_host.py

diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index f911829d38b1..2fdf5aeb132a 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -99,6 +99,66 @@ def find_lib_path(name=None, search_path=None, optional=False):
     return lib_found
 
 
+def find_include_path(name=None, search_path=None, optional=False):
+    """Find header files for C compilation.
+
+    Parameters
+    ----------
+    name : list of str
+        List of directory names to be searched.
+
+    Returns
+    -------
+    include_path : list(string)
+        List of all found paths to header files.
+    """
+    ffi_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    source_dir = os.path.join(ffi_dir, "..", "..", "..")
+    install_include_dir = os.path.join(ffi_dir, "..", "..", "..", "..")
+    third_party_dir = os.path.join(source_dir, "3rdparty")
+
+    header_path = []
+
+    if os.environ.get('TVM_INCLUDE_PATH', None):
+        header_path.append(os.environ['TVM_INCLUDE_PATH'])
+
+    header_path.append(install_include_dir)
+    header_path.append(source_dir)
+    header_path.append(third_party_dir)
+
+    header_path = [os.path.abspath(x) for x in header_path]
+    if search_path is not None:
+        if search_path is list:
+            header_path = header_path + search_path
+        else:
+            header_path.append(search_path)
+    if name is not None:
+        if isinstance(name, list):
+            tvm_include_path = []
+            for n in name:
+                tvm_include_path += [os.path.join(p, n) for p in header_path]
+        else:
+            tvm_include_path = [os.path.join(p, name) for p in header_path]
+        dlpack_include_path = []
+    else:
+        tvm_include_path = [os.path.join(p, 'include') for p in header_path]
+        dlpack_include_path = [os.path.join(p, 'dlpack/include') for p in header_path]
+
+        # try to find include path
+        include_found = [p for p in tvm_include_path if os.path.exists(p) and os.path.isdir(p)]
+        include_found += [p for p in dlpack_include_path if os.path.exists(p) and os.path.isdir(p)]
+
+    if not include_found:
+        message = ('Cannot find the files.\n' +
+                   'List of candidates:\n' +
+                   str('\n'.join(tvm_include_path + dlpack_include_path)))
+        if not optional:
+            raise RuntimeError(message)
+        return None
+
+    return include_found
+
+
 # current version
 # We use the version of the incoming release for code
 # that is under development.
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index b17487559e50..ef5316b5e267 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -118,6 +118,7 @@ class TVMContext(ctypes.Structure):
         'llvm': 1,
         'stackvm': 1,
         'cpu': 1,
+        'c': 1,
         'gpu': 2,
         'cuda': 2,
         'nvptx': 2,
diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index 0ffa6c420243..0361f594de6a 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -7,6 +7,7 @@
 
 from .._ffi.base import py_str
 from .util import tempdir
+from .._ffi.libinfo import find_include_path
 
 
 def create_shared(output,
@@ -49,6 +50,7 @@ def _linux_shared(output, objects, options, cc="g++"):
         cmd += objects
     if options:
         cmd += options
+    cmd += ["-I" + path for path in find_include_path()]
     proc = subprocess.Popen(
         cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
diff --git a/python/tvm/module.py b/python/tvm/module.py
index 1ca09740aff4..cd919722e681 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -97,17 +97,21 @@ def export_library(self,
             self.save(file_name)
             return
 
-        if self.type_key != "llvm":
-            raise ValueError("Module[%s]: Only llvm support export shared" % self.type_key)
+        if not (self.type_key == "llvm" or self.type_key == "c"):
+            raise ValueError("Module[%s]: Only llvm and c support export shared" % self.type_key)
         temp = _util.tempdir()
         if fcompile is not None and hasattr(fcompile, "object_format"):
             object_format = fcompile.object_format
         else:
-            object_format = "o"
+            if self.type_key == "llvm":
+                object_format = "o"
+            else:
+                assert self.type_key == "c"
+                object_format = "cc"
         path_obj = temp.relpath("lib." + object_format)
         self.save(path_obj)
         files = [path_obj]
-        is_system_lib = self.get_function("__tvm_is_system_module")()
+        is_system_lib = self.type_key == "llvm" and self.get_function("__tvm_is_system_module")()
         if self.imported_modules:
             path_cc = temp.relpath("devc.cc")
             with open(path_cc, "w") as f:
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index d902437dd990..3624dc0403aa 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -22,12 +22,43 @@ void CodeGenC::InitFuncState(LoweredFunc f) {
   handle_data_type_.clear();
   CodeGenSourceBase::ClearFuncState();
 }
-void CodeGenC::AddFunction(LoweredFunc f) {
-  // clear previous generated state.
-  this->InitFuncState(f);
+
+void CodeGenC::ReserveKeywordsAsUnique() {
   // skip the first underscore, so SSA variable starts from _1
   GetUniqueName("_");
   GetUniqueName("extern");
+  GetUniqueName("void");
+  GetUniqueName("int");
+  GetUniqueName("float");
+  GetUniqueName("double");
+  GetUniqueName("char");
+  GetUniqueName("unsigned");
+  GetUniqueName("short");
+  GetUniqueName("long");
+  GetUniqueName("if");
+  GetUniqueName("else");
+  GetUniqueName("switch");
+  GetUniqueName("case");
+  GetUniqueName("default");
+  GetUniqueName("for");
+  GetUniqueName("do");
+  GetUniqueName("while");
+  GetUniqueName("goto");
+  GetUniqueName("register");
+  GetUniqueName("continue");
+  GetUniqueName("break");
+  GetUniqueName("typedef");
+  GetUniqueName("struct");
+  GetUniqueName("enum");
+  GetUniqueName("union");
+  GetUniqueName("return");
+}
+
+void CodeGenC::AddFunction(LoweredFunc f) {
+  // clear previous generated state.
+  this->InitFuncState(f);
+  // reserve keywords
+  ReserveKeywordsAsUnique();
   // add to alloc buffer type.
   for (const auto & kv : f->handle_data_type) {
     RegisterHandleType(kv.first.get(), kv.second.type());
@@ -187,6 +218,7 @@ std::string CodeGenC::GetStructRef(
       case intrinsic::kArrNDim: os << "ndim"; break;
       case intrinsic::kArrTypeCode: os << "dtype.code"; break;
       case intrinsic::kArrTypeBits: os << "dtype.bits"; break;
+      case intrinsic::kArrByteOffset: os << "byte_offset"; break;
       case intrinsic::kArrTypeLanes: os << "dtype.lanes"; break;
       case intrinsic::kArrDeviceId: os << "ctx.device_id"; break;
       case intrinsic::kArrDeviceType: os << "ctx.device_type"; break;
@@ -834,8 +866,10 @@ void CodeGenC::VisitStmt_(const Evaluate *op) {
     }
   }
   std::string vid = this->PrintExpr(op->value);
-  this->PrintIndent();
-  this->stream << "(void)" << vid << ";\n";
+  if (vid != "") {
+    this->PrintIndent();
+    this->stream << "(void)" << vid << ";\n";
+  }
 }
 
 void CodeGenC::VisitStmt_(const ProducerConsumer *op) {
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index b36e37da54fe..c9af24a04a3c 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -183,6 +183,8 @@ class CodeGenC :
   std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
   /*! \brief the data type of allocated buffers */
   std::unordered_map<const Variable*, Type> handle_data_type_;
+  /*! \brief reserves common C keywords */
+  void ReserveKeywordsAsUnique();
 
  private:
   /*! \brief whether to print in SSA form */
diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc
new file mode 100644
index 000000000000..248354dbc339
--- /dev/null
+++ b/src/codegen/codegen_c_host.cc
@@ -0,0 +1,252 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_c_host.cc
+ */
+#include <tvm/packed_func_ext.h>
+#include <vector>
+#include <string>
+#include "codegen_c_host.h"
+#include "build_common.h"
+
+namespace tvm {
+namespace codegen {
+
+CodeGenCHost::CodeGenCHost() {
+  module_name = GetUniqueName("__tvm_module_ctx");
+}
+
+void CodeGenCHost::Init(bool output_ssa) {
+  decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
+  decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
+  decl_stream << "extern void* " << module_name << " = NULL;\n";
+  CodeGenC::Init(output_ssa);
+}
+
+void CodeGenCHost::AddFunction(LoweredFunc f) {
+  // clear previous generated state.
+  this->InitFuncState(f);
+  // reserve keywords
+  ReserveKeywordsAsUnique();
+  // add to alloc buffer type.
+  for (const auto & kv : f->handle_data_type) {
+    RegisterHandleType(kv.first.get(), kv.second.type());
+  }
+
+  this->stream << "#ifdef __cplusplus\n";
+  this->stream << "extern \"C\"\n";
+  this->stream << "#endif\n";
+  this->stream << "TVM_DLL int32_t " << f->name << "(";
+  for (size_t i = 0; i < f->args.size(); ++i) {
+    Var v = f->args[i];
+    std::string vid = AllocVarID(v.get());
+    if (i != 0) stream << ", ";
+    if (v.type().is_handle()) {
+      auto it = alloc_storage_scope_.find(v.get());
+      if (it != alloc_storage_scope_.end()) {
+        PrintStorageScope(it->second, stream);
+      }
+      stream << ' ';
+
+      if (handle_data_type_.count(v.get())) {
+        PrintType(handle_data_type_.at(v.get()), stream);
+      } else {
+        stream << "void";
+      }
+      stream << "*";
+
+      if (f->is_restricted && restrict_keyword_.length() != 0) {
+        stream << ' ' << restrict_keyword_;
+      }
+    } else {
+      PrintType(v.type(), stream);
+    }
+    stream << ' ' << vid;
+  }
+  stream << ") {\n";
+  this->PreFunctionBody(f);
+  int func_scope = this->BeginScope();
+  this->PrintStmt(f->body);
+  this->PrintIndent();
+  this->stream << "return 0;\n";
+  this->EndScope(func_scope);
+  this->PrintIndent();
+  this->stream << "}\n\n";
+}
+
+std::string CodeGenCHost::Finish() {
+  return CodeGenC::Finish();
+}
+
+void CodeGenCHost::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    CHECK_EQ(lanes, 1)
+        << "does not support vector types";
+    os << "void*"; return;
+  }
+  if (t == Bool()) {
+    os << "bool"; return;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+      case 16:
+        os << "half";
+        break;
+      case 32: os << "float"; break;
+      case 64:
+        os << "double";
+        break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  } else if (t.is_uint() || t.is_int()) {
+    if (t.is_uint()) {
+      os << 'u';
+    }
+    switch (t.bits()) {
+      case 8: os << "int8_t"; break;
+      case 16: os << "int16_t"; break;
+      case 32: os << "int32_t"; break;
+      case 64: os << "int64_t"; break;
+      case 1: os << "int32_t"; break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to C type";
+}
+
+void CodeGenCHost::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
+  std::string v = PrintExpr(op->value);
+  os << "((";
+  PrintType(op->type, os);
+  os << ")(";
+  for (int i = 0; i < op->lanes; ++i) {
+    if (i != 0) os << ", ";
+    os << v;
+  }
+  os << "))";
+}
+
+void CodeGenCHost::PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name) {
+  this->PrintIndent();
+  this->stream << "if (" << packed_func_name << " == NULL) {\n";
+  int packed_func_if_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "if (TVMBackendGetFuncFromEnv(" << module_name
+              << ", \"" << func_name << "\""
+              << ", &" << packed_func_name << ") != 0) {\n";
+  int get_func_env_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(get_func_env_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+  this->EndScope(packed_func_if_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenCHost::PrintFuncCall(std::string packed_func_name, int num_args) {
+  this->PrintIndent();
+  std::string ret_val = GetUniqueName("ret_val");
+  std::string ret_type_code = GetUniqueName("ret_type_code");
+  this->stream << "TVMValue " << ret_val << ";\n";
+  this->PrintIndent();
+  this->stream << "int " << ret_type_code << ";\n";
+  this->PrintIndent();
+  this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
+               << "(TVMValue*) stack_value" << ", " << "(int*) stack_tcode" << ", "
+               << num_args << ", " << "&" << ret_val << ", " << "&"
+               << ret_type_code << ") != 0) {\n";
+  int func_call_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(func_call_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenCHost::VisitExpr_(const Call *op, std::ostream& os) { // NOLINT(*)
+  if (op->is_intrinsic(intrinsic::tvm_stack_alloca)) {
+    std::string stack_name = GetUniqueName("stack");
+    const std::string& type = op->args[0].as<StringImm>()->value;
+    const IntImm* num = op->args[1].as<IntImm>();
+    CHECK(num != nullptr);
+    static_assert(alignof(TVMValue) % alignof(TVMArray) == 0, "invariant");
+    size_t unit = sizeof(TVMValue);
+    size_t size = 0;
+    if (type == "shape") {
+      size = (num->value * sizeof(tvm_index_t) + unit - 1) / unit;
+    } else if (type == "arg_value") {
+      size = (num->value * sizeof(TVMValue) + unit - 1) / unit;
+    } else if (type == "arg_tcode") {
+      size = (num->value * sizeof(int) + unit - 1) / unit;
+    } else if (type == "array") {
+      size = (num->value * sizeof(TVMArray) + unit - 1) / unit;
+    } else {
+      LOG(FATAL) << "Unknown stack alloca type " << type;
+    }
+    this->PrintIndent();
+    this->stream << "TVMValue " << stack_name << "[" << size << "];\n";
+    os << stack_name;
+  } else if (op->is_intrinsic(intrinsic::tvm_call_packed_lowered)) {
+    const StringImm* s = op->args[0].as<StringImm>();
+    CHECK(s != nullptr) << "tvm_call_packed_lowered expects first argument as function name";
+    int64_t begin = op->args[3].as<IntImm>()->value;
+    int64_t end = op->args[4].as<IntImm>()->value;
+    int64_t num_args = end - begin;
+    CHECK_GE(num_args, 0);
+    std::string func_name = s->value;
+    std::string packed_func_name = GetUniqueName(func_name + "_packed");
+    decl_stream << "static void* " << packed_func_name << " = NULL;\n";
+    this->PrintGetFuncFromBackend(func_name, packed_func_name);
+    this->PrintFuncCall(packed_func_name, num_args);
+  } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) {
+    this->PrintIndent();
+    this->stream << "return -1;\n";
+  } else {
+    CodeGenC::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenCHost::VisitStmt_(const AssertStmt *op) { // NOLINT(*)
+  std::string cond = PrintExpr(op->condition);
+  PrintIndent();
+  stream << "if (!(" << cond << ")) {\n";
+  int assert_if_scope = this->BeginScope();
+  PrintIndent();
+  stream << "TVMAPISetLastError(\"" << op->message.as<StringImm>()->value << "\");\n";
+  PrintIndent();
+  stream << "return -1;\n";
+  this->EndScope(assert_if_scope);
+  PrintIndent();
+  stream << "}\n";
+  this->PrintStmt(op->body);
+}
+
+runtime::Module BuildCHost(Array<LoweredFunc> funcs) {
+  using tvm::runtime::Registry;
+  bool output_ssa = false;
+  CodeGenCHost cg;
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  std::string code = cg.Finish();
+  return CSourceModuleCreate(code, "c");
+}
+
+TVM_REGISTER_API("codegen.build_c")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildCHost(args[0]);
+  });
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/codegen_c_host.h b/src/codegen/codegen_c_host.h
new file mode 100644
index 000000000000..eb47a7829e2c
--- /dev/null
+++ b/src/codegen/codegen_c_host.h
@@ -0,0 +1,40 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_c_host.h
+ * \brief Generate C host code.
+ */
+#ifndef TVM_CODEGEN_CODEGEN_C_HOST_H_
+#define TVM_CODEGEN_CODEGEN_C_HOST_H_
+
+#include <tvm/codegen.h>
+#include <tvm/packed_func_ext.h>
+#include <string>
+#include "codegen_c.h"
+
+namespace tvm {
+namespace codegen {
+
+class CodeGenCHost final : public CodeGenC {
+ public:
+  CodeGenCHost();
+  void Init(bool output_ssa);
+  void AddFunction(LoweredFunc f);
+  std::string Finish();
+
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
+
+  // overload visitor functions
+  void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
+  void VisitExpr_(const Call *op, std::ostream& os) final; // NOLINT(*)
+  void VisitStmt_(const AssertStmt *op) final; // NOLINT(*)
+
+ private:
+  std::string module_name;
+  void PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name);
+  void PrintFuncCall(std::string packed_func_name, int num_args);
+};
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_CODEGEN_CODEGEN_C_HOST_H_
diff --git a/src/codegen/codegen_source_base.h b/src/codegen/codegen_source_base.h
index d2f80a538a33..3fc46c35c7f7 100644
--- a/src/codegen/codegen_source_base.h
+++ b/src/codegen/codegen_source_base.h
@@ -112,6 +112,13 @@ class CodeGenSourceBase {
  */
 runtime::Module SourceModuleCreate(std::string code, std::string fmt);
 
+/*!
+ * \brief Create a C source module for viewing and compiling GCC code.
+ * \param code The code to be viewed.
+ * \param fmt The code. format.
+ */
+runtime::Module CSourceModuleCreate(std::string code, std::string fmt);
+
 /*!
  * \brief Create a source module for viewing and limited saving for device.
  * \param data The code data to be viewed.
diff --git a/src/codegen/source_module.cc b/src/codegen/source_module.cc
index c7100e18735e..56facea1567f 100644
--- a/src/codegen/source_module.cc
+++ b/src/codegen/source_module.cc
@@ -53,6 +53,52 @@ runtime::Module SourceModuleCreate(std::string code, std::string fmt) {
   return runtime::Module(n);
 }
 
+// Simulator function
+class CSourceModuleNode : public runtime::ModuleNode {
+ public:
+  CSourceModuleNode(std::string code,
+                   std::string fmt)
+      : code_(code), fmt_(fmt) {}
+  const char* type_key() const {
+    return "c";
+  }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    LOG(FATAL) << "C Source module cannot execute, to get executable module"
+               << " build TVM with \'" << fmt_ << "\' runtime support";
+    return PackedFunc();
+  }
+
+  std::string GetSource(const std::string& format) final {
+    return code_;
+  }
+
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final {
+    std::string fmt = GetFileFormat(file_name, format);
+    std::string meta_file = GetMetaFilePath(file_name);
+    if (fmt == "cc") {
+      CHECK_NE(code_.length(), 0);
+      SaveBinaryToFile(file_name, code_);
+    } else {
+      CHECK_EQ(fmt, fmt_)
+          << "Can only save to format=" << fmt_;
+    }
+  }
+
+ protected:
+  std::string code_;
+  std::string fmt_;
+};
+
+runtime::Module CSourceModuleCreate(std::string code, std::string fmt) {
+  std::shared_ptr<CSourceModuleNode> n =
+      std::make_shared<CSourceModuleNode>(code, fmt);
+  return runtime::Module(n);
+}
+
 // supports limited save without cross compile
 class DeviceSourceModuleNode final : public runtime::ModuleNode {
  public:
diff --git a/tests/python/unittest/test_codegen_c_host.py b/tests/python/unittest/test_codegen_c_host.py
new file mode 100644
index 000000000000..00acbeb88fcf
--- /dev/null
+++ b/tests/python/unittest/test_codegen_c_host.py
@@ -0,0 +1,87 @@
+import tvm
+import numpy as np
+from tvm.contrib import util
+
+def test_add():
+    nn = 1024
+    n = tvm.convert(nn)
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.create_schedule(C.op)
+
+    def check_c():
+        f1 = tvm.lower(s, [A, B, C], name="fadd")
+        fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
+        fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
+        mhost = tvm.codegen.build_module(fsplits[0], "c")
+        temp = util.tempdir()
+        path_dso = temp.relpath("temp.so")
+        mhost.export_library(path_dso)
+        m = tvm.module.load(path_dso)
+        fadd = m['fadd']
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        n = nn
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        fadd(a, b, c)
+        tvm.testing.assert_allclose(
+            c.asnumpy(), a.asnumpy() + b.asnumpy())
+    check_c()
+
+def test_add_pipeline():
+    nn = 1024
+    n = tvm.convert(nn)
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    AA = tvm.compute((n,), lambda *i: A(*i), name='A')
+    BB = tvm.compute((n,), lambda *i: B(*i), name='B')
+    T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T')
+    C = tvm.compute(A.shape, lambda *i: T(*i), name='C')
+    s = tvm.create_schedule(C.op)
+    xo, xi = s[C].split(C.op.axis[0], factor=4)
+    xo1, xo2 = s[C].split(xo, factor=13)
+    s[C].parallel(xo2)
+    s[C].pragma(xo1, "parallel_launch_point")
+    s[C].pragma(xo2, "parallel_stride_pattern")
+    s[C].pragma(xo2, "parallel_barrier_when_finish")
+    s[C].vectorize(xi)
+
+    def check_c():
+        if not tvm.module.enabled("llvm"):
+            return
+        # Specifically allow offset to test codepath when offset is available
+        Ab = tvm.decl_buffer(
+            A.shape, A.dtype,
+            elem_offset=tvm.var('Aoffset'),
+            offset_factor=8,
+            name='A')
+        binds = {A : Ab}
+        # BUILD and invoke the kernel.
+        f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline")
+        fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
+        fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
+        mhost = tvm.codegen.build_module(fsplits[0], "c")
+        temp = util.tempdir()
+        path_dso = temp.relpath("temp.so")
+        mhost.export_library(path_dso)
+        m = tvm.module.load(path_dso)
+        fadd = m["fadd_pipeline"]
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        n = nn
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        fadd(a, b, c)
+        tvm.testing.assert_allclose(
+            c.asnumpy(), a.asnumpy() + b.asnumpy())
+
+    with tvm.build_config(offset_factor=4):
+        check_c()
+
+if __name__ == "__main__":
+    test_add()
+    test_add_pipeline()

From 69b1b634ef32cff5af125c3589acade3ae9d17ba Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Thu, 29 Nov 2018 06:52:17 +0530
Subject: [PATCH 423/529] [Tutorial]NLP Sequence to sequence model for
 translation (#1815)

* [Tutorial]NLP Sequence to sequence model for translation

* Review comments

* Review comments updated
---
 nnvm/python/nnvm/frontend/keras.py            |  45 +++-
 .../python/frontend/keras/test_forward.py     |  20 +-
 tutorials/nnvm/nlp/keras_s2s_translate.py     | 238 ++++++++++++++++++
 3 files changed, 286 insertions(+), 17 deletions(-)
 create mode 100644 tutorials/nnvm/nlp/keras_s2s_translate.py

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index a1e089b210c5..9dabebc14b90 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -131,6 +131,14 @@ def _convert_dense(insym, keras_layer, symtab):
     if keras_layer.use_bias:
         params['use_bias'] = True
         params['bias'] = symtab.new_const(weightList[1])
+    input_shape = keras_layer.input_shape
+    input_dim = len(input_shape)
+    # In case of RNN dense, input shape will be (1, 1, n)
+    if input_dim > 2:
+        input_shape = tuple(dim if dim else 1 for dim in _as_list(input_shape)[0])
+        if input_dim != 3 or input_shape[0] != 1 or input_shape[1] != 1:
+            raise ValueError("Cannot flatten the inputs with shape.", input_shape, " for dense.")
+        insym = _sym.squeeze(insym, axis=0)
     out = _sym.dense(data=insym, **params)
     # defuse activation
     if sys.version_info.major < 3:
@@ -139,6 +147,8 @@ def _convert_dense(insym, keras_layer, symtab):
         act_type = keras_layer.activation.__name__
     if act_type != 'linear':
         out = _convert_activation(out, act_type, symtab)
+    if input_dim > 2:
+        out = _sym.expand_dims(out, axis=0)
     return out
 
 
@@ -408,10 +418,11 @@ def _convert_lstm(insym, keras_layer, symtab):
         insym = [insym, h_sym, c_sym]
 
     in_data = insym[0]
-    in_state_h = insym[1]
-    in_state_c = insym[2]
+    next_h = insym[1]
+    next_c = insym[2]
 
     weightList = keras_layer.get_weights()
+    inp_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.input_shape)[0])
 
     kernel_wt = symtab.new_const(weightList[0].transpose([1, 0]))
     recurrent_wt = symtab.new_const(weightList[1].transpose([1, 0]))
@@ -419,16 +430,20 @@ def _convert_lstm(insym, keras_layer, symtab):
 
     units = list(weightList[0].shape)[1]
 
-    in_data = _sym.flatten(in_data)
-    ixh1 = _sym.dense(in_data, kernel_wt, use_bias=False, units=units)
-    ixh2 = _sym.dense(in_state_h, recurrent_wt, in_bias, use_bias=True, units=units)
-    gate = ixh1 + ixh2
-    gates = _sym.split(gate, indices_or_sections=4, axis=1)
-    in_gate = _convert_recurrent_activation(gates[0], keras_layer)
-    in_transform = _convert_recurrent_activation(gates[1], keras_layer)
-    next_c = in_transform * in_state_c + in_gate * _convert_activation(gates[2], keras_layer, None)
-    out_gate = _convert_recurrent_activation(gates[3], keras_layer)
-    next_h = out_gate * _convert_activation(next_c, keras_layer, None)
+    time_steps = inp_shape[1]
+    in_data = _sym.squeeze(in_data, axis=0)
+    in_data = _sym.split(in_data, indices_or_sections=time_steps, axis=0)
+    #loop for the number of time_steps
+    for data in in_data:
+        ixh1 = _sym.dense(data, kernel_wt, use_bias=False, units=units)
+        ixh2 = _sym.dense(next_h, recurrent_wt, in_bias, use_bias=True, units=units)
+        gate = ixh1 + ixh2
+        gates = _sym.split(gate, indices_or_sections=4, axis=1)
+        in_gate = _convert_recurrent_activation(gates[0], keras_layer)
+        in_transform = _convert_recurrent_activation(gates[1], keras_layer)
+        next_c = in_transform * next_c + in_gate * _convert_activation(gates[2], keras_layer, None)
+        out_gate = _convert_recurrent_activation(gates[3], keras_layer)
+        next_h = out_gate * _convert_activation(next_c, keras_layer, None)
 
     out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
     out = _sym.reshape(next_h, shape=out_shape)
@@ -656,6 +671,12 @@ def from_keras(model):
                 raise TypeError("Unknown layer type or unsupported Keras version : {}"
                                 .format(keras_layer))
             for node_idx, node in enumerate(inbound_nodes):
+                # If some nodes in imported model is not relevant to the current model,
+                # skip such layers. model._network_nodes contains keys of all nodes relevant
+                # to the current model.
+                if not model._node_key(keras_layer, node_idx) in model._network_nodes:
+                    continue
+
                 insym = []
 
                 # Since Keras allows creating multiple layers from the same name instance,
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 96c51a94ff69..618af3b2e417 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -74,7 +74,7 @@ def test_forward_elemwise_add():
     verify_keras_frontend(keras_model)
 
 
-def test_forward_dense():
+def _test_forward_dense():
     data = keras.layers.Input(shape=(32,32,1))
     x = keras.layers.Flatten()(data)
     x = keras.layers.Dropout(0.5)(x)
@@ -82,6 +82,15 @@ def test_forward_dense():
     keras_model = keras.models.Model(data, x)
     verify_keras_frontend(keras_model)
 
+def _test_forward_dense_with_3d_inp():
+    data = keras.layers.Input(shape=(1, 20))
+    x = keras.layers.Dense(10, activation='relu', kernel_initializer='uniform')(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_dense():
+    _test_forward_dense()
+    _test_forward_dense_with_3d_inp()
 
 def test_forward_pool():
     data = keras.layers.Input(shape=(32,32,1))
@@ -226,8 +235,8 @@ def test_forward_reuse_layers():
     keras_model = keras.models.Model(data, z)
     verify_keras_frontend(keras_model)
 
-def _test_LSTM(inputs, hidden, return_state=True):
-    data = keras.layers.Input(shape=(1, inputs))
+def _test_LSTM(time_steps, inputs, hidden, return_state=True):
+    data = keras.layers.Input(shape=(time_steps, inputs))
     lstm_out = keras.layers.LSTM(hidden,
                                  return_state=return_state,
                                  recurrent_activation='sigmoid',
@@ -250,8 +259,9 @@ def _test_LSTM_MultiLayer(inputs, hidden):
 
 
 def test_forward_LSTM():
-    _test_LSTM(8, 8, return_state=True)
-    _test_LSTM(4, 4, return_state=False)
+    _test_LSTM(1, 8, 8, return_state=True)
+    _test_LSTM(1, 4, 4, return_state=False)
+    _test_LSTM(20, 16, 256, return_state=False)
     _test_LSTM_MultiLayer(4, 4)
 
 def _test_RNN(inputs, units):
diff --git a/tutorials/nnvm/nlp/keras_s2s_translate.py b/tutorials/nnvm/nlp/keras_s2s_translate.py
new file mode 100644
index 000000000000..77c7f23902f4
--- /dev/null
+++ b/tutorials/nnvm/nlp/keras_s2s_translate.py
@@ -0,0 +1,238 @@
+"""
+Keras LSTM Sequence to Sequence Model for Translation
+=================================
+**Author**: `Siju Samuel <https://siju-samuel.github.io/>`_
+
+This script demonstrates how to implement a basic character-level sequence-to-sequence model.
+We apply it to translating short English sentences into short French sentences,
+character-by-character.
+
+# Summary of the algorithm
+
+- We start with input sequences from a domain (e.g. English sentences)
+    and corresponding target sequences from another domain
+    (e.g. French sentences).
+- An encoder LSTM turns input sequences to 2 state vectors
+    (we keep the last LSTM state and discard the outputs).
+- A decoder LSTM is trained to turn the target sequences into
+    the same sequence but offset by one timestep in the future,
+    a training process called "teacher forcing" in this context.
+    Is uses as initial state the state vectors from the encoder.
+    Effectively, the decoder learns to generate `targets[t+1...]`
+    given `targets[...t]`, conditioned on the input sequence.
+
+This script loads the s2s.h5 model saved in repository
+https://github.com/dmlc/web-data/raw/master/keras/models/s2s_translate/lstm_seq2seq.py
+and generates sequences from it.  It assumes that no changes have been made (for example:
+latent_dim is unchanged, and the input data and model architecture are unchanged).
+
+# References
+
+- Sequence to Sequence Learning with Neural Networks
+    https://arxiv.org/abs/1409.3215
+- Learning Phrase Representations using
+    RNN Encoder-Decoder for Statistical Machine Translation
+    https://arxiv.org/abs/1406.1078
+
+See lstm_seq2seq.py for more details on the model architecture and how it is trained.
+"""
+
+from keras.models import Model, load_model
+from keras.layers import Input
+import random
+import os
+import numpy as np
+import keras
+import tvm
+import nnvm
+
+######################################################################
+# Download required files
+# -----------------------
+# Download files listed below from dmlc web-data repo.
+model_file = "s2s_translate.h5"
+data_file = "fra-eng.txt"
+
+# Base location for model related files.
+repo_base = 'https://github.com/dmlc/web-data/raw/master/keras/models/s2s_translate/'
+model_url = os.path.join(repo_base, model_file)
+data_url = os.path.join(repo_base, data_file)
+
+# Download files listed below.
+from mxnet.gluon.utils import download
+download(model_url, model_file)
+download(data_url, model_file)
+
+latent_dim = 256  # Latent dimensionality of the encoding space.
+test_samples = 10000  # Number of samples used for testing.
+
+######################################################################
+# Process the data file
+# ---------------------
+# Vectorize the data.  We use the same approach as the training script.
+# NOTE: the data must be identical, in order for the character -> integer
+# mappings to be consistent.
+input_texts = []
+target_texts = []
+input_characters = set()
+target_characters = set()
+with open(data_file, 'r', encoding='utf-8') as f:
+    lines = f.read().split('\n')
+test_samples = min(test_samples, len(lines))
+max_encoder_seq_length = 0
+max_decoder_seq_length = 0
+for line in lines[:test_samples]:
+    input_text, target_text = line.split('\t')
+    # We use "tab" as the "start sequence" character
+    # for the targets, and "\n" as "end sequence" character.
+    target_text = '\t' + target_text + '\n'
+    max_encoder_seq_length = max(max_encoder_seq_length, len(input_text))
+    max_decoder_seq_length = max(max_decoder_seq_length, len(target_text))
+    for char in input_text:
+        if char not in input_characters:
+            input_characters.add(char)
+    for char in target_text:
+        if char not in target_characters:
+            target_characters.add(char)
+
+input_characters = sorted(list(input_characters))
+target_characters = sorted(list(target_characters))
+num_encoder_tokens = len(input_characters)
+num_decoder_tokens = len(target_characters)
+input_token_index = dict(
+    [(char, i) for i, char in enumerate(input_characters)])
+target_token_index = dict(
+    [(char, i) for i, char in enumerate(target_characters)])
+
+# Reverse-lookup token index to decode sequences back to something readable.
+reverse_target_char_index = dict(
+    (i, char) for char, i in target_token_index.items())
+
+######################################################################
+# Load Keras Model
+# ----------------
+# Restore the model and construct the encoder and decoder.
+model = load_model(model_file)
+encoder_inputs = model.input[0]   # input_1
+
+encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output   # lstm_1
+encoder_states = [state_h_enc, state_c_enc]
+encoder_model = Model(encoder_inputs, encoder_states)
+
+decoder_inputs = model.input[1]   # input_2
+decoder_state_input_h = Input(shape=(latent_dim,), name='input_3')
+decoder_state_input_c = Input(shape=(latent_dim,), name='input_4')
+decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
+decoder_lstm = model.layers[3]
+decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
+    decoder_inputs, initial_state=decoder_states_inputs)
+decoder_states = [state_h_dec, state_c_dec]
+decoder_dense = model.layers[4]
+decoder_outputs = decoder_dense(decoder_outputs)
+decoder_model = Model(
+    [decoder_inputs] + decoder_states_inputs,
+    [decoder_outputs] + decoder_states)
+
+######################################################################
+# Compile both encoder and decoder model on NNVM
+# ----------------------------------------------
+# Creates NNVM graph definition from keras model file.
+from tvm.contrib import graph_runtime
+target = 'llvm'
+ctx = tvm.cpu(0)
+
+# Parse Encoder model
+sym, params = nnvm.frontend.from_keras(encoder_model)
+inp_enc_shape = (1, max_encoder_seq_length, num_encoder_tokens)
+shape_dict = {'input_1': inp_enc_shape}
+
+# Build Encoder model
+with nnvm.compiler.build_config(opt_level=2):
+    enc_graph, enc_lib, enc_params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+print("Encoder build ok.")
+
+# Create graph runtime for encoder model
+tvm_enc = graph_runtime.create(enc_graph, enc_lib, ctx)
+tvm_enc.set_input(**enc_params)
+
+# Parse Decoder model
+inp_dec_shape = (1, 1, num_decoder_tokens)
+shape_dict = {'input_2': inp_dec_shape,
+              'input_3': (1, latent_dim),
+              'input_4': (1, latent_dim)}
+
+# Build Decoder model
+sym, params = nnvm.frontend.from_keras(decoder_model)
+with nnvm.compiler.build_config(opt_level=2):
+    dec_graph, dec_lib, dec_params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+print("Decoder build ok.")
+
+# Create graph runtime for decoder model
+tvm_dec = graph_runtime.create(dec_graph, dec_lib, ctx)
+tvm_dec.set_input(**dec_params)
+
+# Decodes an input sequence.
+def decode_sequence(input_seq):
+    # Set the input for encoder model.
+    tvm_enc.set_input('input_1', input_seq)
+
+    # Run encoder model
+    tvm_enc.run()
+
+    # Get states from encoder network
+    h = tvm_enc.get_output(0).asnumpy()
+    c = tvm_enc.get_output(1).asnumpy()
+
+    # Populate the first character of target sequence with the start character.
+    sampled_token_index = target_token_index['\t']
+
+    # Sampling loop for a batch of sequences
+    decoded_sentence = ''
+    while True:
+        # Generate empty target sequence of length 1.
+        target_seq = np.zeros((1, 1, num_decoder_tokens), dtype='float32')
+        # Update the target sequence (of length 1).
+        target_seq[0, 0, sampled_token_index] = 1.
+
+        # Set the input and states for decoder model.
+        tvm_dec.set_input('input_2', target_seq)
+        tvm_dec.set_input('input_3', h)
+        tvm_dec.set_input('input_4', c)
+        # Run decoder model
+        tvm_dec.run()
+
+        output_tokens = tvm_dec.get_output(0).asnumpy()
+        h = tvm_dec.get_output(1).asnumpy()
+        c = tvm_dec.get_output(2).asnumpy()
+
+        # Sample a token
+        sampled_token_index = np.argmax(output_tokens[0, -1, :])
+        sampled_char = reverse_target_char_index[sampled_token_index]
+
+        # Exit condition: either hit max length or find stop character.
+        if sampled_char == '\n':
+            break
+
+        # Update the sentence
+        decoded_sentence += sampled_char
+        if len(decoded_sentence) > max_decoder_seq_length:
+            break
+    return decoded_sentence
+
+def generate_input_seq(input_text):
+    input_seq = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
+    for t, char in enumerate(input_text):
+        input_seq[0, t, input_token_index[char]] = 1.
+    return input_seq
+
+######################################################################
+# Run the model
+# -------------
+# Randonly take some text from test samples and translate
+for seq_index in range(100):
+    # Take one sentence randomly and try to decode.
+    index = random.randint(1, test_samples)
+    input_text, _ = lines[index].split('\t')
+    input_seq = generate_input_seq(input_text)
+    decoded_sentence = decode_sequence(input_seq)
+    print((seq_index + 1), ": ", input_text,  "==>", decoded_sentence)

From 7a69969f6239e8b87b959ebb6f36b01d017e59e2 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Thu, 29 Nov 2018 10:32:27 +0800
Subject: [PATCH 424/529] [FRONTEND][TENSORFLOW] Support AttrValue that has
 different types of value in a list (#2177)

---
 nnvm/python/nnvm/frontend/tensorflow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index b0b546a32b3d..26e59dc7e830 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -1280,9 +1280,9 @@ def _get_attr(self, buf):
             for f in fields:
                 if getattr(x.list, f):
                     if f == "type":
-                        ret = [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
+                        ret += [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
                     else:
-                        ret = list(getattr(x.list, f))
+                        ret += list(getattr(x.list, f))
         else:
             for f in fields:
                 if x.HasField(f):

From 6de05720b676745d1a48fff3edcc488bedf00f80 Mon Sep 17 00:00:00 2001
From: Dmitrii Murygin <dmitrij.murygin@bk.ru>
Date: Thu, 29 Nov 2018 08:12:28 +0300
Subject: [PATCH 425/529] [TOPI] Add tensor multiplication. (#2106)

---
 topi/include/topi/transform.h         | 115 ++++++++++++++++++++++++++
 topi/python/topi/transform.py         |  20 +++++
 topi/src/topi.cc                      |  12 +++
 topi/tests/python/test_topi_matmul.py |  17 ++++
 4 files changed, 164 insertions(+)

diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 835f4272b940..157c19e55249 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -753,6 +753,121 @@ inline tvm::Tensor matmul(const tvm::Tensor& A,
   return tvm::compute(output_shape, l, name, tag);
 }
 
+/*!
+ * \brief A generalization of matrix multiplication to tensors.
+ *
+ * \param A The tensor A
+ * \param B The tensor B
+ * \param axes The number of the dimensions to reduce over
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor computing the result
+ */
+inline Tensor tensordot(const Tensor& A,
+                        const tvm::Tensor& B,
+                        int axes = 2,
+                        std::string name = "tensor",
+                        std::string tag = kMatMul) {
+  CHECK_GE(A->shape.size(), axes);
+  CHECK_GE(B->shape.size(), axes);
+
+  Array<Expr> output_shape(A->shape.begin(), A->shape.end() + (-axes));
+  for (auto it = B->shape.begin() + axes; it != B->shape.end(); ++it)
+    output_shape.push_back(*it);
+
+  Array<IterVar> iter_vars;
+  for (int i = 0; i < axes; ++i)
+    iter_vars.push_back(reduce_axis(Range(0, B->shape[i]), "k" + std::to_string(i)));
+
+  auto func =
+    [&A, &B, &iter_vars, axes]
+    (const Array<Var>& input_indices) {
+      Array<Expr> A_indices(
+          input_indices.begin(),
+          input_indices.begin() + (A->shape.size() - axes));
+      for (auto& v : iter_vars)
+        A_indices.push_back(v);
+
+      Array<Expr> B_indices;
+      for (auto& v : iter_vars)
+        B_indices.push_back(v);
+
+      auto it = input_indices.begin() + (A->shape.size() - axes);
+      for (; it != input_indices.end(); ++it)
+        B_indices.push_back(*it);
+
+      // Some passes don't like reductions with empty axis, so avoid it here
+      if (iter_vars.empty())
+        return A(A_indices) * B(B_indices);
+      else
+        return sum(A(A_indices) * B(B_indices), iter_vars);
+    };
+
+  return compute(output_shape, func, name, tag);
+}
+
+/*!
+ * \brief A generalization of matrix multiplication to tensors.
+ *
+ * \param A The tensor A
+ * \param B The tensor B
+ * \param A_axes The indices of the dimensions of tensor A to reduce over
+ * \param B_axes The indices of the dimensions of tensor B to reduce over
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor computing the result
+ */
+inline Tensor tensordot(const Tensor& A,
+                        const tvm::Tensor& B,
+                        Array<Expr> A_axes,
+                        Array<Expr> B_axes,
+                        std::string name = "tensor",
+                        std::string tag = kMatMul) {
+  CHECK_EQ(A_axes.size(), B_axes.size());
+
+  auto A_axes_val = GetConstIntValues(A_axes, "A_axes");
+  auto B_axes_val = GetConstIntValues(B_axes, "B_axes");
+
+  Array<Expr> output_shape;
+  for (unsigned i = 0; i < A->shape.size(); ++i)
+    if (std::find(A_axes_val.begin(), A_axes_val.end(), i) == A_axes_val.end())
+      output_shape.push_back(A->shape[i]);
+  for (unsigned i = 0; i < B->shape.size(); ++i)
+    if (std::find(B_axes_val.begin(), B_axes_val.end(), i) == B_axes_val.end())
+      output_shape.push_back(B->shape[i]);
+
+  Array<IterVar> iter_vars;
+    for (unsigned i = 0; i < B_axes_val.size(); ++i)
+      iter_vars.push_back(reduce_axis(Range(0, B->shape[B_axes_val[i]]), "k" + std::to_string(i)));
+
+  auto func =
+    [&A, &B, &iter_vars, A_axes_val, B_axes_val]
+    (const Array<Var>& input_indices) {
+      int idx_input = 0;
+      Array<Expr> A_indices;
+      for (unsigned i = 0; i < A->shape.size(); ++i) {
+        auto axes_pos = std::find(A_axes_val.begin(), A_axes_val.end(), i);
+        if (axes_pos == A_axes_val.end())
+          A_indices.push_back(input_indices[idx_input++]);
+        else
+          A_indices.push_back(iter_vars[axes_pos - A_axes_val.begin()]);
+      }
+
+      Array<Expr> B_indices;
+      for (unsigned i = 0; i < B->shape.size(); ++i) {
+        auto axes_pos = std::find(B_axes_val.begin(), B_axes_val.end(), i);
+        if (axes_pos == B_axes_val.end())
+          B_indices.push_back(input_indices[idx_input++]);
+        else
+          B_indices.push_back(iter_vars[axes_pos - B_axes_val.begin()]);
+      }
+      return sum(A(A_indices) * B(B_indices), iter_vars);
+    };
+  return compute(output_shape, func, name, tag);
+}
+
 
 }  // namespace topi
 #endif  // TOPI_TRANSFORM_H_
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 4cc6cb7e9f34..b9a7bd4f2992 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -269,3 +269,23 @@ def matmul(a, b, transp_a=False, transp_b=False):
     A Tensor whose op member is the matmul operation
     """
     return cpp.matmul(a, b, transp_a, transp_b)
+
+
+def tensordot(a, b, axes):
+    """A generalization of matrix multiplication to tensor.
+
+    Parameters
+    ----------
+    a : The tensor A
+    b : The tensor B
+    axes : The number of dimensions to reduce over
+
+    Returns
+    -------
+    A Tensor computing the result
+    """
+    if isinstance(axes, int):
+        return cpp.tensordot(a, b, axes)
+    if isinstance(axes[0], int):
+        return cpp.tensordot(a, b, (axes[0],), (axes[1],))
+    return cpp.tensordot(a, b, axes[0], axes[1])
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 13a5ccad654c..fe2af0561ea7 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -305,6 +305,18 @@ TVM_REGISTER_GLOBAL("topi.matmul")
     default: CHECK(0) << "topi.matmul expects 2, 3 or 4 arguments";
   }});
 
+TVM_REGISTER_GLOBAL("topi.tensordot")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  if (args.size() == 2) {
+    *rv = tensordot(args[0], args[1]);
+  } else if (args.size() == 3) {
+    *rv = tensordot(args[0], args[1], args[2]);
+  } else {
+    Array<Expr> axes = args[3];
+    *rv = tensordot(args[0], args[1], args[2], axes);
+  }
+  });
+
 TVM_REGISTER_GLOBAL("topi.strided_slice")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = strided_slice(args[0], args[1], args[2], args[3]);
diff --git a/topi/tests/python/test_topi_matmul.py b/topi/tests/python/test_topi_matmul.py
index bd79bc4cba41..2a8eaeb608dd 100644
--- a/topi/tests/python/test_topi_matmul.py
+++ b/topi/tests/python/test_topi_matmul.py
@@ -39,6 +39,23 @@ def test_matmul():
     verify_matmul((3,5),(3,2),True,False)
     verify_matmul((3,5),(2,3),True,True)
 
+def verify_tensordot(sa, sb, axes):
+    a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32)
+    b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
+    c1 = np.tensordot(a, b, axes)
+    c2 = with_tvm(lambda A, B: topi.tensordot(A, B, axes), a, b)
+    tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
+
+def test_tensordot():
+    verify_tensordot((3), (3), 0)
+    verify_tensordot((2, 3), (3, 5), 1)
+    verify_tensordot((2, 2, 3), (2, 3, 5), 2)
+    verify_tensordot((2, 2, 3, 4), (2, 3, 4, 5), 3)
+    verify_tensordot((3, 2, 2), (2, 3, 5), (1, 0))
+    verify_tensordot((3, 2, 2), (2, 3, 5), ((1, 0), (0, 1)))
+    verify_tensordot((4, 3, 2, 2), (2, 4, 3, 5), ((1, 2, 0), (2, 0, 1)))
+
 if __name__ == "__main__":
     test_matmul()
+    test_tensordot()
 

From 04a5ee7858074d63ee732d823dfcdf152df0eb52 Mon Sep 17 00:00:00 2001
From: Liang ZOU <liang.d.zou@gmail.com>
Date: Fri, 30 Nov 2018 02:01:02 +0800
Subject: [PATCH 426/529] Update comments for the API tvm.lower (#2193)

tvm.Schedule ==> tvm.schedule.Schedule
---
 python/tvm/build_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index d65642340bad..79debc9a2c45 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -299,7 +299,7 @@ def lower(sch,
 
     Parameters
     ----------
-    sch : tvm.Schedule
+    sch : tvm.schedule.Schedule
         The schedule to be builded
 
     args : list of Buffer or Tensor or Var

From 85c7c788e13bcaef4c28f1f89049c172319e0281 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 29 Nov 2018 10:01:49 -0800
Subject: [PATCH 427/529] [COMMUNITY] @grwlf -> Reviewer (#2190)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 29b1c222455c..185bf329f6d7 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -38,6 +38,7 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang
 - [Wuwei Lin](https://github.com/vinx13): @vinx13
 - [Masahiro Masuda](https://github.com/masahi): @masahi
+- [Sergey Mironov](https://github.com/grwlf): @grwlf
 - [Thierry Moreau](https://github.com/tmoreau89): @tmoreau89
 - [Kazutaka Morita](https://github.com/kazum): @kazum
 - [Tatsuya Nishiyama](https://github.com/nishi-t): @nishi-t

From ce9a07e9a1b0608b60b9c01c31d71807654a4cdb Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Thu, 29 Nov 2018 10:01:58 -0800
Subject: [PATCH 428/529] dockerfile cpu changes (#2191)

---
 docker/Dockerfile.ci_cpu | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index e6e2dd7a37b0..11a77adbfdde 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -40,3 +40,10 @@ COPY install/ubuntu_install_nnpack.sh /install/ubuntu_install_nnpack.sh
 RUN bash /install/ubuntu_install_nnpack.sh
 
 ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
+
+# ANTLR deps
+COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
+RUN bash /install/ubuntu_install_java.sh
+
+COPY install/ubuntu_install_antlr.sh /install/ubuntu_install_antlr.sh
+RUN bash /install/ubuntu_install_antlr.sh

From 5cf3265ecb010abc39f55b50a8f56a1de0aff7ae Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 29 Nov 2018 10:02:15 -0800
Subject: [PATCH 429/529] [DOCS] Introduction to Relay IR. (#2185)

---
 docs/dev/index.rst       |   3 +-
 docs/dev/relay_intro.rst | 188 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 docs/dev/relay_intro.rst

diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index c7a52c6de13b..2734a816dc68 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -12,4 +12,5 @@ In this part of documentation, we share the rationale for the specific choices m
    nnvm_json_spec
    nnvm_overview
    hybrid_script
-   relay_add_op
+   relay_intro
+   relay_add_op
\ No newline at end of file
diff --git a/docs/dev/relay_intro.rst b/docs/dev/relay_intro.rst
new file mode 100644
index 000000000000..d3c83590cbb8
--- /dev/null
+++ b/docs/dev/relay_intro.rst
@@ -0,0 +1,188 @@
+Introduction to Relay IR
+========================
+This article introduces Relay IR -- the second generation of NNVM.
+We expect readers from two kinds of background -- those who have a programming language background and deep learning
+framework developers who are familiar with the computational graph representation.
+
+We briefly summarize the design goal here, and will touch upon these points in the later part of the article.
+
+- Support traditional data flow style programming and transformations.
+- Support functional style scoping, let-binding and making it fully featured differentiable language.
+- Being able to allow the user to mix the two programming styles.
+
+Build Computational Graph with Relay
+------------------------------------
+Traditional deep learning frameworks use computational graphs as their intermediate representation.
+A computational graph (or data-flow graph), is a directed acyclic graph (DAG) that represents the computation.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/dataflow.png
+    :align: center
+    :scale: 70%
+
+
+You can use Relay to build a computational(dataflow) graph. Specifically, the above code shows how to
+construct a simple two-node graph. You can find that the syntax of the example is not that different from existing
+computational graph IR like NNVMv1, with the only difference in terms of terminology:
+
+- Existing frameworks usually use graph and subgraph
+- Relay uses function e.g. --  ``fn (%x)``, to indicate the graph
+
+Each data-flow node is a CallNode in Relay. The relay python DSL allows you to construct a data-flow quickly.
+One thing we want to highlight in the above code -- is that we explicitly constructed an Add node with
+both input point to ``%1``.  When a deep learning framework evaluates the above program, it will compute
+the nodes in topological order, and ``%1`` will only be computed once.
+While this fact is very natural to deep learning framework builders, it is something that might
+surprise a PL folk in the first place.  If we implement a simple visitor to print out the result and
+treat the result as nested Call expression, it becomes ``log(%x) + log(%x)``.
+
+Such ambiguity is caused by different interpretation of program semantics when there is a shared node in the DAG.
+In a normal functional programming IR, nested expressions are treated as expression trees, without considering the
+fact that the ``%1`` is actually reused twice in ``%2``.
+
+Relay IR choose to be mindful of this difference. Usually, deep learning framework users build the computational
+graph in this fashion, where a DAG node reuse often occur. As a result, when we print out the Relay program in
+the text format, we print one CallNode per line and assign a temporary id ``(%1, %2)`` to each CallNode so each common
+node can be referenced in later parts of the program.
+
+Module: Support Multiple Functions(Graphs)
+------------------------------------------
+So far we have introduced how can we build a data flow graph as a function. One might naturally ask -- can we support multiple
+functions and enable them to call each other. Relay allows grouping multiple functions together in a module, the code below
+shows an example of a function calling another function.
+
+.. code::
+
+   def @muladd(%x, %y, %z) {
+     %1 = mul(%x, %y)
+     %2 = add(%x, %z)
+     %2
+   }
+   def @myfunc(%x) {
+     %1 = @muladd(%x, 1, 2)
+     %2 = @muladd(%1, 2, 3)
+     %2
+   }
+
+The Module can be viewed as a ``Map<GlobalVar, Function>``. Here GlobalVar is just an id that is used to represent the functions
+in the module. ``@muladd`` and ``@myfunc`` are GlobalVars in the above example. When a CallNode is used to call another function,
+the corresponding GlobalVar is stored in the op field of the CallNode. It contains a level of indirection -- we need to look up
+body of the called function from the module using the corresponding GlobalVar. In this particular case, we could also directly
+store the reference to the Function as op in the CallNode. So, why do we need to introduce GlobalVar? The main reason is that
+GlobalVar decouples the definition/declaration and enables recursion and delayed declaration of the function.
+
+.. code ::
+
+  @def myfunc(%x) {
+    %1 = equal(%x, 1)
+     if (%1) {
+        %x
+     } else {
+       %2 = sub(%x, 1)
+       %3 = @myfunc(%2)
+        %4 = add(%3, %3)
+        %4
+    }
+  }
+
+In the above example, ``@myfunc`` recursively calls itself. Using GlobalVar ``@myfunc`` to represent the function avoids
+the cyclic dependency in the data structure.
+At this point, we have introduced the basic concepts in Relay. Notably, Relay has the following improvements over NNVMv1:
+
+- Succinct text format that eases debugging of writing passes.
+- First-class support for subgraphs-functions, in a joint module, this enables further chance of joint optimizations such as inlining and calling convention specification.
+- Naive front-end language interop, for example, all the data structure can be visited in python, which allows quick prototyping of optimizations in python and mixing them with c++ code.
+
+
+Let Binding and Scopes
+----------------------
+
+So far, we have introduced how to build a computational graph in the good old way used in deep learning frameworks.
+This section will talk about a new important construct introduced by Relay -- let bindings.
+
+Let binding is used in every high-level programming languages. In Relay, it is a data structure with three
+fields ``Let(var, value, body)``. When we evaluate a let expression, we first evaluate the value part, assign
+it to the var, then return the evaluated result in the body expression.
+
+You can use a sequence of let bindings to construct a logically equivalent program to a data-flow program.
+The code example below shows one program with two forms side by side.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/dataflow_vs_func.png
+    :align: center
+    :scale: 70%
+
+
+The nested let-binding is called A-normal form, and it is commonly used as IRs in functional programming languages.
+Now, please take a close look at the AST structure. While the two programs are semantically identical
+(so are their textual representations, except that A-normal form has let prefix), their AST structures are different from each other.
+
+Since program optimizations take these AST data structures and transform them, the two different structure will
+affect the compiler code we are going to write. For example, if we want to detect a pattern ``add(log(x), y)``:
+
+- In the data-flow form, we can first access the add node, then directly look at its first argument to see if it is a log
+- In the A-normal form, we cannot directly do the check anymore, because the first input to add is ``%v1`` -- we will need to keep a map from variable to its bound values and lookup that map, in order to know that ``%v1`` is a log.
+
+Different data structures will impact how you might write transformations, and we need to keep that in mind.
+So now, as a deep learning framework developer, you might ask, why do we need let-binding.
+Your PL friends will always tell you that let is important -- as PL is a quite established field,
+there must be some wisdom behind that.
+
+
+Why We Might Need Let Binding
+-----------------------------
+One key usage of let binding is that it specifies the scope of computation. Let us take look at the following example,
+which does not use let binding.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/let_scope.png
+    :align: center
+    :scale: 70%
+
+The problem comes when we try to decide where we should evaluate node ``%1``. In particular, while the text format seems
+to suggest that we should evaluate node ``%1`` outside the if scope, the AST(as shown in the picture) does not suggest so.
+Actually, a dataflow graph never defines its scope of the evaluation. This introduces some ambiguity in the semantics.
+
+This ambiguity becomes more interesting when we have closures. Consider the following program, which returns a closure.
+We don’t know where should we compute ``%1``. It can either be outside the closure, or inside the closure.
+
+.. code::
+
+  fn (%x) {
+    %1 = log(%x)
+    %2 = fn(%y) {
+      add(%y, %1)
+    }
+    %2
+  }
+
+Let binding solves this problem, as the computation of the value happens at the let node. In both programs,
+if we change ``%1 = log(%x)`` to ``let %v1 = log(%x)``, we clearly specify the computation location to
+be outside of the if scope and closure. As you can see let-binding gives a more precise specification of the computation site
+and could be useful when we generate backend code(as such specification is in the IR).
+
+On the other hand, the data-flow form, which does not specify the scope of computation, does have its own advantages
+-- we don’t need to worry about where to put the let when we generate the code. The dataflow form also gives more freedom
+to the later passes to decide where to put the evaluation point. As a result, it might not be a bad idea to use data flow
+form of the program in the initial phases of optimizations when you find it is convenient.
+Many optimizations in Relay today are written to optimize dataflow programs.
+
+However, when we lower the IR to actual runtime program, we need to be precise about the scope of computation.
+In particular, we want to explicitly specify where the scope of computation should happen when we are using
+sub-functions and closures. Let-binding can be used to solve this problem in later stage execution specific optimizations.
+
+
+Implication on IR Transformations
+---------------------------------
+
+Hopefully, by now you are familiar with the two kinds of representations.
+Most functional programming languages do their analysis in A-normal form,
+where the analyzer does not need to be mindful that the expressions are DAGs.
+
+Relay choose to support both the data-flow form and let binding. We believe that it is important to let the
+framework developer choose the representation they are familiar with.
+This does, however, have some implications on how we write passes:
+
+- If you come from a data-flow background and want to handle let, keep a map of var to the expressions so you can perform lookup when encountering a var. This likely means a minimum change as we already need a map from expr -> transformed expression anyway. Note that this will effectively remove all the let in the program.
+- If you come from a PL background and like A-normal form, we will provide a dataflow -> A-normal form pass.
+- For PL folks, when you are implementing something (like dataflow->ANF transformation), be mindful that the expression can be DAG, and this usually means that we should visit expressions with a ``Map<Expr, Result>`` and only compute the transformed result once, so the result expression keeps the common structure.
+
+There are additional advanced concepts such as symbolic shape inference, polymorphic functions
+that are not covered by this material, you are more than welcomed to look at other materials.

From 8c7e40986fca0de5730c04136343865cd0d0bc5b Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Thu, 29 Nov 2018 23:36:29 +0530
Subject: [PATCH 430/529] [Relay]collapse_sum and broadcast_to compute &
 schedule (#2180)

---
 python/tvm/relay/op/_transform.py     |  6 +++--
 src/relay/op/tensor/transform.cc      | 28 +++++++++++++++++--
 tests/python/relay/test_op_level10.py | 39 ++++++++++++++++++++++-----
 3 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index b732c5292080..3093032f9e40 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -2,13 +2,15 @@
 # pylint: disable=invalid-name
 from __future__ import absolute_import
 from . import op as _reg
+from ._reduce import _schedule_reduce
 
 schedule_injective = _reg.schedule_injective
 schedule_broadcast = _reg.schedule_injective
 
-
-_reg.register_schedule("squeeze", schedule_injective)
+_reg.register_schedule("collapse_sum_like", _schedule_reduce)
+_reg.register_schedule("broadcast_to_like", schedule_broadcast)
 _reg.register_schedule("expand_dims", schedule_broadcast)
+_reg.register_schedule("squeeze", schedule_injective)
 _reg.register_schedule("reshape", schedule_injective)
 _reg.register_schedule("reshape_like", schedule_injective)
 _reg.register_schedule("full", schedule_injective)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index cc5b5c6a9fbc..4a052881d7bf 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -9,6 +9,8 @@
 #include <tvm/ir.h>
 #include <topi/transform.h>
 #include <topi/elemwise.h>
+#include <topi/broadcast.h>
+#include <topi/reduction.h>
 #include <vector>
 #include "../op_common.h"
 #include "../../../arithmetic/compute_expr.h"
@@ -1017,6 +1019,15 @@ Expr MakeCollapseSumLike(Expr data,
   return CallNode::make(op, {data, collapse_type}, Attrs(), {});
 }
 
+Array<Tensor> CollapseSumLikeCompute(const Attrs& attrs,
+                                     const Array<Tensor>& inputs,
+                                     const Type& out_type,
+                                     const Target& target) {
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  CHECK(out_ttype != nullptr);
+  return { topi::collapse_sum(inputs[0], out_ttype->shape) };
+}
+
 TVM_REGISTER_API("relay.op._make.collapse_sum_like")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
     runtime::detail::unpack_call<Expr, 2>(MakeCollapseSumLike, args, rv);
@@ -1029,7 +1040,9 @@ RELAY_REGISTER_OP("collapse_sum_like")
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("collapse_type", "Tensor", "Provide the type to collapse to.")
 .set_support_level(10)
-.add_type_rel("CollapseSumLike", CollapseSumLikeRel);
+.add_type_rel("CollapseSumLike", CollapseSumLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", CollapseSumLikeCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 // BroadCastToLike: <A, B> -> B where BroadCast(A, B) = B
 bool BroadCastToLikeRel(const Array<Type>& types,
@@ -1047,6 +1060,15 @@ Expr MakeBroadCastToLike(Expr data,
   return CallNode::make(op, {data, broadcast_type}, Attrs(), {});
 }
 
+Array<Tensor> BroadCastToLikeCompute(const Attrs& attrs,
+                                     const Array<Tensor>& inputs,
+                                     const Type& out_type,
+                                     const Target& target) {
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  CHECK(out_ttype != nullptr);
+  return { topi::broadcast_to(inputs[0], out_ttype->shape) };
+}
+
 TVM_REGISTER_API("relay.op._make.broadcast_to_like")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
     runtime::detail::unpack_call<Expr, 2>(MakeBroadCastToLike, args, rv);
@@ -1059,7 +1081,9 @@ RELAY_REGISTER_OP("broadcast_to_like")
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("broadcast_type", "Tensor", "Provide the type to broadcast to.")
 .set_support_level(10)
-.add_type_rel("BroadCastToLike", BroadCastToLikeRel);
+.add_type_rel("BroadCastToLike", BroadCastToLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", BroadCastToLikeCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
 
 // strided_slice
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index ef1c57d263fa..5d65691a2ad5 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -6,19 +6,44 @@
 from tvm.relay.testing import ctx_list
 
 def test_collapse_sum_like():
-    x = relay.Var("x", relay.ty.TensorType((3, 4, 5, 6), "int8"))
-    y = relay.Var("y", relay.ty.TensorType((4, 1, 6), "int8"))
+    shape = (3, 4, 5, 6)
+    shape_like = (4, 5, 6)
+    dtype = "float32"
+    x = relay.Var("x", relay.ty.TensorType(shape , dtype))
+    y = relay.Var("y", relay.ty.TensorType(shape_like, dtype))
     z = relay.collapse_sum_like(x, y)
     zz = relay.ir_pass.infer_type(z)
-    assert zz.checked_type == relay.ty.TensorType((4, 1, 6), "int8")
+    assert zz.checked_type == relay.ty.TensorType(shape_like, dtype)
 
+    func = relay.Function([x, y], z)
+    x = np.random.uniform(size=shape).astype(dtype)
+    y = np.random.uniform(size=shape_like).astype(dtype)
+    ref_res = np.sum(x, 0)
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x, y)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 def test_broadcast_to_like():
-    x = relay.Var("x", relay.ty.TensorType((3, 4, 5, 6), "int8"))
-    y = relay.Var("y", relay.ty.TensorType((4, 1, 6), "int8"))
-    z = relay.broadcast_to_like(y, x)
+    shape = (4, 1, 6)
+    shape_like = (3, 4, 5, 6)
+    dtype = "float32"
+    x = relay.Var("x", relay.ty.TensorType(shape , dtype))
+    y = relay.Var("y", relay.ty.TensorType(shape_like, dtype))
+    z = relay.broadcast_to_like(x, y)
     zz = relay.ir_pass.infer_type(z)
-    assert zz.checked_type == relay.ty.TensorType((3, 4, 5, 6), "int8")
+    assert zz.checked_type == relay.ty.TensorType(shape_like, dtype)
+
+    func = relay.Function([x, y], z)
+    x = np.random.uniform(size=shape).astype(dtype)
+    y = np.random.uniform(size=shape_like).astype(dtype)
+    ref_res = np.broadcast_to(x, shape_like)
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x, y)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
 def np_slice_like(np_data, np_shape_like, axis=None):

From 618f6b183cd6e56b658486039f48778e9d0dc623 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Thu, 29 Nov 2018 23:37:19 +0530
Subject: [PATCH 431/529] [RELAY]missing schedules updated (#2196)

---
 python/tvm/relay/op/_tensor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 774e091baefc..75ea3da8af80 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -20,6 +20,7 @@
 register_schedule("abs", schedule_broadcast)
 register_schedule("tanh", schedule_broadcast)
 register_schedule("negative", schedule_broadcast)
+register_schedule("copy", schedule_broadcast)
 
 register_schedule("add", schedule_broadcast)
 register_schedule("subtract", schedule_broadcast)
@@ -33,7 +34,7 @@
 register_schedule("less_equal", schedule_broadcast)
 register_schedule("greater", schedule_broadcast)
 register_schedule("greater_equal", schedule_broadcast)
-register_schedule("maximum_compute", schedule_injective)
+register_schedule("maximum", schedule_injective)
 register_schedule("minimum", schedule_injective)
 register_schedule("right_shift", schedule_injective)
 register_schedule("left_shift", schedule_injective)

From c7495072bacb87c19b5128f4b0332492ba81506c Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Thu, 29 Nov 2018 21:07:53 +0300
Subject: [PATCH 432/529] [TVM] Fix segfault for CanonicalSimplify(x % -1)
 (#2194)

---
 src/arithmetic/canonical.cc                  | 10 +++++++++-
 tests/python/unittest/test_arith_simplify.py | 10 ++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc
index 0fa7b846cf7e..8f913ccd4350 100644
--- a/src/arithmetic/canonical.cc
+++ b/src/arithmetic/canonical.cc
@@ -515,7 +515,15 @@ class Canonical::Internal : public IRMutator {
         n->elem.push_back(e);
       }
       Expr ret = Sum2Expr(ComExpr(n), v.type()) % v;
-      return Binary(ret.as<Mod>(), ret);
+      if (const Mod* mod = ret.as<Mod>()) {
+        return Binary(mod, ret);
+      } else {
+        // Sometimes the result is a constant, this may happen when value is -1
+        CHECK(is_const(ret)) << "CanonicalSimplify: "
+          << Sum2Expr(ComExpr(n), v.type()) << " % " << v << " is " << ret
+          << " which is neither Mod, nor a constant";
+        return ret;
+      }
     }
     ret_entry_.sum = pair[1];
     ret_entry_.max_level = stack_.back().max_level;
diff --git a/tests/python/unittest/test_arith_simplify.py b/tests/python/unittest/test_arith_simplify.py
index e6689dddf9d0..e9315eda3257 100644
--- a/tests/python/unittest/test_arith_simplify.py
+++ b/tests/python/unittest/test_arith_simplify.py
@@ -20,6 +20,16 @@ def test_simplify():
     zz = zz.a
     assert zz.a == x and zz.b.value == 4
 
+    n = tvm.var('n')
+    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n % (-1)), tvm.const(0))
+    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n % 1), tvm.const(0))
+    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n / 1), n)
+    tvm.ir_pass.CanonicalSimplify(n / (-1))
+    # This is not true in the current implementation
+    #  assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n / (-1)),
+    #                           tvm.ir_pass.CanonicalSimplify(-n))
+
+
 def test_simplify_mod():
     """Not yet working, mock design"""
     ib = tvm.ir_builder.create()

From e7307d615202d769cdde3c74968ee1e3e281a840 Mon Sep 17 00:00:00 2001
From: Liang ZOU <liang.d.zou@gmail.com>
Date: Fri, 30 Nov 2018 02:19:25 +0800
Subject: [PATCH 433/529] Remove redundant item from langref/relay_op.rst
 (#2192)

Remove redundant item `tvm.relay.sigmoid` from langref/relay_op.rst
---
 docs/langref/relay_op.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index e7fda319cb9c..f4a65023ee53 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -36,7 +36,6 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.divide
    tvm.relay.mod
    tvm.relay.tanh
-   tvm.relay.sigmoid
    tvm.relay.nn.relu
    tvm.relay.nn.dropout
    tvm.relay.nn.batch_norm
@@ -159,7 +158,6 @@ Level 1 Definitions
 .. autofunction:: tvm.relay.divide
 .. autofunction:: tvm.relay.mod
 .. autofunction:: tvm.relay.tanh
-.. autofunction:: tvm.relay.sigmoid
 .. autofunction:: tvm.relay.concatenate
 .. autofunction:: tvm.relay.nn.softmax
 .. autofunction:: tvm.relay.nn.log_softmax

From c754a79bf404545cf2144a50ed30a8a9ab29ab64 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 30 Nov 2018 02:32:36 +0800
Subject: [PATCH 434/529] Fix logging in autotvm record (#2195)

---
 python/tvm/autotvm/record.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index 3135e5c58f3d..5adfae465ce3 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -271,7 +271,7 @@ def pick_best(in_file, out_file):
     parser.add_argument("--code", action='store_true')
 
     args = parser.parse_args()
-    logging.basicConfig(level=logger.INFO)
+    logging.basicConfig(level=logging.INFO)
 
     if args.mode == 'pick':
         args.o = args.o or args.i + ".best.log"

From 60018945e316e6893114e1360e7c95a550dd64f9 Mon Sep 17 00:00:00 2001
From: Pratyush Patel <pratyushpatel.1995@gmail.com>
Date: Thu, 29 Nov 2018 13:17:32 -0800
Subject: [PATCH 435/529] fix llvm dependency bug (#2198)

---
 python/tvm/contrib/cc.py | 2 --
 python/tvm/module.py     | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index 0361f594de6a..0ffa6c420243 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -7,7 +7,6 @@
 
 from .._ffi.base import py_str
 from .util import tempdir
-from .._ffi.libinfo import find_include_path
 
 
 def create_shared(output,
@@ -50,7 +49,6 @@ def _linux_shared(output, objects, options, cc="g++"):
         cmd += objects
     if options:
         cmd += options
-    cmd += ["-I" + path for path in find_include_path()]
     proc = subprocess.Popen(
         cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
diff --git a/python/tvm/module.py b/python/tvm/module.py
index cd919722e681..79a1fab45570 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -6,6 +6,7 @@
 
 from ._ffi.function import ModuleBase, _set_class_module
 from ._ffi.function import _init_api
+from ._ffi.libinfo import find_include_path
 from .contrib import cc as _cc, tar as _tar, util as _util
 
 ProfileResult = namedtuple("ProfileResult", ["mean", "results"])
@@ -122,6 +123,8 @@ def export_library(self,
                 fcompile = _tar.tar
             else:
                 fcompile = _cc.create_shared
+        if self.type_key == "c":
+            kwargs.update({'options': ["-I" + path for path in find_include_path()]})
         fcompile(file_name, files, **kwargs)
 
     def time_evaluator(self, func_name, ctx, number, repeat=1):

From ae2c7fda3b169dba756c2f6ab4cdd25a4dfa0b9d Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Thu, 29 Nov 2018 17:11:15 -0800
Subject: [PATCH 436/529] [Relay] Alter Op Layout (#2150)

* [RELAY] Finish alter op pass

* [RELAY] AlterOpLayout Pass

* fix broadcast operators

* fix broadcast operators

* fix broadcast operators

* Support concatenate

* address comments

* address comments

* add comments

* rebase
---
 3rdparty/HalideIR                             |   2 +-
 include/tvm/relay/attrs/nn.h                  |   5 +
 include/tvm/relay/attrs/transform.h           |  13 +
 include/tvm/relay/expr.h                      |   2 +-
 include/tvm/relay/op_attr_types.h             |  15 +
 include/tvm/relay/pass.h                      |  16 +
 python/tvm/__init__.py                        |   1 +
 python/tvm/attrs.py                           |  40 +++
 python/tvm/relay/base.py                      |  14 +
 python/tvm/relay/build_module.py              |   8 +
 python/tvm/relay/ir_pass.py                   |  36 ++
 python/tvm/relay/op/__init__.py               |   4 +-
 python/tvm/relay/op/_tensor.py                |   9 -
 python/tvm/relay/op/_transform.py             |  18 +-
 python/tvm/relay/op/op.py                     |  22 +-
 python/tvm/relay/op/op_attrs.py               |  14 +
 python/tvm/relay/op/transform.py              |  22 ++
 src/lang/attrs.cc                             |   6 +
 src/relay/op/layout.h                         |  23 +-
 src/relay/op/nn/convolution.cc                |  31 +-
 src/relay/op/nn/nn.cc                         |  19 +-
 src/relay/op/nn/pad.cc                        | 175 +++++-----
 src/relay/op/nn/pooling.cc                    |  39 ++-
 src/relay/op/op_common.h                      |  57 +++-
 src/relay/op/tensor/binary.cc                 |  40 +--
 src/relay/op/tensor/transform.cc              | 146 +++++++-
 src/relay/op/tensor/unary.cc                  |  83 ++---
 src/relay/pass/alter_op_layout.cc             | 312 +++++++++++++++++
 src/relay/pass/alter_op_layout.h              | 119 +++++++
 src/relay/pass/canonicalize_ops.cc            |  46 +++
 src/relay/pass/fold_scale_axis.cc             |   4 +-
 src/relay/pass/forward_rewrite.cc             |  55 ++-
 src/relay/pass/pattern_util.h                 |   2 +-
 .../python/relay/test_pass_alter_op_layout.py | 316 ++++++++++++++++++
 topi/include/topi/nn.h                        |   1 +
 35 files changed, 1498 insertions(+), 217 deletions(-)
 create mode 100644 python/tvm/attrs.py
 create mode 100644 python/tvm/relay/op/op_attrs.py
 create mode 100644 src/relay/pass/alter_op_layout.cc
 create mode 100644 src/relay/pass/alter_op_layout.h
 create mode 100644 src/relay/pass/canonicalize_ops.cc
 create mode 100644 tests/python/relay/test_pass_alter_op_layout.py

diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
index e4a4c02764d3..a08e26e5a97f 160000
--- a/3rdparty/HalideIR
+++ b/3rdparty/HalideIR
@@ -1 +1 @@
-Subproject commit e4a4c02764d37c9c3db0d64c4996651a3ef9513c
+Subproject commit a08e26e5a97f4ef4d566a42f6c78704b3f9c7b8a
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 817ee04bd844..724749368aa9 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -105,6 +105,7 @@ struct Conv2DTransposeAttrs : public tvm::AttrsNode<Conv2DTransposeAttrs> {
   int groups;
   std::string data_layout;
   std::string weight_layout;
+  std::string out_layout;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv2DTransposeAttrs, "relay.attrs.Conv2DTransposeAttrs") {
@@ -139,6 +140,10 @@ struct Conv2DTransposeAttrs : public tvm::AttrsNode<Conv2DTransposeAttrs> {
       .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
                 "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
                 "dimensions respectively.");
+    TVM_ATTR_FIELD(out_layout).set_default("")
+        .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                      "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                      "dimensions respectively. Default to be same as input layout.");
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 3e56106df0c2..7e614a8cafd4 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -164,6 +164,19 @@ struct ClipAttrs : public tvm::AttrsNode<ClipAttrs> {
   }
 };
 
+
+struct LayoutTransformAttrs : public tvm::AttrsNode<LayoutTransformAttrs> {
+  std::string src_layout;
+  std::string dst_layout;
+
+  TVM_DECLARE_ATTRS(LayoutTransformAttrs, "relay.attrs.LayoutTransformAttrs") {
+    TVM_ATTR_FIELD(src_layout)
+        .describe("The source layout of the tensor. (e.g. NCHW)");
+    TVM_ATTR_FIELD(dst_layout)
+        .describe("The destination layout of the tensor. (e.g. NCHW16c)");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 469b73a1df10..37c91ffe4ed2 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -459,7 +459,7 @@ inline const TTypeNode* ExprNode::type_as() const {
   static_assert(std::is_base_of<TypeNode, TTypeNode>::value,
                 "TType must be a special case of type");
   CHECK(checked_type_.defined())
-      << "Type inference for this Expr has not completed";
+      << "Type inference for this Expr has not completed. Try to call infer_type pass.";
   const TTypeNode* node = checked_type_.as<TTypeNode>();
   CHECK(node != nullptr)
       << "Expected type to be " << TTypeNode::_type_key
diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 3d9fa56855c3..1f37e9947bb8 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -86,6 +86,21 @@ using FTVMSchedule = runtime::TypedPackedFunc<
            const Array<Tensor>& outs,
            const Target& target)>;
 
+/*!
+ * \brief Alternate the layout of operators or replace the
+ *  operator with other expressions. This function will be invoked
+ *  in AlterOpLayout pass.
+ * \param attrs The attribute of the original node.
+ * \param inputs The input symbols of the original node.
+ * \param tinfos An array of placeholders, use for getting the inferred shape
+ *               and dtype of the inputs.
+ * \return new_expr The modified expression.
+ */
+using FTVMAlterOpLayout = runtime::TypedPackedFunc<
+  Expr(const Attrs& attrs,
+       const Array<Expr>& args,
+       const Array<Tensor>& tinfos)>;
+
 /*!
  * \brief Forward rewriting rule for a specific op.
  *
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 58e160eb4ac9..8fff7016a827 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -8,6 +8,7 @@
 
 #include <tvm/relay/module.h>
 #include <tvm/relay/expr.h>
+#include <tvm/relay/op_attr_types.h>
 #include <string>
 
 namespace tvm {
@@ -173,6 +174,21 @@ Expr ForwardRewrite(const Expr& expr,
                     std::function<NodeRef(const Call&)> fcontext = nullptr,
                     std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
 
+/*!
+ * \brief Apply rewrite rules to rewrite the expr in post DFS order.
+ * \param expr The expression.
+ * \param rewrite_func The rewrite func that will apply to all operators.
+ * \param fcontext Additional callback to provide context argument for each call node.
+ * \param fmulti_ref_trigger Transformation function to be called when
+ *                           an Expr consumed by multiple callers.
+ * \return The rewritten expression.
+ */
+Expr ForwardRewrite(const Expr& expr,
+                    const FForwardRewrite& rewrite_func,
+                    std::function<NodeRef(const Call&)> fcontext = nullptr,
+                    std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
+
+
 /*! \brief A hashing structure in the style of std::hash. */
 struct StructuralHash {
   /*! \brief Hash a Relay type.
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index e202c5adb967..67dd54d1db4d 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -13,6 +13,7 @@
 from . import schedule
 from . import module
 from . import node
+from . import attrs
 from . import ir_builder
 from . import target
 from . import generic
diff --git a/python/tvm/attrs.py b/python/tvm/attrs.py
new file mode 100644
index 000000000000..529dbcc14c13
--- /dev/null
+++ b/python/tvm/attrs.py
@@ -0,0 +1,40 @@
+""" TVM Attribute module, which is mainly used for defining attributes of operators"""
+from ._ffi.node import NodeBase, register_node as _register_tvm_node
+from ._ffi.function import _init_api
+from . import _api_internal
+
+
+@_register_tvm_node
+class Attrs(NodeBase):
+    """Attribute node, which is mainly use for defining attributes of relay operators.
+
+    Used by function registered in python side, such as compute, schedule and alter_layout.
+    Attrs is passed as the first argument to these functions.
+    """
+    def list_field_info(self):
+        """ Get fields information
+
+        Returns
+        -------
+        infos: list of AttrFieldInfo
+            List of field information
+        """
+        return _api_internal._AttrsListFieldInfo(self)
+
+    def keys(self):
+        """Get list of names in the attribute.
+
+        Returns
+        -------
+        keys : list of str
+            List of keys
+        """
+        fields = self.list_field_info()
+        for field in fields:
+            yield field.name
+
+    def __getitem__(self, item):
+        return self.__getattr__(item)
+
+
+_init_api("tvm.attrs")
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
index 83aa4ec2cdd0..f1105fe4f0d9 100644
--- a/python/tvm/relay/base.py
+++ b/python/tvm/relay/base.py
@@ -21,6 +21,20 @@ def register_relay_node(type_key=None):
     return _register_tvm_node(type_key)
 
 
+def register_relay_attr_node(type_key=None):
+    """register relay attribute node
+
+    Parameters
+    ----------
+    type_key : str or cls
+        The type key of the node
+    """
+    if not isinstance(type_key, str):
+        return _register_tvm_node(
+            "relay.attrs." + type_key.__name__)(type_key)
+    return _register_tvm_node(type_key)
+
+
 class RelayNode(NodeBase):
     """Base class of all relay node."""
     def astext(self, show_meta_data=True, annotate=None):
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 863ca063137f..2a2cd9f82ecb 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -17,6 +17,7 @@
     "FoldConstant": 2,
     "CombineParallelConv2D": 3,
     "FoldScaleAxis": 3,
+    "AlterOpLayout": 3,
 }
 
 class BuildConfig(object):
@@ -157,6 +158,13 @@ def optimize(func, params=None):
 
     if cfg.pass_enabled("FoldConstant"):
         func = ir_pass.fold_constant(func)
+
+    if cfg.pass_enabled("AlterOpLayout"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.canonicalize_ops(func)
+        func = ir_pass.infer_type(func)
+        func = ir_pass.alter_op_layout(func)
+
     return func
 
 
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 6297e366070f..53fa59cd053d 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -191,6 +191,23 @@ def simplify_inference(expr):
     return _ir_pass.simplify_inference(expr)
 
 
+def canonicalize_ops(expr):
+    """ Canonicalize special operators to basic operators.
+    This can simplify latter analysis. (e.g. Expand bias_add to expand_dims and broadcast_add.)
+
+    Parameters
+    ----------
+    e: tvm.relay.Expr
+        The input Expression
+
+    Returns
+    -------
+    result: tvm.relay.Expr
+        An expression without bias_add
+    """
+    return _ir_pass.canonicalize_ops(expr)
+
+
 def dead_code_elimination(expr):
     """ Remove expressions which does not effect the program result (dead code).
 
@@ -321,3 +338,22 @@ def combine_parallel_conv2d(expr):
         Transformed expression
     """
     return _ir_pass.CombineParallelConv2D(expr)
+
+
+def alter_op_layout(expr):
+    """Alternate the layouts of operators or replace primitive operators with
+    other expressions.
+    This pass can be used for computing convolution in custom layouts or
+    other general weight pre-transformation.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    transformed_expr : tvm.relay.Expr
+        Transformed expression with alternated layout.
+    """
+    return _ir_pass.AlterOpLayout(expr)
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index b32db4c23f3e..4a6dfd9f7335 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -1,7 +1,8 @@
 #pylint: disable=wildcard-import, redefined-builtin
 """Relay core operators."""
 # operator defs
-from .op import get, register, register_schedule, register_compute, Op
+from .op import get, register, register_schedule, register_compute, register_alter_op_layout, \
+    Op
 
 # Operators
 from .reduce import *
@@ -10,6 +11,7 @@
 from . import nn
 from . import image
 from . import vision
+from . import op_attrs
 
 # operator registry
 from . import _tensor
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 75ea3da8af80..d1035ee047e5 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -80,12 +80,3 @@ def clip_compute(attrs, inputs, output_type, target):
     return [topi.clip(inputs[0], attrs.a_min, attrs.a_max)]
 
 register_schedule("clip", schedule_elemwise)
-register_pattern("clip", OpPattern.ELEMWISE)
-
-# concatenate
-@register_compute("concatenate")
-def concatenate_compute(attrs, inputs, output_type, target):
-    return [topi.concatenate(inputs, axis=attrs.axis)]
-
-register_schedule("concatenate", schedule_injective)
-register_pattern("concatenate", OpPattern.INJECTIVE)
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 3093032f9e40..1aaf376a7dc8 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -1,8 +1,10 @@
 """Backend compiler related feature registration"""
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name,unused-argument
 from __future__ import absolute_import
+import topi
 from . import op as _reg
 from ._reduce import _schedule_reduce
+from .op import schedule_injective, OpPattern
 
 schedule_injective = _reg.schedule_injective
 schedule_broadcast = _reg.schedule_injective
@@ -15,10 +17,22 @@
 _reg.register_schedule("reshape_like", schedule_injective)
 _reg.register_schedule("full", schedule_injective)
 _reg.register_schedule("full_like", schedule_injective)
-_reg.register_schedule("cast", schedule_broadcast)
+_reg.register_schedule("cast", schedule_injective)
 _reg.register_schedule("strided_slice", schedule_injective)
 _reg.register_schedule("slice_like", schedule_injective)
 _reg.register_schedule("split", schedule_injective)
 _reg.register_schedule("take", schedule_injective)
 _reg.register_schedule("transpose", schedule_injective)
 _reg.register_schedule("where", schedule_broadcast)
+
+# layout_transform
+_reg.register_schedule("layout_transform", schedule_injective)
+_reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
+
+# concatenate
+@_reg.register_compute("concatenate")
+def concatenate_compute(attrs, inputs, output_type, target):
+    return [topi.concatenate(inputs, axis=attrs.axis)]
+
+_reg.register_schedule("concatenate", schedule_injective)
+_reg.register_pattern("concatenate", OpPattern.INJECTIVE)
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index c777a82462c8..dd3af9c44e42 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -107,7 +107,7 @@ def register_schedule(op_name, schedule=None, level=10):
     op_name : str
         The name of the op.
 
-    schedule : function
+    schedule : function (attrs: Attrs, outs: List[Tensor], target: Target) -> sch: Schedule
         The schedule function.
 
     level : int
@@ -124,7 +124,8 @@ def register_compute(op_name, compute=None, level=10):
     op_name : str
         The name of the op.
 
-    compute : function
+    compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type, target:Target)
+                       -> List[Tensor]
         The compute function.
 
     level : int
@@ -133,6 +134,23 @@ def register_compute(op_name, compute=None, level=10):
     return register(op_name, "FTVMCompute", compute, level)
 
 
+def register_alter_op_layout(op_name, alter_layout=None, level=10):
+    """Register alter op layout function for an op
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the operator
+
+    alter_layout: function (attrs: Attrs, inputs: List[Expr]) -> new_expr: Expr
+        The function for changing the layout or replacing the operator
+
+    level : int
+        The priority level
+    """
+    return register(op_name, "FTVMAlterOpLayout", alter_layout, level)
+
+
 def register_pattern(op_name, pattern, level=10):
     """Register operator pattern for an op.
 
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
new file mode 100644
index 000000000000..682d56fb9efc
--- /dev/null
+++ b/python/tvm/relay/op/op_attrs.py
@@ -0,0 +1,14 @@
+"""The attributes node used for Relay operators"""
+
+from ...attrs import Attrs
+from ..base import register_relay_attr_node
+
+@register_relay_attr_node
+class Conv2DAttrs(Attrs):
+    """Attribute of a Convolution Operator"""
+    pass
+
+@register_relay_attr_node
+class GlobalPool2DAttrs(Attrs):
+    """Attribute of a Global 2D Pooling Operator"""
+    pass
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index c5fedab054d2..17caad4bb304 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -387,3 +387,25 @@ def slice_like(data, shape_like, axes=None):
         The computed result.
     """
     return _make.slice_like(data, shape_like, axes)
+
+
+def layout_transform(data, src_layout, dst_layout):
+    """Transform the layout of a tensor
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The source tensor to be transformed
+
+    src_layout: str
+        The source layout.  (e.g NCHW)
+
+    dst_layout: str
+        The destination layout.  (e.g. NCHW16c)
+
+    Returns
+    -------
+    ret : relay.Expr
+        The transformed tensor.
+    """
+    return _make.layout_transform(data, src_layout, dst_layout)
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index 3b273f4939ef..1daf1e792553 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -3,6 +3,7 @@
  * \file attrs.cc
  */
 #include <tvm/attrs.h>
+#include <tvm/api_registry.h>
 #include "attr_functor.h"
 
 namespace tvm {
@@ -321,4 +322,9 @@ bool DictAttrsNode::ContentEqual(const Node* other, AttrsEqual equal) const {
   return equal(this->dict, static_cast<const DictAttrsNode*>(other)->dict);
 }
 
+TVM_REGISTER_API("_AttrsListFieldInfo")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = args[0].operator Attrs()->ListFieldInfo();
+});
+
 }  // namespace tvm
diff --git a/src/relay/op/layout.h b/src/relay/op/layout.h
index 97160f3cbb9e..90c920bf3aa1 100644
--- a/src/relay/op/layout.h
+++ b/src/relay/op/layout.h
@@ -185,7 +185,7 @@ class Layout : public NodeRef {
         CHECK_GT(block_size, 0);
         new_layout << block_size;
       }
-      new_layout << layout_simplified[i]->value;
+      new_layout << static_cast<char>(layout_simplified[i]->value);
     }
     return Layout(new_layout.str());
   }
@@ -241,6 +241,16 @@ class Layout : public NodeRef {
     return operator->()->layout_simplified.size();
   }
 
+  /*! \return number of super dimensions */
+  size_t ndim_super() const {
+    size_t ct = 0;
+    for (auto x : operator->()->layout_simplified) {
+      if (IsSuperdim(x))
+        ct++;
+    }
+    return ct;
+  }
+
   /*!
    * \brief The description of the \p i-th dimension.
    *        If it is a sub-dimension, the size will be returned as well,
@@ -327,6 +337,17 @@ class Layout : public NodeRef {
     return operator->()->name == rhs->name;
   }
 
+  /*!
+ * \brief allow output string of layout to ostream
+ * \param os the output stream
+ * \param l the layout
+ * \return the ostream
+ */
+  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
+    os << l.name();
+    return os;
+  }
+
   using ContainerType = LayoutNode;
 
  private:
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index cb648166f7bb..170b6b6d13c5 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -7,11 +7,13 @@
 #include <tvm/relay/attrs/nn.h>
 #include <vector>
 
+#include "../../pass/alter_op_layout.h"
 #include "../layout.h"
 
 namespace tvm {
 namespace relay {
 
+// relay.nn.conv2d
 TVM_REGISTER_NODE_TYPE(Conv2DAttrs);
 
 bool Conv2DRel(const Array<Type>& types,
@@ -101,6 +103,20 @@ bool Conv2DRel(const Array<Type>& types,
   return true;
 }
 
+template<typename T>
+Array<Array<Layout> > Conv2DInferCorrectLayout(
+    const Attrs& attrs,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr>> &old_in_shapes) {
+  const T* params = attrs.as<T>();
+  Layout out_layout(params->out_layout);
+
+  // We always make other operators to fit the layouts of convolution layers
+  // So this inference ignores all inputs
+  return Array<Array<Layout> >{{params->data_layout, params->weight_layout},
+                               {out_layout.defined() ? out_layout : params->data_layout}};
+}
 
 // Positional relay function to create conv2d operator
 // used by frontend FFI.
@@ -156,10 +172,11 @@ with the layer input to produce a tensor of outputs.
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(2)
-.add_type_rel("Conv2D", Conv2DRel);
+.add_type_rel("Conv2D", Conv2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", Conv2DInferCorrectLayout<Conv2DAttrs>);
 
 
-// Conv2DTranspose
+// relay.nn.conv2d_transpose
 TVM_REGISTER_NODE_TYPE(Conv2DTransposeAttrs);
 
 bool Conv2DTransposeRel(const Array<Type>& types,
@@ -185,6 +202,12 @@ bool Conv2DTransposeRel(const Array<Type>& types,
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
+  Layout out_layout(param->out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+  CHECK(out_layout.Convertible(kNCHW))
+    << "Conv only support output layouts that are convertible from NCHW."
+    << " But got " << out_layout;
+
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
 
   auto dshape_nchw = ConvertLayout(data->shape, in_layout, kNCHW);
@@ -241,7 +264,7 @@ bool Conv2DTransposeRel(const Array<Type>& types,
   if (out_dtype.bits() == 0) {
     out_dtype = data->dtype;
   }
-  oshape = ConvertLayout(oshape, kNCHW, in_layout);
+  oshape = ConvertLayout(oshape, kNCHW, out_layout);
   reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
   return true;
 }
@@ -307,6 +330,8 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(2)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               Conv2DInferCorrectLayout<Conv2DTransposeAttrs>)
 .add_type_rel("Conv2DTranspose", Conv2DTransposeRel);
 
 }  // namespace relay
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index d3b454f35ede..7ed43d0df019 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -12,12 +12,14 @@
 #include <topi/nn/flatten.h>
 #include <vector>
 #include "../type_relations.h"
+#include "../../pass/alter_op_layout.h"
 #include "../op_common.h"
 #include "../layout.h"
 
 namespace tvm {
 namespace relay {
 
+// relay.nn.bias_add
 TVM_REGISTER_NODE_TYPE(BiasAddAttrs);
 
 bool BiasAddRel(const Array<Type>& types,
@@ -74,6 +76,7 @@ RELAY_REGISTER_OP("nn.bias_add")
 .add_type_rel("BiasAdd", BiasAddRel);
 
 
+// relay.nn.dense
 TVM_REGISTER_NODE_TYPE(DenseAttrs);
 
 
@@ -143,6 +146,8 @@ RELAY_REGISTER_OP("nn.dense")
 .set_support_level(1)
 .add_type_rel("Dense", DenseRel);
 
+// relay.leaky_relu
+TVM_REGISTER_NODE_TYPE(LeakyReluAttrs);
 
 // Positional relay function to create leaky relu operator used by frontend FFI.
 Expr MakeLeakyRelu(Expr data,
@@ -171,6 +176,7 @@ RELAY_REGISTER_OP("nn.leaky_relu")
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(3)
 .add_type_rel("Identity", IdentityRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>(
   "FTVMCompute", [](const Attrs& attrs,
                     const Array<Tensor>& inputs,
@@ -181,6 +187,7 @@ RELAY_REGISTER_OP("nn.leaky_relu")
 });
 
 
+// relay.prelu
 TVM_REGISTER_NODE_TYPE(PReluAttrs);
 
 bool PReluRel(const Array<Type>& types,
@@ -235,6 +242,7 @@ where :math:`*` is an channelwise multiplication for each sample in the batch.
 .add_argument("alpha", "Tensor", "Input channelwise alpha.")
 .set_support_level(3)
 .add_type_rel("PRelu", PReluRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>(
   "FTVMCompute", [](const Attrs& attrs,
                     const Array<Tensor>& inputs,
@@ -245,6 +253,9 @@ where :math:`*` is an channelwise multiplication for each sample in the batch.
 });
 
 
+// relay.softmax
+TVM_REGISTER_NODE_TYPE(SoftmaxAttrs);
+
 TVM_REGISTER_API("relay.op.nn._make.softmax")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
   auto make_func = [](Expr data, int axis) {
@@ -282,6 +293,7 @@ RELAY_REGISTER_OP("nn.softmax")
 });
 
 
+// relay.nn.log_softmax
 TVM_REGISTER_API("relay.op.nn._make.log_softmax")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
   auto make_func = [](Expr data, int axis) {
@@ -321,8 +333,7 @@ RELAY_REGISTER_OP("nn.log_softmax")
 });
 
 
-
-// BatchFlatten
+// relay.nn.batch_flatten
 bool BatchFlattenRel(const Array<Type>& types,
                      int num_inputs,
                      const Attrs& attrs,
@@ -410,6 +421,7 @@ RELAY_REGISTER_OP("nn.relu")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
 .add_type_rel("Identity", IdentityRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
                                          const Array<Tensor>& inputs,
                                          const Type& out_type,
@@ -460,6 +472,7 @@ centered at that value (zero padding is added where necessary).
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
 .add_type_rel("Identity", IdentityRel);
 
 
@@ -495,6 +508,7 @@ Normalizes along dimension axis using an L2 norm
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
 .add_type_rel("Identity", IdentityRel);
 
 // Dropout
@@ -538,6 +552,7 @@ The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "Input to which dropout will be applied.")
 .set_support_level(1)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
 .add_type_rel("Dropout", DropoutRel);
 
 // batch_norm
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index 6e02d74e6ea8..5403d0620e50 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -1,87 +1,88 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file pad.cc
- * \brief Implementation of operator pad
- */
-#include <tvm/ir_operator.h>
-#include <tvm/relay/op.h>
-#include <tvm/relay/attrs/nn.h>
-#include <vector>
-#include "../layout.h"
-
-namespace tvm {
-namespace relay {
-
-TVM_REGISTER_NODE_TYPE(PadAttrs);
-
-bool PadRel(const Array<Type>& types,
-            int num_inputs,
-            const Attrs& attrs,
-            const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
-  const auto* data = types[0].as<TensorTypeNode>();
-  if (data == nullptr) return false;
-
-  const PadAttrs* param = attrs.as<PadAttrs>();
-  CHECK(param != nullptr);
-
-  // check that pad widths match lengths
-  CHECK(data->shape.size() == param->pad_width.size())
-    << "There should be as many pad width pairs as shape dimensions "
-    << "but the shape has " << data->shape.size() << " dimensions "
-    << "and there are " << param->pad_width.size() << " pad width pairs.";
-
-  // each pad width element should be a pair of positive integers
-  std::vector<IndexExpr> oshape;
-  for (size_t i = 0; i < param->pad_width.size(); i++) {
-    CHECK(param->pad_width[i].size() == 2)
-      << "Each pad width element should be a pair but at index " << i
-      << " there are " << param->pad_width[i].size() << " elements.";
-
-    auto width1 = as_const_int(param->pad_width[i][0]);
-    auto width2 = as_const_int(param->pad_width[i][1]);
-    CHECK(width1 != nullptr);
-    CHECK(width2 != nullptr);
-
-    CHECK(*width1 >= 0)
-      << "Param width elements should be positive but first pad width at "
-      << "index " << i << " is " << *width1 << ".";
-    CHECK(*width2 >= 0)
-      << "Param width elements should be positive but first pad width at "
-      << "index " << i << " is " << *width2 << ".";
-
-    auto padding = make_const(data->shape[i].type(), *width1 + *width2);
-    oshape.push_back(data->shape[i] + padding);
-  }
-
-  reporter->Assign(types[1], TensorTypeNode::make(Array<IndexExpr>(oshape),
-                                                  data->dtype));
-  return true;
-}
-
-// Handler to create a call to the padding op used by front-end FFI
-Expr MakePad(Expr data, Array<Array<IndexExpr> > pad_width, double pad_value) {
-  auto attrs = make_node<PadAttrs>();
-  attrs->pad_value = pad_value;
-  attrs->pad_width = std::move(pad_width);
-  static const Op& op = Op::Get("nn.pad");
-  return CallNode::make(op, {data}, Attrs(attrs), {});
-}
-
-TVM_REGISTER_API("relay.op.nn._make.pad")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 3>(MakePad, args, rv);
-  });
-
-RELAY_REGISTER_OP("nn.pad")
-.describe(R"code(Pad for n-D tensor.
-
-)code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.PadAttrs")
-.set_num_inputs(1)
-.add_argument("data", "Tensor", "The input tensor.")
-.set_support_level(2)
-.add_type_rel("Pad", PadRel);
-
-}  // namespace relay
-}  // namespace tvm
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file pad.cc
+ * \brief Implementation of operator pad
+ */
+#include <tvm/ir_operator.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/nn.h>
+#include <vector>
+#include "../layout.h"
+
+namespace tvm {
+namespace relay {
+
+// relay.nn.pad
+TVM_REGISTER_NODE_TYPE(PadAttrs);
+
+bool PadRel(const Array<Type>& types,
+            int num_inputs,
+            const Attrs& attrs,
+            const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const PadAttrs* param = attrs.as<PadAttrs>();
+  CHECK(param != nullptr);
+
+  // check that pad widths match lengths
+  CHECK(data->shape.size() == param->pad_width.size())
+    << "There should be as many pad width pairs as shape dimensions "
+    << "but the shape has " << data->shape.size() << " dimensions "
+    << "and there are " << param->pad_width.size() << " pad width pairs.";
+
+  // each pad width element should be a pair of positive integers
+  std::vector<IndexExpr> oshape;
+  for (size_t i = 0; i < param->pad_width.size(); i++) {
+    CHECK(param->pad_width[i].size() == 2)
+      << "Each pad width element should be a pair but at index " << i
+      << " there are " << param->pad_width[i].size() << " elements.";
+
+    auto width1 = as_const_int(param->pad_width[i][0]);
+    auto width2 = as_const_int(param->pad_width[i][1]);
+    CHECK(width1 != nullptr);
+    CHECK(width2 != nullptr);
+
+    CHECK(*width1 >= 0)
+      << "Param width elements should be positive but first pad width at "
+      << "index " << i << " is " << *width1 << ".";
+    CHECK(*width2 >= 0)
+      << "Param width elements should be positive but first pad width at "
+      << "index " << i << " is " << *width2 << ".";
+
+    auto padding = make_const(data->shape[i].type(), *width1 + *width2);
+    oshape.push_back(data->shape[i] + padding);
+  }
+
+  reporter->Assign(types[1], TensorTypeNode::make(Array<IndexExpr>(oshape),
+                                                  data->dtype));
+  return true;
+}
+
+// Handler to create a call to the padding op used by front-end FFI
+Expr MakePad(Expr data, Array<Array<IndexExpr> > pad_width, double pad_value) {
+  auto attrs = make_node<PadAttrs>();
+  attrs->pad_value = pad_value;
+  attrs->pad_width = std::move(pad_width);
+  static const Op& op = Op::Get("nn.pad");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.pad")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakePad, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.pad")
+.describe(R"code(Pad for n-D tensor.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.PadAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("Pad", PadRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 0af0bbf63633..6233e6d51776 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -9,13 +9,39 @@
 #include <topi/nn/pooling.h>
 #include <vector>
 #include "../layout.h"
+#include "../../pass/alter_op_layout.h"
 
 namespace tvm {
 namespace relay {
 
+// relay.nn.max_pool2d & relay.nn.avg_pool2d
 TVM_REGISTER_NODE_TYPE(MaxPool2DAttrs);
 TVM_REGISTER_NODE_TYPE(AvgPool2DAttrs);
 
+template <typename T>
+Array<Array<Layout> > Pool2DInferCorrectLayout(
+    const Attrs& attrs,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr>> &old_in_shapes) {
+  // NOTE: Discard "const" qualifier here.
+  T *params = const_cast<T*>(attrs.as<T>());
+
+  if (new_in_layouts.defined()) {
+    CHECK_EQ(new_in_layouts.size(), 1);
+
+    Layout raw_layout(params->layout);
+    Layout input = new_in_layouts[0];
+    if (input.Indexof('W') == raw_layout.Indexof('W') &&
+        input.Indexof('H') == raw_layout.Indexof('H') &&
+        !input.Contains('w') && !input.Contains('h')) {
+      params->layout = input.name();  // modify self to follow the input layout
+    }
+  }
+
+  return Array<Array<Layout> >{{params->layout}, {params->layout}};
+}
+
 template <typename AttrType>
 bool Pool2DRel(const Array<Type>& types,
                int num_inputs,
@@ -163,6 +189,7 @@ RELAY_REGISTER_OP("nn.max_pool2d")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
 .add_type_rel("MaxPool2D", Pool2DRel<MaxPool2DAttrs>)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", Pool2DInferCorrectLayout<MaxPool2DAttrs>)
 .set_attr<FTVMCompute>("FTVMCompute", Pool2DCompute<MaxPool2DAttrs, topi::nn::kMaxPool>);
 
 
@@ -219,9 +246,10 @@ Average pooling operation for one dimensional data.
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
 .add_type_rel("AvgPool2D", Pool2DRel<AvgPool2DAttrs>)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", Pool2DInferCorrectLayout<AvgPool2DAttrs>)
 .set_attr<FTVMCompute>("FTVMCompute", Pool2DCompute<AvgPool2DAttrs, topi::nn::kAvgPool>);
 
-// Global Pool
+// relay.nn.global_pool_2d & relay.nn.max_pool_2d
 TVM_REGISTER_NODE_TYPE(GlobalPool2DAttrs);
 
 bool GlobalPool2DRel(const Array<Type>& types,
@@ -247,8 +275,9 @@ bool GlobalPool2DRel(const Array<Type>& types,
 
   const auto hidx = layout.Indexof('H');
   const auto widx = layout.Indexof('W');
-  std::vector<IndexExpr> oshape({dshape[0], dshape[1], dshape[2], dshape[3]});
-  oshape[hidx] = oshape[widx] = 1;
+  Array<IndexExpr> oshape(dshape);
+  oshape.Set(hidx, 1);
+  oshape.Set(widx, 1);
 
   // assign output type
   reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
@@ -307,6 +336,8 @@ RELAY_REGISTER_OP("nn.global_avg_pool2d")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
 .add_type_rel("GlobalAvgPool2D", GlobalPool2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               Pool2DInferCorrectLayout<GlobalPool2DAttrs>)
 .set_attr<FTVMCompute>("FTVMCompute", GlobalPool2DCompute<topi::nn::kAvgPool>);
 
 // GlobalMaxPool
@@ -338,6 +369,8 @@ RELAY_REGISTER_OP("nn.global_max_pool2d")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
 .add_type_rel("GlobalMaxPool2D", GlobalPool2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               Pool2DInferCorrectLayout<GlobalPool2DAttrs>)
 .set_attr<FTVMCompute>("FTVMCompute", GlobalPool2DCompute<topi::nn::kMaxPool>);
 
 }  // namespace relay
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
index 5bb2f24cae81..36cd04931903 100644
--- a/src/relay/op/op_common.h
+++ b/src/relay/op/op_common.h
@@ -11,6 +11,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <vector>
+#include "../pass/alter_op_layout.h"
 
 namespace tvm {
 namespace relay {
@@ -32,21 +33,24 @@ inline std::vector<T> AsVector(const Array<T> &array) {
  * We make the decision to always only expose positional argument.
  * We will do rewrapping in the frontend to support language
  * sugars such as keyword arguments and default value.
- *
- * \param Prefix the prefix of the registry, for example, "relay.op._make.".
- *
+
  * \param OpName the name of registry.
  */
-#define RELAY_REGISTER_UNARY_OP(Prefix, OpName)           \
-  TVM_REGISTER_API(Prefix OpName)                         \
-    .set_body_typed<Expr(Expr)>([](Expr data) {           \
-        static const Op& op = Op::Get(OpName);            \
-        return CallNode::make(op, {data}, Attrs(), {});   \
-      });                                                 \
-  RELAY_REGISTER_OP(OpName)                               \
-    .set_num_inputs(1)                                    \
-    .add_argument("data", "Tensor", "The input tensor.")  \
-    .set_attr<TOpPattern>("TOpPattern", kElemWise)
+#define RELAY_REGISTER_UNARY_OP(OpName)                     \
+  TVM_REGISTER_API("relay.op._make." OpName)                \
+    .set_body_typed<Expr(Expr)>([](Expr data) {             \
+        static const Op& op = Op::Get(OpName);              \
+        return CallNode::make(op, {data}, Attrs(), {});     \
+      });                                                   \
+  RELAY_REGISTER_OP(OpName)                                 \
+    .set_num_inputs(1)                                      \
+    .add_argument("data", "Tensor", "The input tensor.")    \
+    .add_type_rel("Identity", IdentityRel)                  \
+    .set_attr<TOpPattern>("TOpPattern", kElemWise)          \
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)        \
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",   \
+                                   ElemwiseArbitraryLayout) \
+
 
 /*! Quick helper macro
  * - Expose a positional make function to construct the node.
@@ -56,12 +60,10 @@ inline std::vector<T> AsVector(const Array<T> &array) {
  * We will do rewrapping in the frontend to support language
  * sugars such as keyword arguments and default value.
  *
- * \param Prefix the prefix of the registry, for example, "relay.op._make.".
- *
  * \param OpName the name of registry.
  */
-#define RELAY_REGISTER_BINARY_OP(Prefix, OpName)                  \
-  TVM_REGISTER_API(Prefix OpName)                                 \
+#define RELAY_REGISTER_BINARY_OP(OpName)                          \
+  TVM_REGISTER_API("relay.op._make." OpName)                      \
     .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {    \
         static const Op& op = Op::Get(OpName);                    \
         return CallNode::make(op, {lhs, rhs}, Attrs(), {});       \
@@ -72,7 +74,26 @@ inline std::vector<T> AsVector(const Array<T> &array) {
     .add_argument("rhs", "Tensor", "The right hand side tensor.") \
     .add_type_rel("Broadcast", BroadcastRel)                      \
     .set_attr<TOpPattern>("TOpPattern", kBroadcast)               \
-    .set_attr<TOpIsStateful>("TOpIsStateful", false)
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)              \
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",         \
+                                   BinaryBroadcastLayout)
+
+// Comparisons
+#define RELAY_REGISTER_CMP_OP(OpName)                             \
+  TVM_REGISTER_API("relay.op._make." OpName)                      \
+  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {      \
+    static const Op& op = Op::Get(OpName);                        \
+    return CallNode::make(op, {lhs, rhs}, Attrs(), {});           \
+  });                                                             \
+  RELAY_REGISTER_OP(OpName)                                       \
+    .set_num_inputs(2)                                            \
+    .add_argument("lhs", "Tensor", "The left hand side tensor.")  \
+    .add_argument("rhs", "Tensor", "The right hand side tensor.") \
+    .add_type_rel("BroadcastComp", BroadcastCompRel)              \
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast)               \
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)              \
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",         \
+                                   BinaryBroadcastLayout)
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index 3f28bd52cd4b..da9b1af87578 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -23,71 +23,65 @@ namespace relay {
 
 
 // Addition
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "add")
+RELAY_REGISTER_BINARY_OP("add")
 .describe("Elementwise add with with broadcasting")
 .set_support_level(1)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::add));
 
 // Subtraction
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "subtract")
+RELAY_REGISTER_BINARY_OP("subtract")
 .describe("Elementwise substract with broadcasting")
 .set_support_level(1)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::subtract));
 
 // Right shift
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "right_shift")
+RELAY_REGISTER_BINARY_OP("right_shift")
 .describe("Elementwise right shift with broadcasting")
 .set_support_level(4)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::right_shift));
 
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "left_shift")
+
+RELAY_REGISTER_BINARY_OP("left_shift")
 .describe("Elementwise left shift with broadcasting")
 .set_support_level(4)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::left_shift));
 
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "maximum")
+
+RELAY_REGISTER_BINARY_OP("maximum")
 .describe("Elementwise maximum of two tensors with broadcasting")
 .set_support_level(4)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::maximum));
 
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "minimum")
+
+RELAY_REGISTER_BINARY_OP("minimum")
 .describe("Elementwise minimum of two tensors with broadcasting")
 .set_support_level(4)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::minimum));
 
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "divide")
+
+RELAY_REGISTER_BINARY_OP("divide")
 .describe("Elementwise divide with broadcasting")
 .set_support_level(1)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::divide));
 
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "multiply")
+
+RELAY_REGISTER_BINARY_OP("multiply")
 .describe("Elementwise multiply with broadcasting")
 .set_support_level(1)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::multiply));
 
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "power")
+
+RELAY_REGISTER_BINARY_OP("power")
 .describe("Elementwise power with broadcasting")
 .set_support_level(4)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::power));
 
-RELAY_REGISTER_BINARY_OP("relay.op._make.", "mod")
+
+RELAY_REGISTER_BINARY_OP("mod")
 .describe("Elementwise mod with broadcasting")
 .set_support_level(1)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::mod));
 
-// Comparisons
-#define RELAY_REGISTER_CMP_OP(OpName)                               \
-  TVM_REGISTER_API("relay.op._make." OpName)                        \
-  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {        \
-    static const Op& op = Op::Get(OpName);                          \
-    return CallNode::make(op, {lhs, rhs}, Attrs(), {});             \
-  });                                                               \
-  RELAY_REGISTER_OP(OpName)                                         \
-    .set_num_inputs(2)                                              \
-    .add_argument("lhs", "Tensor", "The left hand side tensor.")    \
-    .add_argument("rhs", "Tensor", "The right hand side tensor.")   \
-    .add_type_rel("BroadcastComp", BroadcastCompRel)                \
-    .set_attr<TOpPattern>("TOpPattern", kBroadcast)
 
 RELAY_REGISTER_CMP_OP("equal")
 .describe("Elementwise equal compare with broadcasting")
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 4a052881d7bf..fcf7f6fe3299 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -11,9 +11,12 @@
 #include <topi/elemwise.h>
 #include <topi/broadcast.h>
 #include <topi/reduction.h>
+#include <topi/nn.h>
 #include <vector>
 #include "../op_common.h"
 #include "../../../arithmetic/compute_expr.h"
+#include "../../pass/alter_op_layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -156,6 +159,7 @@ RELAY_REGISTER_OP("expand_dims")
 .set_attr<FTVMCompute>("FTVMCompute", ExpandDimsCompute)
 .set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
+// relay.concatenate
 TVM_REGISTER_NODE_TYPE(ConcatenateAttrs);
 
 bool ConcatenateRel(const Array<Type>& types,
@@ -201,6 +205,42 @@ bool ConcatenateRel(const Array<Type>& types,
   return true;
 }
 
+Array<Array<Layout>> ConcatenateLayout(
+    const Attrs& attrs,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr>> &old_in_shapes) {
+  const ConcatenateAttrs* param = attrs.as<ConcatenateAttrs>();
+
+  size_t axis = param->axis < 0 ? param->axis + old_in_shapes[0].size() :
+                static_cast<size_t>(param->axis);
+
+  Layout ret;
+  if (new_in_layouts.defined()) {  // this function is called after some operators are alternated.
+    Layout::LayoutDim concate_dim = old_in_layouts[0][axis];
+    for (size_t i = 0; i < new_in_layouts.size(); ++i) {
+      if (new_in_layouts[i].ndim() > axis &&
+          new_in_layouts[i][axis] == concate_dim) {
+        ret = new_in_layouts[i];
+        break;
+      }
+    }
+  } else {  // this function is called on the original correct relay ir
+    for (size_t i = 0; i < old_in_layouts.size(); ++i) {
+      if (old_in_layouts[i].defined()) {
+        ret = old_in_layouts[i];
+        break;
+      }
+    }
+
+    if (ret.ndim() <= axis || Layout::IsSubdim(ret[axis])) {
+      return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
+    }
+  }
+
+  return Array<Array<Layout> > {Array<Layout>(old_in_layouts.size(), ret), {ret}};
+}
+
 Expr MakeConcatenate(Expr data,
                      int axis) {
   auto attrs = make_node<ConcatenateAttrs>();
@@ -226,7 +266,8 @@ RELAY_REGISTER_OP("concatenate")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input list of tensors.")
 .set_support_level(1)
-.add_type_rel("Concatenate", ConcatenateRel);
+.add_type_rel("Concatenate", ConcatenateRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConcatenateLayout);
 
 /* relay.transpose */
 TVM_REGISTER_NODE_TYPE(TransposeAttrs);
@@ -323,7 +364,6 @@ RELAY_REGISTER_OP("transpose")
 .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 /* relay.reshape */
-
 TVM_REGISTER_NODE_TYPE(ReshapeAttrs);
 
 bool ReshapeRel(const Array<Type>& types,
@@ -1252,7 +1292,7 @@ Examples::
 .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 
-// Split
+// relay.split
 TVM_REGISTER_NODE_TYPE(SplitAttrs);
 
 bool SplitRel(const Array<Type>& types,
@@ -1367,6 +1407,7 @@ the entries indicate where along axis the array is split.
 .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 
+// relay.slice_like
 TVM_REGISTER_NODE_TYPE(SliceLikeAttrs);
 
 /*!
@@ -1513,5 +1554,104 @@ RELAY_REGISTER_OP("slice_like")
 .set_attr<FTVMCompute>("FTVMCompute", SliceLikeCompute)
 .set_attr<TOpPattern>("TOpPattern", kInjective);
 
+
+// relay.layout_transform
+Array<Tensor> LayoutTransformCompute(const Attrs& attrs,
+                                     const Array<Tensor>& inputs,
+                                     const Type& out_type,
+                                     const Target& target) {
+  const LayoutTransformAttrs *param = attrs.as<LayoutTransformAttrs>();
+  CHECK(param != nullptr);
+
+  Layout src_layout(param->src_layout);
+  Layout dst_layout(param->dst_layout);
+
+  if (src_layout.Equals(dst_layout)) {
+    return Array<Tensor>{ inputs[0] };
+  }
+
+  CHECK(src_layout.defined() && dst_layout.defined())
+    << "cannot convert from/to undefined layout";
+  CHECK(src_layout.Convertible(dst_layout))
+    << "cannot convert from " << param->src_layout << " to " << param->dst_layout;
+
+  const auto& out_shape = ConvertLayout(inputs[0]->shape, src_layout, dst_layout);
+  return Array<Tensor> {
+      topi::layout_transform(inputs[0], out_shape, [&](const Array<tvm::Var>& dst_indices) {
+        std::vector<tvm::Expr> dst_to_src_indices;
+        for (size_t i = 0; i < src_layout.ndim(); ++i) {
+          Layout::LayoutDim src_axis = src_layout[i];
+          int dst_major_pos = dst_layout.Indexof(Layout::ToSuperdim(src_axis));
+          int dst_minor_pos = dst_layout.Indexof(Layout::ToSubdim(src_axis));
+          int32_t src_factor = static_cast<int32_t>(src_layout.Subsizeof(src_axis));
+          int32_t dst_factor = static_cast<int32_t>(dst_layout.Subsizeof(src_axis));
+
+          tvm::Expr src_index(dst_indices[dst_major_pos]);
+          if (dst_minor_pos >= 0) {
+            CHECK_GT(dst_factor, 0);
+            src_index = src_index * dst_factor + dst_indices[dst_minor_pos];
+          }
+          if (Layout::IsSuperdim(src_axis) && src_factor > 0) {
+            src_index = src_index / src_factor;
+          } else if (Layout::IsSubdim(src_axis) && src_factor > 0) {
+            src_index = src_index % src_factor;
+          }
+          dst_to_src_indices.push_back(src_index);
+        }
+        return Array<tvm::Expr>(dst_to_src_indices);
+      })
+  };
+}
+
+bool LayoutTransformRel(const Array<Type>& types,
+                        int num_inputs,
+                        const Attrs& attrs,
+                        const TypeReporter& reporter) {
+  const auto* data = types[0].as<TensorTypeNode>();
+  CHECK(data != nullptr);
+  const LayoutTransformAttrs* params = attrs.as<LayoutTransformAttrs>();
+
+  Layout src_layout(params->src_layout);
+  Layout dst_layout(params->dst_layout);
+
+  CHECK(src_layout.defined() && dst_layout.defined())
+    << "cannot convert from/to undefined layout";
+  CHECK(src_layout.Convertible(dst_layout))
+    << "cannot convert from " << params->src_layout << " to " << params->dst_layout;
+
+  const auto& out_shape = ConvertLayout(data->shape, src_layout, dst_layout);
+  reporter->Assign(types[1], TensorTypeNode::make(out_shape, data->dtype));
+  return true;
+}
+
+Expr MakeLayoutTransform(Expr data,
+                         std::string src_layout,
+                         std::string dst_layout) {
+  auto attrs = make_node<LayoutTransformAttrs>();
+  attrs->src_layout = std::move(src_layout);
+  attrs->dst_layout = std::move(dst_layout);
+  static const Op& op = Op::Get("layout_transform");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.layout_transform")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 3>(MakeLayoutTransform, args, rv);
+});
+
+RELAY_REGISTER_OP("layout_transform")
+.describe(R"code(Transform the input data layout.
+
+For transforming from NCHW to N16cHWC, the `__layout_transform__` operator reshapes
+the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.LayoutTransformAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_type_rel("layout_transform", LayoutTransformRel)
+.set_support_level(5)
+.set_attr<FTVMCompute>("FTVMCompute", LayoutTransformCompute);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index fef0302a0507..b83fdacda1ee 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -22,7 +22,7 @@ namespace relay {
   }                                                     \
 
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "log")
+RELAY_REGISTER_UNARY_OP("log")
 .describe(R"code(Returns the log input array, computed element-wise.
 
 .. math::
@@ -30,11 +30,10 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "log")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::log));
 
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "exp")
+RELAY_REGISTER_UNARY_OP("exp")
 .describe(R"code(Returns the exp input array, computed element-wise.
 
 .. math::
@@ -42,36 +41,30 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "exp")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::exp));
 
-
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "sqrt")
-.describe(R"code(Returns the sqrt input array, computed element-wise.
+RELAY_REGISTER_UNARY_OP("sqrt")
+.describe(R"code(Returns the rsqrt input array, computed element-wise.
 
 .. math::
    sqrt(x)
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::sqrt));
 
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "zeros_like")
+RELAY_REGISTER_UNARY_OP("zeros_like")
 .describe(R"code(Returns an array of zeros, with same type and shape as the input.
 )code" TVM_ADD_FILELINE)
-.set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.set_support_level(4);
 
-
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "ones_like")
+RELAY_REGISTER_UNARY_OP("ones_like")
 .describe(R"code(Returns an array of ones, with same type and shape as the input.
 )code" TVM_ADD_FILELINE)
-.set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.set_support_level(4);
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "sigmoid")
+RELAY_REGISTER_UNARY_OP("sigmoid")
 .describe(R"code(Returns the sigmoid input array, computed element-wise.
 
 .. math::
@@ -79,48 +72,47 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "sigmoid")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::sigmoid));
 
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "copy")
+RELAY_REGISTER_UNARY_OP("copy")
 .describe(R"code(Copy a tensor.
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::identity));
 
 // relay.clip
 TVM_REGISTER_NODE_TYPE(ClipAttrs);
 
 TVM_REGISTER_API("relay.op._make.clip")
-  .set_body_typed<Expr(Expr, double, double)>([](Expr a, double a_min, double a_max) {
-      auto attrs = make_node<ClipAttrs>();
-      attrs->a_min = a_min;
-      attrs->a_max = a_max;
-      static const Op& op = Op::Get("clip");
-    return CallNode::make(op, {a}, Attrs(attrs), {});
-  });
+.set_body_typed<Expr(Expr, double, double)>([](Expr a, double a_min, double a_max) {
+    auto attrs = make_node<ClipAttrs>();
+    attrs->a_min = a_min;
+    attrs->a_max = a_max;
+    static const Op& op = Op::Get("clip");
+  return CallNode::make(op, {a}, Attrs(attrs), {});
+});
 
 RELAY_REGISTER_OP("clip")
-  .describe(R"code(Clip tensor values.
-  This function takes a tensor, a minimum value `a_min`, and a maximum value `a_max`, and returns a clipped tensor where all values below `a_min` are set to `a_min` and all values above `a_max` are set to `a_max`. `a_min` and `a_max` are cast to the tensor's dtype.
-  )code" TVM_ADD_FILELINE)
-  .set_num_inputs(1)
-  .add_argument("tensor", "Tensor", "The input tensor.")
-  .set_support_level(3)
-  .add_type_rel("Clip", IdentityRel);
-
+.describe(R"code(Clip tensor values.
+This function takes a tensor, a minimum value `a_min`, and a maximum value `a_max`, and returns a clipped tensor where all values below `a_min` are set to `a_min` and all values above `a_max` are set to `a_max`. `a_min` and `a_max` are cast to the tensor's dtype.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_type_rel("Identity", IdentityRel)
+.set_attr<TOpPattern>("TOpPattern", kElemWise)
+.set_attr<TOpIsStateful>("TOpIsStateful", false)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+.set_support_level(3);
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "floor")
+RELAY_REGISTER_UNARY_OP("floor")
 .describe(R"code(Returns the floor of input array, computed element-wise.
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::floor));
 
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "ceil")
+RELAY_REGISTER_UNARY_OP("ceil")
 .describe(R"code(Returns the ceil of input array, computed element-wise.
 
 .. math::
@@ -128,11 +120,10 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "ceil")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::ceil));
 
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "trunc")
+RELAY_REGISTER_UNARY_OP("trunc")
 .describe(R"code(Returns the trunc of input array, computed element-wise.
 
 .. math::
@@ -140,11 +131,9 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "trunc")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::trunc));
 
-
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "round")
+RELAY_REGISTER_UNARY_OP("round")
 .describe(R"code(Returns the round of input array, computed element-wise.
 
 .. math::
@@ -152,11 +141,10 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "round")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::round));
 
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "abs")
+RELAY_REGISTER_UNARY_OP("abs")
 .describe(R"code(Returns the abs of input array, computed element-wise.
 
 .. math::
@@ -164,11 +152,10 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "abs")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::abs));
 
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "tanh")
+RELAY_REGISTER_UNARY_OP("tanh")
 .describe(R"code(Returns the tanh of input array, computed element-wise.
 
 .. math::
@@ -176,11 +163,10 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "tanh")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::tanh));
 
 
-RELAY_REGISTER_UNARY_OP("relay.op._make.", "negative")
+RELAY_REGISTER_UNARY_OP("negative")
 .describe(R"code(Returns the numeric negative of input array, computed element-wise.
 
 .. math::
@@ -188,7 +174,6 @@ RELAY_REGISTER_UNARY_OP("relay.op._make.", "negative")
 
 )code" TVM_ADD_FILELINE)
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::negative));
 
 }  // namespace relay
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
new file mode 100644
index 000000000000..5c4475259086
--- /dev/null
+++ b/src/relay/pass/alter_op_layout.cc
@@ -0,0 +1,312 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file alter_op_layout.cc
+ * \brief Alternate the layouts of operators or replace primitive operators with
+          other expressions. This pass can be used for computing convolution in
+          custom layouts or other general weight pre-transformation.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/tvm.h>
+#include <tuple>
+#include <vector>
+#include <functional>
+#include <string>
+
+#include "alter_op_layout.h"
+
+namespace tvm {
+namespace relay {
+
+namespace alter_op_layout {
+
+// Make a transform CallNode
+Expr TransformLayout(Expr raw, Layout src_layout, Layout dst_layout) {
+  if (src_layout.Equals(dst_layout)) { return raw; }
+  CHECK(src_layout.defined() && dst_layout.defined())
+    << "Cannot insert layout transform because there are undefined layouts";
+  CHECK(src_layout.Convertible(dst_layout))
+    << "Cannot insert layout transform because there are inconvertible layouts: "
+    << src_layout << " v.s. " << dst_layout;
+  static auto &transform_op = Op::Get("layout_transform");
+  NodePtr<LayoutTransformAttrs> attrs = make_node<LayoutTransformAttrs>();
+  attrs->src_layout = src_layout.name();
+  attrs->dst_layout = dst_layout.name();
+  Call transform = CallNode::make(transform_op, {raw}, Attrs{attrs});
+  return transform;
+}
+
+// Memorize layout transform so we can reuse internal transformed nodes
+class TransformMemorizerNode : public Node {
+ public:
+  // map from (Expr, src_layout, dst_layout) to transformed Expr
+  using TransformKey = std::tuple<const Node*, std::string, std::string>;
+  struct key_hash : public std::unary_function<TransformKey , std::size_t> {
+    std::size_t operator()(const TransformKey& k) const {
+      return dmlc::HashCombine<std::string>(dmlc::HashCombine<std::string>(
+              std::hash<const Node*>()(std::get<0>(k)), std::get<1>(k)), (std::get<2>(k)));
+    }
+  };
+
+  std::unordered_map<TransformKey, Expr, key_hash> memo;
+  static constexpr const char *_type_key = "relay.alter_op_layout.TransformMemorizerNode";
+  TVM_DECLARE_NODE_TYPE_INFO(TransformMemorizerNode, Node);
+};
+
+class TransformMemorizer : public NodeRef {
+ public:
+  TransformMemorizer() {}
+  explicit TransformMemorizer(NodePtr<Node> n) : NodeRef(n) {}
+
+  TransformMemorizerNode* operator->() {
+    return static_cast<TransformMemorizerNode*>(node_.get());
+  }
+
+  // Transform layout with memorizer
+  Expr Transform(Expr raw, const Layout& src_layout, const Layout& dst_layout) {
+    if (src_layout.Equals(dst_layout)) { return raw; }
+
+    std::tuple<const Node*, std::string, std::string> key =
+        std::make_tuple<>(raw.get(), src_layout.name(), dst_layout.name());
+    auto& memo = operator->()->memo;
+
+    auto iter = memo.find(key);
+    if (iter != memo.end()) {
+      return iter->second;
+    } else {
+      Expr transform = TransformLayout(raw, src_layout, dst_layout);
+      memo[key] = transform;
+      return transform;
+    }
+  }
+
+  using ContainerType = TransformMemorizerNode;
+};
+
+
+// TempExprNode during layout transform
+// Instance of this expr will be Realized to normal expr ultimately
+class LayoutAlternatedExprNode : public TempExprNode {
+ public:
+  Expr value;
+  Layout old_layout;
+  Layout new_layout;
+  TransformMemorizer memorizer;
+
+  Expr Realize() const final {
+    // NOTE: use a copy to discard the "const" qualifier
+    TransformMemorizer tmp_memorizer = memorizer;
+    // fallback to old layout
+    return tmp_memorizer.Transform(value, new_layout, old_layout);
+  }
+
+  void VisitAttrs(AttrVisitor *v) final {
+    v->Visit("value", &value);
+    v->Visit("old_layout", &old_layout);
+    v->Visit("new_layout", &new_layout);
+  }
+
+  static constexpr const char *_type_key = "relay.alter_op_layout.LayoutAlternatedExprNode";
+  TVM_DECLARE_NODE_TYPE_INFO(LayoutAlternatedExprNode, TempExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(LayoutAlternatedExpr, LayoutAlternatedExprNode, TempExpr);
+
+// Call registered FInferCorrectLayout of an op.
+// Parameters are the same as the parameters for FInferCorrectLayout
+// Returns inferred_input_layout, inferred_output_layout, success
+std::tuple<Array<Layout>, Array<Layout>, bool> CallInfer(
+    const Call& call,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr> > &old_in_shapes) {
+  static auto finfer_layout = Op::GetAttr<FInferCorrectLayout>("FInferCorrectLayout");
+
+  Op op = Downcast<Op>(call->op);
+  if (finfer_layout.count(op)) {
+    Array<Array<Layout> > inferred_layouts;
+    inferred_layouts = finfer_layout[op](call->attrs, new_in_layouts,
+                                         old_in_layouts, old_in_shapes);
+    CHECK_EQ(inferred_layouts.size(), 2)
+      << "FInferCorrectLayout should return an array with size of 2";
+    for (auto x : inferred_layouts) {
+      for (auto y : x) {
+        if (!y.defined()) {  // inference fails
+          return std::make_tuple<>(Array<Layout>(nullptr), Array<Layout>(nullptr), false);
+        }
+      }
+    }
+    return std::make_tuple<>(inferred_layouts[0], inferred_layouts[1], true);
+  } else {
+    return std::make_tuple<>(Array<Layout>(nullptr), Array<Layout>(nullptr), false);
+  }
+}
+
+// Call registered FTVMAlterOpLayout of an op
+// Returns the altered expression
+Call CallAlter(const Call& ref_call,
+               const std::vector<Expr>& new_args) {
+  static auto falter_layout = Op::GetAttr<FTVMAlterOpLayout>("FTVMAlterOpLayout");
+  Op op = Downcast<Op>(ref_call->op);
+
+  Expr new_e;
+  bool modified = false;
+  if (falter_layout.count(op)) {
+    tvm::Array<tvm::Tensor> tinfos;
+    for (auto expr : ref_call->args) {
+      auto ttype = expr->type_as<TensorTypeNode>();
+      tinfos.push_back(tvm::placeholder(ttype->shape, ttype->dtype));
+    }
+    Expr altered_value = falter_layout[op](ref_call->attrs, new_args, tinfos);
+    if (altered_value.defined()) {
+      new_e = altered_value;
+      modified = true;
+    }
+  }
+  if (!modified) {
+    new_e = CallNode::make(ref_call->op, new_args,
+                           ref_call->attrs, ref_call->type_args);
+  }
+
+  const CallNode *new_call = new_e.as<CallNode>();
+  CHECK(new_call) << "Can only replace the original operator with another call node";
+  return GetRef<Call>(new_call);
+}
+
+Expr AlterOpLayoutRewrite(const Call &ref_call,
+                          const Array<Expr> &new_args,
+                          const NodeRef& ctx) {
+  std::vector<LayoutAlternatedExpr> inputs;
+  std::vector<Expr> normal_new_args;
+  Array<Array<IndexExpr> > input_shapes;
+
+  // NOTE: discard the "const" qualifier
+  TransformMemorizer memorizer = Downcast<TransformMemorizer>(ctx);
+
+  // fill incomplete state and expand tuple
+  for (auto new_arg : new_args) {
+    auto push_back_one_arg = [&](Expr arg) {
+      // We always expect LayoutAlternatedExpr.
+      // This is used to convert the normal Expr to LayoutAlternatedExpr.
+      if (const LayoutAlternatedExprNode *inp = arg.as<LayoutAlternatedExprNode>()) {
+        inputs.push_back(GetRef<LayoutAlternatedExpr>(inp));
+        normal_new_args.push_back(inp->value);
+      } else {
+        auto inode = make_node<LayoutAlternatedExprNode>();
+        inode->value = arg;
+        inode->memorizer = memorizer;
+        inputs.push_back(LayoutAlternatedExpr(inode));
+        normal_new_args.push_back(arg);
+      }
+    };
+
+    if (new_arg->is_type<TupleNode>()) {
+      Tuple tuple_new_arg = Downcast<Tuple>(new_arg);
+      for (auto x : tuple_new_arg->fields) {
+        push_back_one_arg(x);
+      }
+    } else {
+      push_back_one_arg(new_arg);
+    }
+  }
+
+  // old_in, new_in = state[inputs]
+  Array<Layout> old_in, old_out, new_in, new_out, new_in2;
+  for (auto inp : inputs) {
+    old_in.push_back(inp->old_layout);
+    new_in.push_back(inp->new_layout);
+  }
+
+  for (auto arg : ref_call->args) {
+    if (arg->is_type<TupleNode>()) {  // expand tuple
+      Tuple tuple_arg = Downcast<Tuple>(arg);
+      for (auto x : tuple_arg->fields) {
+        input_shapes.push_back(x->type_as<TensorTypeNode>()->shape);
+      }
+    } else {
+      input_shapes.push_back(arg->type_as<TensorTypeNode>()->shape);
+    }
+  }
+
+  // old_in, old_out = op.infer(old_in)
+  bool success = false;
+  std::tie(old_in, old_out, success) = CallInfer(ref_call,
+                                                 Array<Layout>(nullptr),
+                                                 old_in, input_shapes);
+  if (!success) { return Expr(nullptr); }
+  CHECK_EQ(old_in.size(), new_in.size());
+
+  // if new_in == 'undef':  new_in = old_in
+  for (size_t i = 0; i < new_in.size(); ++i) {
+    if (!new_in[i].defined()) {
+      new_in.Set(i, old_in[i]);
+    }
+  }
+
+  // new_op = alter(op)
+  Call new_call = CallAlter(ref_call, normal_new_args);
+
+  // new_in2, new_out = op.infer(new_in)
+  if (new_call->op->is_type<OpNode>()) {
+    success = false;
+    std::tie(new_in2, new_out, success) = CallInfer(new_call, new_in, old_in, input_shapes);
+    if (!success) { return Expr(nullptr); }
+  } else {
+    return Expr(nullptr);
+  }
+
+  CHECK_EQ(new_out.size(), old_out.size())
+    << "The number of output nodes should keep the same during alter_op_layout";
+  CHECK_EQ(new_in.size(), new_in2.size())
+    << "The number of input nodes should keep the same during alter_op_layout";
+
+  // if (new_in != new_in2): insert transform (new_in -> new_in2)
+  Array<Expr> transformed_args;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    transformed_args.push_back(memorizer.Transform(new_call->args[i], new_in[i], new_in2[i]));
+  }
+
+  // state[node] = (old_out, new_out)
+  CHECK(ref_call->checked_type_.defined())
+    << "Call infer_type pass before alter_op_layout pass";
+
+  if (ref_call->checked_type()->is_type<TupleTypeNode>()) {
+    Expr tuple_output = CallNode::make(new_call->op, transformed_args,
+                                       new_call->attrs, new_call->type_args);
+    Array<Expr> fields;
+    for (size_t i = 0; i < new_out.size(); ++i) {
+      auto rnode = make_node<LayoutAlternatedExprNode>();
+      rnode->value = TupleGetItemNode::make(tuple_output, i);
+      rnode->old_layout = old_out[i];
+      rnode->new_layout = new_out[i];
+      rnode->memorizer = memorizer;
+      fields.push_back(Expr(rnode));
+    }
+    return TupleNode::make(fields);
+  } else {
+    auto rnode = make_node<LayoutAlternatedExprNode>();
+    CHECK_EQ(new_out.size(), 1);
+    rnode->value = CallNode::make(new_call->op, transformed_args,
+                                  new_call->attrs, new_call->type_args);
+    rnode->old_layout = old_out[0];
+    rnode->new_layout = new_out[0];
+    rnode->memorizer = memorizer;
+    return Expr(rnode);
+  }
+}
+
+TVM_REGISTER_API("relay._ir_pass.AlterOpLayout")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  TransformMemorizer transformMemorizer(make_node<TransformMemorizerNode>());
+  auto fcontext = [&](const Call& call) -> NodeRef{
+    return transformMemorizer;
+  };
+
+  *ret = ForwardRewrite(args[0], AlterOpLayoutRewrite, fcontext);
+});
+
+}  // namespace alter_op_layout
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/alter_op_layout.h b/src/relay/pass/alter_op_layout.h
new file mode 100644
index 000000000000..fcb7b379a0ec
--- /dev/null
+++ b/src/relay/pass/alter_op_layout.h
@@ -0,0 +1,119 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file alter_op_layout.h
+ * \brief Alternate the layouts of operators or replace primitive operators with
+          other expressions. This pass can be used for computing convolution in
+          custom layouts or other general weight pre-transformation.
+ */
+
+#ifndef TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
+#define TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
+
+#include <tvm/relay/expr.h>
+
+#include "../op/layout.h"
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Infer & correct function of node layout. See \p Layout for layout convention
+ * \param attrs The attribute of the node.
+ * \param new_in_layouts The layouts of input arguments after alter_op_layout.
+ *                       This can be undefined, which means we call this function before alternating
+ *                       any operators.
+ * \param old_in_layouts The layouts of input arguments before alter_op_layout.
+ * \param old_in_shapes The shapes of old input arguments.
+ * \return infered_layout An array of two elements that are inferred input layouts and
+ *                        inferred output layouts.
+ */
+using FInferCorrectLayout = runtime::TypedPackedFunc<
+    Array<Array<Layout>>(const Attrs& attrs,
+                         const Array<Layout>& new_in_layouts,
+                         const Array<Layout>& old_in_layouts,
+                         const Array<Array<IndexExpr>> &old_in_shapes)>;
+
+/*! \brief take arbitrary input layout and copy to output */
+inline Array<Array<Layout> > ElemwiseArbitraryLayout(const Attrs& attrs,
+                                                     const Array<Layout>& new_in_layouts,
+                                                     const Array<Layout>& old_in_layouts,
+                                                     const Array<Array<IndexExpr>> &old_in_shapes) {
+  Layout ret;
+
+  if (new_in_layouts.defined()) {
+    CHECK_GE(new_in_layouts.size(), 1);
+    ret = new_in_layouts[0];
+  } else {
+    for (size_t i = 0; i < old_in_layouts.size(); ++i) {
+      if (old_in_layouts[i].defined()) {
+        ret = old_in_layouts[i];
+        break;
+      }
+    }
+  }
+
+  return Array<Array<Layout> >{Array<Layout>(old_in_layouts.size(), ret), {ret}};
+}
+
+/*! \brief Infer layout for binary broadcast operators */
+inline Array<Array<Layout> > BinaryBroadcastLayout(const Attrs& attrs,
+                                                   const Array<Layout>& new_in_layouts,
+                                                   const Array<Layout>& old_in_layouts,
+                                                   const Array<Array<IndexExpr>> &old_in_shapes) {
+  Array<Layout> layouts;
+
+  if (new_in_layouts.defined()) {
+    layouts.assign(new_in_layouts.begin(), new_in_layouts.end());
+  } else {
+    layouts.assign(old_in_layouts.begin(), old_in_layouts.end());
+  }
+
+  if (!layouts[0].defined() && !layouts[1].defined()) {
+    // both undefined, infer fails
+    return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
+  } else if (!layouts[0].defined() || !layouts[1].defined()) {
+    // only one is defined, use shape information to help infer
+    int defined_idx = layouts[0].defined() ? 0 : 1;
+    int undef_idx = 1 - defined_idx;
+
+    if (old_in_shapes[defined_idx].size() >= old_in_shapes[undef_idx].size()) {
+      layouts.Set(undef_idx,
+                  layouts[defined_idx].Sublayout(
+                      old_in_shapes[defined_idx].size() - old_in_shapes[undef_idx].size(),
+                      old_in_shapes[undef_idx].size()));
+      return Array<Array<Layout> > {layouts, {layouts[defined_idx]}};
+    } else {
+      // only know the tensor with smaller dimensions,
+      // so we cannot infer the final broadcasted output.
+      // fails in this case.
+      return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
+    }
+  } else {
+    // try to broadcast the tensors to the larger dimension
+    int large_idx = layouts[0].ndim_super() >= layouts[1].ndim_super() ? 0 : 1;
+    int small_idx = 1 - large_idx;
+    Layout ret = layouts[large_idx];
+
+    // extract common part
+    size_t i = layouts[large_idx].ndim();
+    for (; i != 0; --i) {
+      auto dim = layouts[large_idx][i-1];
+      if (!layouts[small_idx].Contains(Layout::ToSuperdim(dim))) {
+        break;
+      }
+    }
+
+    Layout common_part = layouts[large_idx].Sublayout(i, layouts[large_idx].ndim() - i);
+    if (!layouts[small_idx].Convertible(common_part)) {  // fail
+      return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
+    }
+
+    layouts.Set(small_idx, common_part);
+    return Array<Array<Layout> > {layouts, {ret}};
+  }
+}
+
+}  //  namespace relay
+}  //  namespace tvm
+
+#endif  // TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
diff --git a/src/relay/pass/canonicalize_ops.cc b/src/relay/pass/canonicalize_ops.cc
new file mode 100644
index 000000000000..77cd59e2afd8
--- /dev/null
+++ b/src/relay/pass/canonicalize_ops.cc
@@ -0,0 +1,46 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file canonicalize_ops.cc
+ * \brief Canonicalize special operators to basic operators.
+    This can simplify latter analysis. (e.g. Expand bias_add to expand_dims and broadcast_add.)
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/attrs/nn.h>
+#include "pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+class BiasAddSimplifier : public ExprMutator {
+ public:
+  Expr VisitExpr_(const CallNode* n) {
+    static const Op& bias_add = Op::Get("nn.bias_add");
+    auto new_n = ExprMutator::VisitExpr_(n);
+    if (n->op.same_as(bias_add)) {
+      Call call = Downcast<Call>(new_n);
+      CHECK_EQ(call->args.size(), 2);
+      const BiasAddAttrs* param = call->attrs.as<BiasAddAttrs>();
+
+      auto ttype = call->args[0]->type_as<TensorTypeNode>();
+      size_t n_dim = ttype->shape.size();
+      Expr expanded_bias = ExpandBiasToMatchAxis(call->args[1], n_dim, {param->axis});
+      Expr ret = Add(call->args[0], expanded_bias);
+      ret->checked_type_ = n->checked_type_;
+      return ret;
+    }
+    return new_n;
+  }
+};
+
+Expr CanonicalizeOps(const Expr& e) {
+  return BiasAddSimplifier().Mutate(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.canonicalize_ops")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+*ret = CanonicalizeOps(args[0]);
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index bcb91e7e5737..c56ee98a3969 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -29,11 +29,11 @@ using runtime::TypedPackedFunc;
 // FoldScaleAxis algorithm:
 //
 // The general idea is to transform Expr to tuple of
-// (value, axes, scale), where the final result satiesfies:
+// (value, axes, scale), where the final result satisfies:
 //
 // result = value
 // for i, k in enumerate(axes):
-//    k-ith dimension of result *= i-th dimension of scale
+//    k-th dimension of result *= i-th dimension of scale
 //
 // Then we can propagate this signal along and fold the scale if necessary.
 // However, it is possible that certain scale may never be consumed
diff --git a/src/relay/pass/forward_rewrite.cc b/src/relay/pass/forward_rewrite.cc
index 7873db80c6b0..4f33d4a053b7 100644
--- a/src/relay/pass/forward_rewrite.cc
+++ b/src/relay/pass/forward_rewrite.cc
@@ -42,13 +42,20 @@ class TempRealizer : private ExprMutator {
 
 class ForwardRewriter : private ExprMutator {
  public:
-  ForwardRewriter(const OpMap<FForwardRewrite>& rewrite_map,
+  ForwardRewriter(const OpMap<FForwardRewrite>* rewrite_map,
                   std::function<NodeRef(const Call&)> fcontext,
                   std::function<Expr(const Expr&)> fmulti_ref_trigger)
       : rewrite_map_(rewrite_map),
         fcontext_(fcontext),
-        fmulti_ref_trigger_(fmulti_ref_trigger) {
-  }
+        fmulti_ref_trigger_(fmulti_ref_trigger) {}
+
+  ForwardRewriter(const FForwardRewrite* rewrite_func,
+                  std::function<NodeRef(const Call&)> fcontext,
+                  std::function<Expr(const Expr&)> fmulti_ref_trigger)
+      : rewrite_func_(rewrite_func),
+        fcontext_(fcontext),
+        fmulti_ref_trigger_(fmulti_ref_trigger) {}
+
 
   // Transform expression.
   Expr Rewrite(Expr expr) {
@@ -60,8 +67,9 @@ class ForwardRewriter : private ExprMutator {
 
  private:
   // The rewrite rule.
-  const OpMap<FForwardRewrite>& rewrite_map_;
-  // The context.
+  const OpMap<FForwardRewrite>* rewrite_map_{nullptr};
+  const FForwardRewrite* rewrite_func_{nullptr};
+  // The context.const
   std::function<NodeRef(const Call&)> fcontext_{nullptr};
   // The multiple reference trigger
   std::function<Expr(const Expr&)> fmulti_ref_trigger_{nullptr};
@@ -104,9 +112,31 @@ class ForwardRewriter : private ExprMutator {
     }
   }
 
+  Expr VisitExpr_(const TupleNode* op) final {
+    tvm::Array<Expr> fields;
+    bool all_fields_unchanged = true;
+    for (auto field : op->fields) {
+      auto new_field = this->GetTempExpr(field);
+      fields.push_back(new_field);
+      all_fields_unchanged &= new_field.same_as(field);
+    }
+
+    if (all_fields_unchanged) {
+      return GetRef<Expr>(op);
+    } else {
+      return TupleNode::make(fields);
+    }
+  }
+
   Expr VisitExpr_(const CallNode* call_node) final {
     const Call& ref_call = GetRef<Call>(call_node);
-    PackedFunc frewrite = rewrite_map_.get(call_node->op, nullptr);
+    PackedFunc frewrite;
+    if (rewrite_func_) {
+      frewrite = *rewrite_func_;
+    } else {
+      CHECK(rewrite_map_);
+      frewrite = rewrite_map_->get(call_node->op, nullptr);
+    }
 
     auto new_op = this->Mutate(call_node->op);
     bool unchanged = call_node->op.same_as(new_op);
@@ -147,9 +177,16 @@ Expr ForwardRewrite(const Expr& expr,
                     std::function<NodeRef(const Call&)> fcontext,
                     std::function<Expr(const Expr&)> fmulti_ref_trigger) {
   auto rewrite_map = Op::GetAttr<FForwardRewrite>(rewrite_map_name);
-  return ForwardRewriter(rewrite_map,
-                         fcontext,
-                         fmulti_ref_trigger).Rewrite(expr);
+  return ForwardRewriter(&rewrite_map, fcontext, fmulti_ref_trigger).Rewrite(expr);
+}
+
+Expr ForwardRewrite(const Expr& expr,
+                    const FForwardRewrite& rewrite_func,
+                    std::function<NodeRef(const Call&)> fcontext,
+                    std::function<Expr(const Expr&)> fmulti_ref_trigger) {
+  return ForwardRewriter(&rewrite_func, fcontext, fmulti_ref_trigger).Rewrite(expr);
 }
+
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 38ae923c5274..e6e8415bd620 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -73,7 +73,7 @@ inline bool MatchBroadcastToLeftAxes(const TensorTypeNode* tlhs,
  * the target Tensor on the specified axis via broadcasting rule.
  *
  * \param bias The bias.
- * \param target_ndim target dimension.
+ * \param target_ndim Target dimension.
  * \param axes The axis on the output we want to match on.
  */
 inline Expr ExpandBiasToMatchAxis(Expr bias,
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
new file mode 100644
index 000000000000..6a8be7ea847e
--- /dev/null
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -0,0 +1,316 @@
+"""Test alter op layout pass"""
+
+from tvm import relay
+from tvm.relay.op import register_alter_op_layout
+from tvm.relay.ir_pass import *
+
+def test_alter_op():
+    """Test directly replacing an operator with a new one"""
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight = relay.var('weight', shape=(64, 64, 3, 3))
+        y = relay.nn.conv2d(x, weight,
+                            channels=64,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y = relay.nn.relu(y)
+        y = relay.Function([x, weight], y)
+        return y
+
+    @register_alter_op_layout("nn.conv2d", level=100)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        weight = relay.multiply(weight, relay.const(2.0))
+        return relay.nn.conv2d(data, weight, **attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight = relay.var('weight', shape=(64, 64, 3, 3))
+        y = relay.nn.conv2d(x, relay.multiply(weight, relay.const(2.0)),
+                            channels=64,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y = relay.nn.relu(y)
+        y = relay.Function([x, weight], y)
+        return y
+
+    a = before()
+    a = infer_type(a)
+    a = alter_op_layout(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
+
+def test_alter_return_none():
+    """Test doing nothing by returning 'None' """
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        y = relay.nn.global_max_pool2d(x)
+        y = relay.Function([x], y)
+        return y
+
+    called = [False]
+
+    @register_alter_op_layout("nn.global_max_pool2d", level=101)
+    def alter_conv2d(attrs, inputs, tinfos):
+        called[0] = True
+        return None
+
+    a = before()
+    a = infer_type(a)
+    a = alter_op_layout(a)
+
+    b = before()
+    b = infer_type(b)
+    assert(alpha_equal(a, b))
+    assert(called[0])
+
+
+def test_alter_layout():
+    """Test alternating the layout of a conv2d.
+    The layout of broadcast operators and the weight should be changed accordingly.
+    """
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        bias = relay.var("bias")
+        weight = relay.var("weight")
+        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
+        y = relay.nn.bias_add(y, bias)
+        # a useless tuple, which will be eliminated
+        y = relay.Tuple([y])[0]
+        y = relay.nn.relu(y)
+        y = relay.nn.batch_flatten(y)
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    @register_alter_op_layout("nn.conv2d", level=102)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs['data_layout'] = 'NCHW16c'
+        new_attrs['weight_layout'] = 'OIHW16i'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        bias = relay.var("bias", shape=(64,))
+        weight = relay.var("weight", shape=(64, 64, 3, 3))
+
+        y = relay.layout_transform(x, "NCHW", "NCHW16c")
+        w = relay.layout_transform(weight, "OIHW", "OIHW16i")
+        y = relay.nn.conv2d(y, w,
+                            channels=64,
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            weight_layout="OIHW16i",
+                            data_layout="NCHW16c")
+        b = relay.expand_dims(bias, axis=1, num_newaxis=2)
+        b = relay.layout_transform(b, "CHW", "CHW16c")
+        y = relay.add(y, b)
+
+        y = relay.nn.relu(y)
+        y = relay.layout_transform(y, "NCHW16c", "NCHW")
+        y = relay.nn.batch_flatten(y)
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    a = before()
+    a = infer_type(a)
+    a = canonicalize_ops(a)
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
+
+def test_alter_layout_dual_path():
+    """
+    Test alternating the layout with two outputs.
+    One path continues to use the new layout while one path fall backs to old layout.
+    """
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var('weight1')
+        weight2 = relay.var('weight2')
+        y = relay.nn.conv2d(x, weight1,
+                            channels=32,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y = relay.nn.relu(y)
+        y1 = relay.nn.conv2d(y, weight2,
+                             channels=32,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.batch_flatten(y)
+        ret = relay.Tuple([y1, y2])
+        y = relay.Function(free_vars(ret), ret)
+        return y
+
+    @register_alter_op_layout("nn.conv2d", level=103)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs['data_layout'] = 'NCHW16c'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var('weight1')
+        weight2 = relay.var('weight2')
+        y = relay.layout_transform(x, "NCHW", "NCHW16c")
+        y = relay.nn.conv2d(y, weight1,
+                            channels=32,
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            data_layout="NCHW16c")
+        y = relay.nn.relu(y)
+        y1 = relay.nn.conv2d(y, weight2,
+                             channels=32,
+                             kernel_size=(3, 3),
+                             padding=(1, 1),
+                             data_layout='NCHW16c')
+        y1 = relay.nn.relu(y1)
+        y1 = relay.layout_transform(y1, "NCHW16c", "NCHW")
+        y2 = relay.layout_transform(y, "NCHW16c", "NCHW")
+        y2 = relay.nn.batch_flatten(y2)
+        ret = relay.Tuple([y1, y2])
+        y = relay.Function(free_vars(ret), ret)
+        return y
+
+    a = before()
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
+def test_alter_layout_resnet():
+    """Test alternating the layout of a residual block
+    This also tests the elimination of duplicated transformation.
+    If a same transformation applies to a same node twice, only one transformation will be created.
+    """
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var('weight1')
+        weight2 = relay.var('weight2')
+        y = relay.nn.conv2d(x, weight1,
+                            channels=32,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y = relay.nn.relu(y)
+        y2 = relay.nn.conv2d(x, weight2,
+                             channels=32,
+                             kernel_size=(1, 1))
+        y2 = relay.nn.relu(y2)
+        y = y + y2
+        y = relay.nn.global_max_pool2d(y)
+        return relay.Function(free_vars(y), y)
+
+    @register_alter_op_layout("nn.conv2d", level=104)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs['data_layout'] = 'NCHW16c'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var('weight1')
+        weight2 = relay.var('weight2')
+        x = relay.layout_transform(x, "NCHW", "NCHW16c")
+        y = relay.nn.conv2d(x, weight1,
+                            channels=32,
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            data_layout="NCHW16c")
+        y = relay.nn.relu(y)
+        y2 = relay.nn.conv2d(x, weight2,
+                             channels=32,
+                             kernel_size=(1, 1),
+                             data_layout='NCHW16c')
+        y2 = relay.nn.relu(y2)
+        y = y + y2
+        y = relay.nn.global_max_pool2d(y, layout="NCHW16c")
+        y = relay.layout_transform(y, "NCHW16c", "NCHW")
+        return relay.Function(free_vars(y), y)
+
+    a = before()
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
+
+def test_alter_layout_broadcast_op():
+    """Test boradcast operators """
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        bias = relay.var("bias", shape=(64,))
+        scale = relay.var("scale", shape=(64, 1, 1))
+        weight = relay.var("weight")
+        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
+        y = relay.nn.bias_add(y, bias) # test broadcasting to lhs
+        y = relay.multiply(scale, y)         # test broadcasting to rhs
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    @register_alter_op_layout("nn.conv2d", level=102)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs['data_layout'] = 'NCHW16c'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        bias = relay.var("bias", shape=(64,))
+        scale = relay.var("scale", shape=(64, 1, 1))
+        weight = relay.var("weight")
+        x = relay.layout_transform(x, "NCHW", "NCHW16c")
+        bias = relay.expand_dims(bias, 1, 2)
+        bias = relay.layout_transform(bias, "CHW", "CHW16c")
+        scale = relay.layout_transform(scale, "CHW", "CHW16c")
+        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1),
+                            data_layout="NCHW16c")
+        y = relay.add(y, bias)          # test broadcasting to lhs
+        y = relay.multiply(scale, y)      # test broadcasting to rhs
+        y = relay.layout_transform(y, "NCHW16c", "NCHW")
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    a = before()
+    a = infer_type(a)
+    a = canonicalize_ops(a)
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
+
+if __name__ == "__main__":
+    test_alter_op()
+    test_alter_return_none()
+    test_alter_layout()
+    test_alter_layout_dual_path()
+    test_alter_layout_resnet()
+    test_alter_layout_broadcast_op()
+
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 5fc05162f09b..9d3e675d8ef7 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -448,6 +448,7 @@ inline tvm::Tensor group_conv2d_ngchw(const tvm::Tensor& I,
 }
 
 using FLayoutIndicesTransform = std::function<Array<Expr>(const Array<Var>& indices)>;
+
 /*!
  * \brief Transform the layout according to the mapping function \p to_src_indices.
  * \param src the source input.

From 6ddfab90d511f6be5e1a0f5bc3b2c80414102a0b Mon Sep 17 00:00:00 2001
From: Denis Khalikov <dennis.khalikov@gmail.com>
Date: Fri, 30 Nov 2018 05:31:59 +0300
Subject: [PATCH 437/529] [PASS] InstrumentBoundCheckers pass (#2079)

The pass which instruments checkers before
memory accesses (load/store).
This allows to handle invalid memory accesses.

The patch is related to issue:
https://discuss.tvm.ai/t/array-bounds-checking/944
---
 include/tvm/build_module.h                    |   4 +
 include/tvm/ir.h                              |   2 +
 include/tvm/ir_pass.h                         |  11 +-
 python/tvm/build_module.py                    |   8 +-
 src/api/api_pass.cc                           |  10 +-
 src/codegen/build_module.cc                   |   6 +-
 src/pass/bound_checker.cc                     | 195 +++++++
 src/pass/storage_flatten.cc                   |  68 ++-
 tests/python/unittest/test_codegen_llvm.py    |  25 +
 .../unittest/test_pass_bound_checkers.py      | 544 ++++++++++++++++++
 10 files changed, 862 insertions(+), 11 deletions(-)
 create mode 100644 src/pass/bound_checker.cc
 create mode 100644 tests/python/unittest/test_pass_bound_checkers.py

diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index ba340166339b..8bb3345a5eb3 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -220,6 +220,9 @@ class BuildConfigNode : public Node {
   /*! \brief Whether to dump the IR of each pass (only when building from python) */
   bool dump_pass_ir = false;
 
+  /*! \brief Whether to instrument loads and stores with check for out of the bounds. */
+  bool instrument_bound_checkers = false;
+
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("data_alignment", &data_alignment);
     v->Visit("offset_factor", &offset_factor);
@@ -232,6 +235,7 @@ class BuildConfigNode : public Node {
     v->Visit("detect_global_barrier", &detect_global_barrier);
     v->Visit("partition_const_loop", &partition_const_loop);
     v->Visit("dump_pass_ir", &dump_pass_ir);
+    v->Visit("instrument_bound_checkers", &instrument_bound_checkers);
   }
 
   static constexpr const char* _type_key = "BuildConfig";
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 212234303c61..adaffa77dae6 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -206,6 +206,8 @@ constexpr const char* scan_init_scope = "scan_init_scope";
  *  This gives hint to require stride of dim to be k * align + offset.
  */
 constexpr const char* buffer_dim_align = "buffer_dim_align";
+/*! \brief Mark stores/loads with theirs bounds.  */
+constexpr const char* buffer_bound = "buffer_bound";
 /*!
  * \brief Bind the buffer specification to the region of the op
  *  When this scope occurs, the stmt.node is a Array<NodeRef> = [buffer, tensor]
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index 332becb7aa38..68bfe53407c8 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -181,11 +181,13 @@ Stmt Inline(Stmt stmt,
  * \param extern_buffer Map specifies external
  *    buffer assignment of input and outputs.
  * \param cache_line_size The size of CPU cache line.
+ * \param create_bound_attribute Whether to create bound attributes.
  * \return Transformed stmt.
  */
 Stmt StorageFlatten(Stmt stmt,
                     Map<Tensor, Buffer> extern_buffer,
-                    int cache_line_size);
+                    int cache_line_size,
+                    bool create_bound_attribute = false);
 
 /*!
  * \brief Remove No Op from the Stmt.
@@ -234,6 +236,13 @@ Stmt UnrollLoop(Stmt stmt,
  */
 Stmt VectorizeLoop(Stmt stmt);
 
+/*!
+* \brief instruments bound checkers.
+* \param stmt The statment to be instrumented.
+* \return Instrumented Stmt.
+*/
+Stmt InstrumentBoundCheckers(Stmt stmt);
+
 /*!
  * \brief Inject virtual thread loops into stmt.
  * \param stmt The statment to be transformed.
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 79debc9a2c45..4068b1ce3a94 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -125,7 +125,8 @@ class BuildConfig(NodeBase):
         "data_alignment": -1,
         "restricted_func": True,
         "double_buffer_split_loop": 1,
-        "dump_pass_ir": False
+        "dump_pass_ir": False,
+        "instrument_bound_checkers": False
     }
     _dump_ir = DumpIR()
 
@@ -344,7 +345,7 @@ def lower(sch,
     for f in lower_phase0:
         stmt = f(stmt)
     # Phase 1
-    stmt = ir_pass.StorageFlatten(stmt, binds, 64)
+    stmt = ir_pass.StorageFlatten(stmt, binds, 64, cfg.instrument_bound_checkers)
     stmt = ir_pass.CanonicalSimplify(stmt)
     for f in lower_phase1:
         stmt = f(stmt)
@@ -370,6 +371,9 @@ def lower(sch,
     stmt = ir_pass.RewriteUnsafeSelect(stmt)
     for f in lower_phase3:
         stmt = f(stmt)
+    # Instrument BoundCheckers
+    if cfg.instrument_bound_checkers:
+        stmt = ir_pass.InstrumentBoundCheckers(stmt)
     if simple_mode:
         return stmt
     return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index 575535f26e81..bf9e85e8134a 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -66,6 +66,14 @@ TVM_REGISTER_API("ir_pass.Equal")
     }
   });
 
+TVM_REGISTER_API("ir_pass.StorageFlatten")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    if (args.size() <= 3) {
+      *ret = StorageFlatten(args[0], args[1], args[2]);
+    } else {
+      *ret = StorageFlatten(args[0], args[1], args[2], args[3]);
+    }
+  });
 
 TVM_REGISTER_API("ir_pass.AttrsEqual")
 .set_body_typed<bool(const NodeRef&, const NodeRef&)>([](const NodeRef& lhs, const NodeRef& rhs) {
@@ -126,7 +134,6 @@ REGISTER_PASS1(ConvertSSA);
 REGISTER_PASS1(VerifySSA);
 REGISTER_PASS1(RewriteUnsafeSelect);
 REGISTER_PASS4(Inline);
-REGISTER_PASS3(StorageFlatten);
 REGISTER_PASS4(IRTransform);
 REGISTER_PASS1(VectorizeLoop);
 REGISTER_PASS5(UnrollLoop);
@@ -155,5 +162,6 @@ REGISTER_PASS1(CombineContextCall);
 REGISTER_PASS2(VerifyMemory);
 REGISTER_PASS2(VerifyGPUCode);
 REGISTER_PASS1(DecorateDeviceScope);
+REGISTER_PASS1(InstrumentBoundCheckers);
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index c5c14d711df7..0659a07f2520 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -364,7 +364,8 @@ Stmt BuildStmt(Schedule sch,
   stmt = ir::InjectPrefetch(stmt);
 
   // Phase 1
-  stmt = ir::StorageFlatten(stmt, out_binds, 64);
+  stmt = ir::StorageFlatten(stmt, out_binds, 64,
+                            config->instrument_bound_checkers);
   stmt = ir::CanonicalSimplify(stmt);
   if (loop_partition) {
     stmt = ir::LoopPartition(stmt, config->partition_const_loop);
@@ -382,6 +383,9 @@ Stmt BuildStmt(Schedule sch,
   stmt = ir::RemoveNoOp(stmt);
   stmt = ir::RewriteUnsafeSelect(stmt);
 
+  if (config->instrument_bound_checkers)
+    stmt = ir::InstrumentBoundCheckers(stmt);
+
   return stmt;
 }
 
diff --git a/src/pass/bound_checker.cc b/src/pass/bound_checker.cc
new file mode 100644
index 000000000000..a7c03d0d1d60
--- /dev/null
+++ b/src/pass/bound_checker.cc
@@ -0,0 +1,195 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file bounds_checker.cc
+ */
+// Instrument checkers for out of the bounds access.
+
+#include <tvm/ir.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_pass.h>
+#include <tvm/ir_visitor.h>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+
+namespace tvm {
+namespace ir {
+
+class BoundCollector : public IRVisitor {
+ public:
+  BoundCollector() {}
+
+  void Visit_(const AttrStmt *op) {
+    if (op->attr_key == ir::attr::buffer_bound) {
+      if (const Variable *key = op->node.as<Variable>()) {
+        mem_to_shape[key] = op->value;
+      }
+    }
+    IRVisitor::Visit_(op);
+  }
+  // Hashtable which maps buffer_var to shape.
+  std::unordered_map<const Variable *, Expr> mem_to_shape;
+};
+
+class BoundChecker : public IRMutator {
+ public:
+  explicit BoundChecker(
+      const std::unordered_map<const Variable *, Expr> &mem_to_shape)
+      : mem_to_shape_(mem_to_shape) {}
+
+  Stmt Mutate_(const Allocate *op, const Stmt &s) final {
+    // If the shape was updated we should update the hashtable.
+    if (UpdateIsNeeded(op->buffer_var)) {
+      Update(op->buffer_var, op->extents, op->type);
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Expr Mutate_(const Call *op, const Expr &ex) final {
+    if (process_store_ && op->is_intrinsic(intrinsic::tvm_if_then_else)) {
+      unsafe_rewritten_ = true;
+    }
+    return IRMutator::Mutate_(op, ex);
+  }
+
+  Stmt Mutate_(const Store *op, const Stmt &s) final {
+    store_scope_bound_collector_.clear();
+    process_store_ = true;
+    unsafe_rewritten_ = false;
+    IRMutator::Mutate_(op, s);
+    process_store_ = false;
+    if (CanInstrument(op->index, op->buffer_var)) {
+      Collect(op->index, op->buffer_var);
+    }
+    // The collector should has at least one item.
+    if (store_scope_bound_collector_.size()) {
+      Expr condition = MakeCondition();
+      if (!condition.as<StringImm>()) {
+        Stmt nop = Evaluate::make(1);
+        Stmt then_case =
+            Store::make(op->buffer_var, op->value, op->index, op->predicate);
+        Stmt else_case =
+            AssertStmt::make(condition, StringImm::make(error_message_), nop);
+        Stmt body = IfThenElse::make(condition, then_case, else_case);
+        return body;
+      }
+    }
+    return s;
+  }
+
+  Expr Mutate_(const Load *op, const Expr &ex) final {
+    if (CanInstrument(op->index, op->buffer_var)) {
+      Collect(op->index, op->buffer_var);
+    }
+    return IRMutator::Mutate_(op, ex);
+  }
+
+ private:
+  bool UpdateIsNeeded(const VarExpr &buffer_var) const {
+    return (buffer_var.defined() && mem_to_shape_.count(buffer_var.get()));
+  }
+
+  void Update(const VarExpr &buffer_var, const Array<Expr> &new_shape,
+              const Type &type) {
+    // Sanity check at first.
+    if (!new_shape.size()) {
+      return;
+    }
+
+    for (size_t i = 0; i < new_shape.size(); ++i) {
+      if (!new_shape[0].defined() || !new_shape[i].type().is_scalar() ||
+          is_negative_const(new_shape[i])) {
+        return;
+      }
+    }
+
+    // Scalarize the shape.
+    Expr shape = Mul::make(make_const(UInt(64), type.lanes()),
+                           Cast::make(UInt(64), new_shape[0]));
+    for (size_t i = 1; i < new_shape.size(); ++i) {
+      // Cast to unsigned to avoid integer overlow at frist.
+      shape = Mul::make(shape, Mul::make(make_const(UInt(64), type.lanes()),
+                                         Cast::make(UInt(64), new_shape[i])));
+    }
+    mem_to_shape_[buffer_var.get()] = shape;
+  }
+
+  bool IndexIsValid(const Expr &index) const {
+    if (!index.defined()) {
+      return false;
+    }
+
+    if (const Ramp *ramp_index = index.as<Ramp>()) {
+      return ramp_index->base.defined() &&
+             ramp_index->base.type().is_scalar() &&
+             ramp_index->stride.defined() &&
+             ramp_index->stride.type().is_scalar() && (ramp_index->lanes > 0);
+    }
+    return true;
+  }
+
+  bool CanInstrument(const Expr &index, const VarExpr &buffer_var) const {
+    return buffer_var.defined() && mem_to_shape_.count(buffer_var.get()) &&
+           IndexIsValid(index) && !unsafe_rewritten_;
+  }
+
+  void Collect(Expr index, VarExpr buffer_var) {
+    store_scope_bound_collector_.push_back(
+        std::make_pair(index, mem_to_shape_[buffer_var.get()]));
+  }
+
+  Expr MakeCondition() {
+    Expr condition;
+    for (size_t i = 0; i < store_scope_bound_collector_.size(); ++i) {
+      std::pair<Expr, Expr> buffer_to_mem = store_scope_bound_collector_[i];
+      Expr index = buffer_to_mem.first;
+      Expr upper_bound = buffer_to_mem.second;
+
+      if (const Ramp *ramp_index = index.as<Ramp>()) {
+        // In case index is base + stride * i.
+        // Non inclusive range.
+        index = Add::make(
+            ramp_index->base,
+            Mul::make(ramp_index->stride, make_const(ramp_index->stride.type(),
+                                                     ramp_index->lanes - 1)));
+      }
+
+      // Try to simplify index and bound.
+      index = ir::Simplify(index);
+      upper_bound = ir::Simplify(upper_bound);
+
+      // Cast to the same type - signed, to be able to check lower bound.
+      index = Cast::make(Int(64), index);
+      upper_bound = Cast::make(Int(64), upper_bound);
+
+      // Looks like a lower bound should always be zero after normalization.
+      Expr lower_bound = make_zero(Int(64));
+
+      Expr current_condition =
+          And::make(GE::make(index, lower_bound), LT::make(index, upper_bound));
+      condition =
+          !i ? current_condition : And::make(condition, current_condition);
+    }
+    return condition;
+  }
+
+  // Whether we process store value recursively.
+  bool process_store_{false};
+  // Whether we face tvm_if_then_else intrinsic.
+  bool unsafe_rewritten_{false};
+  // Pool which collects the pair of index and shape for specific store/load.
+  std::vector<std::pair<Expr, Expr>> store_scope_bound_collector_;
+  // Error message.
+  const char *const error_message_ = "OUT OF THE BOUNDS";
+  // Hashtable which maps buffer_var to shape.
+  std::unordered_map<const Variable *, Expr> mem_to_shape_;
+};
+
+Stmt InstrumentBoundCheckers(Stmt stmt) {
+  BoundCollector bound_collector;
+  // At first walk recursively and collect bound attributes.
+  bound_collector.Visit(stmt);
+  return BoundChecker(bound_collector.mem_to_shape).Mutate(stmt);
+}
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index 8c2105829839..488d44544c31 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -31,7 +31,8 @@ using intrinsic::tvm_address_of;
 class StorageFlattener : public IRMutator {
  public:
   explicit StorageFlattener(Map<Tensor, Buffer> extern_buffer,
-                            int cache_line_size) {
+                            int cache_line_size, bool create_bound_attributes)
+      : create_bound_attributes_(create_bound_attributes) {
     for (auto kv : extern_buffer) {
       BufferEntry e;
       e.buffer = kv.second;
@@ -101,6 +102,8 @@ class StorageFlattener : public IRMutator {
   }
 
   Stmt Mutate_(const Provide* op, const Stmt& s) final {
+    if (create_bound_attributes_)
+      shape_collector_.clear();
     Stmt stmt = IRMutator::Mutate_(op, s);
     op = stmt.as<Provide>();
     TensorKey key{op->func, op->value_index};
@@ -117,7 +120,20 @@ class StorageFlattener : public IRMutator {
           {e.buffer->data, op->value},
           Call::Intrinsic));
     } else {
-      return e.buffer.vstore(e.RelIndex(op->args), op->value);
+      Stmt body = e.buffer.vstore(e.RelIndex(op->args), op->value);
+      if (create_bound_attributes_ && ShapeIsValid(e.buffer->shape)) {
+        shape_collector_.push_back(
+            std::make_pair(e.buffer->data, e.buffer->shape));
+      }
+      // To create bound attribute collector should has at least one item.
+      if (create_bound_attributes_ && shape_collector_.size()) {
+        for (size_t i = 0; i < shape_collector_.size(); ++i) {
+          body = AttrStmt::make(
+              shape_collector_[i].first, ir::attr::buffer_bound,
+              MakeBound(e.buffer->dtype, shape_collector_[i].second), body);
+        }
+      }
+      return body;
     }
   }
 
@@ -216,6 +232,11 @@ class StorageFlattener : public IRMutator {
       ret = AttrStmt::make(
           e.buffer->data, attr::storage_scope,
           StringImm::make(e.buffer->scope), ret);
+
+      if (create_bound_attributes_ && ShapeIsValid(e.buffer->shape)) {
+        ret = AttrStmt::make(e.buffer->data, ir::attr::buffer_bound,
+                             MakeBound(e.buffer->dtype, e.buffer->shape), ret);
+      }
       return ret;
     }
   }
@@ -254,6 +275,11 @@ class StorageFlattener : public IRMutator {
       const BufferEntry& e = it->second;
       CHECK(!e.released)
           << "Read a buffer that is already out of scope";
+
+      if (create_bound_attributes_ && ShapeIsValid(e.buffer->shape)) {
+        shape_collector_.push_back(
+            std::make_pair(e.buffer->data, e.buffer->shape));
+      }
       return e.buffer.vload(e.RelIndex(op->args), e.buffer->dtype);
     } else {
       return expr;
@@ -429,6 +455,31 @@ class StorageFlattener : public IRMutator {
       }
     }
   };
+
+  bool ShapeIsValid(const Array<Expr> &shape) {
+    // Zero-dimensional tensor does not need boundary check.
+    if (!shape.size())
+      return false;
+
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (!shape[i].defined() || !shape[i].type().is_scalar() ||
+          is_negative_const(shape[i])) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  Expr MakeBound(const Type &type, const Array<Expr> &shape) {
+    // We have already checked the shape size to be greater then 0.
+    Expr bound = Mul::make(make_const(shape[0].type(), type.lanes()), shape[0]);
+    for (size_t i = 1; i < shape.size(); ++i) {
+      bound = Mul::make(
+          bound, Mul::make(make_const(bound.type(), type.lanes()), shape[i]));
+    }
+    return bound;
+  }
+
   // The buffer assignment map
   // Variable remap
   std::unordered_map<const Variable*, Expr> var_remap_;
@@ -440,16 +491,21 @@ class StorageFlattener : public IRMutator {
   std::unordered_map<const Node*, std::string> storage_scope_;
   // The current thread scope.
   std::vector<ThreadScope> curr_thread_scope_;
+  // Collects shapes.
+  std::vector<std::pair<VarExpr, Array<Expr>>> shape_collector_;
   // The size of cacheline
   int cache_line_size_;
   // The current stage is an OpenGL shader.
   bool is_opengl_{false};
+  // Whether to mark load/store with theirs bounds.
+  bool create_bound_attributes_{false};
 };
 
-Stmt StorageFlatten(Stmt stmt,
-                    Map<Tensor, Buffer> extern_buffer,
-                    int cache_line_size) {
-  stmt = StorageFlattener(extern_buffer, cache_line_size).Mutate(stmt);
+Stmt StorageFlatten(Stmt stmt, Map<Tensor, Buffer> extern_buffer,
+                    int cache_line_size, bool create_bound_attributes) {
+  stmt =
+      StorageFlattener(extern_buffer, cache_line_size, create_bound_attributes)
+          .Mutate(stmt);
   return stmt;
 }
 
diff --git a/tests/python/unittest/test_codegen_llvm.py b/tests/python/unittest/test_codegen_llvm.py
index 66a7fc48c287..c0792cf38234 100644
--- a/tests/python/unittest/test_codegen_llvm.py
+++ b/tests/python/unittest/test_codegen_llvm.py
@@ -348,6 +348,30 @@ def check_llvm(n):
         tvm.testing.assert_allclose(d.asnumpy(), d_np)
     check_llvm(64)
 
+def test_rank_zero_bound_checkers():
+    def check_llvm(n):
+        if not tvm.module.enabled("llvm"):
+            return
+        with tvm.build_config(instrument_bound_checkers=True):
+            A = tvm.placeholder((n, ), name='A')
+            scale = tvm.placeholder((), name='scale')
+            k = tvm.reduce_axis((0, n), name="k")
+            C = tvm.compute((), lambda : tvm.sum(A[k] * scale, axis=k), name="C")
+            D = tvm.compute((), lambda : C + 1)
+            s = tvm.create_schedule(D.op)
+            # build and invoke the kernel.
+            f = tvm.build(s, [A, scale, D], "llvm")
+            ctx = tvm.cpu(0)
+            # launch the kernel.
+            a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx)
+            sc = tvm.nd.array(
+                np.random.randint(0, 2, size=()).astype(scale.dtype), ctx)
+            d = tvm.nd.empty((), D.dtype, ctx)
+            f(a, sc, d)
+            d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
+            tvm.testing.assert_allclose(d.asnumpy(), d_np)
+    check_llvm(64)
+
 
 def test_alignment():
     n = tvm.convert(1024)
@@ -367,6 +391,7 @@ def test_alignment():
     test_llvm_import()
     test_alignment()
     test_rank_zero()
+    test_rank_zero_bound_checkers()
     test_llvm_bool()
     test_llvm_persist_parallel()
     test_llvm_select()
diff --git a/tests/python/unittest/test_pass_bound_checkers.py b/tests/python/unittest/test_pass_bound_checkers.py
new file mode 100644
index 000000000000..bb552f078f1a
--- /dev/null
+++ b/tests/python/unittest/test_pass_bound_checkers.py
@@ -0,0 +1,544 @@
+from nose.tools import raises
+import tvm
+import numpy as np
+def collect_visit(stmt, f):
+    ret = []
+    tvm.ir_pass.PostOrderVisit(stmt, lambda x: ret.append(f(x)))
+    return ret
+
+def lower(sch, args):
+    binds = {}
+    arg_list = []
+    for x in args:
+        if isinstance(x, tvm.tensor.Tensor):
+            buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.name)
+            assert x not in binds
+            binds[x] = buf
+            arg_list.append(buf)
+        else:
+            raise ValueError("args must be Tensor, Buffer or Var")
+    sch = sch.normalize()
+    bounds = tvm.schedule.InferBound(sch)
+    stmt = tvm.schedule.ScheduleOps(sch, bounds)
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64, True)
+    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    return stmt
+
+@raises(Exception)
+def test_out_of_bounds_llvm(index_a, index_b):
+    n = tvm.var("n")
+    A = tvm.placeholder ((n,), name='A')
+    B = tvm.placeholder ((n,), name='B')
+    C = tvm.compute(A.shape, lambda i: A[i + index_a] + B[i + index_b], name='C')
+    s = tvm.create_schedule (C.op)
+    tgt = "llvm"
+    tgt_host = "llvm"
+    stmt = tvm.lower (s, [A, B, C], simple_mode=True)
+    print (stmt)
+    fadd = tvm.build (s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+    ctx = tvm.context(tgt, 0)
+    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx)
+    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), ctx)
+    fadd (a, b, c)
+
+def test_in_bounds_llvm():
+    n = tvm.var("n")
+    A = tvm.placeholder ((n,), name='A')
+    B = tvm.placeholder ((n,), name='B')
+    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name='C')
+    s = tvm.create_schedule (C.op)
+    tgt = "llvm"
+    tgt_host = "llvm"
+    stmt = tvm.lower (s, [A, B, C], simple_mode=True)
+    print (stmt)
+    fadd = tvm.build (s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+    ctx = tvm.context(tgt, 0)
+    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx)
+    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), ctx)
+    fadd (a, b, c)
+
+@raises(Exception)
+def test_out_of_bounds_vectorize_llvm(nn, index_a, index_b):
+    n = tvm.convert(nn)
+    a = tvm.placeholder((n), name='a')
+    b = tvm.placeholder((n), name='b')
+    c = tvm.compute((n,), lambda i: a[i + index_a] + b[i + index_b], name='c')
+    s = tvm.create_schedule(c.op)
+    xo, xi = s[c].split(c.op.axis[0], factor=8)
+    s[c].parallel(xo)
+    s[c].vectorize(xi)
+    tgt = "llvm"
+    tgt_host = "llvm"
+    stmt = tvm.lower (s, [a, b, c], simple_mode=True)
+    print (stmt)
+    f = tvm.build(s, [a, b, c], tgt, target_host=tgt_host, name="myaddvec")
+    ctx = tvm.cpu(0)
+    n = nn
+    a = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), ctx)
+    c = tvm.nd.array(np.zeros(n, dtype=c.dtype), ctx)
+    f(a, b, c)
+
+def test_in_bounds_vectorize_llvm():
+    n = 512
+    lanes = 2
+    A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes)
+    B = tvm.compute((n,), lambda i: A[i], name='B')
+    C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C')
+    s = tvm.create_schedule(C.op)
+    xo, xi = s[C].split(C.op.axis[0], nparts=2)
+    _, xi = s[C].split(xi, factor=2)
+    s[C].parallel(xo)
+    s[C].vectorize(xi)
+    s[B].compute_at(s[C], xo)
+    xo, xi = s[B].split(B.op.axis[0], factor=2)
+    s[B].vectorize(xi)
+    # build and invoke the kernel.
+    lowered_func = tvm.lower (s, [A, C], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    f = tvm.build(s, [A, C], "llvm")
+    ctx = tvm.cpu(0)
+    # launch the kernel.
+    a = tvm.nd.empty((n,), A.dtype).copyfrom(
+        np.random.uniform(size=(n, lanes)))
+    c = tvm.nd.empty((n,), C.dtype, ctx)
+    f(a, c)
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
+
+def test_in_bounds_loop_partition_basic_llvm():
+    n = tvm.var('n')
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((n, ), name='B')
+
+    T = tvm.compute((n, ), lambda i: A[i]+B[i])
+    s = tvm.create_schedule(T.op)
+    xo, xi = s[T].split(T.op.axis[0], factor=4)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((32,), T.dtype, ctx)
+    f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_loop_partition_basic_llvm(index_a, index_b):
+    n = tvm.var('n')
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((n, ), name='B')
+
+    T = tvm.compute((n, ), lambda i: A[i + index_a]+B[i + index_b])
+    s = tvm.create_schedule(T.op)
+    xo, xi = s[T].split(T.op.axis[0], factor=4)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((32,), T.dtype, ctx)
+    f(a, b, t)
+
+def test_in_bounds_const_loop_partition_ir():
+    def check_attr_stmt (x):
+        if isinstance(x, tvm.stmt.AttrStmt) and x.attr_key == "buffer_bound" and str(x.value) == str(n):
+            return True
+        return False
+
+    def check_branch_stmt (x):
+        if isinstance(x, tvm.stmt.IfThenElse):
+            return True
+        return False
+
+    def assert_bound_instrumentation(stmt, f, nums):
+        count = 0
+        for i in collect_visit(stmt, f):
+            if i is True:
+              count = count + 1
+        assert (count == nums)
+
+    def collect_branch_stmt (x):
+        if isinstance(x, tvm.stmt.IfThenElse):
+            branch_collector.append(x)
+
+    n = 21
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((n, ), name='B')
+
+    T = tvm.compute((n, ), lambda i: A[i]+B[i])
+    s = tvm.create_schedule(T.op)
+    xo, xi = s[T].split(T.op.axis[0], factor=4)
+
+    bounds = tvm.schedule.InferBound(s)
+    stmt = lower (s, [A, B, T])
+    # num_attributes = num_buffers * num_splits = 2 * 3
+    # before instrumentation
+    assert_bound_instrumentation(stmt, check_attr_stmt, 2 * 3)
+    assert_bound_instrumentation(stmt, check_branch_stmt, 0)
+    stmt = tvm.ir_pass.InstrumentBoundCheckers(stmt)
+    # after instrumentation
+    assert_bound_instrumentation(stmt, check_attr_stmt, 2 * 3)
+    assert_bound_instrumentation(stmt, check_branch_stmt, 2)
+    print (stmt)
+    branch_collector = list()
+    collect_visit(stmt, collect_branch_stmt)
+    assert(len(branch_collector) ==  2)
+    print (branch_collector[0].condition)
+    print (branch_collector[1].condition)
+
+def test_in_bounds_const_loop_partition_llvm():
+    with tvm.build_config(instrument_bound_checkers=True, partition_const_loop=True):
+        n = 21
+        A = tvm.placeholder((n, ), name='A')
+        B = tvm.placeholder((n, ), name='B')
+
+        T = tvm.compute((n, ), lambda i: A[i]+B[i])
+        s = tvm.create_schedule(T.op)
+        xo, xi = s[T].split(T.op.axis[0], factor=4)
+        lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+        print (lowered_func.body)
+        ctx = tvm.cpu(0)
+
+        f = tvm.build(s, [A, B, T], "llvm")
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
+        t = tvm.nd.empty((n,), T.dtype, ctx)
+        f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_const_loop_partition_llvm(index_a, index_b):
+    with tvm.build_config(instrument_bound_checkers=True, partition_const_loop=True):
+        n = 21
+        A = tvm.placeholder((n, ), name='A')
+        B = tvm.placeholder((n, ), name='B')
+
+        T = tvm.compute((n, ), lambda i: A[i + index_a]+B[i + index_b])
+        s = tvm.create_schedule(T.op)
+        xo, xi = s[T].split(T.op.axis[0], factor=4)
+        lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+        print (lowered_func.body)
+        ctx = tvm.cpu(0)
+
+        f = tvm.build(s, [A, B, T], "llvm")
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
+        t = tvm.nd.empty((n,), T.dtype, ctx)
+        f(a, b, t)
+
+def test_in_bounds_conv_llvm(loop_tiling=False):
+    HSTR = WSTR = 1
+    in_channel = 128
+    kernel_height = kernel_width = 3
+    out_channel = 64
+    batch_size = 1
+    in_height = in_width = 64
+    out_height = out_width = in_height - kernel_height + 1
+    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
+    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
+        out_channel), name='kernel')
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
+                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
+                                                     kernel[kh, kw, ic, oc],
+                                                     axis=[ic, kh, kw]),
+                       name="conv2d")
+    s = tvm.create_schedule(conv.op)
+
+    n, oc, oh, ow = conv.op.axis
+    if loop_tiling:
+        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
+    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
+    print (lowered_func.body)
+    ctx = tvm.cpu (0)
+
+    f = tvm.build(s, [data, kernel, conv], "llvm")
+    data_input = tvm.nd.array(np.random.uniform(
+          size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx)
+    kernel_input = tvm.nd.array(np.random.uniform(
+          size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx)
+    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx)
+    f(data_input, kernel_input, conv_out)
+
+@raises(Exception)
+def test_out_of_bounds_conv_llvm(data_offsets, kernel_offsets, loop_tiling=False):
+    HSTR = WSTR = 1
+    in_channel = 128
+    kernel_height = kernel_width = 3
+    out_channel = 64
+    batch_size = 1
+    in_height = in_width = 64
+    out_height = out_width = in_height - kernel_height + 1
+    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
+    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
+        out_channel), name='kernel')
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
+                       lambda n, oc, oh, ow: tvm.sum(data[n + data_offsets[0],
+                                                          ic + data_offsets[1],
+                                                          oh*HSTR + kh + data_offsets[2],
+                                                          ow*WSTR + kw + data_offsets[3]]
+                                                          *
+                                                     kernel[kh + kernel_offsets[0],
+                                                     kw + kernel_offsets[1],
+                                                     ic + kernel_offsets[2],
+                                                     oc + kernel_offsets[3]],
+                                                     axis=[ic, kh, kw]),
+                       name="conv2d")
+    s = tvm.create_schedule(conv.op)
+
+    n, oc, oh, ow = conv.op.axis
+    if loop_tiling:
+        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
+    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
+    print (lowered_func.body)
+    ctx = tvm.cpu (0)
+
+    f = tvm.build(s, [data, kernel, conv], "llvm")
+    data_input = tvm.nd.array(np.random.uniform(
+          size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx)
+    kernel_input = tvm.nd.array(np.random.uniform(
+          size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx)
+    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx)
+    f(data_input, kernel_input, conv_out)
+
+def test_in_bounds_tensors_with_same_shapes1D_llvm():
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((k, ), name='B')
+
+    T = tvm.compute((m, ), lambda i: A[i]*B[i])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(32, )).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((32,), T.dtype, ctx)
+    f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_tensors_with_diff_shapes1D_llvm(a_shape, b_shape, c_shape):
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((k, ), name='B')
+
+    T = tvm.compute((m, ), lambda i: A[i]*B[i])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(a_shape,)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(b_shape,)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((c_shape,), T.dtype, ctx)
+    f(a, b, t)
+
+def test_in_bounds_tensors_with_same_shapes2D_llvm():
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, n), name='A')
+    B = tvm.placeholder((k, k), name='B')
+
+    T = tvm.compute((m, m), lambda i, j: A[i][j]*B[i][j])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((32, 32), T.dtype, ctx)
+    f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_tensors_with_diff_shapes2D_llvm(a_shape, b_shape, c_shape):
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, n), name='A')
+    B = tvm.placeholder((k, k), name='B')
+
+    T = tvm.compute((m, m), lambda i, j: A[i][j]*B[i][j])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(a_shape[0],a_shape[1])).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(b_shape[0],b_shape[1])).astype(B.dtype), ctx)
+    t = tvm.nd.empty((c_shape[0],c_shape[1]), T.dtype, ctx)
+    f(a, b, t)
+
+def test_in_bounds_tensors_with_same_shapes3D_llvm():
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, n, n), name='A')
+    B = tvm.placeholder((k, k, k), name='B')
+
+    T = tvm.compute((m, m, m), lambda i, j, p: A[i][j][p]*B[i][j][p])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(32,32,32)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(32,32,32)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((32, 32, 32), T.dtype, ctx)
+    f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_tensors_with_diff_shapes3D_llvm(a_shape, b_shape, c_shape):
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, n, n), name='A')
+    B = tvm.placeholder((k, k, k), name='B')
+
+    T = tvm.compute((m, m, m), lambda i, j, p: A[i][j][p]*B[i][j][p])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(a_shape[0],a_shape[1], c_shape[2])).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(b_shape[0],b_shape[1], b_shape[2])).astype(B.dtype), ctx)
+    t = tvm.nd.empty((c_shape[0],c_shape[1],c_shape[2]), T.dtype, ctx)
+    f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm():
+    if not tvm.module.enabled("llvm"):
+        return
+    n = 64
+    A = tvm.placeholder((n, ), name='A')
+    scale = tvm.placeholder((), name='scale')
+    k = tvm.reduce_axis((0, n), name="k")
+    C = tvm.compute((), lambda : tvm.sum(A[k + k + k] * scale, axis=k), name="C")
+    D = tvm.compute((), lambda : C + 1)
+    s = tvm.create_schedule(D.op)
+    stmt = tvm.lower (s, [A, scale, D], simple_mode=True)
+    print (stmt)
+    # build and invoke the kernel.
+    f = tvm.build(s, [A, scale, D], "llvm")
+    ctx = tvm.cpu(0)
+    # launch the kernel.
+    a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx)
+    sc = tvm.nd.array(
+        np.random.randint(0, 2, size=()).astype(scale.dtype), ctx)
+    d = tvm.nd.empty((), D.dtype, ctx)
+    f(a, sc, d)
+    d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
+    tvm.testing.assert_allclose(d.asnumpy(), d_np)
+
+if __name__ == "__main__":
+    with tvm.build_config(instrument_bound_checkers=True):
+        # zero scale
+        test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm()
+        # in bound
+        test_in_bounds_llvm()
+        # upper bound
+        test_out_of_bounds_llvm(1, 0)
+        test_out_of_bounds_llvm(0, 1)
+        test_out_of_bounds_llvm(1, 1)
+        test_out_of_bounds_llvm(10000, 0)
+        test_out_of_bounds_llvm(0, 10000)
+        test_out_of_bounds_llvm(10000, 10000)
+        # lower bound
+        test_out_of_bounds_llvm(-1, 0)
+        test_out_of_bounds_llvm(0, -1)
+        test_out_of_bounds_llvm(-1, -1)
+        test_out_of_bounds_llvm(-10000, 0)
+        test_out_of_bounds_llvm(0, -10000)
+        test_out_of_bounds_llvm(-10000, -10000)
+        # vectorize in bound
+        test_in_bounds_vectorize_llvm()
+        # vectorization upper bound
+        test_out_of_bounds_vectorize_llvm(1024, 1000, 0)
+        test_out_of_bounds_vectorize_llvm(1024, 0, 10000)
+        # vectorization lower bound
+        test_out_of_bounds_vectorize_llvm(1024, -1000, 0)
+        test_out_of_bounds_vectorize_llvm(1024, 0, -10000)
+        test_in_bounds_const_loop_partition_llvm()
+        test_out_of_bounds_const_loop_partition_llvm(1, 0)
+        test_out_of_bounds_const_loop_partition_llvm(0, 1)
+        test_out_of_bounds_const_loop_partition_llvm(-1, 0)
+        test_out_of_bounds_const_loop_partition_llvm(0, -1)
+        test_in_bounds_loop_partition_basic_llvm()
+        test_out_of_bounds_loop_partition_basic_llvm(32, 0)
+        test_out_of_bounds_loop_partition_basic_llvm(0, 32)
+        test_out_of_bounds_loop_partition_basic_llvm(-32, 0)
+        test_out_of_bounds_loop_partition_basic_llvm(0, -32)
+        # conv
+        test_in_bounds_conv_llvm()
+        test_out_of_bounds_conv_llvm([1, 0, 0, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 1, 0, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 1, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 1], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([-1, 0, 0, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, -1, 0, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, -1, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, -1], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [1, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 1, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 1, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, 1])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [-1, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, -1, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, -1, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, -1])
+        # loop tiling
+        test_in_bounds_conv_llvm(True)
+        test_out_of_bounds_conv_llvm([1, 0, 0, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 1, 0, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 1, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 1], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([-1, 0, 0, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, -1, 0, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, -1, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, -1], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [1, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 1, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 1, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, 1], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [-1, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, -1, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, -1, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, -1], True)
+        # tensors with diff shapes basic operation such as mul
+        test_out_of_bounds_tensors_with_diff_shapes1D_llvm (32, 64, 64)
+        test_out_of_bounds_tensors_with_diff_shapes1D_llvm (64, 32, 64)
+        test_out_of_bounds_tensors_with_diff_shapes2D_llvm([64, 64], [32, 32], [64, 64])
+        test_out_of_bounds_tensors_with_diff_shapes2D_llvm([32, 32], [64, 64], [64, 64])
+        test_out_of_bounds_tensors_with_diff_shapes3D_llvm([64, 64, 64], [32, 32, 32], [64, 64, 64])
+        test_out_of_bounds_tensors_with_diff_shapes3D_llvm([32, 32, 32], [64, 64, 64], [64, 64, 64])
+        # check tensors with the same shapes
+        test_in_bounds_tensors_with_same_shapes1D_llvm()
+        test_in_bounds_tensors_with_same_shapes2D_llvm()
+        test_in_bounds_tensors_with_same_shapes3D_llvm()
+        # ir tests
+        test_in_bounds_const_loop_partition_ir()

From b90411af641d44a9565151c32f33eadaae4b17bf Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 30 Nov 2018 13:49:58 +0900
Subject: [PATCH 438/529] [Relay] Add support for tuple node in operator fusion
 (#2187)

---
 src/relay/pass/fuse_ops.cc               | 73 +++++++++++++++------
 tests/python/relay/test_pass_fuse_ops.py | 81 +++++++++++++++++++++++-
 2 files changed, 131 insertions(+), 23 deletions(-)

diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index b9e0823e88fa..21660decf2fa 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -232,8 +232,11 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
   }
 
   void VisitExpr_(const TupleNode* op) {
+    CHECK(graph_.node_map.count(op));
+    Node* tuple_node = graph_.node_map.at(op);
+    tuple_node->pattern = kInjective;
     for (const Expr& field : op->fields) {
-      this->Update(field, nullptr, kOpaque);
+      this->Update(field, tuple_node, kInjective);
     }
     ExprVisitor::VisitExpr_(op);
     this->AddNode(op);
@@ -712,32 +715,15 @@ class FuseMutator : private ExprMutator {
       // then we must have a group assignment for it already.
       CHECK(gmap_.count(call));
       auto* ret_group = gmap_.at(call)->FindRoot();
-      Array<Expr> new_args;
-      for (auto arg : call->args) {
-        auto type = arg->checked_type();
-        CHECK(gmap_.count(arg.get()))
-            << "cannot find group of " << arg;
-        auto* arg_group = gmap_.at(arg.get())->FindRoot();
-        Expr new_arg = this->Mutate(arg);
-
-        if (ret_group != arg_group) {
-          Var param = ginfo_[ret_group].GetOrAllocParam(new_arg, type);
-          new_args.push_back(param);
-        } else {
-          new_args.push_back(new_arg);
-        }
-      }
+      Array<Expr> new_args = GetNewArguments(call->args, ret_group);
+
       auto new_call = CallNode::make(
           call->op, new_args, call->attrs, call->type_args);
 
       if (ret_group->root_ref == call) {
         // This is the root of the group
         // create the new call node.
-        const GroupInfo& ginfo = ginfo_[ret_group];
-        auto func = FunctionNode::make(
-            ginfo.params, new_call, call->checked_type(), {});
-        func = FunctionSetAttr(func, "Primitive", tvm::Integer(1));
-        return CallNode::make(func, ginfo.arguments, Attrs());
+        return MakeNewFunction(ret_group, call->checked_type(), new_call);
       } else {
         // This is an intermediate node of a fused function
         // simply return the new call.
@@ -747,6 +733,51 @@ class FuseMutator : private ExprMutator {
       return ExprMutator::VisitExpr_(call);
     }
   }
+
+  Expr VisitExpr_(const TupleNode* tuple) {
+    auto* ret_group = gmap_.at(tuple)->FindRoot();
+    Array<Expr> new_fields = GetNewArguments(tuple->fields, ret_group);
+    Tuple new_tuple = TupleNode::make(new_fields);
+    if (ret_group == gmap_.at(tuple)) {
+      bool isolated = true;
+      for (size_t i = 0; i < new_fields.size(); ++i) {
+        isolated &= (new_fields[i].same_as(ginfo_[ret_group].params[i]));
+      }
+      if (isolated) {
+        // Do not put a isolated tuple into a function
+        return ExprMutator::VisitExpr_(tuple);
+      }
+      // This tuple has been fused with other ops before it
+      return MakeNewFunction(ret_group, tuple->checked_type(), new_tuple);
+    }
+    // This tuple is an intermediate node in the group
+    return new_tuple;
+  }
+
+  Expr MakeNewFunction(GraphPartitioner::Group* group, Type ret_type, Expr body) {
+    const GroupInfo& ginfo = ginfo_[group];
+    auto func = FunctionNode::make(ginfo.params, body, ret_type, {});
+    func = FunctionSetAttr(func, "Primitive", tvm::Integer(1));
+    return CallNode::make(func, ginfo.arguments, Attrs());
+  }
+
+  Array<Expr> GetNewArguments(const tvm::Array<Expr>& args,
+                              GraphPartitioner::Group* current_group) {
+    Array<Expr> new_args;
+    for (auto arg : args) {
+      auto* arg_group = gmap_.at(arg.get())->FindRoot();
+      auto type = arg->checked_type();
+      Expr new_arg = this->Mutate(arg);
+      if (current_group != arg_group) {
+        Var param = ginfo_[current_group].GetOrAllocParam(new_arg, type);
+        new_args.push_back(param);
+      } else {
+        new_args.push_back(new_arg);
+      }
+    }
+    return new_args;
+  }
+
   // Debug function, dump the group assignment in text.
   void DebugDumpGroup(const Expr& body) {
     std::string text = RelayPrint(body, false, [this](const Expr& expr) -> std::string {
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 27806791c399..28ea8dd28988 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -28,8 +28,6 @@ def expected():
     assert relay.ir_pass.alpha_equal(zz, after)
 
 
-
-
 def test_conv2d_fuse():
     """Test fusion case of conv2d"""
     def before(dshape):
@@ -106,7 +104,86 @@ def expected(dshape):
     assert relay.ir_pass.alpha_equal(zz, after)
 
 
+def test_concatenate():
+    """Test fusion case involving concat op and Tuple node"""
+
+    def before(dshape):
+        x = relay.var("x", shape=dshape)
+        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        upsampled = relay.nn.upsampling(pooled, scale=2, layout="NCHW")
+        concat = relay.concatenate((upsampled, x), axis=1)
+        out = relay.add(concat, relay.const(1, "float32"))
+        return relay.Function(relay.ir_pass.free_vars(out), out)
+
+    def expected(dshape):
+        x = relay.var("x", shape=dshape)
+        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        f0 = relay.Function([x], pooled)
+
+        p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2]//2, dshape[3]//2))
+        p1 = relay.var("p1", shape=dshape)
+        upsampled = relay.nn.upsampling(p0, scale=2, layout="NCHW")
+        concat = relay.concatenate((upsampled, p1), axis=1)
+        out = relay.add(concat, relay.const(1, "float32"))
+        f1 = relay.Function([p0, p1], out)
+
+        x = relay.var("x", shape=dshape)
+        y = relay.Call(f0, [x])
+        z = relay.Call(f1, [y, x])
+        return relay.Function([x], z)
+
+    dshape = (1, 16, 64, 64)
+    z = before(dshape)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
+    assert not relay.ir_pass.free_vars(zz)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
+    assert not relay.ir_pass.free_vars(zz)
+    after = relay.ir_pass.infer_type(expected(dshape))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
+def test_tuple_root():
+    """Test fusion case where Tuple node is the root in its group"""
+
+    def before(dshape):
+        x = relay.var("x", shape=dshape)
+        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        upsampled = relay.nn.upsampling(pooled, scale=2, layout="NCHW")
+        out = relay.Tuple((upsampled, x))
+        return relay.Function(relay.ir_pass.free_vars(out), out)
+
+    def expected(dshape):
+        x = relay.var("x", shape=dshape)
+        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        f0 = relay.Function([x], pooled)
+
+        p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2]//2, dshape[3]//2))
+        p1 = relay.var("p1", shape=(dshape[0], dshape[1], dshape[2], dshape[3]))
+        upsampled = relay.nn.upsampling(p0, scale=2, layout="NCHW")
+        out = relay.Tuple((upsampled, p1))
+        f1 = relay.Function([p0, p1], out)
+
+        x = relay.var("x", shape=dshape)
+        y = relay.Call(f0, [x])
+        z = relay.Call(f1, [y, x])
+        return relay.Function([x], z)
+
+    dshape = (1, 16, 64, 64)
+    z = before(dshape)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
+    assert not relay.ir_pass.free_vars(zz)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
+    assert not relay.ir_pass.free_vars(zz)
+    after = relay.ir_pass.infer_type(expected(dshape))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
 
 if __name__ == "__main__":
     test_fuse_simple()
     test_conv2d_fuse()
+    test_concatenate()
+    test_tuple_root()

From 3ca432694d293da980dd8f465cdf5c3ee83e0839 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 29 Nov 2018 23:43:41 -0800
Subject: [PATCH 439/529] NOTICE (#2203)

---
 NOTICE | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 NOTICE

diff --git a/NOTICE b/NOTICE
new file mode 100644
index 000000000000..45468c50ba1b
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1 @@
+TVM End to End Deep Learning Compiler Stack: https://tvm.ai/

From 555308a6490d35c1cf6bb162f7f3d0468fd425cb Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Fri, 30 Nov 2018 20:01:38 +0300
Subject: [PATCH 440/529] [TVM] Fix llvm codegen (div by power of 2) (#2204)

---
 src/codegen/llvm/codegen_llvm.cc           |  6 +---
 tests/python/unittest/test_codegen_llvm.py | 36 ++++++++++++++++++++++
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 22319aa926fb..215a6c9c5b1b 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -788,11 +788,7 @@ DEFINE_CODEGEN_CMP_OP(GE);
 llvm::Value* CodeGenLLVM::VisitExpr_(const Div* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  int shift;
-  if ((op->type.is_int() || op->type.is_uint()) &&
-      is_const_power_of_two_integer(op->b, &shift)) {
-    return builder_->CreateAShr(a, shift);
-  } else if (op->type.is_int()) {
+  if (op->type.is_int()) {
     return builder_->CreateSDiv(a, b);
   } else if (op->type.is_uint()) {
     return builder_->CreateUDiv(a, b);
diff --git a/tests/python/unittest/test_codegen_llvm.py b/tests/python/unittest/test_codegen_llvm.py
index c0792cf38234..4f3e4e914d55 100644
--- a/tests/python/unittest/test_codegen_llvm.py
+++ b/tests/python/unittest/test_codegen_llvm.py
@@ -2,6 +2,7 @@
 from tvm.contrib import util, clang
 import numpy as np
 import ctypes
+import math
 
 def test_llvm_intrin():
     ib = tvm.ir_builder.create()
@@ -386,6 +387,40 @@ def test_alignment():
         if "align" in l and "4 x float" in l:
             assert "align 32" in l
 
+def test_llvm_div():
+    """Check that the semantics of div and mod is the same as in C/C++"""
+    def check_div(start, end, divisor, dtype):
+        T = tvm.compute((end - start,),
+                        lambda i: tvm.expr.Cast(dtype, (start + i)) / tvm.const(divisor, dtype))
+        s = tvm.create_schedule([T.op])
+        f = tvm.build(s, [T], "llvm")
+        a = tvm.nd.empty((end - start,), dtype)
+        f(a)
+        ref = [int(float(i)/divisor) for i in range(start, end)]
+        tvm.testing.assert_allclose(a.asnumpy(), ref)
+
+    def check_mod(start, end, divisor, dtype):
+        T = tvm.compute((end - start,),
+                        lambda i: tvm.expr.Cast(dtype, (start + i)) % tvm.const(divisor, dtype))
+        s = tvm.create_schedule([T.op])
+        f = tvm.build(s, [T], "llvm")
+        a = tvm.nd.empty((end - start,), dtype)
+        f(a)
+        ref = [int(math.fmod(i, divisor)) for i in range(start, end)]
+        tvm.testing.assert_allclose(a.asnumpy(), ref)
+
+    def check_llvm(start, end, divisor, dtype):
+        check_div(start, end, divisor, dtype)
+        check_mod(start, end, divisor, dtype)
+
+    for d in range(-5, 6):
+        if d != 0:
+            # Note that 11 (and not e.g. 10) is used to avoid issues with the simplifier
+            check_llvm(-11, 11, d, 'int32')
+            check_llvm(-11, 11, d, 'int8')
+            if d > 0:
+                check_llvm(123, 133, d, 'uint8')
+                check_llvm(0, 256, d, 'uint8')
 
 if __name__ == "__main__":
     test_llvm_import()
@@ -403,3 +438,4 @@ def test_alignment():
     test_llvm_madd_pipeline()
     test_llvm_temp_space()
     test_llvm_lookup_intrin()
+    test_llvm_div()

From bd9d03179bf8d9b2c75efd8dc78d923877dcbbe9 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Sat, 1 Dec 2018 01:03:14 +0800
Subject: [PATCH 441/529] [Relay][Pass] Fold constant tuple (#2201)

---
 src/relay/pass/fold_constant.cc               | 35 ++++++++++++++++++-
 tests/python/relay/test_pass_fold_constant.py | 20 +++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
index 6237bcdce7a8..60994cdd6ca9 100644
--- a/src/relay/pass/fold_constant.cc
+++ b/src/relay/pass/fold_constant.cc
@@ -13,6 +13,36 @@ namespace relay {
 using FInterpreter = runtime::TypedPackedFunc<Value(Expr)>;
 
 
+class ConstantChecker : private ExprVisitor {
+ public:
+  // Check whether an expression is constant. The results are memorized.
+  bool Check(const Expr& expr) {
+    if (expr.as<ConstantNode>()) {
+      return true;
+    }
+    const auto it = memo_.find(expr);
+    if (it != memo_.end())
+      return it->second;
+    VisitExpr(expr);
+    return memo_[expr];  // return memorized result or the default value false
+  }
+
+ private:
+  std::unordered_map<Expr, bool, NodeHash, NodeEqual> memo_;
+
+  void VisitExpr_(const TupleNode* n) final {
+    bool result = true;
+    for (const auto& field : n->fields) {
+      if (!Check(field)) {
+        result = false;
+        break;
+      }
+    }
+    memo_[GetRef<Tuple>(n)] = result;
+  }
+};
+
+
 // TODO(tvm-team) consider combine dead-code with constant folder.
 // or make a more powerful partial evaluator.
 class ConstantFolder : public ExprMutator {
@@ -53,7 +83,7 @@ class ConstantFolder : public ExprMutator {
     if (op_stateful.get(GetRef<Op>(op), false)) return res;
     bool all_const_args = true;
     for (Expr arg : call->args) {
-      if (arg.as<ConstantNode>() == nullptr) {
+      if (!checker_.Check(arg)) {
         all_const_args = false;
       }
     }
@@ -77,6 +107,9 @@ class ConstantFolder : public ExprMutator {
  private:
   // Internal interepreter.
   FInterpreter executor_;
+  // Internal constant checker
+  ConstantChecker checker_;
+
   // Convert value to expression.
   Expr ValueToExpr(Value value) {
     if (const auto* val = value.as<TensorValueNode>()) {
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
index 250cfc70cc28..6a63d88f052f 100644
--- a/tests/python/relay/test_pass_fold_constant.py
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -76,7 +76,27 @@ def expected():
     assert relay.ir_pass.graph_equal(zz, zexpected)
 
 
+def test_fold_concat():
+    c_data = np.array([[1, 2, 3]]).astype("float32")
+
+    def before():
+        a = relay.const(c_data)
+        b = relay.const(c_data)
+        y = relay.concatenate((a, b), axis=0)
+        return relay.Function([], y)
+
+    def expected():
+        y_data = np.concatenate((c_data, c_data), axis=0)
+        y = relay.const(y_data)
+        return relay.Function([], y)
+
+    zz = relay.ir_pass.fold_constant(before())
+    zexpected = expected()
+    assert relay.ir_pass.graph_equal(zz, zexpected)
+
+
 if __name__ == "__main__":
     test_fold_const()
     test_fold_let()
     test_fold_tuple()
+    test_fold_concat()

From 03fe5f7de2d6433cc496fea66df20f78b21674c4 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Fri, 30 Nov 2018 22:41:53 +0530
Subject: [PATCH 442/529] added int type axis for relay reduce ops (#2199)

---
 python/tvm/relay/op/reduce.py        | 7 +++++++
 tests/python/relay/test_op_level4.py | 9 +++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/op/reduce.py b/python/tvm/relay/op/reduce.py
index 73c5f270e8bf..71c7dea9c0dc 100644
--- a/python/tvm/relay/op/reduce.py
+++ b/python/tvm/relay/op/reduce.py
@@ -30,6 +30,7 @@ def argmax(data, axis=None, keepdims=False, exclude=False):
     result : relay.Expr
         The computed result.
     """
+    axis = [axis] if isinstance(axis, int) else axis
     return _make.argmax(data, axis, keepdims, exclude)
 
 def argmin(data, axis=None, keepdims=False, exclude=False):
@@ -59,6 +60,7 @@ def argmin(data, axis=None, keepdims=False, exclude=False):
     result : relay.Expr
         The computed result.
     """
+    axis = [axis] if isinstance(axis, int) else axis
     return _make.argmin(data, axis, keepdims, exclude)
 
 
@@ -89,6 +91,7 @@ def sum(data, axis=None, keepdims=False, exclude=False):
     result : relay.Expr
         The computed result.
     """
+    axis = [axis] if isinstance(axis, int) else axis
     return _make.sum(data, axis, keepdims, exclude)
 
 
@@ -119,6 +122,7 @@ def max(data, axis=None, keepdims=False, exclude=False):
     result : relay.Expr
         The computed result.
     """
+    axis = [axis] if isinstance(axis, int) else axis
     return _make.max(data, axis, keepdims, exclude)
 
 
@@ -149,6 +153,7 @@ def min(data, axis=None, keepdims=False, exclude=False):
     result : relay.Expr
         The computed result.
     """
+    axis = [axis] if isinstance(axis, int) else axis
     return _make.min(data, axis, keepdims, exclude)
 
 
@@ -179,6 +184,7 @@ def mean(data, axis=None, keepdims=False, exclude=False):
     result : relay.Expr
         The computed result.
     """
+    axis = [axis] if isinstance(axis, int) else axis
     return _make.mean(data, axis, keepdims, exclude)
 
 
@@ -209,4 +215,5 @@ def prod(data, axis=None, keepdims=False, exclude=False):
     result : relay.Expr
         The computed result.
     """
+    axis = [axis] if isinstance(axis, int) else axis
     return _make.prod(data, axis, keepdims, exclude)
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 075a58c31acf..db478ff251c5 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -145,7 +145,7 @@ def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32")
     elif ref_func in [np.max, np.min, np.mean, np.prod]:
         ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
     else: #argmin/argmax
-        if axis and len(axis) > 1:
+        if axis and not isinstance(axis, int) and len(axis) > 1 :
             return
         ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
 
@@ -164,7 +164,7 @@ def _wrapper(data, axis=None, keepdims=False):
                 return func(data, axis=axis)
             else:
                 if axis is not None:
-                    axis = axis[0]
+                    axis = axis if isinstance(axis, int) else axis[0]
                     out_shape = list(data.shape)
                     out_shape[axis] = 1
                 else:
@@ -180,10 +180,11 @@ def _wrapper(data, axis=None, keepdims=False):
                  [relay.prod, np.prod],
                  [relay.argmin, _with_keepdims(np.argmin)],
                  [relay.argmax, _with_keepdims(np.argmax)]]:
-        verify_reduce(func, (d1, d2, d3, d4), (2,), True, False, (d1, d2, 1, d4))
-        verify_reduce(func, (d1, d2, d3), (1,), True, False, (d1, 1, d3))
+        verify_reduce(func, (d1, d2, d3, d4), 2, True, False, (d1, d2, 1, d4))
+        verify_reduce(func, (d1, d2, d3), 1, True, False, (d1, 1, d3))
         verify_reduce(func, (d1, d2, d3), None, True, False, (1, 1, 1))
         verify_reduce(func, (d1, d2, d3), (0, 1), True, False, (1, 1, d3))
+        verify_reduce(func, (2, 3, 4), 1, True, False, (2, 1, 4))
         verify_reduce(func, (2, 3, 4), (1,), True, False, (2, 1, 4))
         verify_reduce(func, (2, 3, 4), (0, 1, 2), False, False, ())
         verify_reduce(func, (4, 4, 3), None, False, True, ())

From 4eb187a4d51b1f6f06a0c69d3018c90419b1c0b3 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 30 Nov 2018 16:05:04 -0800
Subject: [PATCH 443/529] [SCHEDULE] Fix code lowering when loop condition
 depends on outer axis. (#2208)

---
 src/op/compute_op.cc                            | 17 +++++++++++------
 .../unittest/test_schedule_schedule_ops.py      | 11 +++++++++++
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index 5c972595ff00..d4cb2b4c632b 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -321,27 +321,32 @@ Stmt MakeComputeStmt(const ComputeOpNode* self,
       source.push_back(stage->op.output(i));
     }
     MakeReduction(self, source, &init, &provide);
-    init = op::Substitute(init, n.init_vmap);
     init = MergeNest(n.init_nest, init);
+    init = op::Substitute(init, n.init_vmap);
     // common nest
     std::vector<std::vector<Stmt> > common(
         n.main_nest.begin(), n.main_nest.begin() + n.num_common_loop + 1);
     std::vector<std::vector<Stmt> > reduce(
         n.main_nest.begin() + n.num_common_loop + 1, n.main_nest.end());
-    provide = op::Substitute(provide, n.main_vmap);
     provide = MergeNest(reduce, provide);
     if (debug_keep_trivial_loop) {
-      return MergeNest(common, provide);
+      provide = MergeNest(common, provide);
     } else {
-      return MergeNest(common, Block::make(init, provide));
+      provide = MergeNest(common, Block::make(init, provide));
     }
+    // run substitution in the on the full nest, because  loop condition
+    // could depend on outer loops.
+    return op::Substitute(provide, n.main_vmap);
   } else {
     std::vector<Stmt> provides;
     for (size_t i = 0; i < self->body.size(); ++i) {
       provides.emplace_back(MakeProvide(self, stage->op.output(i)));
     }
-    Stmt provide = op::Substitute(Block::make(provides), n.main_vmap);
-    return MergeNest(n.main_nest, provide);
+    Stmt provide = Block::make(provides);
+    provide = MergeNest(n.main_nest, provide);
+    // run substitution in the on the full nest, because  loop condition
+    // could depend on outer loops.
+    return op::Substitute(provide, n.main_vmap);
   }
 }
 
diff --git a/tests/python/unittest/test_schedule_schedule_ops.py b/tests/python/unittest/test_schedule_schedule_ops.py
index e60073fe9f5c..e59a73529d24 100644
--- a/tests/python/unittest/test_schedule_schedule_ops.py
+++ b/tests/python/unittest/test_schedule_schedule_ops.py
@@ -409,7 +409,18 @@ def test_schedule_tensor_compute3():
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
 
+def test_loop_dep_reduce():
+    X = tvm.placeholder(shape=(10,), name="x")
+    def f(n):
+        rv = tvm.reduce_axis((0, n))
+        return tvm.sum(X[rv], axis=rv)
+    Y = tvm.compute(X.shape, f, name="y")
+    s = tvm.create_schedule([Y.op])
+    f = tvm.build(s, [X, Y])
+
+
 if __name__ == "__main__":
+    test_loop_dep_reduce()
     test_schedule_middle_cache()
     test_inline_multi_reduce()
     test_schedule_cache_relayout4()

From ab297f073b96aa913a3e7711557063c1b8856f17 Mon Sep 17 00:00:00 2001
From: lihaozhehw <45391385+lihaozhehw@users.noreply.github.com>
Date: Sat, 1 Dec 2018 08:05:31 +0800
Subject: [PATCH 444/529] Python security issue about mktemp() and abspath()
 (#2202)

---
 python/tvm/_ffi/libinfo.py                   | 4 ++--
 python/tvm/contrib/debugger/debug_runtime.py | 2 +-
 python/tvm/contrib/nvcc.py                   | 2 +-
 python/tvm/contrib/verilog.py                | 4 ++--
 python/tvm/contrib/xcode.py                  | 4 ++--
 python/tvm/exec/rpc_proxy.py                 | 2 +-
 topi/python/topi/cpp.py                      | 2 +-
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index 2fdf5aeb132a..6ad2e06939b1 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -25,7 +25,7 @@ def find_lib_path(name=None, search_path=None, optional=False):
     # inplace) or the install directory (if TVM is installed).
     # An installed TVM's curr_path will look something like:
     #   $PREFIX/lib/python3.6/site-packages/tvm/_ffi
-    ffi_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    ffi_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
     source_dir = os.path.join(ffi_dir, "..", "..", "..")
     install_lib_dir = os.path.join(ffi_dir, "..", "..", "..", "..")
 
@@ -49,7 +49,7 @@ def find_lib_path(name=None, search_path=None, optional=False):
 
     dll_path.append(install_lib_dir)
 
-    dll_path = [os.path.abspath(x) for x in dll_path]
+    dll_path = [os.path.realpath(x) for x in dll_path]
     if search_path is not None:
         if search_path is list:
             dll_path = dll_path + search_path
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index 25d17d528bf2..6642a8bdc822 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -146,7 +146,7 @@ def _create_debug_env(self, graph_json, ctx):
         """
         # make the dump folder if not given
         if not self._dump_root:
-            self._dump_root = tempfile.mktemp(prefix=_DUMP_ROOT_PREFIX)
+            self._dump_root = tempfile.mkdtemp(prefix=_DUMP_ROOT_PREFIX)
 
         # format the context
         ctx = self._format_context(ctx)
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index a87c942a7247..21cc4844087c 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -103,7 +103,7 @@ def find_cuda_path():
     (out, _) = proc.communicate()
     out = py_str(out)
     if proc.returncode == 0:
-        return os.path.abspath(os.path.join(str(out).strip(), "../.."))
+        return os.path.realpath(os.path.join(str(out).strip(), "../.."))
     cuda_path = "/usr/local/cuda"
     if os.path.exists(os.path.join(cuda_path, "bin/nvcc")):
         return cuda_path
diff --git a/python/tvm/contrib/verilog.py b/python/tvm/contrib/verilog.py
index 22b8fe1722d4..358366684fa4 100644
--- a/python/tvm/contrib/verilog.py
+++ b/python/tvm/contrib/verilog.py
@@ -111,7 +111,7 @@ def __getattr__(self, name):
 
 
 def _find_vpi_path():
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
     api_path = os.path.join(curr_path, '../../../lib/')
     vpi_path = [curr_path, api_path]
     vpi_path = [os.path.join(p, 'tvm_vpi.vpi') for p in vpi_path]
@@ -123,7 +123,7 @@ def _find_vpi_path():
 
 def search_path():
     """Get the search directory."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
     ver_path = [os.path.join(curr_path, '../../../verilog/')]
     ver_path += [os.path.join(curr_path, '../../../tests/verilog/unittest/')]
     ver_path += [os.path.join(curr_path, '../../../tests/verilog/integration/')]
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index 63fbad2a58cf..186df3f130e9 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -206,9 +206,9 @@ def popen_test_rpc(host,
     if "TVM_IOS_RPC_ROOT" in os.environ:
         rpc_root = os.environ["TVM_IOS_RPC_ROOT"]
     else:
-        curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+        curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
         rpc_root = os.path.join(curr_path, "../../../apps/ios_rpc")
-    proj_path = os.path.abspath(os.path.join(rpc_root, "tvmrpc.xcodeproj"))
+    proj_path = os.path.realpath(os.path.join(rpc_root, "tvmrpc.xcodeproj"))
     if not os.path.exists(proj_path):
         raise RuntimeError("Cannot find tvmrpc.xcodeproj in %s," +
                            (" please set env TVM_IOS_RPC_ROOT correctly" % rpc_root))
diff --git a/python/tvm/exec/rpc_proxy.py b/python/tvm/exec/rpc_proxy.py
index 678023a10550..363609c81de4 100644
--- a/python/tvm/exec/rpc_proxy.py
+++ b/python/tvm/exec/rpc_proxy.py
@@ -12,7 +12,7 @@
 
 def find_example_resource():
     """Find resource examples."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
     base_path = os.path.join(curr_path, "../../../")
     index_page = os.path.join(base_path, "web/example_rpc.html")
     js_files = [
diff --git a/topi/python/topi/cpp.py b/topi/python/topi/cpp.py
index 85f203387805..3321b5b68289 100644
--- a/topi/python/topi/cpp.py
+++ b/topi/python/topi/cpp.py
@@ -15,7 +15,7 @@ def _get_lib_names():
 
 def _load_lib():
     """Load libary by searching possible path."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
     lib_search = curr_path
     lib_path = libinfo.find_lib_path(_get_lib_names(), lib_search, optional=True)
     if lib_path is None:

From ed6b9adaab9fa46180c97daa9f1740dbdba210d0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 30 Nov 2018 19:43:00 -0800
Subject: [PATCH 445/529] [DOCKER] inheritate javahome (#2210)

---
 docker/with_the_same_user | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/with_the_same_user b/docker/with_the_same_user
index 470d64384de6..27f2e66a29d5 100644
--- a/docker/with_the_same_user
+++ b/docker/with_the_same_user
@@ -29,6 +29,7 @@ echo "${CI_BUILD_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-nopasswd-sudo
 HOME=${CI_BUILD_HOME}\
     sudo -u "#${CI_BUILD_UID}" --preserve-env\
     PATH=${PATH}\
+    JAVA_HOME=${JAVA_HOME}\
     LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\
     PYTHONPATH=${PYTHONPATH}\
     HOME=${CI_BUILD_HOME}\

From a8a0dc2521c9d11086051bdc8a4ef0a65103be3d Mon Sep 17 00:00:00 2001
From: wuzhao <wuzhaozju@gmail.com>
Date: Sun, 2 Dec 2018 00:36:26 +0800
Subject: [PATCH 446/529] Update arm cpu depthwise convolution based on latest
 code

Add x86 cpu back and dilation support.
---
 topi/python/topi/arm_cpu/conv2d.py           | 130 ++++---
 topi/python/topi/arm_cpu/depthwise_conv2d.py | 366 ++++++++++++++-----
 2 files changed, 357 insertions(+), 139 deletions(-)

diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 22c9d2368de3..5d8f1dc9a62d 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -11,7 +11,8 @@
 
 from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
 from ..util import traverse_inline, get_const_tuple, const_matrix
-from ..nn import dilate, pad, conv2d, conv2d_alter_layout, conv2d_winograd_without_weight_transform
+from ..nn import dilate, pad, conv2d, conv2d_alter_layout, \
+                 conv2d_winograd_without_weight_transform, depthwise_conv2d_nchw
 from ..nn.util import get_const_int, get_pad_tuple
 
 @autotvm.register_topi_compute(conv2d, 'arm_cpu', ['direct'])
@@ -548,54 +549,81 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
     N, CI, H, W = get_const_tuple(data.shape)
     CO, _, KH, KW = get_const_tuple(kernel.shape)
 
-    # query config of this workload
-    workload = autotvm.task.args_to_workload(
-        [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d)
-    target = tvm.target.current_target()
-    dispatch_ctx = autotvm.DispatchContext.current
-    cfg = dispatch_ctx.query(target, workload)
-
-    if cfg.is_fallback:  # if is fallback, clear query cache and return None
-        autotvm.task.clear_fallback_cache(target, workload)
-        return None
+    if groups == 1:
+        # query config of this workload
+        workload = autotvm.task.args_to_workload(
+            [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d)
+        target = tvm.target.current_target()
+        dispatch_ctx = autotvm.DispatchContext.current
+        cfg = dispatch_ctx.query(target, workload)
+
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(target, workload)
+            return None
+
+        if cfg.template_key == 'direct':  # pack weight tensor
+            VC = cfg['tile_co'].size[-1]
+            new_attrs['kernel_layout'] = 'OIHW%do' % VC
+
+            # Store the same config for the altered operator (workload)
+            new_data = data
+            new_kernel = tvm.placeholder((CO // VC, CI, KH, KW, VC), dtype=kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, strides, padding, dilation, 'NCHW', out_dtype], conv2d)
+            dispatch_ctx.update(target, new_workload, cfg)
+
+            return sym.conv2d(*copy_inputs, **new_attrs)
+        else:  # pre-compute weight transformation in winograd
+            if "-device=arm_cpu" in target.options:
+                tile_size = 4
+                VC = cfg['tile_k'].size[-1]
+            else:
+                from ..mali.conv2d import _pick_tile_size
+                tile_size = _pick_tile_size(tinfos[0], tinfos[1])
+                VC = cfg['tile_bna'].val
+
+            weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1], tile_size=tile_size)
+            weight = sym.reshape(weight,
+                                 shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
+            weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])
+
+            copy_inputs[1] = weight
+            new_attrs['tile_size'] = tile_size
+
+            # Store the same config for the altered operator (workload)
+            new_data = data
+            new_weight = tvm.placeholder((KH + tile_size - 1, KH + tile_size -1, CO // VC, CI, VC),
+                                         kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation,
+                 new_attrs['layout'], out_dtype, tile_size],
+                conv2d_winograd_without_weight_transform)
+            dispatch_ctx.update(target, new_workload, cfg)
+
+            return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
+    else:
+        workload = autotvm.task.args_to_workload(
+            [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw)
+        target = tvm.target.current_target()
+        dispatch_ctx = autotvm.DispatchContext.current
+        cfg = dispatch_ctx.query(target, workload)
+
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(tvm.target.current_target(), workload)
+            return None
+
+        if cfg.template_key == 'direct':
+            VC = cfg['tile_co'].size[-1]
+            new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
+
+            # Store the same config for the altered operator (workload)
+            new_data = data
+            CO, M, KH, KW = get_const_tuple(kernel.shape)
+            new_kernel = tvm.placeholder((CO // VC, M, KH, KW, VC), dtype=kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, strides, padding, dilation, out_dtype],
+                depthwise_conv2d_nchw)
+            dispatch_ctx.update(target, new_workload, cfg)
+
+            return sym.conv2d(*copy_inputs, **new_attrs)
 
-    if cfg.template_key == 'direct':  # pack weight tensor
-        VC = cfg['tile_co'].size[-1]
-        new_attrs['kernel_layout'] = 'OIHW%do' % VC
-
-        # Store the same config for the altered operator (workload)
-        new_data = data
-        new_kernel = tvm.placeholder((CO // VC, CI, KH, KW, VC), dtype=kernel.dtype)
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, 'NCHW', out_dtype], conv2d)
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return sym.conv2d(*copy_inputs, **new_attrs)
-    else:  # pre-compute weight transformation in winograd
-        if "-device=arm_cpu" in target.options:
-            tile_size = 4
-            VC = cfg['tile_k'].size[-1]
-        else:
-            from ..mali.conv2d import _pick_tile_size
-            tile_size = _pick_tile_size(tinfos[0], tinfos[1])
-            VC = cfg['tile_bna'].val
-
-        weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1], tile_size=tile_size)
-        weight = sym.reshape(weight,
-                             shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
-        weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])
-
-        copy_inputs[1] = weight
-        new_attrs['tile_size'] = tile_size
-
-        # Store the same config for the altered operator (workload)
-        new_data = data
-        new_weight = tvm.placeholder((KH + tile_size - 1, KH + tile_size -1, CO // VC, CI, VC),
-                                     kernel.dtype)
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_weight, strides, padding, dilation,
-             new_attrs['layout'], out_dtype, tile_size],
-            conv2d_winograd_without_weight_transform)
-        dispatch_ctx.update(target, new_workload, cfg)
-
-        return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index 2556af36e5f9..e486142b80e6 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -5,15 +5,173 @@
 from tvm import autotvm
 
 from ..generic import schedule_depthwise_conv2d_nchw
-from ..nn import depthwise_conv2d_nchw
-from ..util import traverse_inline
+from ..nn import depthwise_conv2d_nchw, pad
+from ..util import traverse_inline, get_const_tuple, get_const_int
+from ..nn.util import get_pad_tuple
 
-# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-autotvm.register_topi_compute(depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct',
-                              depthwise_conv2d_nchw.fdefault)
+@autotvm.register_topi_compute(depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], ['direct'])
+def depthwise_conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """TOPI compute callback for depthwise_conv2d nchw
 
-# register customized schedule for arm cpu.
-@autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct')
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, multiplier, filter_height, filter_width] or
+        pre-packed 5-D with shape [num_filter_chunk, multiplier, filter_height,
+        filter_width, num_filter_block]
+
+    strides : list of two ints
+        [stride_height, stride_width]
+
+    padding : list of two ints
+        [pad_height, pad_width]
+
+    dilation : list of two ints
+        [dilation_height, dilation_width]
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+
+    return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=2)
+
+
+def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile):
+    out_dtype = out_dtype or data.dtype
+
+    N, C, IH, IW = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if len(kernel.shape) == 4:
+        pre_packed = False
+        C, M, KH, KW = get_const_tuple(kernel.shape)
+    else:  # kernel tensor is pre packed
+        pre_packed = True
+        C, M, KH, KW, VC = get_const_tuple(kernel.shape)
+        C = C * VC
+
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
+    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
+    # pack data
+    HPAD = pad_top + pad_down
+    WPAD = pad_left + pad_right
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right),
+                       name="data_pad")
+    else:
+        data_pad = data
+
+    # fallback support
+    # Currently, Mali schedule doesn't use it like conv2d.
+    if cfg.is_fallback:
+        ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'depthwise_conv2d_nchw',
+                                                    'direct')
+        cfg.fallback_with_reference_log(ref_log)
+
+    # ==================== define configuration space ====================
+    n, c, oh, ow = cfg.axis(N), cfg.axis(C), cfg.axis(OH), cfg.axis(OW)
+    kh, kw = cfg.reduce_axis(KH), cfg.reduce_axis(KW)
+
+    # Currently, Mali schedule doesn't use it like conv2d.
+    # Leave num_tile for possible future use of Mali schedule
+    if num_tile == 2:     # for arm cpu
+        co, vc = cfg.define_split('tile_co', c, num_outputs=2)
+        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
+        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
+    else:
+        raise RuntimeError("Invalid num_tile")
+
+    cfg.define_reorder("reorder_0",
+                       [n, co, oh, ow, kh, kw, vh, vw, vc],
+                       policy='candidate', candidate=[
+                           [n, co, oh, ow, kh, kw, vh, vw, vc],
+                           [n, co, oh, ow, kh, kw, vc, vh, vw]])
+
+    cfg.define_reorder("reorder_1",
+                       [n, co, oh, ow, vh, vw, vc],
+                       policy='candidate', candidate=[
+                           [n, co, oh, ow, vh, vw, vc],
+                           [n, co, oh, ow, vc, vh, vw],
+                           [n, co, oh, ow, vh, vc, vw]])
+
+    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
+    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
+    # ====================================================================
+
+    VC = cfg["tile_co"].size[-1]
+    VH = cfg["tile_oh"].size[-1]
+    VW = cfg["tile_ow"].size[-1]
+
+    kvshape = (C // VC, M, KH, KW, VC)
+    ovshape = (N, C * M // VC, OH // VH, OW // VW, VH, VW, VC)
+    oshape = (N, C * M, OH, OW)
+
+    if dilation_h != 1 or dilation_w != 1:
+        # undilate input data
+        dvshape = (N, OH // VH, OW // VW, C, KH, KW, VH, VW)
+        data_vec = tvm.compute(dvshape, lambda n, h, w, c, kh, kw, vh, vw:
+                               data_pad[n][c][(h * VH + vh) * HSTR + kh * dilation_h]
+                               [(w*VW+vw)*WSTR+kw*dilation_w],
+                               name='data_vec_undilated')
+    else:
+        dvshape = (N, OH // VH, OW // VW, C, VH*HSTR + KH-1, VW*WSTR + KW-1)
+        data_vec = tvm.compute(dvshape, lambda n, h, w, c, vh, vw:
+                               data_pad[n][c][h * VH * HSTR + vh][w * VW * WSTR + vw],
+                               name='data_vec')
+
+    if pre_packed:
+        kernel_vec = kernel
+    else:
+        kernel_vec = tvm.compute(kvshape, lambda co, m, kh, kw, vc:
+                                 kernel[co*VC+vc][m][kh][kw],
+                                 name='kernel_vec')
+
+    kh = tvm.reduce_axis((0, KH), name='kh')
+    kw = tvm.reduce_axis((0, KW), name='kw')
+
+    if dilation_h != 1 or dilation_w != 1:
+        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+                          tvm.sum(data_vec[n, h, w, (co * VC + vc) // M, kh, kw, vh, vw]
+                                  .astype(out_dtype) *
+                                  kernel_vec[co // M, co % M, kh, kw, vc].astype(out_dtype),
+                                  axis=[kh, kw]), name='conv')
+    else:
+        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+                           tvm.sum(data_vec[n, h, w, (co * VC + vc) // M, vh * HSTR + kh,
+                                            vw * WSTR + kw].astype(out_dtype) *
+                                   kernel_vec[co // M, co % M, kh, kw, vc].astype(out_dtype),
+                                   axis=[kh, kw]), name='conv')
+
+    output = tvm.compute(oshape, lambda n, co, h, w:
+                         conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
+                         name='output_unpack', tag='spatial_depthwise_conv_nchw_output')
+    return output
+
+
+# register customized schedule for arm cpu / x86 cpu.
+@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct')
 def schedule_depthwise_conv2d_nchw_arm(cfg, outs):
     """Schedule depthwise conv2d
 
@@ -30,91 +188,123 @@ def schedule_depthwise_conv2d_nchw_arm(cfg, outs):
     s: Schedule
         The computation schedule for depthwise_conv2d nchw.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
 
-    def _schedule(cfg, s, data, data_pad, kernel, output):
-        A, B, C = data, kernel, output
-        s[data_pad].compute_inline()
-
-        ##### space definition begin #####
-        n, c, h, w = s[output].op.axis
-        _, vc = cfg.define_split('tile_c', c, num_outputs=2)
-        _, vh = cfg.define_split('tile_h', h, num_outputs=2)
-        _, vw = cfg.define_split('tile_w', w, num_outputs=2)
-        cfg.define_annotate('ann', [vh, vw, vc], policy='try_unroll_vec')
-
-        # fallback support
-        if cfg.is_fallback:
-            ref_log = autotvm.tophub.load_reference_log(
-                'arm_cpu', 'rk3399', 'depthwise_conv2d_nchw', 'direct')
-            cfg.fallback_with_reference_log(ref_log)
-        ##### space definition end #####
-
-        # park data to vector form  [n, c, h, w] -> [n, C, h, w, VC]
-        A0 = s.cache_read(data_pad, "global", C)
-        n, c, h, w = s[A0].op.axis
-        c, vc = cfg['tile_c'].apply(s, A0, c)
-        s[A0].reorder(n, c, h, w, vc)
-        A1 = s.cache_write(A0, 'global')
-        s[A0].compute_inline()
-
-        # park kernel to vector form  [co, ci, kh, kw] -> [CO, ci, kh, kw, VC]
-        B0 = s.cache_read(B, "global", C)
-        c, m, h, w = s[B0].op.axis
-        c, vc, = cfg['tile_c'].apply(s, B0, c)
-        s[B0].reorder(c, m, h, w, vc)
-        B1 = s.cache_write(B0, 'global')
-        s[B0].compute_inline()
-
-        n, c, h, w = s[C].op.axis
-        c, vc, = cfg['tile_c'].apply(s, C, c)
-        s[C].reorder(n, c, h, w, vc)
-
-        # depthwise conv
-        C0 = s.cache_write(C, 'global')
-        _, c, h, w, vc = s[C0].op.axis
-        dh, dw = s[C0].op.reduce_axis
-        oh, ih = cfg['tile_h'].apply(s, C0, h)
-        ow, iw = cfg['tile_w'].apply(s, C0, w)
-        s[C0].reorder(c, oh, ow, dh, dw, ih, iw, vc)
-        s[A1].compute_at(s[C0], oh)
-
-        # try unroll and vectorization
-        cfg['ann'].apply(s, C0, [ih, iw, vc],
-                         axis_lens=[cfg['tile_h'].size[-1],
-                                    cfg['tile_w'].size[-1],
-                                    cfg['tile_c'].size[-1]],
-                         max_unroll=16,
-                         cfg=cfg)
-
-        # fusion
-        if C.op not in s.outputs:
-            s[C].compute_inline()
-
-        # mark parallel
-        last = outs[0]
-        n, c, h, w = s[last].op.axis
-        s[last].parallel(c)
-
-        n, c, h, w, vc = s[C0].op.axis
-        s[C0].parallel(c)
-
-        c, m, h, w, vc = s[B1].op.axis
-        s[B1].parallel(c)
-
-        return s
-
     def _callback(op):
-        if op.tag == 'depthwise_conv2d_nchw':
+        if 'spatial_depthwise_conv_nchw_output' in op.tag:
             output = op.output(0)
-            kernel = op.input_tensors[1]
-            data = op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-            _schedule(cfg, s, data, data_pad, kernel, output)
+            conv = op.input_tensors[0]
 
+            data_vec = conv.op.input_tensors[0]
+
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
     traverse_inline(s, outs[0].op, _callback)
     return s
+
+
+def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
+                           conv, output, last):
+    """schedule implementation"""
+    n, co, oh, ow, vh, vw, vc = s[conv].op.axis
+    kh, kw = s[conv].op.reduce_axis
+
+    if data_vec.op.name == 'data_vec_undilated':
+        _, dv_oh, dv_ow, dv_c, _, _, dv_vh, dv_vw = s[data_vec].op.axis
+    else:
+        _, dv_oh, dv_ow, dv_c, dv_vh, dv_vw = s[data_vec].op.axis
+
+    _, dv_oh, dv_ow, dv_c, dv_vh, dv_vw = s[data_vec].op.axis
+    data_pad = data_vec.op.input_tensors[0]
+    if data_pad.op.name == "data_pad":
+        assert isinstance(data_pad.op, tvm.tensor.ComputeOp)
+        has_padding = True
+    else:
+        assert isinstance(data_pad.op, tvm.tensor.PlaceholderOp)
+        has_padding = False
+
+    cfg.define_knob('data_pad_inline', [0, 1, 2, 3, 4])
+
+    if cfg['data_pad_inline'].val == 1 and has_padding:
+        s[data_pad].compute_inline()
+    if cfg['data_pad_inline'].val == 2 and has_padding:
+        s[data_pad].vectorize(list(s[data_pad].op.axis)[-1])
+    if cfg['data_pad_inline'].val == 3 and has_padding:
+        s[data_pad].vectorize(list(s[data_pad].op.axis)[-1])
+        s[data_pad].compute_at(s[data_vec], dv_oh)
+    if cfg['data_pad_inline'].val == 4 and has_padding:
+        s[data_pad].vectorize(list(s[data_pad].op.axis)[-1])
+        s[data_pad].compute_at(s[data_vec], dv_ow)
+
+    cfg.define_knob('data_vec_inline', [0, 1, 2, 3])
+    if cfg['data_vec_inline'].val == 1:
+        s[data_vec].compute_at(s[conv], oh)
+    if cfg['data_vec_inline'].val == 2:
+        s[data_vec].compute_at(s[conv], ow)
+    if cfg['data_vec_inline'].val == 3:
+        s[data_vec].compute_at(s[conv], co)
+
+    # schedule conv
+    cfg["reorder_0"].apply(s, conv, [n, co, oh, ow, kh, kw, vh, vw, vc])
+    cfg["ann_reduce"].apply(s, conv, [kh, kw],
+                            axis_lens=[get_const_int(kh.dom.extent),
+                                       get_const_int(kw.dom.extent)],
+                            max_unroll=16,
+                            cfg=cfg)
+    cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
+                             axis_lens=[cfg['tile_oh'].size[-1],
+                                        cfg['tile_ow'].size[-1],
+                                        cfg['tile_co'].size[-1]],
+                             max_unroll=16,
+                             cfg=cfg)
+
+    # schedule fusion
+    n, co, h, w = s[last].op.axis
+    co, vc = cfg['tile_co'].apply(s, last, co)
+    oh, vh = cfg['tile_oh'].apply(s, last, h)
+    ow, vw = cfg['tile_ow'].apply(s, last, w)
+    cfg["reorder_1"].apply(s, last, [n, co, oh, ow, vh, vw, vc])
+    if last != output:
+        s[output].compute_inline()
+        cfg["ann_spatial"].apply(s, last, [vh, vw, vc],
+                                 axis_lens=[cfg['tile_oh'].size[-1],
+                                            cfg['tile_ow'].size[-1],
+                                            cfg['tile_co'].size[-1]],
+                                 max_unroll=16,
+                                 cfg=cfg)
+    else:
+        s[last].vectorize(vw)
+    cfg.define_knob('conv_inline', [0, 1, 2, 3])
+    if cfg['conv_inline'].val == 1:
+        s[conv].compute_at(s[last], ow)
+    if cfg['conv_inline'].val == 2:
+        s[conv].compute_at(s[last], oh)
+    if cfg['conv_inline'].val == 3:
+        s[conv].compute_at(s[last], co)
+
+    # mark parallel
+    s[last].parallel(co)
+
+    if data_vec.op.name == 'data_vec_undilated':
+        _, h, _, _, _, _, _, _ = s[data_vec].op.axis
+    else:
+        _, h, _, _, _, _ = s[data_vec].op.axis
+    s[data_vec].parallel(h)
+
+    if kernel_vec.op.name == 'kernel_vec':
+        co, _, _, _, _ = s[kernel_vec].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel packing will be pre-computed during compliation, so we skip
+            # this part to make tuning records correct
+            s[kernel_vec].pragma(co, 'debug_skip_region')
+        else:
+            s[kernel_vec].parallel(co)
+
+    return s

From 5d60d6f08329ceed57313bf8a4fe065970f552e0 Mon Sep 17 00:00:00 2001
From: wuzhao <wuzhaozju@gmail.com>
Date: Sun, 2 Dec 2018 00:40:44 +0800
Subject: [PATCH 447/529] Modify lint issue

---
 topi/python/topi/arm_cpu/conv2d.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 5d8f1dc9a62d..185df7d460b5 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -582,7 +582,8 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
                 tile_size = _pick_tile_size(tinfos[0], tinfos[1])
                 VC = cfg['tile_bna'].val
 
-            weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1], tile_size=tile_size)
+            weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1],
+                                                                  tile_size=tile_size)
             weight = sym.reshape(weight,
                                  shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
             weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])

From 41d32950f8296f86dee545f623440adf166ae05a Mon Sep 17 00:00:00 2001
From: wuzhao <wuzhaozju@gmail.com>
Date: Sun, 2 Dec 2018 01:03:53 +0800
Subject: [PATCH 448/529] Fix depthwise convolution infer shape error and lint
 issue.

---
 nnvm/src/top/nn/convolution.cc     | 7 ++++---
 topi/python/topi/arm_cpu/conv2d.py | 6 ++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index 813947492117..11f1989e61dc 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -77,10 +77,11 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
                  dshape[1] / param.groups,
                  param.kernel_size[0],
                  param.kernel_size[1]});
-
-  wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
-
+  // Restore depthwise conv2d kernel layout
+  // otherwise we will get error if we split output channel
+  // of depthwise conv2d kernel (because it will be 1 if we don't restore).
   wshape[kernel_layout.indexof('O')] *= param.groups;
+  wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
 
   if (in_shape->at(Conv2DParam::kWeight).ndim() == 0) {
     NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kWeight, wshape);
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 185df7d460b5..b64d2b92a334 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -539,7 +539,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
     out_dtype = attrs["out_dtype"]
     out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype
 
-    if layout != 'NCHW' or groups != 1:
+    if layout != 'NCHW':
         return None
     if dilation != (1, 1):
         warnings.warn("Does not support weight pre-transform for dilated convolution.")
@@ -627,4 +627,6 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
             dispatch_ctx.update(target, new_workload, cfg)
 
             return sym.conv2d(*copy_inputs, **new_attrs)
-
+        else:
+            # add more schedule templates
+            return None

From b0664d661a72ad5f53a4591beed253b2158e86ec Mon Sep 17 00:00:00 2001
From: wuzhao <wuzhaozju@gmail.com>
Date: Sun, 2 Dec 2018 02:32:50 +0800
Subject: [PATCH 449/529] Fix conv2d infer shape issue in HWOI kernel layout

---
 nnvm/src/top/nn/convolution.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index 11f1989e61dc..df81c47823d9 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -73,14 +73,14 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(param.channels % param.groups, 0U)
       << "output channels must divide group size";
 
-  TShape wshape({param.channels / param.groups,
+  // Restore depthwise conv2d kernel layout
+  // otherwise we will get error if we split output channel
+  // of depthwise conv2d kernel (because it will be 1 if
+  // use param.channels divide param.groups).
+  TShape wshape({param.channels,
                  dshape[1] / param.groups,
                  param.kernel_size[0],
                  param.kernel_size[1]});
-  // Restore depthwise conv2d kernel layout
-  // otherwise we will get error if we split output channel
-  // of depthwise conv2d kernel (because it will be 1 if we don't restore).
-  wshape[kernel_layout.indexof('O')] *= param.groups;
   wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
 
   if (in_shape->at(Conv2DParam::kWeight).ndim() == 0) {

From 5d9efaf7a3473215ca0ca8d03d1b789ae04216ed Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Sun, 2 Dec 2018 14:23:52 +0800
Subject: [PATCH 450/529] [RELAY][PASS] Memorize FoldScaleAxis backward
 transform result (#2214)

---
 src/relay/pass/fold_scale_axis.cc             | 28 +++++--
 .../python/relay/test_pass_fold_scale_axis.py | 76 +++++++++++++++++++
 2 files changed, 96 insertions(+), 8 deletions(-)

diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index c56ee98a3969..1cd6606bd5c1 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -556,9 +556,7 @@ class BackwardTransformerNode :
    * \return The result of transformation.
    */
   Expr Transform(const Expr& expr, AxesSet axes, Expr scale) {
-    // NOTE: the result of Transform is not memoized.
-    // However, in the current rule, Transform will
-    // only be called to expr that is referred once.
+    // NOTE: the result of Transform is memoized.
     if (const CallNode* call_node = expr.as<CallNode>()) {
       return Transform(call_node, axes, scale);
     } else {
@@ -572,7 +570,14 @@ class BackwardTransformerNode :
    * \return the result of the call Mutation.
    */
   Expr NormalCallTransform(const CallNode* call_node) {
-    return ExprMutator::VisitExpr_(call_node);
+    const Call call = GetRef<Call>(call_node);
+    const auto it = memo_.find(call);
+    if (it != memo_.end()) {
+      return it->second;
+    }
+    Expr new_expr = ExprMutator::VisitExpr_(call_node);
+    memo_[call] = new_expr;
+    return new_expr;
   }
   /*!
    * \brief Get the expected axes on expr.
@@ -620,10 +625,17 @@ Expr BackwardTransformerNode::Transform(
       Op::GetAttr<FBackwardTransform>("FScaleAxisBackwardTransform");
   auto f = ftransform.get(call_node->op, nullptr);
   if (f != nullptr) {
-    return f(GetRef<Call>(call_node),
-             axes,
-             scale,
-             GetRef<BackwardTransformer>(this));
+    const Call call = GetRef<Call>(call_node);
+    const auto it = memo_.find(call);
+    if (it != memo_.end()) {
+      return it->second;
+    }
+    Expr new_expr = f(GetRef<Call>(call_node),
+                      axes,
+                      scale,
+                      GetRef<BackwardTransformer>(this));
+    memo_[call] = new_expr;
+    return new_expr;
   } else {
     CHECK(!axes.defined()) << "outstanding scale";
     return NormalCallTransform(call_node);
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index a5a7a05a974c..e6e008f80d0c 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -268,6 +268,81 @@ def check(shape, channels):
     check((2, 4, 10, 10), 8)
 
 
+def test_fold_bwd_dual_consumer():
+    def before(x, conv_weight, out_bias, out_scale, channels):
+        args = [x, conv_weight, out_bias, out_scale]
+        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        y0 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y0 = relay.multiply(y0, out_scale)
+        y0 = relay.nn.relu(y0)
+
+        y1 = relay.nn.conv2d(y0, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y1 = relay.multiply(y1, out_scale)
+        y1 = relay.nn.relu(y1)
+
+        y2 = relay.nn.conv2d(y0, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y2 = relay.multiply(y2, out_scale)
+        y2 = relay.nn.relu(y2)
+
+        y = relay.add(y1, y2)
+        return relay.Function(args, y)
+
+    def expected(x, conv_weight, out_bias, out_scale, channels):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, conv_weight, out_bias, out_scale]
+        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        def fold_conv_weight():
+            squeezed_scale = relay.squeeze(out_scale, axis=[1,2])
+            return  relay.multiply(
+                conv_weight ,
+                relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3))
+        y0 = relay.nn.conv2d(x, fold_conv_weight(),
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y0 = relay.nn.relu(y0)
+        y1 = relay.nn.conv2d(y0, fold_conv_weight(),
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(y0, fold_conv_weight(),
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y2 = relay.nn.relu(y2)
+        y = relay.add(y1, y2)
+        return relay.Function(args, y)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        out_bias = relay.var("out_bias", shape=(channels,))
+        out_scale = relay.var("out_scale", shape=(channels,))
+
+        y1 = before(x, weight, out_bias, out_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        type_dict = {x.name_hint:x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
+        y1_expected = expected(x, weight, out_bias, out_scale, channels)
+        y1_folded = relay.ir_pass.infer_type(y1_folded)
+        y1_expected = relay.ir_pass.infer_type(y1_expected)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 10), 4)
+
+
 def test_fold_bwd_fail():
     """Dual path testcase."""
     def fail1(x, conv_weight, out_bias, out_scale, channels):
@@ -327,4 +402,5 @@ def check(shape, channels, fbefore):
     test_fold_fwd_fail()
     test_fold_bwd_simple()
     test_fold_bwd_dual_path()
+    test_fold_bwd_dual_consumer()
     test_fold_bwd_fail()

From 0fd635ca9697f55ceefd5efdfe9d4e26c2320e7c Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Sat, 1 Dec 2018 22:24:16 -0800
Subject: [PATCH 451/529] Run verifier during LLVM code generation (#2211)

---
 src/codegen/llvm/llvm_common.h  | 1 +
 src/codegen/llvm/llvm_module.cc | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/src/codegen/llvm/llvm_common.h b/src/codegen/llvm/llvm_common.h
index 9f162ee5c6d7..89008bb480d2 100644
--- a/src/codegen/llvm/llvm_common.h
+++ b/src/codegen/llvm/llvm_common.h
@@ -26,6 +26,7 @@
 #include <llvm/IR/Type.h>
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/Verifier.h>
 
 #include <llvm/IR/LegacyPassManager.h>
 #include <llvm/Transforms/Utils/Cloning.h>
diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index 495f9982022a..25de224e44cd 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -168,6 +168,11 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     }
     cg->AddMainFunction(funcs[0]->name);
     module_ = cg->Finish();
+    std::string verify_errors_storage;
+    llvm::raw_string_ostream verify_errors(verify_errors_storage);
+    LOG_IF(FATAL, llvm::verifyModule(*module_, &verify_errors))
+        << "LLVM module verification failed with the following errors: \n"
+        << verify_errors.str();
     module_->addModuleFlag(
         llvm::Module::Warning, "tvm_target",
         llvm::MDString::get(*ctx_, target));

From e6f5e2189745a3721dc00fb78592507c0d7f850d Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Sun, 2 Dec 2018 11:56:38 +0530
Subject: [PATCH 452/529] [RELAY][OP] end to end support for pad op. (#2213)

---
 python/tvm/relay/op/nn/_nn.py        |  3 +++
 src/relay/op/nn/pad.cc               | 30 +++++++++++++++++++++++++++-
 tests/python/relay/test_op_level2.py | 16 +++++++++++++++
 3 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index ebfda0ab4c50..007888996ed5 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -251,3 +251,6 @@ def schedule_upsampling(_, outs, target):
         return topi.generic.schedule_injective(outs)
 
 reg.register_pattern("nn.upsampling", OpPattern.INJECTIVE)
+
+# pad
+reg.register_schedule("nn.pad", schedule_broadcast)
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index 5403d0620e50..dc99f05f4d2d 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -6,8 +6,10 @@
 #include <tvm/ir_operator.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
+#include <topi/nn.h>
 #include <vector>
 #include "../layout.h"
+#include "../op_common.h"
 
 namespace tvm {
 namespace relay {
@@ -60,6 +62,30 @@ bool PadRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> PadCompute(const Attrs& attrs,
+                         const Array<Tensor>& inputs,
+                         const Type& out_type,
+                         const Target& target) {
+  const auto* param = attrs.as<PadAttrs>();
+  CHECK(param != nullptr);
+
+  auto pad_width = param->pad_width;
+  CHECK(pad_width.size() == inputs[0].ndim() &&
+    pad_width[0].size() == 2)
+    << "Illegal pad_width";
+  Array<IndexExpr> pad_before;
+  for (size_t i = 0; i < pad_width.size(); ++i) {
+    pad_before.push_back(pad_width[i][0]);
+  }
+  Array<IndexExpr> pad_after;
+  for (size_t i = 0; i < pad_width.size(); ++i) {
+    pad_after.push_back(pad_width[i][1]);
+  }
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  return Array<Tensor>{ topi::pad(inputs[0], pad_before, pad_after,
+                                  tvm::make_const(out_ttype->dtype, param->pad_value)) };
+}
+
 // Handler to create a call to the padding op used by front-end FFI
 Expr MakePad(Expr data, Array<Array<IndexExpr> > pad_width, double pad_value) {
   auto attrs = make_node<PadAttrs>();
@@ -82,7 +108,9 @@ RELAY_REGISTER_OP("nn.pad")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
-.add_type_rel("Pad", PadRel);
+.add_type_rel("Pad", PadRel)
+.set_attr<TOpPattern>("TOpPattern", kInjective)
+.set_attr<FTVMCompute>("FTVMCompute", PadCompute);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 2060b44017d3..0544ee49d159 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -330,6 +330,21 @@ def test_pad_infer_type():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
 
+def test_pad_run():
+    def _test_run(dtype):
+        dshape = (4, 10, 7, 7)
+        x = relay.var("x", shape=dshape)
+        y = relay.nn.pad(x, ((1, 1), (2, 2), (3, 3), (4, 4)))
+        func = relay.Function([x], y)
+        data = np.random.uniform(size=dshape).astype(dtype)
+        ref_res = np.pad(data, ((1, 1), (2, 2), (3, 3), (4, 4)), 'constant')
+        for target, ctx in ctx_list():
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(data)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+
+    _test_run('float32')
+    _test_run('int32')
 
 def test_lrn():
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
@@ -457,6 +472,7 @@ def test_upsampling():
     test_upsampling_infer_type()
     test_flatten_infer_type()
     test_pad_infer_type()
+    test_pad_run()
     test_conv2d_transpose_infer_type()
     test_conv2d_transpose_run()
     test_conv2d_run()

From 8f89bbed9fe6518577e6a13b04544186d48e4af8 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Sat, 1 Dec 2018 22:27:49 -0800
Subject: [PATCH 453/529] [DOC][Relay]: Add API docs for Relay. (#1750)

---
 docs/api/python/index.rst               |  1 +
 docs/api/python/relay/backend.rst       | 16 ++++++++
 docs/api/python/relay/base.rst          | 16 ++++++++
 docs/api/python/relay/build_module.rst  | 19 +++++++++
 docs/api/python/relay/expr.rst          | 53 +++++++++++++++++++++++++
 docs/api/python/relay/frontend.rst      |  7 ++++
 docs/api/python/relay/image.rst         |  9 +++++
 docs/api/python/relay/index.rst         | 25 ++++++++++++
 docs/api/python/relay/ir_pass.rst       |  4 ++
 docs/api/python/relay/module.rst        |  7 ++++
 docs/api/python/relay/nn.rst            |  7 ++++
 docs/api/python/relay/op.rst            | 25 ++++++++++++
 docs/api/python/relay/scope_builder.rst |  7 ++++
 docs/api/python/relay/ty.rst            | 32 +++++++++++++++
 docs/api/python/relay/vision.rst        | 12 ++++++
 python/tvm/relay/backend/interpreter.py |  6 ++-
 python/tvm/relay/base.py                | 20 ++++++----
 python/tvm/relay/build_module.py        | 12 +++---
 python/tvm/relay/frontend/__init__.py   |  8 +++-
 python/tvm/relay/image.py               |  2 +-
 python/tvm/relay/scope_builder.py       |  8 ++--
 21 files changed, 274 insertions(+), 22 deletions(-)
 create mode 100644 docs/api/python/relay/backend.rst
 create mode 100644 docs/api/python/relay/base.rst
 create mode 100644 docs/api/python/relay/build_module.rst
 create mode 100644 docs/api/python/relay/expr.rst
 create mode 100644 docs/api/python/relay/frontend.rst
 create mode 100644 docs/api/python/relay/image.rst
 create mode 100644 docs/api/python/relay/index.rst
 create mode 100644 docs/api/python/relay/ir_pass.rst
 create mode 100644 docs/api/python/relay/module.rst
 create mode 100644 docs/api/python/relay/nn.rst
 create mode 100644 docs/api/python/relay/op.rst
 create mode 100644 docs/api/python/relay/scope_builder.rst
 create mode 100644 docs/api/python/relay/ty.rst
 create mode 100644 docs/api/python/relay/vision.rst

diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index 59bd1795b7ec..ddad9d10f8f9 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -24,3 +24,4 @@ Python API
    vta/index
    nnvm/index
    hybrid
+   relay/index
diff --git a/docs/api/python/relay/backend.rst b/docs/api/python/relay/backend.rst
new file mode 100644
index 000000000000..5cbc250b55ba
--- /dev/null
+++ b/docs/api/python/relay/backend.rst
@@ -0,0 +1,16 @@
+tvm.relay.backend
+-----------------
+
+.. automodule:: tvm.relay.backend
+
+Interpreter
+-----------
+
+.. automodule:: tvm.relay.backend.interpreter
+    :members:
+
+.. automodule:: tvm.relay.backend.compile_engine
+    :members:
+
+.. automodule:: tvm.relay.backend.graph_runtime_codegen
+    :members:
diff --git a/docs/api/python/relay/base.rst b/docs/api/python/relay/base.rst
new file mode 100644
index 000000000000..72315dca0193
--- /dev/null
+++ b/docs/api/python/relay/base.rst
@@ -0,0 +1,16 @@
+tvm.relay.base
+--------------
+.. automodule:: tvm.relay.base
+
+.. autofunction:: tvm.relay.base.register_relay_node
+
+.. autofunction:: tvm.relay.base.register_relay_attr_node
+
+.. autoclass:: tvm.relay.base.RelayNode
+    :members:
+
+.. autoclass:: tvm.relay.base.Span
+    :members:
+
+.. autoclass:: tvm.relay.base.Id
+    :members:
diff --git a/docs/api/python/relay/build_module.rst b/docs/api/python/relay/build_module.rst
new file mode 100644
index 000000000000..a278940f0fd5
--- /dev/null
+++ b/docs/api/python/relay/build_module.rst
@@ -0,0 +1,19 @@
+tvm.relay.build_module
+----------------------
+
+.. automodule:: tvm.relay.build_module
+
+.. autofunction:: tvm.relay.build_module.build
+
+.. autofunction:: tvm.relay.build_module.optimize
+
+.. autofunction:: tvm.relay.build_module.create_executor
+
+.. autoclass:: tvm.relay.build_module.BuildConfig
+    :members:
+
+.. autofunction:: tvm.relay.build_module.build_config
+    :members:
+
+.. autoclass:: tvm.relay.build_module.GraphExecutor
+    :members:
diff --git a/docs/api/python/relay/expr.rst b/docs/api/python/relay/expr.rst
new file mode 100644
index 000000000000..540d6bfbab65
--- /dev/null
+++ b/docs/api/python/relay/expr.rst
@@ -0,0 +1,53 @@
+tvm.relay.expr
+--------------
+
+.. automodule:: tvm.relay.expr
+
+.. autofunction:: tvm.relay.expr.var
+
+.. autofunction:: tvm.relay.expr.const
+
+.. autofunction:: tvm.relay.expr.bind
+
+.. autoclass:: tvm.relay.expr.Expr
+    :members:
+
+.. autoclass:: tvm.relay.expr.Constant
+    :members:
+
+.. autoclass:: tvm.relay.expr.Tuple
+    :members:
+
+.. autoclass:: tvm.relay.expr.Var
+    :members:
+
+.. autoclass:: tvm.relay.expr.GlobalVar
+    :members:
+
+.. autoclass:: tvm.relay.expr.Function
+    :members:
+
+.. autoclass:: tvm.relay.expr.Call
+    :members:
+
+.. autoclass:: tvm.relay.expr.Let
+    :members:
+
+.. autoclass:: tvm.relay.expr.If
+    :members:
+
+.. autoclass:: tvm.relay.expr.TupleGetItem
+    :members:
+
+.. autoclass:: tvm.relay.expr.TempExpr
+    :members:
+
+.. autoclass:: tvm.relay.expr.ExprFunctor
+    :members:
+
+.. autoclass:: tvm.relay.expr.ExprMutator
+    :members:
+
+.. autoclass:: tvm.relay.expr.TupleWrapper
+    :members
+
diff --git a/docs/api/python/relay/frontend.rst b/docs/api/python/relay/frontend.rst
new file mode 100644
index 000000000000..a418e042bf3d
--- /dev/null
+++ b/docs/api/python/relay/frontend.rst
@@ -0,0 +1,7 @@
+
+tvm.relay.frontend
+------------------
+
+.. automodule:: tvm.relay.frontend
+
+.. autofunction:: tvm.relay.frontend.from_mxnet
diff --git a/docs/api/python/relay/image.rst b/docs/api/python/relay/image.rst
new file mode 100644
index 000000000000..223213eca8e3
--- /dev/null
+++ b/docs/api/python/relay/image.rst
@@ -0,0 +1,9 @@
+
+tvm.relay.image
+---------------
+
+.. automodule:: tvm.relay.image
+    :members:
+
+.. automodule:: tvm.relay.op.image.image
+    :members:
diff --git a/docs/api/python/relay/index.rst b/docs/api/python/relay/index.rst
new file mode 100644
index 000000000000..da3d3a912dd0
--- /dev/null
+++ b/docs/api/python/relay/index.rst
@@ -0,0 +1,25 @@
+Relay API
+=========
+
+This document contains the Python API for the Relay frontend, optimizer, and
+compiler toolchain.
+
+Relay is the second-generation, high-level intermediate representation (IR) for the TVM
+compiler stack.
+
+.. toctree::
+   :maxdepth: 2
+
+   backend
+   base
+   build_module
+   expr
+   frontend
+   image
+   ir_pass
+   module
+   nn
+   op
+   scope_builder
+   ty
+   vision
diff --git a/docs/api/python/relay/ir_pass.rst b/docs/api/python/relay/ir_pass.rst
new file mode 100644
index 000000000000..d02ef4d94b0a
--- /dev/null
+++ b/docs/api/python/relay/ir_pass.rst
@@ -0,0 +1,4 @@
+tvm.relay.ir_pass
+-----------------
+.. automodule:: tvm.relay.ir_pass
+    :members:
\ No newline at end of file
diff --git a/docs/api/python/relay/module.rst b/docs/api/python/relay/module.rst
new file mode 100644
index 000000000000..ec9642b484ba
--- /dev/null
+++ b/docs/api/python/relay/module.rst
@@ -0,0 +1,7 @@
+tvm.relay.module
+----------------
+
+.. automodule:: tvm.relay.module
+
+.. autoclass:: tvm.relay.module.Module
+    :members:
diff --git a/docs/api/python/relay/nn.rst b/docs/api/python/relay/nn.rst
new file mode 100644
index 000000000000..8e3f47f7bead
--- /dev/null
+++ b/docs/api/python/relay/nn.rst
@@ -0,0 +1,7 @@
+tvm.relay.nn
+------------
+.. automodule:: tvm.relay.nn
+    :members:
+
+.. automodule:: tvm.relay.op.nn.nn
+    :members:
diff --git a/docs/api/python/relay/op.rst b/docs/api/python/relay/op.rst
new file mode 100644
index 000000000000..7413a818f73f
--- /dev/null
+++ b/docs/api/python/relay/op.rst
@@ -0,0 +1,25 @@
+tvm.relay.op
+------------
+.. automodule:: tvm.relay.op
+    :members:
+
+.. automodule:: tvm.relay.op.op
+    :members:
+
+.. automodule:: tvm.relay.op.reduce
+    :members:
+
+.. automodule:: tvm.relay.op.tensor
+    :members:
+
+.. automodule:: tvm.relay.op.transform
+    :members:
+
+.. automodule:: tvm.relay.op.nn.nn
+    :members:
+
+.. automodule:: tvm.relay.op.vision.multibox
+    :members:
+
+.. automodule:: tvm.relay.op.vision.nms
+    :members:
diff --git a/docs/api/python/relay/scope_builder.rst b/docs/api/python/relay/scope_builder.rst
new file mode 100644
index 000000000000..19fca89bf2d2
--- /dev/null
+++ b/docs/api/python/relay/scope_builder.rst
@@ -0,0 +1,7 @@
+tvm.relay.scope_builder
+-----------------------
+
+.. automodule:: tvm.relay.scope_builder
+
+.. autoclass:: tvm.relay.scope_builder.ScopeBuilder
+    :members:
diff --git a/docs/api/python/relay/ty.rst b/docs/api/python/relay/ty.rst
new file mode 100644
index 000000000000..edf15275db03
--- /dev/null
+++ b/docs/api/python/relay/ty.rst
@@ -0,0 +1,32 @@
+tvm.relay.ty
+------------
+
+.. automodule:: tvm.relay.ty
+    :members:
+
+.. autoclass:: tvm.relay.ty.Type
+    :members:
+
+.. autoclass:: tvm.relay.ty.TensorType
+    :members:
+
+.. autoclass:: tvm.relay.ty.Kind
+    :members:
+
+.. autoclass:: tvm.relay.ty.TypeVar
+    :members:
+
+.. autoclass:: tvm.relay.ty.TypeConstraint
+    :members:
+
+.. autoclass:: tvm.relay.ty.TupleType
+    :members:
+
+.. autoclass:: tvm.relay.ty.FuncType
+    :members:
+
+.. autoclass:: tvm.relay.ty.IncompleteType
+    :members:
+
+.. autoclass:: tvm.relay.ty.TypeRelation
+    :members:
diff --git a/docs/api/python/relay/vision.rst b/docs/api/python/relay/vision.rst
new file mode 100644
index 000000000000..7751dd688b15
--- /dev/null
+++ b/docs/api/python/relay/vision.rst
@@ -0,0 +1,12 @@
+
+tvm.relay.vision
+----------------
+
+.. automodule:: tvm.relay.vision
+    :members:
+
+.. automodule:: tvm.relay.op.vision.multibox
+    :members:
+
+.. automodule:: tvm.relay.op.vision.nms
+    :members:
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index 5c7401c8c146..ff6cf6aa1d5c 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -1,5 +1,5 @@
 #pylint: disable=no-else-return
-"""An interface to the Realy interpreter."""
+"""The Python interface to the Relay reference interpreter."""
 from __future__ import absolute_import
 
 import numpy as np
@@ -23,6 +23,7 @@ def from_scalar(value, dtype=None):
 
 @register_relay_node
 class TupleValue(Value):
+    """A tuple value produced by the interpreter."""
     def __init__(self, *fields):
         self.__init_handle_by_constructor__(
             _make.TupleValue, fields)
@@ -33,12 +34,13 @@ def __getitem__(self, field_no):
 
 @register_relay_node
 class Closure(Value):
+    """A closure produced by the interpreter."""
     pass
 
 
 @register_relay_node
 class TensorValue(Value):
-    """A Tensor value produced by the evaluator."""
+    """A Tensor value produced by the interpreter."""
 
     def __init__(self, data):
         """Allocate a new TensorValue and copy the data from `array` into
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
index f1105fe4f0d9..c50013b199ac 100644
--- a/python/tvm/relay/base.py
+++ b/python/tvm/relay/base.py
@@ -8,12 +8,12 @@
 NodeBase = NodeBase
 
 def register_relay_node(type_key=None):
-    """register relay node type
+    """Register a Relay node type.
 
     Parameters
     ----------
     type_key : str or cls
-        The type key of the node
+        The type key of the node.
     """
     if not isinstance(type_key, str):
         return _register_tvm_node(
@@ -22,12 +22,12 @@ def register_relay_node(type_key=None):
 
 
 def register_relay_attr_node(type_key=None):
-    """register relay attribute node
+    """Register a Relay attribute node.
 
     Parameters
     ----------
     type_key : str or cls
-        The type key of the node
+        The type key of the node.
     """
     if not isinstance(type_key, str):
         return _register_tvm_node(
@@ -36,7 +36,7 @@ def register_relay_attr_node(type_key=None):
 
 
 class RelayNode(NodeBase):
-    """Base class of all relay node."""
+    """Base class of all Relay nodes."""
     def astext(self, show_meta_data=True, annotate=None):
         """Get the text format of the expression.
 
@@ -52,8 +52,8 @@ def astext(self, show_meta_data=True, annotate=None):
 
         Note
         ----
-        meta data section is necessary to fully parse the text format.
-        However, it can contain dumps that are big(constat weights),
+        The metadata section is necessary to fully parse the text format.
+        However, it can contain dumps that are big (e.g constant weights)a,
         so it can be helpful to skip printing the meta data section.
 
         Returns
@@ -66,12 +66,16 @@ def astext(self, show_meta_data=True, annotate=None):
 
 @register_relay_node
 class Span(RelayNode):
+    """Specifies a location in a source program."""
+
     def __init__(self, source, lineno, col_offset):
         self.__init_handle_by_constructor__(_make.Span, source, lineno, col_offset)
 
 
 @register_relay_node
 class Id(NodeBase):
-    """Unique identifier(name) for Var across type checking."""
+    """Unique identifier(name) used in Var.
+       Guaranteed to be stable across all passes.
+    """
     def __init__(self):
         raise RuntimeError("Cannot directly construct Id")
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 2a2cd9f82ecb..7af22431aa81 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -240,13 +240,13 @@ class GraphExecutor(_interpreter.Executor):
 
     Parameters
     ----------
-    mod : tvm.relay.Module
+    mod : :py:class:`~tvm.relay.module.Module`
         The module to support the execution.
 
-    ctx : tvm.TVMContext
+    ctx : :py:class:`TVMContext`
         The runtime context to run the code on.
 
-    target : tvm.Target
+    target : :py:class:`Target`
         The target option to build the function.
     """
     def __init__(self, mod, ctx, target):
@@ -282,13 +282,13 @@ def create_executor(kind="debug",
     kind : str
         The type of executor
 
-    mod : tvm.relay.Module
+    mod : :py:class:`~tvm.relay.module.Module`
         The Relay module containing collection of functions
 
-    ctx : tvm.TVMContext
+    ctx : :py:class:`tvm.TVMContext`
         The context to execute the code.
 
-    target : tvm.Target
+    target : :py:class:`tvm.Target`
         The corresponding context
     """
     if ctx is not None:
diff --git a/python/tvm/relay/frontend/__init__.py b/python/tvm/relay/frontend/__init__.py
index 28766b9ae3be..2d01174a0d96 100644
--- a/python/tvm/relay/frontend/__init__.py
+++ b/python/tvm/relay/frontend/__init__.py
@@ -1,4 +1,10 @@
-"""Relay frontends."""
+"""
+Frontends for constructing Relay programs.
+
+Contains the model importers currently defined
+for Relay.
+"""
+
 from __future__ import absolute_import
 
 from .mxnet import from_mxnet
diff --git a/python/tvm/relay/image.py b/python/tvm/relay/image.py
index 43cee89b3483..90bb87d71c2e 100644
--- a/python/tvm/relay/image.py
+++ b/python/tvm/relay/image.py
@@ -1,4 +1,4 @@
 # pylint: disable=wildcard-import, unused-import, unused-wildcard-import
-"""Image nets related operators."""
+"""Image network related operators."""
 # Re-export in a specific file name so that autodoc can pick it up
 from .op.image import *
diff --git a/python/tvm/relay/scope_builder.py b/python/tvm/relay/scope_builder.py
index 641566946f58..074a4aa66c81 100644
--- a/python/tvm/relay/scope_builder.py
+++ b/python/tvm/relay/scope_builder.py
@@ -61,7 +61,7 @@ class ScopeBuilder(object):
 
     Examples
     --------
-    ..code-block: python
+    .. code-block: python
 
         sb = relay.ScopeBuilder()
         cond = relay.var("cond", 'bool')
@@ -115,7 +115,7 @@ def if_scope(self, cond):
 
         Parameters
         ----------
-        cond: tvm.relay.Expr
+        cond: tvm.relay.expr.Expr
             The condition
 
         Returns
@@ -165,7 +165,7 @@ def ret(self, value):
 
         Parameters
         ----------
-        value: tvm.relay.Expr
+        value: tvm.relay.expr.Expr
             The return value.
         """
         if self._ret_values[-1] is not None:
@@ -177,7 +177,7 @@ def get(self):
 
         Returns
         -------
-        value: tvm.relay.Expr
+        value: tvm.relay.expr.Expr
             The final result of the expression.
         """
         if len(self._bindings) != 1:

From 1cb2271de13673fd2e4160bda157a4b84bb9b0ec Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Sun, 2 Dec 2018 12:09:57 +0530
Subject: [PATCH 454/529] [RELAY] bugfix. (#2215)

---
 python/tvm/relay/op/transform.py     | 6 ++++--
 tests/python/relay/test_op_level3.py | 9 ++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 17caad4bb304..f536e75fd9b4 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -45,8 +45,10 @@ def transpose(data, axes=None):
     result : relay.Expr
         The transposed result.
     """
-    axes = axes or []
-    return _make.transpose(data, list(axes))
+
+    if axes is not None:
+        axes = list(axes)
+    return _make.transpose(data, axes)
 
 
 def squeeze(data, axis=None):
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 617b532a6a1f..0731ecfef40a 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -82,11 +82,18 @@ def test_transpose_infer_type():
     n, t, d = tvm.var("n"), tvm.var("t"), 100
     x = relay.var("x", relay.TensorType((n, t, d), "float32"))
     y = relay.transpose(x, axes=(1, 0, 2))
-    "axes=" in y.astext()
+    assert "axes=" in y.astext()
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType(
         (t, n, 100), "float32")
 
+    y = relay.transpose(x)
+    assert "axes=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
+        (100, t, n), "float32")
+
+
 def test_transpose():
     def verify_transpose(dshape, axes):
         x = relay.var("x", relay.TensorType(dshape, "float32"))

From 29b2e01c712df121c0811b77f269d7fc61effeb7 Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Sun, 2 Dec 2018 10:35:01 -0800
Subject: [PATCH 455/529] [Relay][RFC] Relay IR Text Format (#1781)

---
 .gitignore                               |   4 +
 CMakeLists.txt                           |   2 +
 Jenkinsfile                              |   2 +
 cmake/config.cmake                       |   3 +
 cmake/modules/ANTLR.cmake                |  28 ++
 docker/Dockerfile.ci_cpu                 |   7 -
 docker/install/ubuntu_install_antlr.sh   |   2 -
 python/tvm/relay/__init__.py             |   5 +-
 python/tvm/relay/_parser.py              | 425 +++++++++++++++++++++++
 python/tvm/relay/expr.pyi                |   8 +-
 python/tvm/relay/grammar/.gitignore      |   1 +
 python/tvm/relay/grammar/Relay.g4        | 146 ++++++++
 python/tvm/relay/grammar/__init__.py     |   0
 python/tvm/relay/grammar/py2/.gitignore  |   1 +
 python/tvm/relay/grammar/py2/__init__.py |   0
 python/tvm/relay/grammar/py3/.gitignore  |   1 +
 python/tvm/relay/grammar/py3/__init__.py |   0
 python/tvm/relay/parser.py               |  17 +
 python/tvm/relay/ty.pyi                  |   2 +-
 19 files changed, 639 insertions(+), 15 deletions(-)
 create mode 100644 cmake/modules/ANTLR.cmake
 create mode 100644 python/tvm/relay/_parser.py
 create mode 100644 python/tvm/relay/grammar/.gitignore
 create mode 100644 python/tvm/relay/grammar/Relay.g4
 create mode 100644 python/tvm/relay/grammar/__init__.py
 create mode 100644 python/tvm/relay/grammar/py2/.gitignore
 create mode 100644 python/tvm/relay/grammar/py2/__init__.py
 create mode 100644 python/tvm/relay/grammar/py3/.gitignore
 create mode 100644 python/tvm/relay/grammar/py3/__init__.py
 create mode 100644 python/tvm/relay/parser.py

diff --git a/.gitignore b/.gitignore
index 410a36aecdec..04dad2039860 100644
--- a/.gitignore
+++ b/.gitignore
@@ -209,3 +209,7 @@ tvm_t.*
 
 # patch sentinel
 patched.txt
+
+# Python type checking
+.mypy_cache/
+.pyre/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 98bbc5b650d3..363b2056a87a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ tvm_option(USE_ROCBLAS "Build with ROCM:RoCBLAS" OFF)
 tvm_option(USE_SORT "Build with sort support" OFF)
 tvm_option(USE_NNPACK "Build with nnpack support" OFF)
 tvm_option(USE_RANDOM "Build with random support" OFF)
+tvm_option(USE_ANTLR "Build with ANTLR for Relay parsing" OFF)
 
 # include directories
 include_directories("include")
@@ -183,6 +184,7 @@ include(cmake/modules/Metal.cmake)
 include(cmake/modules/ROCM.cmake)
 include(cmake/modules/SGX.cmake)
 include(cmake/modules/LLVM.cmake)
+include(cmake/modules/ANTLR.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/Random.cmake)
 include(cmake/modules/contrib/Sort.cmake)
diff --git a/Jenkinsfile b/Jenkinsfile
index adc9e12ca74b..02f00e42e8fd 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -98,6 +98,7 @@ stage('Build') {
            echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
+           echo set\\(USE_ANTLR ON\\) >> config.cmake
            echo set\\(USE_BLAS openblas\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
@@ -133,6 +134,7 @@ stage('Build') {
            echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
            echo set\\(USE_NNPACK ON\\) >> config.cmake
            echo set\\(NNPACK_PATH /NNPACK/build/\\) >> config.cmake
+           echo set\\(USE_ANTLR ON\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
diff --git a/cmake/config.cmake b/cmake/config.cmake
index a92be7ce3008..a97def410ddd 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -128,3 +128,6 @@ set(USE_ROCBLAS OFF)
 
 # Whether use contrib sort
 set(USE_SORT OFF)
+
+# Build ANTLR parser for Relay text format
+set(USE_ANTLR OFF)
diff --git a/cmake/modules/ANTLR.cmake b/cmake/modules/ANTLR.cmake
new file mode 100644
index 000000000000..72eb5925bda0
--- /dev/null
+++ b/cmake/modules/ANTLR.cmake
@@ -0,0 +1,28 @@
+if(USE_ANTLR)
+  if(EXISTS /usr/local/lib/antlr-4.7.1-complete.jar)
+    set(ANTLR4 "/usr/local/lib/antlr-4.7.1-complete.jar")
+
+    set(RELAY_PARSER_DIR
+      ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm/relay/grammar)
+
+    set(RELAY_PARSER
+      ${RELAY_PARSER_DIR}/py2/RelayVisitor.py
+      ${RELAY_PARSER_DIR}/py2/RelayParser.py
+      ${RELAY_PARSER_DIR}/py2/RelayLexer.py
+
+      ${RELAY_PARSER_DIR}/py3/RelayVisitor.py
+      ${RELAY_PARSER_DIR}/py3/RelayParser.py
+      ${RELAY_PARSER_DIR}/py3/RelayLexer.py)
+
+    # Generate ANTLR grammar for parsing.
+    add_custom_command(OUTPUT ${RELAY_PARSER}
+      COMMAND $ENV{JAVA_HOME}/bin/java -jar ${ANTLR4} -visitor -no-listener -Dlanguage=Python2 ${RELAY_PARSER_DIR}/Relay.g4 -o ${RELAY_PARSER_DIR}/py2
+      COMMAND $ENV{JAVA_HOME}/bin/java -jar ${ANTLR4} -visitor -no-listener -Dlanguage=Python3 ${RELAY_PARSER_DIR}/Relay.g4 -o ${RELAY_PARSER_DIR}/py3
+      DEPENDS ${RELAY_PARSER_DIR}/Relay.g4
+      WORKING_DIRECTORY ${RELAY_PARSER_DIR})
+
+    add_custom_target(relay_parser ALL DEPENDS ${RELAY_PARSER})
+  else()
+    message(FATAL_ERROR "Can't find ANTLR4!")
+  endif()
+endif(USE_ANTLR)
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 11a77adbfdde..e6e2dd7a37b0 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -40,10 +40,3 @@ COPY install/ubuntu_install_nnpack.sh /install/ubuntu_install_nnpack.sh
 RUN bash /install/ubuntu_install_nnpack.sh
 
 ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
-
-# ANTLR deps
-COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
-RUN bash /install/ubuntu_install_java.sh
-
-COPY install/ubuntu_install_antlr.sh /install/ubuntu_install_antlr.sh
-RUN bash /install/ubuntu_install_antlr.sh
diff --git a/docker/install/ubuntu_install_antlr.sh b/docker/install/ubuntu_install_antlr.sh
index f1066c4220d4..d2f2d6a8c48f 100644
--- a/docker/install/ubuntu_install_antlr.sh
+++ b/docker/install/ubuntu_install_antlr.sh
@@ -1,5 +1,3 @@
 cd /usr/local/lib
 wget https://www.antlr.org/download/antlr-4.7.1-complete.jar
 cd -
-
-alias antlr4='java -jar /usr/local/lib/antlr-4.7.1-complete.jar'
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 6b071f65a794..b66132f27775 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -8,6 +8,7 @@
 from . import module
 from . import ir_pass
 from .build_module import build, build_config, create_executor
+from . import parser
 
 # Root operators
 from .op import Op
@@ -52,7 +53,6 @@
 If = expr.If
 TupleGetItem = expr.TupleGetItem
 
-
 # helper functions
 var = expr.var
 const = expr.const
@@ -63,3 +63,6 @@
 def _debug(*args):
     import pdb
     pdb.set_trace()
+
+# Parser
+fromtext = parser.fromtext
diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
new file mode 100644
index 000000000000..f64c635dd4ff
--- /dev/null
+++ b/python/tvm/relay/_parser.py
@@ -0,0 +1,425 @@
+
+# pylint: disable=invalid-name, unused-import
+"""A parser for Relay's text format."""
+from __future__ import absolute_import
+
+import sys
+
+from collections import deque
+from typing import TypeVar, Deque, Tuple, Optional, Union, NamedTuple, List, Callable, Any
+
+from . import module
+from . import expr
+from . import ty
+from . import op
+
+class ParseError(Exception):
+    """Exception type for parse errors."""
+
+    def __init__(self, message):
+        # type: (str) -> None
+        super(ParseError, self).__init__()
+        self.message = message
+
+PYTHON_VERSION = sys.version_info.major
+try:
+    if PYTHON_VERSION == 2:
+        from .grammar.py2.RelayVisitor import RelayVisitor
+        from .grammar.py2.RelayParser import RelayParser
+        from .grammar.py2.RelayLexer import RelayLexer
+    else:
+        from .grammar.py3.RelayVisitor import RelayVisitor
+        from .grammar.py3.RelayParser import RelayParser
+        from .grammar.py3.RelayLexer import RelayLexer
+except ImportError:
+    raise ParseError("Couldn't find ANTLR parser. Try building with USE_ANTLR=ON.")
+
+try:
+    from antlr4 import ParserRuleContext, InputStream, CommonTokenStream
+    from antlr4.tree.Tree import TerminalNode
+except ImportError:
+    raise ParseError("Couldn't find ANTLR runtime." +
+                     "Try running `pip{} install antlr4-python{}-runtime`."
+                     .format(PYTHON_VERSION, PYTHON_VERSION))
+
+BINARY_OPS = {
+    RelayParser.MUL: op.multiply,
+    RelayParser.DIV: op.divide,
+    RelayParser.ADD: op.add,
+    RelayParser.SUB: op.subtract,
+    RelayParser.LT:  op.less,
+    RelayParser.GT:  op.greater,
+    RelayParser.LE:  op.less_equal,
+    RelayParser.GE:  op.greater_equal,
+    RelayParser.EQ:  op.equal,
+    RelayParser.NE:  op.not_equal,
+}
+
+TYPE_PREFIXES = [
+    "int",
+    "uint",
+    "float",
+    "bool",
+]
+
+T = TypeVar("T")
+Scope = Deque[Tuple[str, T]]
+Scopes = Deque[Scope[T]]
+
+def lookup(scopes, name):
+    # type: (Scopes[T], str) -> Optional[T]
+    """Look up `name` in `scopes`."""
+
+    for scope in scopes:
+        for key, val in scope:
+            if key == name:
+                return val
+    return None
+
+# TODO(@jmp): Use https://stackoverflow.com/q/13889941
+# to figure out how to get ANTLR4 to be more unhappy about syntax errors
+class ParseTreeToRelayIR(RelayVisitor):
+    """Parse Relay text format into Relay IR."""
+
+    def __init__(self):
+        # type: () -> None
+        self.module = module.Module({})   # type: module.Module
+
+        # Adding an empty scope allows naked lets without pain.
+        self.var_scopes = deque([deque()]) # type: Scopes[expr.Var]
+        self.type_param_scopes = deque([deque()]) # type: Scopes[ty.TypeVar]
+
+        super(ParseTreeToRelayIR, self).__init__()
+
+    def enter_var_scope(self):
+        # type: () -> None
+        """Enter a new Var scope so it can be popped off later."""
+
+        self.var_scopes.appendleft(deque())
+
+    def exit_var_scope(self):
+        # type: () -> Scope[expr.Var]
+        """Pop off the current Var scope and return it."""
+
+        return self.var_scopes.popleft()
+
+    def mk_var(self, name, type_):
+        # type: (str, ty.Type) -> expr.Var
+        """Create a new Var and add it to the Var scope."""
+
+        var = expr.Var(name, type_)
+        self.var_scopes[0].appendleft((name, var))
+        return var
+
+    def enter_type_param_scope(self):
+        # type: () -> None
+        """Enter a new TypeVar scope so it can be popped off later."""
+
+        self.type_param_scopes.appendleft(deque())
+
+    def exit_type_param_scope(self):
+        # type: () -> Scope[ty.TypeVar]
+        """Pop off the current TypeVar scope and return it."""
+
+        return self.type_param_scopes.popleft()
+
+    def mk_typ(self, name, kind):
+        # (str, ty.Kind) -> ty.TypeVar
+        """Create a new TypeVar and add it to the TypeVar scope."""
+
+        typ = ty.TypeVar(name, kind)
+        self.type_param_scopes[0].appendleft((name, typ))
+        return typ
+
+    def visitTerminal(self, node):
+        # type: (TerminalNode) -> Union[expr.Expr, int, float]
+        """Visit lexer tokens that aren't ignored or visited by other functions."""
+
+        node_type = node.getSymbol().type
+        node_text = node.getText()
+
+        # variables
+        if node_type == RelayLexer.GLOBAL_VAR:
+            return expr.GlobalVar(node_text[1:])
+        elif node_type == RelayLexer.LOCAL_VAR:
+            name = node_text[1:]
+            var = lookup(self.var_scopes, name)
+            if var is None:
+                raise ParseError("Couldn't resolve `{}`.".format(name))
+
+            return var
+
+        # data types
+        elif node_type == RelayLexer.INT:
+            return int(node_text)
+        elif node_type == RelayLexer.FLOAT:
+            return float(node_text)
+        elif node_type == RelayLexer.BOOL_LIT:
+            if node_text == "True":
+                return True
+            elif node_text == "False":
+                return False
+            else:
+                raise ParseError("Unrecognized BOOL_LIT: `{}`".format(node_text))
+
+        else:
+            raise ParseError("todo: {}".format(node_text))
+
+    def visit_list(self, ctx_list):
+        # type: (List[ParserRuleContext]) -> List[Any]
+        """"Visit a list of contexts."""
+
+        return [self.visit(ctx) for ctx in ctx_list]
+
+    def getType_(self, ctx):
+        # type: (Optional[RelayParser.Type_Context]) -> Optional[ty.Type]
+        """Return a (possibly None) Relay type."""
+
+        if ctx is None:
+            return None
+
+        return self.visit(ctx)
+
+    def visitProg(self, ctx):
+        # type: (RelayParser.ProgContext) -> Union[expr.Expr, env.Environment]
+        if ctx.defn():
+            self.visit_list(ctx.defn())
+            return self.module
+
+        return self.visit(ctx.expr())
+
+    # Exprs
+
+    def visitOpIdent(self, ctx):
+        # type: (RelayParser.OpIdentContext) -> op.Op
+        return op.get(ctx.CNAME().getText())
+
+    # pass through
+    def visitParens(self, ctx):
+        # type: (RelayParser.ParensContext) -> expr.Expr
+        return self.visit(ctx.expr())
+
+    # pass through
+    def visitBody(self, ctx):
+        # type: (RelayParser.BodyContext) -> expr.Expr
+        return self.visit(ctx.expr())
+
+    def visitScalarFloat(self, ctx):
+        # type: (RelayParser.ScalarFloatContext) -> expr.Constant
+        return expr.const(self.visit(ctx.FLOAT()))
+
+    def visitScalarInt(self, ctx):
+        # type: (RelayParser.ScalarIntContext) -> expr.Constant
+        return expr.const(self.visit(ctx.INT()))
+
+    def visitScalarBool(self, ctx):
+        # type: (RelayParser.ScalarBoolContext) -> expr.Constant
+        return expr.const(self.visit(ctx.BOOL_LIT()))
+
+    def visitNeg(self, ctx):
+        # type: (RelayParser.NegContext) -> Union[expr.Constant, expr.Call]
+        val = self.visit(ctx.expr())
+        if isinstance(val, expr.Constant) and val.data.asnumpy().ndim == 0:
+            # fold Neg in for scalars
+            return expr.const(-val.data.asnumpy().item())
+
+        return op.negative(val)
+
+    def visitTuple(self, ctx):
+        # type: (RelayParser.TupleContext) -> expr.Tuple
+        tup = self.visit_list(ctx.expr())
+        return expr.Tuple(tup)
+
+    # Currently doesn't support mutable sequencing.
+    def visitSeq(self, ctx):
+        # type: (RelayParser.SeqContext) -> expr.Let
+        """Desugar various sequence constructs to Relay Let nodes."""
+        if ctx.MUT() is not None:
+            raise ParseError("Mutation is currently unsupported.")
+
+        if ctx.var() is None or ctx.var().ident() is None:
+            # anonymous identity
+            ident = "_"
+            type_ = None
+        else:
+            local_var = ctx.var().ident().LOCAL_VAR()
+            if local_var is None:
+                raise ParseError('Only local ids may be used in `let`s.')
+            ident = local_var.getText()[1:]
+            type_ = self.getType_(ctx.var().type_())
+
+        var = self.mk_var(ident, type_)
+
+        self.enter_var_scope()
+        value = self.visit(ctx.expr(0))
+        self.exit_var_scope()
+
+        body = self.visit(ctx.expr(1))
+
+        return expr.Let(var, value, body)
+
+    def visitBinOp(self, ctx):
+        # type: (RelayParser.BinOpContext) -> expr.Call
+        """Desugar binary operators."""
+        arg0, arg1 = self.visit_list(ctx.expr())
+        relay_op = BINARY_OPS.get(ctx.op.type)
+
+        if relay_op is None:
+            raise ParseError("Unimplemented binary op.")
+
+        return relay_op(arg0, arg1)
+
+    def visitVar(self, ctx):
+        # type: (RelayParser.VarContext) -> expr.Var
+        ident = ctx.ident().LOCAL_VAR()
+
+        if ident is None:
+            raise ParseError('Only local ids may be used in params.')
+
+        type_ = self.getType_(ctx.type_())
+
+        return self.mk_var(ident.getText()[1:], type_)
+
+    def visitVarList(self, ctx):
+        # type: (RelayParser.VarListContext) -> List[expr.Var]
+        return self.visit_list(ctx.var())
+
+    def mk_func(self, ctx):
+        # type: (Union[RelayParser.FuncContext, RelayParser.DefnContext]) -> Function
+        """Construct a function from either a Func or Defn."""
+
+        # Enter var scope early to put params in scope.
+        self.enter_var_scope()
+        # Capture type params in params.
+        self.enter_type_param_scope()
+        var_list = self.visit(ctx.varList())
+        ret_type = self.getType_(ctx.type_())
+
+        type_params = list(self.exit_type_param_scope())
+        if type_params:
+            _, type_params = zip(*type_params)
+
+        body = self.visit(ctx.body())
+        self.exit_var_scope()
+
+        return expr.Function(var_list, body, ret_type, type_params) # type: ignore
+
+    def visitFunc(self, ctx):
+        # type: (RelayParser.FuncContext) -> expr.Function
+        return self.mk_func(ctx)
+
+    def visitDefn(self, ctx):
+        # type: (RelayParser.DefnContext) -> None
+        ident = ctx.ident().GLOBAL_VAR()
+        if ident is None:
+            raise ParseError('Only global ids may be used in `def`s.')
+        ident = expr.GlobalVar(ident.getText()[1:])
+
+        self.module[ident] = self.mk_func(ctx)
+
+    def visitCall(self, ctx):
+        # type: (RelayParser.CallContext) -> expr.Call
+        visited_exprs = self.visit_list(ctx.expr())
+
+        func = visited_exprs[0]
+        args = visited_exprs[1:]
+
+        return expr.Call(func, args, None, None)
+
+    def visitIfElse(self, ctx):
+        # type: (RelayParser.IfElseContext) -> expr.If
+        """Construct a Relay If node. Creates a new scope for each branch."""
+        cond = self.visit(ctx.expr())
+
+        self.enter_var_scope()
+        true_branch = self.visit(ctx.body(0))
+        self.exit_var_scope()
+
+        self.enter_var_scope()
+        false_branch = self.visit(ctx.body(1))
+        self.exit_var_scope()
+
+        return expr.If(cond, true_branch, false_branch)
+
+    # Types
+
+    # pylint: disable=unused-argument
+    def visitIncompleteType(self, ctx):
+        # type (RelayParser.IncompleteTypeContext) -> None:
+        return None
+
+    def visitIdentType(self, ctx):
+        # type: (RelayParser.IdentTypeContext) -> Union[ty.TensorType, str]
+        ident_type = ctx.CNAME().getText()
+
+        # look through all type prefixes for a match
+        for type_prefix in TYPE_PREFIXES:
+            if ident_type.startswith(type_prefix):
+                return ty.scalar_type(ident_type)
+
+        raise ParseError("Unknown builtin type: {}".format(ident_type))
+
+    # def visitCallType(self, ctx):
+    #     # type: (RelayParser.CallTypeContext) -> Union[expr.Expr, ty.TensorType]
+    #     ident_type = ctx.identType().CNAME().getText()
+
+    #     args = self.visit_list(ctx.type_())
+
+    #     if not args:
+    #         raise ParseError("Type-level functions must have arguments!")
+
+    #     func_type = TYPE_FUNCS.get(ident_type)(args)
+
+    #     if func_type is None:
+    #         raise ParseError("Unknown type-level function: `{}`".format(ident_type))
+    #     else:
+    #         return func_type
+
+    def visitParensShape(self, ctx):
+        # type: (RelayParser.ParensShapeContext) -> int
+        return self.visit(ctx.shape())
+
+    def visitShapeSeq(self, ctx):
+        # type: (RelayParser.ShapeSeqContext) -> List[int]
+        return self.visit_list(ctx.shape())
+
+    def visitTensorType(self, ctx):
+        # type: (RelayParser.TensorTypeContext) -> ty.TensorType
+        """Create a simple tensor type. No generics."""
+
+        shape = self.visit(ctx.shapeSeq())
+        dtype = self.visit(ctx.type_())
+
+        if not isinstance(dtype, ty.TensorType):
+            raise ParseError("Expected dtype to be a Relay base type.")
+
+        dtype = dtype.dtype
+
+        return ty.TensorType(shape, dtype)
+
+    def visitTupleType(self, ctx):
+        # type: (RelayParser.TupleTypeContext) -> ty.TupleType
+        return ty.TupleType(self.visit_list(ctx.type_()))
+
+    def visitFuncType(self, ctx):
+        # type: (RelayParser.FuncTypeContext) -> ty.FuncType
+        types = self.visit_list(ctx.type_())
+
+        arg_types = types[:-1]
+        ret_type = types[-1]
+
+        return ty.FuncType(arg_types, ret_type, [], None)
+
+def make_parser(data):
+    # type: (str) -> RelayParser
+    """Construct a RelayParser a given data stream."""
+    input_stream = InputStream(data)
+    lexer = RelayLexer(input_stream)
+    token_stream = CommonTokenStream(lexer)
+    return RelayParser(token_stream)
+
+def fromtext(data):
+    # type: (str) -> Union[expr.Expr, env.Environment]
+    """Parse a Relay program."""
+    tree = make_parser(data).prog()
+    return ParseTreeToRelayIR().visit(tree)
diff --git a/python/tvm/relay/expr.pyi b/python/tvm/relay/expr.pyi
index e73a5963e5b1..bc2e5115df0d 100644
--- a/python/tvm/relay/expr.pyi
+++ b/python/tvm/relay/expr.pyi
@@ -22,7 +22,7 @@ class Constant(Expr):
 
 
 class Tuple(Expr):
-    fields = ..  # type: List[Expr]
+    fields = ...  # type: List[Expr]
 
     def __init__(self, fields):
         # type: (List[Expr]) -> None
@@ -77,10 +77,10 @@ class Call(Expr):
     """A function call in Relay, see tvm/relay/expr.h for more details."""
     op = ...  # type: Expr
     args = ...  # type: List[Expr]
-    # todo(@jroesch): add attrs
+    # todo(@jroesch): add attrs. revise attrs type in __init__
 
-    def __init__(self, op, args, attrs, ty_args=None):
-        # type: (Expr, List[Expr], Optional[List[Type]]) -> None
+    def __init__(self, op, args, attrs=None, ty_args=None):
+        # type: (Expr, List[Expr], Optional[List[Any]], Optional[List[Type]]) -> None
         if not ty_args:
             ty_args = []
 
diff --git a/python/tvm/relay/grammar/.gitignore b/python/tvm/relay/grammar/.gitignore
new file mode 100644
index 000000000000..cffe35e1a41a
--- /dev/null
+++ b/python/tvm/relay/grammar/.gitignore
@@ -0,0 +1 @@
+/.antlr/
diff --git a/python/tvm/relay/grammar/Relay.g4 b/python/tvm/relay/grammar/Relay.g4
new file mode 100644
index 000000000000..c74a42c97e77
--- /dev/null
+++ b/python/tvm/relay/grammar/Relay.g4
@@ -0,0 +1,146 @@
+grammar Relay;
+
+// Lexing
+// comments
+WS : [ \t\n\r]+ -> skip ;
+LINE_COMMENT : '//' .*? '\n' -> skip ;
+COMMENT : '/*' .*? '*/' -> skip ;
+
+// operators
+MUL: '*' ;
+DIV: '/' ;
+ADD: '+' ;
+SUB: '-' ;
+LT: '<' ;
+GT: '>' ;
+LE: '<=' ;
+GE: '>=' ;
+EQ: '==' ;
+NE: '!=' ;
+
+opIdent: CNAME ;
+GLOBAL_VAR: '@' CNAME ;
+LOCAL_VAR: '%' CNAME ;
+
+MUT: 'mut' ;
+
+BOOL_LIT
+  : 'True'
+  | 'False'
+  ;
+
+// non-negative floats
+FLOAT
+  : INT '.' INT EXP? // 1.35, 1.35E-9, 0.3, 4.5
+  | INT EXP // 1e10 3e4
+  ;
+
+// non-negative ints
+INT: DIGIT+ ;
+fragment EXP: [eE] [+\-]? INT ; // \- since - means "range" inside [...]
+
+CNAME: ('_'|LETTER) ('_'|LETTER|DIGIT)* ;
+fragment LETTER: [a-zA-Z] ;
+fragment DIGIT: [0-9] ;
+
+// Parsing
+
+// A Relay program is a list of global definitions or an expression.
+prog: (defn* | expr) EOF ;
+
+// option: 'set' ident BOOL_LIT ;
+
+expr
+  // operators
+  : '(' expr ')'                              # parens
+  | '-' expr                                  # neg
+  | expr op=('*'|'/') expr                    # binOp
+  | expr op=('+'|'-') expr                    # binOp
+  | expr op=('<'|'>'|'<='|'>=') expr          # binOp
+  | expr op=('=='|'!=') expr                  # binOp
+
+  // function definition and application
+  | expr '(' (expr (',' expr)*)? ')'          # call
+  | func                                      # funcExpr
+
+  // tuples and tensors
+  | '(' ')'                                   # tuple
+  | '(' expr ',' ')'                          # tuple
+  | '(' expr (',' expr)+ ')'                  # tuple
+  | '[' (expr (',' expr)*)? ']'               # tensor
+
+  | 'if' '(' expr ')' body 'else' body        # ifElse
+
+  // sequencing
+  | 'let' MUT? var '=' expr ';' expr          # seq
+  | 'let' MUT? var '=' '{' expr '}' ';' expr  # seq
+  // sugar for let %_ = expr; expr
+  | expr ';' expr                             # seq
+
+  // mutable update
+  // | ident '=' expr                            # writeRef
+  // | expr '^'                                  # readRef
+
+  | ident                                     # identExpr
+  | scalar                                    # scalarExpr
+  // | expr '.' INT                              # project
+  // | 'debug'                                   # debug
+  ;
+
+func: 'fn'        varList ('->' type_)? body ;
+defn: 'def' ident varList ('->' type_)? body ;
+
+varList: '(' (var (',' var)*)? ')' ;
+var: ident (':' type_)? ;
+
+// TODO(@jmp): for improved type annotations
+// returnAnno: (ident ':')? type_ ;
+
+// relations: 'where' relation (',' relation)* ;
+// relation: ident '(' (type_ (',' type_)*)? ')' ;
+
+type_
+  : '(' ')'                                         # tupleType
+  | '(' type_ ',' ')'                               # tupleType
+  | '(' type_ (',' type_)+ ')'                      # tupleType
+  | identType                                       # identTypeType
+  | 'Tensor' '[' shapeSeq ',' type_ ']'             # tensorType
+  // currently unused
+  // | identType '[' (type_ (',' type_)*)? ']'         # callType
+  | 'fn' '(' (type_ (',' type_)*)? ')' '->' type_   # funcType
+  | '_'                                             # incompleteType
+  | INT                                             # intType
+  ;
+
+shapeSeq
+  : '(' ')'
+  | '(' shape ',' ')'
+  | '(' shape (',' shape)+ ')'
+  ;
+
+shape
+  : '(' shape ')'                   # parensShape
+  // | type_ op=('*'|'/') type_        # binOpType
+  // | type_ op=('+'|'-') type_        # binOpType
+  | INT                             # intShape
+  ;
+
+identType: CNAME ;
+// Int8, Int16, Int32, Int64
+// UInt8, UInt16, UInt32, UInt64
+// Float16, Float32, Float64
+// Bool
+
+body: '{' expr '}' ;
+
+scalar
+  : FLOAT    # scalarFloat
+  | INT      # scalarInt
+  | BOOL_LIT # scalarBool
+  ;
+
+ident
+  : opIdent
+  | GLOBAL_VAR
+  | LOCAL_VAR
+  ;
diff --git a/python/tvm/relay/grammar/__init__.py b/python/tvm/relay/grammar/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/tvm/relay/grammar/py2/.gitignore b/python/tvm/relay/grammar/py2/.gitignore
new file mode 100644
index 000000000000..d677ff551940
--- /dev/null
+++ b/python/tvm/relay/grammar/py2/.gitignore
@@ -0,0 +1 @@
+Relay*
diff --git a/python/tvm/relay/grammar/py2/__init__.py b/python/tvm/relay/grammar/py2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/tvm/relay/grammar/py3/.gitignore b/python/tvm/relay/grammar/py3/.gitignore
new file mode 100644
index 000000000000..d677ff551940
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/.gitignore
@@ -0,0 +1 @@
+Relay*
diff --git a/python/tvm/relay/grammar/py3/__init__.py b/python/tvm/relay/grammar/py3/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/tvm/relay/parser.py b/python/tvm/relay/parser.py
new file mode 100644
index 000000000000..51200343f147
--- /dev/null
+++ b/python/tvm/relay/parser.py
@@ -0,0 +1,17 @@
+"""A parser for Relay's text format."""
+from __future__ import absolute_import
+
+def enabled():
+    """Is the parser enabled/Can we import the parser?"""
+    try:
+        # pylint: disable=unused-variable
+        from tvm.relay import _parser
+        return True
+    # pylint: disable=broad-except
+    except Exception:
+        return False
+
+def fromtext(data):
+    """Parse a Relay program."""
+    from tvm.relay import _parser
+    return _parser.fromtext(data)
diff --git a/python/tvm/relay/ty.pyi b/python/tvm/relay/ty.pyi
index 221fc228081d..933814853f3e 100644
--- a/python/tvm/relay/ty.pyi
+++ b/python/tvm/relay/ty.pyi
@@ -156,7 +156,7 @@ class FuncType(Type):
 class IncompleteType(Type):
     """An incomplete type."""
 
-    def __init__(self, kind):
+    def __init__(self, kind=Kind.Type):
         self.__init_handle_by_constructor__(_make.IncompleteType, kind)
 
 @register_relay_node

From 8fd1dfe6b38c1be7daba894e7aa42124bdf6159a Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Sun, 2 Dec 2018 18:58:40 -0800
Subject: [PATCH 456/529] [Relay] Parser Tests (#2209)

---
 src/relay/ir/alpha_equal.cc          |  12 +-
 src/relay/ir/text_printer.cc         |   4 +-
 tests/python/relay/test_ir_parser.py | 562 +++++++++++++++++++++++++++
 3 files changed, 570 insertions(+), 8 deletions(-)
 create mode 100644 tests/python/relay/test_ir_parser.py

diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 873210321bf3..16af572a9d6f 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -26,7 +26,7 @@ class AlphaEqualHandler:
    * Check equality of two nodes.
    * \param lhs The left hand operand.
    * \param rhs The right hand operand.
-   * \return the compare result.
+   * \return The compare result.
    */
   bool Equal(const NodeRef& lhs, const NodeRef& rhs) {
     if (lhs.same_as(rhs)) return true;
@@ -46,7 +46,7 @@ class AlphaEqualHandler:
    * Check equality of two attributes.
    * \param lhs The left hand operand.
    * \param rhs The right hand operand.
-   * \return the compare result.
+   * \return The compare result.
    */
   bool AttrEqual(const NodeRef& lhs, const NodeRef& rhs) {
     return AttrsEqualHandler::Equal(lhs, rhs);
@@ -55,7 +55,7 @@ class AlphaEqualHandler:
    * Check equality of two types.
    * \param lhs The left hand operand.
    * \param rhs The right hand operand.
-   * \return the compare result.
+   * \return The compare result.
    */
   bool TypeEqual(const Type& lhs, const Type& rhs) {
     if (lhs.same_as(rhs)) return true;
@@ -72,7 +72,7 @@ class AlphaEqualHandler:
    *
    * \param lhs The left hand operand.
    * \param rhs The right hand operand.
-   * \return the compare result.
+   * \return The compare result.
    */
   bool ExprEqual(const Expr& lhs, const Expr& rhs) {
     if (lhs.same_as(rhs)) return true;
@@ -94,7 +94,7 @@ class AlphaEqualHandler:
    * \brief Check if data type equals each other.
    * \param lhs The left hand operand.
    * \param rhs The right hand operand.
-   * \return the compare result.
+   * \return The compare result.
    */
   bool DataTypeEqual(const DataType& lhs, const DataType& rhs) {
     return lhs == rhs;
@@ -104,7 +104,7 @@ class AlphaEqualHandler:
    *  if map_free_var_ is set to true, try to map via equal node.
    * \param lhs The left hand operand.
    * \param rhs The right hand operand.
-   * \return the compare result.
+   * \return The compare result.
    */
   bool LeafNodeEqual(const NodeRef& lhs, const NodeRef& rhs) {
     if (lhs.same_as(rhs)) return true;
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index 2664c475608b..46b0d25b3d7d 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -38,7 +38,7 @@ inline std::ostream& operator<<(std::ostream& os, const TextValue& val) {  // NO
  * It can be hard to design a text format for all the possible nodes
  * as the set of nodes can grow when we do more extensions.
  *
- * Instead of trying to design readable text format for every nodes,
+ * Instead of trying to design readable text format for every node,
  * we support a meta-data section in the text format.
  * We allow the text format to refer to a node in the meta-data section.
  *
@@ -73,7 +73,7 @@ inline std::ostream& operator<<(std::ostream& os, const TextValue& val) {  // NO
  * \endcode
  *
  * Note that we store tvm.var("n") in the meta data section.
- * Since it is stored in the index-0 in the meta-data seciton,
+ * Since it is stored in the index-0 in the meta-data section,
  * we print it as meta.Variable(0).
  *
  * The text parser can recover this object by loading from the corresponding
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
new file mode 100644
index 000000000000..c2c83df7ed0c
--- /dev/null
+++ b/tests/python/relay/test_ir_parser.py
@@ -0,0 +1,562 @@
+import tvm
+from tvm import relay
+from tvm.relay.parser import enabled
+from tvm.relay.ir_pass import alpha_equal
+from nose.tools import nottest, raises
+from numpy import isclose
+from typing import Union
+from functools import wraps
+if enabled():
+    from tvm.relay._parser import ParseError
+    raises_parse_error = raises(ParseError)
+else:
+    raises_parse_error = lambda x: x
+
+BINARY_OPS = {
+    "*": relay.multiply,
+    "/": relay.divide,
+    "+": relay.add,
+    "-": relay.subtract,
+    "<": relay.less,
+    ">": relay.greater,
+    "<=": relay.less_equal,
+    ">=": relay.greater_equal,
+    "==": relay.equal,
+    "!=": relay.not_equal,
+}
+
+TYPES = {
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+
+    "float16",
+    "float32",
+    "float64",
+
+    "bool",
+
+    "int8x4",
+    "uint1x4",
+    "float16x4",
+}
+
+def get_scalar(x):
+    # type: (relay.Constant) -> (Union[float, int, bool])
+    return x.data.asnumpy().item()
+
+int32 = relay.scalar_type("int32")
+
+_ = relay.Var("_")
+X = relay.Var("x")
+Y = relay.Var("y")
+X_ANNO = relay.Var("x", int32)
+Y_ANNO = relay.Var("y", int32)
+
+UNIT = relay.Tuple([])
+
+# decorator to determine if parser is enabled
+def if_parser_enabled(func):
+    # https://stackoverflow.com/q/7727678
+    @wraps(func)
+    def wrapper():
+        if not enabled():
+            return
+        func()
+    return wrapper
+
+@if_parser_enabled
+def test_comments():
+    assert alpha_equal(
+        relay.fromtext("""
+            // This is a line comment!
+            ()
+        """),
+        UNIT
+    )
+
+    assert alpha_equal(
+        relay.fromtext("""
+            /* This is a block comment!
+               This is still a block comment!
+            */
+            ()
+        """),
+        UNIT
+    )
+
+@if_parser_enabled
+def test_int_literal():
+    assert isinstance(relay.fromtext("1"), relay.Constant)
+    assert isinstance(relay.fromtext("1").data, tvm.ndarray.NDArray)
+    
+    assert get_scalar(relay.fromtext("1")) == 1
+    assert get_scalar(relay.fromtext("10")) == 10
+    assert get_scalar(relay.fromtext("0")) == 0
+    assert get_scalar(relay.fromtext("-100")) == -100
+    assert get_scalar(relay.fromtext("-05")) == -5
+
+@if_parser_enabled
+def test_float_literal():
+    assert get_scalar(relay.fromtext("1.0")) == 1.0
+    assert isclose(get_scalar(relay.fromtext("1.56667")), 1.56667)
+    assert get_scalar(relay.fromtext("0.0")) == 0.0
+    assert get_scalar(relay.fromtext("-10.0")) == -10.0
+
+    # scientific notation
+    assert isclose(get_scalar(relay.fromtext("1e-1")), 1e-1)
+    assert get_scalar(relay.fromtext("1e+1")) == 1e+1
+    assert isclose(get_scalar(relay.fromtext("1E-1")), 1E-1)
+    assert get_scalar(relay.fromtext("1E+1")) == 1E+1
+    assert isclose(get_scalar(relay.fromtext("1.0e-1")), 1.0e-1)
+    assert get_scalar(relay.fromtext("1.0e+1")) == 1.0e+1
+    assert isclose(get_scalar(relay.fromtext("1.0E-1")), 1.0E-1)
+    assert get_scalar(relay.fromtext("1.0E+1")) == 1.0E+1
+
+@if_parser_enabled
+def test_bool_literal():
+    assert get_scalar(relay.fromtext("True")) == True
+    assert get_scalar(relay.fromtext("False")) == False
+
+@if_parser_enabled
+def test_negative():
+    assert isinstance(relay.fromtext("let %x = 1; -%x").body, relay.Call)
+    assert get_scalar(relay.fromtext("--10")) == 10
+    assert get_scalar(relay.fromtext("---10")) == -10
+
+@if_parser_enabled
+def test_bin_op():
+    for bin_op in BINARY_OPS.keys():
+        assert alpha_equal(
+            relay.fromtext("1 {} 1".format(bin_op)),
+            BINARY_OPS.get(bin_op)(relay.const(1), relay.const(1))
+        )
+
+@if_parser_enabled
+def test_parens():
+    assert alpha_equal(relay.fromtext("1 * 1 + 1"), relay.fromtext("(1 * 1) + 1"))
+    assert not alpha_equal(relay.fromtext("1 * 1 + 1"), relay.fromtext("1 * (1 + 1)"))
+
+@if_parser_enabled
+def test_op_assoc():
+    assert alpha_equal(relay.fromtext("1 * 1 + 1 < 1 == 1"), relay.fromtext("(((1 * 1) + 1) < 1) == 1"))
+    assert alpha_equal(relay.fromtext("1 == 1 < 1 + 1 * 1"), relay.fromtext("1 == (1 < (1 + (1 * 1)))"))
+
+@nottest
+@if_parser_enabled
+def test_vars():
+    # temp vars won't work b/c they start with a digit
+    # # temp var
+    # temp_var = relay.fromtext("%1")
+    # assert isinstance(temp_var, relay.Var)
+    # assert temp_var.name == "1"
+
+    # var
+    var = relay.fromtext("let %foo = (); %foo")
+    assert isinstance(var.body, relay.Var)
+    assert var.body.name_hint == "foo"
+
+    # global var
+    global_var = relay.fromtext("@foo")
+    assert isinstance(global_var, relay.GlobalVar)
+    assert global_var.name_hint == "foo"
+
+    # operator id
+    op = relay.fromtext("foo")
+    assert isinstance(op, relay.Op)
+    assert op.name == "foo"
+
+@if_parser_enabled
+def test_let():
+    assert alpha_equal(
+        relay.fromtext("let %x = 1; ()"),
+        relay.Let(
+            X,
+            relay.const(1),
+            UNIT
+        )
+    )
+
+@if_parser_enabled
+def test_seq():
+    assert alpha_equal(
+        relay.fromtext("(); ()"),
+        relay.Let(
+            _,
+            UNIT,
+            UNIT)
+    )
+
+    assert alpha_equal(
+        relay.fromtext("let %_ = { 1 }; ()"),
+        relay.Let(
+            X,
+            relay.const(1),
+            UNIT
+        )
+    )
+
+@raises_parse_error
+@if_parser_enabled
+def test_let_global_var():
+    relay.fromtext("let @x = 1; ()")
+
+@raises_parse_error
+@if_parser_enabled
+def test_let_op():
+    relay.fromtext("let x = 1; ()")
+
+@if_parser_enabled
+def test_tuple():
+    assert alpha_equal(relay.fromtext("()"), relay.Tuple([]))
+
+    assert alpha_equal(relay.fromtext("(0,)"), relay.Tuple([relay.const(0)]))
+
+    assert alpha_equal(relay.fromtext("(0, 1)"), relay.Tuple([relay.const(0), relay.const(1)]))
+
+    assert alpha_equal(relay.fromtext("(0, 1, 2)"), relay.Tuple([relay.const(0), relay.const(1), relay.const(2)]))
+
+@if_parser_enabled
+def test_func():
+    # 0 args
+    assert alpha_equal(
+        relay.fromtext("fn () { 0 }"),
+        relay.Function(
+            [],
+            relay.const(0),
+            None,
+            []
+        )
+    )
+
+    # 1 arg
+    assert alpha_equal(
+        relay.fromtext("fn (%x) { %x }"),
+        relay.Function(
+            [X],
+            X,
+            None,
+            []
+        )
+    )
+
+    # 2 args
+    assert alpha_equal(
+        relay.fromtext("fn (%x, %y) { %x + %y }"),
+        relay.Function(
+            [X, Y],
+            relay.add(X, Y),
+            None,
+            []
+        )
+    )
+
+    # annotations
+    assert alpha_equal(
+        relay.fromtext("fn (%x: int32) -> int32 { %x }"),
+        relay.Function(
+            [X_ANNO],
+            X_ANNO,
+            int32,
+            []
+        )
+    )
+
+# TODO(@jmp): Crashes if %x isn't annnotated.
+# @nottest
+@if_parser_enabled
+def test_defn():
+    id_defn = relay.fromtext(
+        """
+        def @id(%x: int32) -> int32 {
+            %x
+        }
+        """)
+    assert isinstance(id_defn, relay.Module)
+
+@if_parser_enabled
+def test_ifelse():
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        if (True) {
+            0
+        } else {
+            1
+        }
+        """
+        ),
+        relay.If(
+            relay.const(True),
+            relay.const(0),
+            relay.const(1)
+        )
+    )
+
+@raises_parse_error
+@if_parser_enabled
+def test_ifelse_scope():
+    relay.fromtext(
+        """
+        if (True) {
+            let %x = ();
+            ()
+        } else {
+            %x
+        }
+        """
+    )
+
+@if_parser_enabled
+def test_call():
+    # 0 args
+    constant = relay.Var("constant")
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        let %constant = fn () { 0 };
+        %constant()
+        """
+        ),
+        relay.Let(
+            constant,
+            relay.Function([], relay.const(0), None, []),
+            relay.Call(constant, [], None, None)
+        )
+    )
+
+    # 1 arg
+    id_var = relay.Var("id")
+    assert alpha_equal(
+        relay.fromtext(
+            """
+            let %id = fn (%x) { %x };
+            %id(1)
+            """
+        ),
+        relay.Let(
+            id_var,
+            relay.Function([X], X, None, []),
+            relay.Call(id_var, [relay.const(1)], None, None)
+        )
+    )
+
+    # 2 args
+    multiply = relay.Var("multiply")
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        let %multiply = fn (%x, %y) { %x * %y };
+        %multiply(0, 0)
+        """
+        ),
+        relay.Let(
+            multiply,
+            relay.Function(
+                [X, Y],
+                relay.multiply(X, Y),
+                None,
+                []
+            ),
+            relay.Call(multiply, [relay.const(0), relay.const(0)], None, None)
+        )
+    )
+
+    # anonymous function
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        (fn (%x) { %x })(0)
+        """
+        ),
+        relay.Call(
+            relay.Function(
+                [X],
+                X,
+                None,
+                []
+            ),
+            [relay.const(0)],
+            None,
+            None
+        )
+    )
+
+    # curried function
+    curried_mult = relay.Var("curried_mult")
+    alpha_equal(
+        relay.fromtext(
+            """
+            let %curried_mult =
+                fn (%x) {
+                fn (%y) {
+                    %x * %y
+                }
+                };
+            %curried_mult(0);
+            %curried_mult(0)(0)
+            """
+        ),
+        relay.Let(
+            curried_mult,
+            relay.Function(
+                [X],
+                relay.Function(
+                    [Y],
+                    relay.multiply(X, Y),
+                    None,
+                    []
+                ),
+                None,
+                []
+            ),
+            relay.Let(
+                _,
+                relay.Call(curried_mult, [relay.const(0)], None, None),
+                relay.Call(relay.Call(curried_mult, [relay.const(0)], None, None), [relay.const(0)], None, None)
+            )
+        )
+    )
+
+    # op
+    alpha_equal(
+        relay.fromtext("abs(1)"),
+        relay.Call(relay.op.get("abs"), [relay.const(1)], None, None)
+    )
+
+# Types
+
+@if_parser_enabled
+def test_incomplete_type():
+    assert alpha_equal(
+        relay.fromtext("let %_ : _ = (); ()"),
+        relay.Let(
+            _,
+            UNIT,
+            UNIT
+        )
+    )
+
+@if_parser_enabled
+def test_builtin_types():
+    for builtin_type in TYPES:
+        relay.fromtext("let %_ : {} = (); ()".format(builtin_type))
+
+@nottest
+@if_parser_enabled
+def test_call_type():
+    assert False
+
+@if_parser_enabled
+def test_tensor_type():
+    assert alpha_equal(
+        relay.fromtext("let %_ : Tensor[(), float32] = (); ()"),
+        relay.Let(
+            relay.Var("_", relay.TensorType((), "float32")),
+            UNIT,
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext("let %_ : Tensor[(1,), float32] = (); ()"),
+        relay.Let(
+            relay.Var("_", relay.TensorType((1,), "float32")),
+            UNIT,
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext("let %_ : Tensor[(1, 1), float32] = (); ()"),
+        relay.Let(
+            relay.Var("_", relay.TensorType((1, 1), "float32")),
+            UNIT,
+            UNIT
+        )
+    )
+
+@if_parser_enabled
+def test_function_type():
+    assert alpha_equal(
+        relay.fromtext(
+            """
+            let %_: fn () -> int32 = fn () -> int32 { 0 }; ()
+            """
+        ),
+        relay.Let(
+            relay.Var("_", relay.FuncType([], int32, [], [])),
+            relay.Function([], relay.const(0), int32, []),
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext(
+            """
+            let %_: fn (int32) -> int32 = fn (%x: int32) -> int32 { 0 }; ()
+            """
+        ),
+        relay.Let(
+            relay.Var("_", relay.FuncType([int32], int32, [], [])),
+            relay.Function([relay.Var("x", int32)], relay.const(0), int32, []),
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext(
+            """
+            let %_: fn (int32, int32) -> int32 = fn (%x: int32, %y: int32) -> int32 { 0 }; ()
+            """
+        ),
+        relay.Let(
+            relay.Var("_", relay.FuncType([int32, int32], int32, [], [])),
+            relay.Function([relay.Var("x", int32), relay.Var("y", int32)], relay.const(0), int32, []),
+            UNIT
+        )
+    )
+
+@if_parser_enabled
+def test_tuple_type():
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        let %_: () = (); ()
+        """),
+        relay.Let(
+            relay.Var("_", relay.TupleType([])),
+            UNIT,
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        let %_: (int32,) = (0,); ()
+        """),
+        relay.Let(
+            relay.Var("_", relay.TupleType([int32])),
+            relay.Tuple([relay.const(0)]),
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        let %_: (int32, int32) = (0, 1); ()
+        """),
+        relay.Let(
+            relay.Var("_", relay.TupleType([int32, int32])),
+            relay.Tuple([relay.const(0), relay.const(1)]),
+            UNIT
+        )
+    )

From 2403d1bd217368076bc8f2bf5dbeb5e4a1caf12d Mon Sep 17 00:00:00 2001
From: Ruslan Baratov <ruslan_baratov@yahoo.com>
Date: Mon, 3 Dec 2018 17:55:13 +0000
Subject: [PATCH 457/529] Fix misprint (#2223)

---
 tutorials/nnvm/from_mxnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/nnvm/from_mxnet.py b/tutorials/nnvm/from_mxnet.py
index dcecf3c42bcc..b4c2c5b7dfbd 100644
--- a/tutorials/nnvm/from_mxnet.py
+++ b/tutorials/nnvm/from_mxnet.py
@@ -96,7 +96,7 @@ def transform_image(image):
 ######################################################################
 # Use MXNet symbol with pretrained weights
 # ----------------------------------------
-# MXNet often use `arg_prams` and `aux_params` to store network parameters
+# MXNet often use `arg_params` and `aux_params` to store network parameters
 # separately, here we show how to use these weights with existing API
 def block2symbol(block):
     data = mx.sym.Variable('data')

From 0bbbd815df8ac1fac1d32da6b8c1e799e8715f81 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 4 Dec 2018 09:46:58 +0800
Subject: [PATCH 458/529] [RELAY][PASS] Fix expr subst and
 CombineParallelConv2D (#2218)

---
 src/relay/pass/expr_subst.cc                  |  2 +-
 .../test_pass_combine_parallel_conv2d.py      | 39 +++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/src/relay/pass/expr_subst.cc b/src/relay/pass/expr_subst.cc
index 586f748abef5..67dc0d2f7049 100644
--- a/src/relay/pass/expr_subst.cc
+++ b/src/relay/pass/expr_subst.cc
@@ -18,7 +18,7 @@ class ExprSubstituter : public ExprMutator {
   Expr VisitExpr(const Expr& expr) final {
     auto it = subst_map_.find(expr);
     if (it != subst_map_.end()) {
-      return (*it).second;
+      return ExprMutator::VisitExpr((*it).second);
     }
     return ExprMutator::VisitExpr(expr);
   }
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
index 6fea201d64c8..7d0a5a08555e 100644
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -134,7 +134,46 @@ def check(x_shape, channels1, channels2):
 
     check((1, 4, 16, 16), 4, 8)
 
+
+def test_combine_parallel_conv2d_multiple_blocks():
+    def before(x, w, repeat):
+        args = [x, w]
+        y = x
+        for i in range(repeat):
+            y1 = relay.nn.conv2d(y, w)
+            y2 = relay.nn.conv2d(y, w)
+            y = relay.concatenate((y1, y2), axis=1)
+        return relay.Function(args, y)
+
+    def expected(x, w, channels, repeat):
+        args = [x, w]
+        y = x
+        for i in range(repeat):
+            w_concat = relay.concatenate((w, w), axis=0)
+            y = relay.nn.conv2d(y, w_concat, channels=channels*2)
+            y1 = relay.strided_slice(y, [0, 0], [None, channels])
+            y2 = relay.strided_slice(y, [0, channels], [None, channels * 2])
+            y = relay.concatenate((y1, y2), axis=1)
+        return relay.Function(args, y)
+
+    def check(x_shape, repeat):
+        x = relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        out_c = in_c // 2
+        w = relay.var("w", shape=(out_c, in_c, 1, 1))
+        y_before = before(x, w, repeat)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w, out_c, repeat)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4)
+
+
 if __name__ == "__main__":
     test_combine_parallel_conv2d()
     test_combine_parallel_conv2d_scale_relu()
     test_combine_parallel_conv2d_scale()
+    test_combine_parallel_conv2d_multiple_blocks()

From c9d68703cde5af11df0b72e15a59358bba0da827 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Tue, 4 Dec 2018 12:54:01 -0800
Subject: [PATCH 459/529] Port from_nnvm to NNVM as to_relay (#2144)

---
 nnvm/python/nnvm/to_relay.py                | 506 ++++++++++++++++++++
 nnvm/tests/python/compiler/test_to_relay.py |  41 ++
 python/tvm/relay/frontend/common.py         |  55 ++-
 python/tvm/relay/frontend/mxnet.py          | 137 +-----
 python/tvm/relay/frontend/nnvm_common.py    | 132 +++++
 python/tvm/relay/op/_transform.py           |   1 +
 python/tvm/relay/op/nn/_nn.py               |   7 +-
 src/relay/backend/graph_plan_memory.cc      |   3 +
 src/relay/ir/alpha_equal.cc                 |  10 +-
 src/relay/op/nn/upsampling.cc               |  48 +-
 tests/python/relay/frontend/test_keras.py   | 332 +++++++++++++
 topi/include/topi/image/resize.h            |   3 +-
 12 files changed, 1116 insertions(+), 159 deletions(-)
 create mode 100644 nnvm/python/nnvm/to_relay.py
 create mode 100644 nnvm/tests/python/compiler/test_to_relay.py
 create mode 100644 python/tvm/relay/frontend/nnvm_common.py
 create mode 100644 tests/python/relay/frontend/test_keras.py

diff --git a/nnvm/python/nnvm/to_relay.py b/nnvm/python/nnvm/to_relay.py
new file mode 100644
index 000000000000..318ff1ee92dd
--- /dev/null
+++ b/nnvm/python/nnvm/to_relay.py
@@ -0,0 +1,506 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-argument
+"""Convert an NNVM graph to Relay."""
+import json
+from tvm import relay, nd
+from tvm.relay import op, expr, var
+from tvm.relay.frontend.common import StrAttrsDict
+from tvm.relay.frontend.nnvm_common import _rename
+import numpy
+from .symbol import Symbol
+from .compiler import graph_attr
+from .graph import create as graph_create
+
+def _nn_batch_flatten(children, attrs, odtype='float32'):
+    assert len(children) == 1
+    return op.nn.batch_flatten(children[0])
+
+
+def _dense(children, attrs, odtype='float32'):
+    use_bias = attrs.get_bool('use_bias', True)
+    units = attrs.get_int('units')
+    dense = op.nn.dense(children[0], children[1], units=units)
+    if use_bias:
+        return op.nn.bias_add(dense, children[2])
+    else:
+        return dense
+
+def _nn_softmax(children, attrs, odtype='float32'):
+    assert len(children) == 1
+    axis = attrs.get_int('axis', 1)
+    return op.nn.softmax(children[0], axis)
+
+def _conv2d(children, attrs, odtype='float32'):
+    use_bias = attrs.get_bool('use_bias', False)
+
+    if use_bias:
+        data, weight, bias = children
+    else:
+        data, weight = children
+
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    dilation = attrs.get_int_tuple('dilation', (1, 1))
+    groups = attrs.get_int('groups', 1)
+    data_layout = attrs.get_str('layout', 'NCHW')
+    weight_layout = attrs.get_str('kernel_layout', 'OIHW')
+    out_layout = ''
+    out_dtype = attrs.get_str('out_dtype', '')
+
+    conv_out = op.nn.conv2d(
+        data,
+        weight,
+        strides=strides,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        data_layout=data_layout,
+        weight_layout=weight_layout,
+        out_layout=out_layout,
+        out_dtype=out_dtype)
+
+    if use_bias:
+        return op.nn.bias_add(conv_out, bias)
+    else:
+        return conv_out
+
+
+def _conv2d_transpose(children, attrs, odtype='float32'):
+    use_bias = attrs.get_bool('use_bias', False)
+
+    if use_bias:
+        data, weight, bias = children
+    else:
+        data, weight = children
+
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    dilation = attrs.get_int_tuple('dilation', (1, 1))
+    groups = attrs.get_int('groups', 1)
+    data_layout = attrs.get_str('layout', 'NCHW')
+    weight_layout = attrs.get_str('kernel_layout', 'OIHW')
+    out_dtype = attrs.get_str('out_dtype', '')
+
+    out_conv2d = op.nn.conv2d_transpose(
+        data,
+        weight,
+        strides=strides,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        data_layout=data_layout,
+        weight_layout=weight_layout,
+        out_dtype=out_dtype)
+
+    if use_bias:
+        return op.nn.bias_add(out_conv2d, bias)
+    else:
+        return out_conv2d
+
+
+def _batch_norm(children, attrs, odtype='float32'):
+    data, gamma, beta, moving_mean, moving_view = children
+    axis = attrs.get_int('axis', 1)
+    epsilon = attrs.get_float('epsilon', 1e-05)
+    center = attrs.get_bool('center', True)
+    scale = attrs.get_bool('scale', True)
+
+    return op.nn.batch_norm(
+        data,
+        gamma,
+        beta,
+        moving_mean,
+        moving_view,
+        axis=axis,
+        epsilon=epsilon,
+        center=center,
+        scale=scale)[0]
+
+
+def _max_pool2d(children, attrs, odtype='float32'):
+    assert len(children) == 1
+    data = children[0]
+    pool_size = attrs.get_int_tuple('pool_size', (1, 1))
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    layout = attrs.get_int_tuple('layout', 'NCHW')
+    ceil_mode = attrs.get_bool('ceil_mode', False)
+
+    return op.nn.max_pool2d(
+        data,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        layout=layout,
+        ceil_mode=ceil_mode)
+
+
+def _reshape(children, attrs, odtype='float32'):
+    data = children[0]
+    shape = attrs.get_int_list('shape')
+    return op.reshape(data, shape)
+
+
+def _transpose(children, attrs, odtype='float32'):
+    axes = attrs.get_int_list('axes', None)
+    return op.transpose(children[0], axes=axes)
+
+
+def _add(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype=odtype)
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.add(left, right)
+
+
+def _subtract(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype=odtype)
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.subtract(left, right)
+
+
+def _rsubtract(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype=odtype)
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.subtract(right, left)
+
+
+def _multiply(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype=odtype)
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.multiply(left, right)
+
+
+def _divide(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype=odtype)
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.divide(left, right)
+
+
+def _rshift(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype='int32')
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.right_shift(left, right)
+
+
+def _clip(children, attrs, odtype='float32'):
+    a_min = attrs.get_float('a_min')
+    a_max = attrs.get_float('a_max')
+    return op.clip(children[0], a_min, a_max)
+
+
+def _cast(children, attrs, odtype='float32'):
+    data = children[0]
+    dtype = attrs.get_str('dtype')
+    return data.astype(dtype)
+
+
+def _expand_dims(children, attrs, odtype='float32'):
+    data = children[0]
+    axis = attrs.get_int('axis')
+    num_newaxis = attrs.get_int('num_newaxis', 1)
+    return op.transform.expand_dims(data, axis, num_newaxis=num_newaxis)
+
+
+def broadcast_to(children, attrs, odtype='float32'):
+    # TODO(@jroesch) export broadcast to?
+    data = children[0]
+    shape = attrs.get_int_tuple('shape')
+    array = numpy.zeros(shape).astype(odtype)
+    rconst = relay.Constant(nd.array(array))
+    return op.broadcast_to_like(data, rconst)
+
+def _copy(children, attrs, odtype='float32'):
+    return op.copy(children[0])
+
+
+def _global_avg_pool2d(children, attrs, odtype='float32'):
+    data = children[0]
+    layout = attrs.get_str('layout', "NCHW")
+    return op.nn.global_avg_pool2d(data, layout)
+
+
+def _avg_pool2d(children, attrs, odtype='float32'):
+    data = children[0]
+    pool_size = attrs.get_int_tuple('pool_size', (1, 1))
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    layout = attrs.get_str('layout', "NCHW")
+    ceil_mode = attrs.get_bool('ceil_mode', False)
+    count_include_pad = attrs.get_bool('layout', False)
+    return op.nn.avg_pool2d(
+        data,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        layout=layout,
+        ceil_mode=ceil_mode,
+        count_include_pad=count_include_pad)
+
+
+def _upsampling(children, attrs, odtype='float32'):
+    scale = attrs.get_int('scale')
+    layout = attrs.get_str('layout', 'NCHW')
+    method = attrs.get_str('method', 'NEAREST_NEIGHBOR')
+    return op.nn.upsampling(
+        children[0],
+        scale=scale,
+        layout=layout,
+        method=method)
+
+
+def _pad(children, attrs, odtype='float32'):
+    pad_value = attrs.get_float('pad_value', 0.0)
+    pad_width = attrs.get_tuple_tuple_int('pad_width')
+    return op.nn.pad(children[0], pad_width, pad_value=pad_value)
+
+def _leaky_relu(children, attrs, odtype='float32'):
+    alpha = attrs.get_float('alpha')
+    return op.nn.leaky_relu(children[0], alpha)
+
+
+def _full_like(children, attrs, odtype='float32'):
+    fill_value = relay.const(attrs.get_float('fill_value'), dtype='float32')
+    return op.full_like(children[0], fill_value)
+
+
+def _greater(children, attrs, odtype='float32'):
+    out_type = attrs.get_str('out_type')
+    if out_type:
+        return op.greater(children[0], children[1]).astype(out_type)
+    else:
+        return op.greater(children[0], children[1])
+
+
+def _greater_equal(children, attrs, odtype='float32'):
+    out_type = attrs.get_str('out_type', None)
+    if out_type:
+        return op.greater_equal(children[0], children[1]).astype(out_type)
+    else:
+        return op.greater_equal(children[0], children[1])
+
+
+def _less(children, attrs, odtype='float32'):
+    out_type = attrs.get_str('out_type', None)
+    if out_type:
+        return op.less(children[0], children[1]).astype(out_type)
+    else:
+        return op.less(children[0], children[1])
+
+
+def _less_equal(children, attrs, odtype='float32'):
+    out_type = attrs.get_str('out_type', None)
+    if out_type:
+        return op.less_equal(children[0], children[1]).astype(out_type)
+    else:
+        return op.less_equal(children[0], children[1])
+
+
+def _strided_slice(children, attrs, odtype='float32'):
+    begin = attrs.get_int_list('begin')
+    end = attrs.get_int_list('end')
+    strides = attrs.get_int_list('strides', None)
+    return op.strided_slice(children[0], begin, end, strides=strides)
+
+
+def _split(children, attrs, odtype='float32'):
+    indices_or_sections = None
+    try:
+        indices_or_sections = attrs.get_int('indices_or_sections', None)
+    except ValueError:
+        indices_or_sections = indices_or_sections or attrs.get_int_tuple(
+            'indices_or_sections')
+
+    axis = attrs.get_int('axis', 0)
+
+    return op.split(children[0], indices_or_sections, axis)
+
+def _squeeze(children, attrs, odtype='float32'):
+    axis = None
+    try:
+        axis = [attrs.get_int('axis', None)]
+    except ValueError:
+        axis = axis or attrs.get_int_tuple('axis', None)
+
+    return op.squeeze(children[0], axis)
+
+NNVM_OP_2_RELAY_OP = {
+    'flatten': _nn_batch_flatten,
+    'dense': _dense,
+    'softmax': _nn_softmax,
+    'conv2d': _conv2d,
+    'batch_norm': _batch_norm,
+    'max_pool2d': _max_pool2d,
+    'reshape': _reshape,
+    'transpose': _transpose,
+    # Addition
+    '__add_scalar__': _add,
+    'broadcast_add': _add,
+    'elemwise_add': _add,
+    # Subtraction
+    '__sub_scalar__': _subtract,
+    '__rsub_scalar__': _rsubtract,
+    'broadcast_sub': _subtract,
+    'elemwise_sub': _subtract,
+    # Multiply
+    '__mul_scalar__': _multiply,
+    'broadcast_mul': _multiply,
+    'elemwise_mul': _multiply,
+    # Division
+    '__div_scalar__': _divide,
+    'broadcast_div': _divide,
+    'elemwise_div': _divide,
+    # Negative
+    'negative': _rename("negative"),
+
+    # Comparsion
+    'greater': _greater,
+    'greater_equal': _greater_equal,
+    'less': _less,
+    'less_equal': _less_equal,
+
+    # Activations
+    'sigmoid': _rename('sigmoid'),
+    'relu': _rename('nn.relu'),
+    'exp': _rename('exp'),
+    'log': _rename('log'),
+    'tanh': _rename('tanh'),
+    'leaky_relu': _leaky_relu,
+    'clip': _clip,
+    'round': _rename('round'),
+    'cast': _cast,
+    'expand_dims': _expand_dims,
+    'broadcast_to': broadcast_to,
+    '__rshift_scalar__': _rshift,
+    'copy': _copy,
+    'global_avg_pool2d': _global_avg_pool2d,
+    'avg_pool2d': _avg_pool2d,
+    'conv2d_transpose': _conv2d_transpose,
+    'upsampling': _upsampling,
+    'pad': _pad,
+    'full_like': _full_like,
+    'strided_slice': _strided_slice,
+    'split': _split,
+    'squeeze': _squeeze,
+}
+
+
+def to_relay(graph, shape_dict, dtype_dict, params):
+    """Convert an NNVM graph into the corresponding Relay expression.
+
+    Parameters
+    ----------
+    graph : Graph
+       The input graph.
+
+    shape_dict : dict of str to shape
+       The input shape.
+
+    dtype_dict : dict of str to shape
+       The input shape.
+
+    params : dict of str to array
+        The parameters.
+
+    Returns
+    -------
+    (expr, params) : Tuple[relay.Expr, dict of str to array]
+        The corresponding Relay expression and parameters.
+    """
+    if isinstance(graph, Symbol):
+        graph = graph_create(graph)
+
+    param_shapes = dict((k, params[k].shape) for k in params)
+    shape_dict = shape_dict.copy()
+    shape_dict.update(param_shapes)
+    graph = graph_attr.set_shape_inputs(graph, shape_dict)
+    graph = graph_attr.set_dtype_inputs(graph, dtype_dict)
+    graph = graph.apply(["InferShape", "InferType"])
+    shape = graph.json_attr("shape")
+    dtype = [graph_attr.TCODE_TO_DTYPE[di] for di in graph.json_attr("dtype")]
+    heads = [x[0] for x in json.loads(graph.json())['heads']]
+
+    gidx = graph.index
+    relay_map = {}
+    fn_params = []
+    output_ids = []
+
+    for nid, node in enumerate(gidx.nodes):
+        children = []
+        for i in node['inputs']:
+            child = relay_map[i[0]]
+            if isinstance(child, expr.TupleWrapper):
+                children.append(child[i[1]])
+            else:
+                children.append(child)
+
+        oshape = shape[gidx.entry_id(nid, 0)]
+        odtype = dtype[gidx.entry_id(nid, 0)]
+        attrs = node.get("attrs", {})
+        node_name = node["name"]
+        op_name = node["op"]
+
+        if op_name == "null":
+            v = var(node_name, shape=oshape, dtype=odtype)
+            fn_params.append(v)
+            relay_map[nid] = v
+        else:
+            if nid in heads:
+                output_ids.append(nid)
+
+            if op_name in NNVM_OP_2_RELAY_OP:
+                str_attrs = StrAttrsDict(attrs)
+                call = NNVM_OP_2_RELAY_OP[op_name](children, str_attrs, odtype)
+                relay_map[nid] = call
+            else:
+                raise Exception(
+                    "nnvm.to_relay: unsupported operator: {0}".format(op_name))
+
+    outputs = [relay_map[nid] for nid in output_ids]
+    if len(outputs) == 1:
+        body = outputs[0]
+    else:
+        body = expr.Tuple(outputs)
+
+    func = relay.Function(fn_params, body)
+    return func, params
diff --git a/nnvm/tests/python/compiler/test_to_relay.py b/nnvm/tests/python/compiler/test_to_relay.py
new file mode 100644
index 000000000000..25037cfd3587
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_to_relay.py
@@ -0,0 +1,41 @@
+import nnvm
+from nnvm import testing
+from nnvm import to_relay
+import tvm
+from tvm.relay import ir_pass
+from tvm.relay import create_executor
+from tvm.contrib import graph_runtime
+import numpy as np
+
+def check_model(sym, shapes, dtypes, params):
+    net = nnvm.graph.create(sym)
+    graph_json, mod, params = nnvm.compiler.build(
+        net,
+        'llvm',
+        shape=shapes,
+        dtype=dtypes,
+        params=params)
+    nnvm_rts = graph_runtime.create(graph_json, mod, tvm.cpu(0))
+    inputs = {}
+    for name in shapes:
+        np_array = np.random.rand(*shapes[name]).astype('float32')
+        inputs[name] = tvm.nd.array(np_array)
+
+    nnvm_rts.set_input(**params)
+    nnvm_rts.run(**inputs)
+    nnvm_out = nnvm_rts.get_output(0)
+    relay_model, params = to_relay.to_relay(net, shapes, dtypes, params)
+    relay_model = ir_pass.infer_type(relay_model)
+    relay_rts = create_executor(kind='graph', ctx=tvm.cpu(0), target='llvm')
+    inputs.update(params)
+    relay_out = relay_rts.evaluate(relay_model)(*list(inputs.values()))
+    np.testing.assert_allclose(nnvm_out.asnumpy(), relay_out.asnumpy())
+
+# def test_mlp():
+#     mlp, params = testing.mlp.get_workload(1)
+#     shapes =  { "data": (10, 3, 224, 224) }
+#     dtypes =  { "data": 'float32' }
+#     check_model(mlp, shapes, dtypes, params)
+
+if __name__ == "__main__":
+    test_mlp()
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 8e037d4bc554..95633a4d4586 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -101,11 +101,64 @@ def get_int_tuple(self, key, default=RequiredAttr()):
         """
         if key in self.attrs:
             tshape = self.attrs[key]
-            return tuple(int(x.strip()) for x in tshape.strip('()').split(','))
+            return tuple(int(x.strip()) for x in tshape.strip('()[]').split(','))
         if isinstance(default, RequiredAttr):
             raise AttributeError("Required attribute {} not found.".format(key))
         return default
 
+    def get_tuple_tuple_int(self, key, default=RequiredAttr()):
+        """Get int list attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            value = self.attrs[key]
+            seq = []
+            for tup in value.strip('()').split('),'):
+                tup = tup.strip('[]()')
+                els = [int(x.strip('( ')) for x in tup.split(',')]
+                seq.append(tuple(els))
+
+            return tuple(seq)
+
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_int_list(self, key, default=RequiredAttr()):
+        """Get int list attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            tshape = self.attrs[key]
+            return tuple(int(x.strip()) for x in tshape.strip('[]()').split(','))
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+
+
     def get_bool(self, key, default=RequiredAttr()):
         """Get bool tuple attribute
 
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index b0b1e700987c..77e97d26efe0 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -8,138 +8,14 @@
 from .. import op as _op
 from ... import nd as _nd
 from .common import StrAttrsDict
+from .nnvm_common import _rename, _binop_scalar, _rbinop_scalar, _reduce
+from .nnvm_common import _arg_reduce, _init_op, _softmax_op, _cast
+from .nnvm_common import _clip, _transpose, _upsampling
+from .nnvm_common import _elemwise_sum, _reshape
+from .nnvm_common import _warn_not_used
 
 __all__ = ['from_mxnet']
 
-
-def _get_relay_op(op_name):
-    op = getattr(_op, op_name)
-    if not op:
-        raise RuntimeError("Unable to map op_name {} to relay".format(op_name))
-    return op
-
-
-def _warn_not_used(attr, op='nnvm'):
-    import warnings
-    err = "{} is ignored in {}.".format(attr, op)
-    warnings.warn(err)
-
-
-def _rename(new_op):
-    if isinstance(new_op, str):
-        new_op = _get_relay_op(new_op)
-    # attrs are ignored.
-    def impl(inputs, _):
-        return new_op(*inputs)
-    return impl
-
-
-def _reshape(inputs, attrs):
-    if attrs.get_bool("reverse", False):
-        raise RuntimeError("reshape do not support option reverse")
-    shape = attrs.get_int_tuple("shape")
-    return _op.reshape(inputs[0], newshape=shape)
-
-
-def _init_op(new_op):
-    """Init ops like zeros/ones"""
-    def _impl(inputs, attrs):
-        assert len(inputs) == 0
-        shape = attrs.get_int_tuple("shape")
-        dtype = attrs.get_str("dtype", "float32")
-        return new_op(shape=shape, dtype=dtype)
-    return _impl
-
-
-def _softmax_op(new_op):
-    """softmax/log_softmax"""
-    def _impl(inputs, attrs):
-        assert len(inputs) == 1
-        axis = attrs.get_int("axis", -1)
-        return new_op(inputs[0], axis=axis)
-    return _impl
-
-
-def _reduce(new_op):
-    """Reduction ops like sum/min/max"""
-    def _impl(inputs, attrs):
-        assert len(inputs) == 1
-        axis = attrs.get_int_tuple("axis", [])
-        keepdims = attrs.get_bool("keepdims", False)
-        # use None for reduce over all axis.
-        axis = None if len(axis) == 0 else axis
-        return new_op(inputs[0], axis=axis, keepdims=keepdims)
-    return _impl
-
-
-def _arg_reduce(new_op):
-    """Arg Reduction ops like argmin/argmax"""
-    def _impl(inputs, attrs):
-        assert len(inputs) == 1
-        axis = attrs.get_int("axis", None)
-        keepdims = attrs.get_bool("keepdims", False)
-        res = new_op(inputs[0], axis=[axis], keepdims=keepdims)
-        # cast to dtype.
-        res = res.astype("float32")
-        return res
-    return _impl
-
-
-def _cast(inputs, attrs):
-    """Type cast"""
-    dtype = attrs.get_str("dtype")
-    return _op.cast(inputs[0], dtype=dtype)
-
-
-def _clip(inputs, attrs):
-    a_min = attrs.get_float("a_min")
-    a_max = attrs.get_float("a_max")
-    return _op.clip(inputs[0], a_min=a_min, a_max=a_max)
-
-
-def _transpose(inputs, attrs):
-    axes = attrs.get_int_tuple("axes", None)
-    # translate default case
-    axes = None if len(axes) == 0 else axes
-    return _op.transpose(inputs[0], axes=axes)
-
-
-def _upsampling(inputs, attrs):
-    scale = attrs.get_int("scale")
-    return _op.nn.upsampling(inputs[0], scale=scale)
-
-
-def _elemwise_sum(inputs, _):
-    assert len(inputs) > 0
-    res = inputs[0]
-    for x in inputs[1:]:
-        res = _op.add(res, x)
-    return res
-
-
-def _binop_scalar(new_op):
-    def _impl(inputs, attrs):
-        assert len(inputs) == 1
-        scalar = attrs.get_float("scalar")
-        # Note: binary scalar only works for float op for now
-        scalar = _expr.const(scalar, dtype="float32")
-        return new_op(inputs[0], scalar)
-    return _impl
-
-
-def _rbinop_scalar(new_op):
-    def _impl(inputs, attrs):
-        assert len(inputs) == 1
-        scalar = attrs.get_float("scalar")
-        # Note: binary scalar only works for float op for now
-        scalar = _expr.const(scalar, dtype="float32")
-        return new_op(scalar, inputs[0])
-    return _impl
-
-# All the functions with _mx prefix specific to MXNet.
-# The functions without _mx prefix can be reused for
-# NNVMv1 conversion to _op.
-
 def _mx_fully_connected(inputs, attrs):
     import mxnet as mx
     units = attrs.get_int("num_hidden")
@@ -493,6 +369,7 @@ def _from_mxnet_impl(symbol, shape_dict, dtype_info):
     jnodes = jgraph["nodes"]
     node_map = {}
 
+
     for nid, node in enumerate(jnodes):
         children = [node_map[e[0]][e[1]] for e in node["inputs"]]
         attrs = StrAttrsDict(node.get("attrs", {}))
@@ -501,7 +378,7 @@ def _from_mxnet_impl(symbol, shape_dict, dtype_info):
         if op_name == "null":
             shape = shape_dict[node_name] if node_name in shape_dict else None
             if isinstance(dtype_info, dict):
-                dtype = dtype_info[node_name] if node_name in dtype_dict else "float32"
+                dtype = dtype_info[node_name] if node_name in dtype_info else "float32"
             else:
                 dtype = dtype_info
             node_map[nid] = [_expr.var(node_name, shape=shape, dtype=dtype)]
diff --git a/python/tvm/relay/frontend/nnvm_common.py b/python/tvm/relay/frontend/nnvm_common.py
new file mode 100644
index 000000000000..17502dbaa090
--- /dev/null
+++ b/python/tvm/relay/frontend/nnvm_common.py
@@ -0,0 +1,132 @@
+# pylint: disable=invalid-name, import-self, len-as-condition
+"""Utility functions common to NNVM and MxNet conversion."""
+from __future__ import absolute_import as _abs
+
+from .. import expr as _expr
+from .. import op as _op
+
+def _get_relay_op(op_name):
+    op = _op
+    for path in op_name.split("."):
+        op = getattr(op, path)
+    if not op:
+        raise RuntimeError("Unable to map op_name {} to relay".format(op_name))
+    return op
+
+
+def _warn_not_used(attr, op='nnvm'):
+    import warnings
+    err = "{} is ignored in {}.".format(attr, op)
+    warnings.warn(err)
+
+
+def _rename(new_op):
+    if isinstance(new_op, str):
+        new_op = _get_relay_op(new_op)
+    # attrs are ignored.
+    def impl(inputs, _, _dtype='float32'):
+        return new_op(*inputs)
+    return impl
+
+
+def _reshape(inputs, attrs):
+    if attrs.get_bool("reverse", False):
+        raise RuntimeError("reshape do not support option reverse")
+    shape = attrs.get_int_tuple("shape")
+    return _op.reshape(inputs[0], newshape=shape)
+
+
+def _init_op(new_op):
+    """Init ops like zeros/ones"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 0
+        shape = attrs.get_int_tuple("shape")
+        dtype = attrs.get_str("dtype", "float32")
+        return new_op(shape=shape, dtype=dtype)
+    return _impl
+
+
+def _softmax_op(new_op):
+    """softmax/log_softmax"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        axis = attrs.get_int("axis", -1)
+        return new_op(inputs[0], axis=axis)
+    return _impl
+
+
+def _reduce(new_op):
+    """Reduction ops like sum/min/max"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        axis = attrs.get_int_tuple("axis", [])
+        keepdims = attrs.get_bool("keepdims", False)
+        # use None for reduce over all axis.
+        axis = None if len(axis) == 0 else axis
+        return new_op(inputs[0], axis=axis, keepdims=keepdims)
+    return _impl
+
+
+def _arg_reduce(new_op):
+    """Arg Reduction ops like argmin/argmax"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        axis = attrs.get_int("axis", None)
+        keepdims = attrs.get_bool("keepdims", False)
+        res = new_op(inputs[0], axis=[axis], keepdims=keepdims)
+        # cast to dtype.
+        res = res.astype("float32")
+        return res
+    return _impl
+
+
+def _cast(inputs, attrs):
+    """Type cast"""
+    dtype = attrs.get_str("dtype")
+    return inputs[0].astype(dtype=dtype)
+
+
+def _clip(inputs, attrs):
+    a_min = attrs.get_float("a_min")
+    a_max = attrs.get_float("a_max")
+    return _op.clip(inputs[0], a_min=a_min, a_max=a_max)
+
+
+def _transpose(inputs, attrs):
+    axes = attrs.get_int_tuple("axes", None)
+    # translate default case
+    axes = None if len(axes) == 0 else axes
+    return _op.transpose(inputs[0], axes=axes)
+
+
+def _upsampling(inputs, attrs):
+    scale = attrs.get_int("scale")
+    return _op.nn.upsampling(inputs[0], scale=scale)
+
+
+def _elemwise_sum(inputs, _):
+    assert len(inputs) > 0
+    res = inputs[0]
+    for x in inputs[1:]:
+        res = _op.add(res, x)
+    return res
+
+
+def _binop_scalar(new_op):
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        scalar = attrs.get_float("scalar")
+        # Note: binary scalar only works for float op for now
+        scalar = _expr.const(scalar, dtype="float32")
+        return new_op(inputs[0], scalar)
+    return _impl
+
+
+def _rbinop_scalar(new_op):
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        scalar = attrs.get_float("scalar")
+        # Note: binary scalar only works for float op for now
+        scalar = _expr.const(scalar, dtype="float32")
+        return new_op(scalar, inputs[0])
+    return _impl
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 1aaf376a7dc8..c1e71e9133ea 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -9,6 +9,7 @@
 schedule_injective = _reg.schedule_injective
 schedule_broadcast = _reg.schedule_injective
 
+
 _reg.register_schedule("collapse_sum_like", _schedule_reduce)
 _reg.register_schedule("broadcast_to_like", schedule_broadcast)
 _reg.register_schedule("expand_dims", schedule_broadcast)
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 007888996ed5..f5f76e6af38a 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -243,14 +243,11 @@ def schedule_l2_normalize(attrs, outs, target):
 
 reg.register_pattern("nn.l2_normalize", OpPattern.OUT_ELEMWISE_FUSABLE)
 
-
-@reg.register_schedule("nn.upsampling")
+# Upsampling
+reg.register_schedule("nn.upsampling", reg.schedule_injective)
 def schedule_upsampling(_, outs, target):
     """Schedule definition of upsampling"""
     with target:
         return topi.generic.schedule_injective(outs)
-
-reg.register_pattern("nn.upsampling", OpPattern.INJECTIVE)
-
 # pad
 reg.register_schedule("nn.pad", schedule_broadcast)
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 5001e2cd4fea..4a5aa4ea0a33 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -253,6 +253,9 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     size_t size = 1;
     for (IndexExpr dim : ttype->shape) {
       const int64_t* pval = as_const_int(dim);
+      CHECK_GE(*pval, 0) <<
+        "can not allocate memory for tensor with negative shape" <<
+        *pval;
       CHECK(pval != nullptr)
           << "Cannot allocate memory symbolic tensor shape "
           << ttype->shape;
diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 16af572a9d6f..064343c834ea 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -13,7 +13,7 @@
 namespace tvm {
 namespace relay {
 
-// Alpha equal handler for relay.
+// Alpha Equal handler for Relay.
 class AlphaEqualHandler:
       public AttrsEqualHandler,
       public TypeFunctor<bool(const Type&, const Type&)>,
@@ -26,7 +26,7 @@ class AlphaEqualHandler:
    * Check equality of two nodes.
    * \param lhs The left hand operand.
    * \param rhs The right hand operand.
-   * \return The compare result.
+   * \return The comparison result.
    */
   bool Equal(const NodeRef& lhs, const NodeRef& rhs) {
     if (lhs.same_as(rhs)) return true;
@@ -46,7 +46,7 @@ class AlphaEqualHandler:
    * Check equality of two attributes.
    * \param lhs The left hand operand.
    * \param rhs The right hand operand.
-   * \return The compare result.
+   * \return The comparison result.
    */
   bool AttrEqual(const NodeRef& lhs, const NodeRef& rhs) {
     return AttrsEqualHandler::Equal(lhs, rhs);
@@ -55,7 +55,7 @@ class AlphaEqualHandler:
    * Check equality of two types.
    * \param lhs The left hand operand.
    * \param rhs The right hand operand.
-   * \return The compare result.
+   * \return the comparison result.
    */
   bool TypeEqual(const Type& lhs, const Type& rhs) {
     if (lhs.same_as(rhs)) return true;
@@ -72,7 +72,7 @@ class AlphaEqualHandler:
    *
    * \param lhs The left hand operand.
    * \param rhs The right hand operand.
-   * \return The compare result.
+   * \return The comparison result.
    */
   bool ExprEqual(const Expr& lhs, const Expr& rhs) {
     if (lhs.same_as(rhs)) return true;
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index 6a98d2884621..d386437ae15b 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -6,8 +6,11 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/op_attr_types.h>
+#include <tvm/build_module.h>
 #include <topi/elemwise.h>
 #include <topi/nn/upsampling.h>
+#include <vector>
+#include "../op_common.h"
 #include "../layout.h"
 
 namespace tvm {
@@ -86,26 +89,37 @@ RELAY_REGISTER_OP("nn.upsampling")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
 .add_type_rel("UpSampling", UpSamplingRel)
+.set_attr<TOpPattern>("TOpPattern", kInjective)
 .set_attr<FTVMCompute>(
   "FTVMCompute", [](const Attrs& attrs,
-          const Array<Tensor>& inputs,
-          const Type& out_type,
-          const Target& target) {
-  const auto* param = attrs.as<UpSamplingAttrs>();
-  const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(param != nullptr);
-  CHECK(param->layout == "NCHW" || param->layout == "NHWC");
-  CHECK(out_ttype != nullptr);
-  Array<IndexExpr> oshape;
-  if (param->layout == "NCHW") {
-    oshape.push_back(out_ttype->shape[2]);
-    oshape.push_back(out_ttype->shape[3]);
-  } else if (param->layout == "NHWC") {
-    oshape.push_back(out_ttype->shape[1]);
-    oshape.push_back(out_ttype->shape[2]);
-  }
-  return Array<Tensor>{ topi::nn::upsampling(inputs[0], oshape, param->layout, param->method)};
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    const auto* uattrs = attrs.as<UpSamplingAttrs>();
+    CHECK(uattrs != nullptr);
+    auto out_tt = out_type.as<TensorTypeNode>();
+    CHECK(out_tt) << "expected a tensor type: " << out_type;
+    CHECK(uattrs->layout == "NCHW" || uattrs->layout == "NHWC")
+      << "unknown layout: " << uattrs->layout;
+
+    Array<HalideIR::Expr> oshape;
+    if (uattrs->layout == "NCHW") {
+      oshape.push_back(out_tt->shape[2]);
+      oshape.push_back(out_tt->shape[3]);
+    } else if (uattrs->layout == "NHWC") {
+      oshape.push_back(out_tt->shape[1]);
+      oshape.push_back(out_tt->shape[2]);
+    }
+
+    return Array<Tensor>{
+      topi::nn::upsampling(
+        inputs[0],
+        oshape,
+        uattrs->layout,
+        uattrs->method)
+    };
 });
 
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/frontend/test_keras.py b/tests/python/relay/frontend/test_keras.py
new file mode 100644
index 000000000000..f508c5b44310
--- /dev/null
+++ b/tests/python/relay/frontend/test_keras.py
@@ -0,0 +1,332 @@
+import numpy as np
+import nnvm
+from nnvm import to_relay
+import tvm
+from tvm import relay
+from tvm.contrib import graph_runtime
+from nnvm.testing.config import ctx_list
+import keras
+
+# prevent keras from using up all gpu memory
+import tensorflow as tf
+from keras.backend.tensorflow_backend import set_session
+config = tf.ConfigProto()
+config.gpu_options.per_process_gpu_memory_fraction = 0.5
+set_session(tf.Session(config=config))
+
+
+def verify_keras_frontend(keras_model, need_transpose=True):
+    # Keras frontend currently supports tensorflow backend only.
+    assert(keras.backend.backend() == 'tensorflow')
+
+    in_shapes = []
+    for layer in keras_model._input_layers:
+        in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
+
+    def get_keras_output(xs, dtype='float32'):
+        return keras_model.predict(xs)
+
+    def get_tvm_output(xs, target, ctx, dtype='float32'):
+        sym, params = nnvm.frontend.from_keras(keras_model)
+        shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)}
+        with relay.build_module.build_config(opt_level=2):
+            func, params = to_relay.to_relay(sym, shape_dict, dtype, params)
+            graph, lib, params = relay.build(func, target='llvm', params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        for name, x in zip(keras_model.input_names, xs):
+            m.set_input(name, tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+
+        return [m.get_output(i).asnumpy() for i in range(m.get_num_outputs())]
+
+    def to_channels_first(arr):
+        return arr.transpose([0, -1] + list(range(1, arr.ndim - 1)))
+
+    def to_channels_last(arr):
+        return arr.transpose([0] + list(range(2, arr.ndim)) + [1])
+
+    xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
+    keras_out = get_keras_output(xs)
+
+    keras_out = keras_out if isinstance(keras_out, list) else [keras_out]
+    for target, ctx in ctx_list():
+        inputs = [to_channels_first(x) for x in xs] if need_transpose else xs
+        tvm_out = get_tvm_output(inputs, target, ctx)
+        for kout, tout in zip(keras_out, tvm_out):
+            if need_transpose:
+                tout = to_channels_last(tout)
+            tvm.testing.assert_allclose(kout, tout, rtol=1e-5, atol=1e-5)
+
+def test_forward_elemwise_add():
+    r = []
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    r.append(x)
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(x)
+    r.append(x)
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(x)
+    # add two symbols
+    y = keras.layers.add([keras.layers.add([x, r[0]]), r[1]])
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+    # add three symbols
+    y = keras.layers.add([x, r[0], r[1]])
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_dense():
+    data = keras.layers.Input(shape=(32,32,1))
+    x = keras.layers.Flatten()(data)
+    x = keras.layers.Dropout(0.5)(x)
+    x = keras.layers.Dense(10, activation='relu', kernel_initializer='uniform')(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_pool():
+    data = keras.layers.Input(shape=(32,32,1))
+    # maxpool
+    x = keras.layers.MaxPooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+    # avgpool
+    y = keras.layers.AveragePooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_conv():
+    data = keras.layers.Input(shape=(32,32,3))
+    conv_funcs = [keras.layers.Conv2D(filters=10, kernel_size=(3,3),
+                                      strides=(2,2), padding='same'),
+                  keras.layers.Conv2D(filters=10, kernel_size=(3,3),
+                                      dilation_rate=(2,2), padding='same'),
+                  keras.layers.DepthwiseConv2D(kernel_size=(3,3), padding='same'),
+                  keras.layers.Conv2DTranspose(filters=10, kernel_size=(3,3), padding='valid'),
+                  keras.layers.SeparableConv2D(filters=10, kernel_size=(3,3), padding='same')]
+    for conv_func in conv_funcs:
+        x = conv_func(data)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model)
+
+
+def test_forward_upsample():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.UpSampling2D(size=(3,3))(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_reshape():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Reshape(target_shape=(32,32,3))(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_crop():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Cropping2D(cropping=((1, 1), (1, 1)))(data)
+    x = keras.layers.Cropping2D(cropping=(1, 1))(x)
+    x = keras.layers.Cropping2D(cropping=1)(x)
+    x = keras.layers.Cropping2D(cropping=((0, 1), (1, 0)))(x)
+    x = keras.layers.Cropping2D(cropping=(1, 0))(x)
+    x = keras.layers.Cropping2D(cropping=0)(x)
+    x = keras.layers.Add()([x, x])
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_vgg16():
+    keras_model = keras.applications.vgg16.VGG16(include_top=True, weights='imagenet',
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_xception():
+    keras_model = keras.applications.xception.Xception(include_top=True, weights='imagenet',
+        input_shape=(299,299,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_resnet50():
+    keras_model = keras.applications.resnet50.ResNet50(include_top=True, weights='imagenet',
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_mobilenet():
+    keras_model = keras.applications.mobilenet.MobileNet(include_top=True, weights='imagenet',
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_activations():
+    data = keras.layers.Input(shape=(32,32,3))
+    weights = np.random.rand(1, 32, 32, 3)
+    act_funcs = [keras.layers.Activation('softmax'),
+                 keras.layers.Activation('softplus'),
+                 keras.layers.ReLU(),
+                 keras.layers.ReLU(max_value=6.),
+                 keras.layers.LeakyReLU(alpha=0.3),
+                 keras.layers.PReLU(weights=weights, alpha_initializer="zero"),
+                 keras.layers.ELU(alpha=0.5),
+                 keras.layers.Activation('selu'),
+                 keras.layers.ThresholdedReLU(theta=0.5),
+                 keras.layers.Activation('softsign'),
+                 keras.layers.Activation('hard_sigmoid'),
+                 keras.layers.Activation('sigmoid'),
+                 keras.layers.Activation('tanh'),
+                 keras.layers.Activation('linear')]
+    for act_func in act_funcs:
+        x = act_func(data)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model)
+
+
+def test_forward_multi_inputs():
+    data1 = keras.layers.Input(shape=(32,32,3))
+    data2 = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data1)
+    y = keras.layers.Conv2D(8, (3, 3), padding="same")(data2)
+    z = keras.layers.add([x, y])
+    z = keras.layers.GlobalAveragePooling2D()(z)
+    keras_model = keras.models.Model([data1, data2], z)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_multi_outputs():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    y = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, [x, y])
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_reuse_layers():
+    # reuse conv2d
+    data = keras.layers.Input(shape=(32,32,3))
+    conv2d = keras.layers.Conv2D(8, (3, 3), padding="same")
+    x = conv2d(data)
+    y = conv2d(data)
+    z = keras.layers.add([x, y])
+    z = keras.layers.GlobalAveragePooling2D()(z)
+    keras_model = keras.models.Model(data, z)
+    verify_keras_frontend(keras_model)
+
+    # reuse add
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    add = keras.layers.Add()
+    x = add([x, x])
+    x = add([x, x])
+    z = keras.layers.GlobalAveragePooling2D()(x)
+    keras_model = keras.models.Model(data, z)
+    verify_keras_frontend(keras_model)
+
+def _test_LSTM(inputs, hidden, return_state=True):
+    data = keras.layers.Input(shape=(1, inputs))
+    lstm_out = keras.layers.LSTM(hidden,
+                                 return_state=return_state,
+                                 recurrent_activation='sigmoid',
+                                 activation='tanh')
+    x = lstm_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_LSTM_MultiLayer(inputs, hidden):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.LSTM(hidden, return_state=True, return_sequences=True,
+                                 recurrent_activation='sigmoid',
+                                 activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.LSTM(hidden, recurrent_activation='sigmoid',
+                               activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+
+def test_forward_LSTM():
+    # TODO(@jroesch): need to modify compile engine to fix return_state=True
+    _test_LSTM(8, 8, return_state=False)
+    _test_LSTM(4, 4, return_state=False)
+    _test_LSTM_MultiLayer(4, 4)
+
+def _test_RNN(inputs, units):
+    data = keras.layers.Input(shape=(1, inputs))
+    rnn_out = keras.layers.SimpleRNN(units, return_state=True,
+                                 activation='tanh')
+    x = rnn_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_RNN_MultiLayer(inputs, units):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.SimpleRNN(units, return_state=True, return_sequences=True,
+                                   activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.SimpleRNN(units, activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_RNN():
+    _test_RNN(2, 4)
+    _test_RNN(4, 3)
+    _test_RNN_MultiLayer(4, 12)
+
+def _test_GRU(inputs, units):
+    data = keras.layers.Input(shape=(1, inputs))
+    gru_out = keras.layers.GRU(units,
+                               return_state=True,
+                               recurrent_activation='sigmoid',
+                               activation='tanh')
+    x = gru_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_GRU_MultiLayer(inputs, units):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.GRU(units,
+                             return_state=True,
+                             return_sequences=True,
+                             recurrent_activation='sigmoid',
+                             activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.GRU(units, recurrent_activation='sigmoid',
+                              activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_GRU():
+    _test_GRU(2, 4)
+    _test_GRU(4, 3)
+    _test_GRU_MultiLayer(4, 4)
+
+if __name__ == '__main__':
+    test_forward_elemwise_add()
+    test_forward_activations()
+    test_forward_dense()
+    test_forward_pool()
+    test_forward_conv()
+    test_forward_upsample()
+    test_forward_reshape()
+    test_forward_crop()
+    test_forward_vgg16()
+    test_forward_xception()
+    test_forward_resnet50()
+    test_forward_mobilenet()
+    test_forward_multi_inputs()
+    test_forward_multi_outputs()
+    test_forward_reuse_layers()
+    test_forward_LSTM()
+    test_forward_RNN()
+    test_forward_GRU()
diff --git a/topi/include/topi/image/resize.h b/topi/include/topi/image/resize.h
index b6bd51ef0fd2..2ffe4f453ba2 100644
--- a/topi/include/topi/image/resize.h
+++ b/topi/include/topi/image/resize.h
@@ -12,6 +12,7 @@
 #include <algorithm>
 
 #include "topi/tags.h"
+#include "topi/elemwise.h"
 #include "topi/detail/ravel_unravel.h"
 #include "topi/detail/constant_utils.h"
 #include "tvm/tvm.h"
@@ -288,7 +289,7 @@ inline Tensor resize_bilinear_nchw(const Tensor& input,
 * \return A Tensor resized to given shape
 */
 inline Tensor resize_bilinear(const Tensor& input,
-                              const Array<Expr>& shape,
+                              const Array<tvm::Expr>& shape,
                               std::string layout = "NCHW",
                               bool align_corners = false,
                               std::string name = "tensor",

From d2bf9a21576ad5da7646cad90b4348970dd0d03c Mon Sep 17 00:00:00 2001
From: Zhebin Jin <zhebin.jzb@alibaba-inc.com>
Date: Wed, 5 Dec 2018 04:57:07 +0800
Subject: [PATCH 460/529] [DEBUG]Fix debugger message mess in
 display_debug_result (#2228)

Signed-off-by: Zhebin Jin <zhebin.jzb@alibaba-inc.com>
---
 python/tvm/contrib/debugger/debug_result.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 88874d138302..5b563c86e6e4 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -140,6 +140,7 @@ def display_debug_result(self):
             for j in range(num_outputs):
                 op = node['op']
                 if node['op'] == 'param':
+                    eid += 1
                     continue
                 name = node['name']
                 shape = str(self._output_tensor_list[eid].shape)

From 5062650284593311907396a9cb8ded18970d7127 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 5 Dec 2018 04:58:16 +0800
Subject: [PATCH 461/529] [RELAY][PASS] Check Positiveness in FoldScaleAxis
 (#2220)

---
 python/tvm/relay/build_module.py              |   5 +-
 src/relay/pass/fold_scale_axis.cc             |  61 +++++++++-
 src/relay/pass/pattern_util.h                 |  51 ++++++++
 .../python/relay/test_pass_fold_scale_axis.py | 112 +++++++++++++-----
 4 files changed, 193 insertions(+), 36 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 7af22431aa81..5b05bc44551a 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -150,13 +150,14 @@ def optimize(func, params=None):
         func = ir_pass.infer_type(func)
         func = ir_pass.combine_parallel_conv2d(func)
 
+    if cfg.pass_enabled("FoldConstant"):
+        func = ir_pass.fold_constant(func)
+
     if cfg.pass_enabled("FoldScaleAxis"):
         func = ir_pass.infer_type(func)
         func = ir_pass.backward_fold_scale_axis(func)
         func = ir_pass.infer_type(func)
         func = ir_pass.forward_fold_scale_axis(func)
-
-    if cfg.pass_enabled("FoldConstant"):
         func = ir_pass.fold_constant(func)
 
     if cfg.pass_enabled("AlterOpLayout"):
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index 1cd6606bd5c1..9e9dd0604916 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -246,9 +246,44 @@ class ForwardPrep : private ExprVisitor {
 // Per operator defs for FScaleAxisForward
 //----------------------------------------------
 
+// Helper functions
+Expr GetForwardScale(const Expr& expr, AxesSet out) {
+  static const Op& multiply = Op::Get("multiply");
+  static const auto& fprep = Op::GetAttr<FForwardPrep>("FScaleAxisForwardPrep");
+
+  const CallNode* call = expr.as<CallNode>();
+  if (!call) return NullValue<Expr>();
+  auto f = fprep.get(call->op, nullptr);
+
+  if (call->op.same_as(multiply)) {
+    const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
+    const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
+    if (MatchBroadcastToLeftAxes(tlhs, trhs, out)) {
+      return call->args[1];
+    } else if (MatchBroadcastToLeftAxes(trhs, tlhs, out)) {
+      return call->args[0];
+    } else {
+      return NullValue<Expr>();
+    }
+  } else if (f != nullptr) {
+    Array<AxesSet> in_axes = f(GetRef<Call>(call), out);
+    for (size_t i = 0; i < call->args.size(); i++) {
+      auto scale = GetForwardScale(call->args[i], in_axes[i]);
+      if (scale.defined()) {
+        return scale;
+      }
+    }
+  }
+  return NullValue<Expr>();
+}
+
 // Intermediate operators
 Array<AxesSet> ReluForwardPrep(const Call& call, AxesSet out) {
-  return {out};
+  Expr scale = GetForwardScale(call->args[0], out);
+  if (IsPositiveConstant(scale)) {
+    return {out};
+  }
+  return {NullValue<AxesSet>()};
 }
 
 Expr ReluForwardRewrite(const Call& ref_call,
@@ -755,6 +790,22 @@ RELAY_REGISTER_OP("subtract")
 RELAY_REGISTER_OP("subtract")
 .set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", AddSubBackwardTransform);
 
+// Find relu in the backward path between multiply and conv2d
+bool FindBackwardRelu(const Expr& expr) {
+  const CallNode* call = expr.as<CallNode>();
+  static const Op& conv2d = Op::Get("nn.conv2d");
+  static const Op& relu = Op::Get("nn.relu");
+
+  if (!call) return false;
+  if (call->op.same_as(relu)) return true;
+  if (call->op.same_as(conv2d)) return false;
+
+  for (size_t i = 0; i < call->args.size(); i++) {
+    if (FindBackwardRelu(call->args[i])) return true;
+  }
+  return false;
+}
+
 // Producer operators
 // Multiply produces the scale-axis pair.
 Expr MultiplyBackwardTransform(const Call& call,
@@ -770,12 +821,16 @@ Expr MultiplyBackwardTransform(const Call& call,
     // NOTE we won't recursively call mutating on scale part.
     // since there  won't be scale chance within scale part.
     Expr rhs = call->args[1];
-    if (MatchBroadcastToLeftAxes(tlhs, trhs, lhs_axes, &rhs)) {
+    if (MatchBroadcastToLeftAxes(tlhs, trhs, lhs_axes, &rhs) &&
+        (!FindBackwardRelu(call->args[0]) ||
+         IsPositiveConstant(call->args[1]))) {
       return transformer->Transform(call->args[0], lhs_axes, rhs);
     }
   } else if (rhs_axes.defined() && rhs_axes.size() != 0) {
     Expr lhs = call->args[0];
-    if (MatchBroadcastToLeftAxes(trhs, tlhs, rhs_axes, &lhs)) {
+    if (MatchBroadcastToLeftAxes(trhs, tlhs, rhs_axes, &lhs) &&
+        (!FindBackwardRelu(call->args[1]) ||
+         IsPositiveConstant(call->args[0]))) {
       return transformer->Transform(call->args[1], rhs_axes, lhs);
     }
   }
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index e6e8415bd620..5d76efd0124d 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -190,6 +190,57 @@ Expr MakeConcatenate(Expr data, int axis);
 
 Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides);
 
+
+template <typename T>
+bool IsNDArrayAllGreaterEqual(const runtime::NDArray& tensor, T value) {
+  CHECK_EQ(tensor->ctx.device_type, kDLCPU);
+  CHECK(tensor->strides == nullptr);
+  CHECK_EQ(tensor->byte_offset, 0);
+  const T* data = static_cast<const T*>(tensor->data);
+  int64_t num_elems = 1;
+  for (int i = 0; i < tensor->ndim; ++i) {
+    num_elems *= tensor->shape[i];
+  }
+
+  for (int64_t i = 0; i < num_elems; i++) {
+    if (*data < value) {
+      return false;
+    }
+    data++;
+  }
+  return true;
+}
+
+
+inline bool IsPositiveConstant(const Expr& expr) {
+  const auto* constant = expr.as<ConstantNode>();
+  if (!constant) return false;
+  const auto& tensor = constant->data;
+  const auto& dtype = tensor->dtype;
+
+  if (dtype.lanes != 1) {
+    // pass
+  } else if (dtype.code == kDLFloat && dtype.bits == 32) {
+    return IsNDArrayAllGreaterEqual<float>(tensor, 0);
+  } else if (dtype.code == kDLFloat && dtype.bits == 64) {
+    return IsNDArrayAllGreaterEqual<double>(tensor, 0);
+  } else if (dtype.code == kDLInt && dtype.bits == 8) {
+    return IsNDArrayAllGreaterEqual<int8_t>(tensor, 0);
+  } else if (dtype.code == kDLInt && dtype.bits == 32) {
+    return IsNDArrayAllGreaterEqual<int32_t>(tensor, 0);
+  } else if (dtype.code == kDLUInt && dtype.bits == 8) {
+    return IsNDArrayAllGreaterEqual<uint8_t>(tensor, 0);
+  } else if (dtype.code == kDLUInt && dtype.bits == 32) {
+    return IsNDArrayAllGreaterEqual<uint32_t>(tensor, 0);
+  }
+
+  LOG(WARNING) << "Unsupported data type (code = " << dtype.code
+               << ", bits = " << dtype.bits << ", lanes = " << dtype.lanes
+               << ")";
+  return false;
+}
+
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_PATTERN_UTIL_H_
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index e6e008f80d0c..f42aa7b7b8d0 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -1,11 +1,11 @@
 from tvm import relay
+import numpy as np
 
 
 def test_fold_fwd_simple():
     """Simple testcase."""
     def before(x, conv_weight, in_bias, in_scale, channels):
-        args = [x, conv_weight, in_bias, in_scale]
-        in_scale = relay.expand_dims(in_scale, axis=1, num_newaxis=2)
+        args = [x, conv_weight, in_bias]
         in_bias = relay.expand_dims(in_bias, axis=1, num_newaxis=2)
         x = relay.multiply(x, in_scale)
         x = relay.nn.relu(x)
@@ -18,8 +18,7 @@ def before(x, conv_weight, in_bias, in_scale, channels):
 
     def expected(x, conv_weight, in_bias, in_scale, channels):
         # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, in_bias, in_scale]
-        in_scale = relay.expand_dims(in_scale, axis=1, num_newaxis=2)
+        args = [x, conv_weight, in_bias]
         in_bias = relay.expand_dims(in_bias, axis=1, num_newaxis=2)
         squeezed_scale = relay.squeeze(in_scale, axis=[1,2])
         x = relay.nn.relu(x)
@@ -38,7 +37,7 @@ def check(shape, channels):
         in_channels = shape[1]
         weight = relay.var("weight")
         in_bias = relay.var("in_bias", shape=(in_channels,))
-        in_scale = relay.var("in_scale", shape=(in_channels,))
+        in_scale = relay.const(np.random.uniform(size=(in_channels, 1, 1)).astype('float32'))
 
         y1 = before(x, weight, in_bias, in_scale, channels)
         y1 = relay.ir_pass.infer_type(y1)
@@ -56,7 +55,7 @@ def check(shape, channels):
 def test_fold_fwd_dual_path():
     """scale axis being consumed by two consumers"""
     def before(x, conv_weight, in_bias, in_scale, channels):
-        args = [x, conv_weight, in_bias, in_scale]
+        args = [x, conv_weight, in_bias]
         x = relay.multiply(in_scale, x)
         x = relay.nn.relu(x)
         x = relay.subtract(x, in_bias)
@@ -78,7 +77,7 @@ def before(x, conv_weight, in_bias, in_scale, channels):
         return relay.Function(args, z)
 
     def expected(x, conv_weight, in_bias, in_scale, channels):
-        args = [x, conv_weight, in_bias, in_scale]
+        args = [x, conv_weight, in_bias]
         x = relay.nn.relu(x)
         in_bias = relay.divide(in_bias, in_scale)
         x = relay.subtract(x, in_bias)
@@ -108,7 +107,7 @@ def check(shape, channels):
         assert in_channels == channels
         weight = relay.var("weight")
         in_bias = relay.var("in_bias", shape=(in_channels,))
-        in_scale = relay.var("in_scale", shape=(in_channels,))
+        in_scale = relay.const(np.random.uniform(size=(in_channels,)).astype("float32"))
         y1 = before(x, weight, in_bias, in_scale, channels)
         y1 = relay.ir_pass.infer_type(y1)
         y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
@@ -142,7 +141,7 @@ def check(shape, channels):
         assert in_channels == channels
         weight = relay.var("weight")
         in_bias = relay.var("in_bias", shape=(in_channels,))
-        in_scale = relay.var("in_scale", shape=(in_channels,))
+        in_scale = relay.const(np.random.uniform(size=(in_channels,)).astype("float32"))
         y1 = before(x, weight, in_bias, in_scale, channels)
         y1 = relay.ir_pass.infer_type(y1)
         y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
@@ -151,11 +150,42 @@ def check(shape, channels):
     check((2, 11, 10, 4), 4)
 
 
+def test_fold_fwd_relu_fail():
+    """testcase where we canont fold because scale can not pass relu"""
+    def before(x, conv_weight, in_bias, in_scale, channels):
+        x = relay.multiply(x, in_scale)
+        xx = relay.nn.relu(x)
+        y1 = relay.nn.conv2d(xx, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             padding=(1, 1))
+        z = relay.add(y1, x)
+        return relay.Function(relay.ir_pass.free_vars(z), z)
+
+    def check(shape, channels, in_scale):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[-1]
+        # test depthwise
+        assert in_channels == channels
+        weight = relay.var("weight")
+        in_bias = relay.var("in_bias", shape=(in_channels,))
+        in_scale = relay.var("in_scale", shape=(in_channels,))
+        y1 = before(x, weight, in_bias, in_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
+        assert relay.ir_pass.alpha_equal(y1, y1_folded)
+
+    in_scale = relay.var("in_scale", shape=(4,))
+    check((2, 11, 10, 4), 4, in_scale)
+    in_scale = relay.const(np.random.uniform(size=(4,), low=-1.0, high=0.0)).astype("float32")
+    check((2, 11, 10, 4), 4, in_scale)
+
+
 def test_fold_bwd_simple():
     """Simple testcase."""
     def before(x, conv_weight, out_bias, out_scale, channels):
-        args = [x, conv_weight, out_bias, out_scale]
-        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        args = [x, conv_weight, out_bias]
         out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
         y = relay.nn.conv2d(x, conv_weight,
                             channels=channels,
@@ -168,8 +198,7 @@ def before(x, conv_weight, out_bias, out_scale, channels):
 
     def expected(x, conv_weight, out_bias, out_scale, channels):
         # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, out_bias, out_scale]
-        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        args = [x, conv_weight, out_bias]
         out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
         squeezed_scale = relay.squeeze(out_scale, axis=[1,2])
         conv_weight = relay.multiply(
@@ -190,7 +219,7 @@ def check(shape, channels):
         in_channels = shape[1]
         weight = relay.var("weight")
         out_bias = relay.var("out_bias", shape=(channels,))
-        out_scale = relay.var("out_scale", shape=(channels,))
+        out_scale = relay.const(np.random.uniform(size=(channels, 1, 1)).astype("float32"))
 
         y1 = before(x, weight, out_bias, out_scale, channels)
         y1 = relay.ir_pass.infer_type(y1)
@@ -208,9 +237,7 @@ def check(shape, channels):
 def test_fold_bwd_dual_path():
     """Dual path testcase."""
     def before(x, conv_weight, out_bias, out_scale, channels):
-        args = [x, conv_weight, out_bias, out_scale]
-        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
-        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        args = [x, conv_weight, out_bias]
         y1 = relay.nn.conv2d(x, conv_weight,
                              channels=channels,
                              kernel_size=(3, 3),
@@ -227,8 +254,7 @@ def before(x, conv_weight, out_bias, out_scale, channels):
 
     def expected(x, conv_weight, out_bias, out_scale, channels):
         # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, out_bias, out_scale]
-        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        args = [x, conv_weight, out_bias]
         out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
         squeezed_scale = relay.squeeze(out_scale, axis=[1,2])
         def fold_conv_weight():
@@ -253,7 +279,7 @@ def check(shape, channels):
         in_channels = shape[1]
         weight = relay.var("weight")
         out_bias = relay.var("out_bias", shape=(channels,))
-        out_scale = relay.var("out_scale", shape=(channels,))
+        out_scale = relay.const(np.random.uniform(size=(channels, 1, 1)).astype("float32"))
 
         y1 = before(x, weight, out_bias, out_scale, channels)
         y1 = relay.ir_pass.infer_type(y1)
@@ -270,8 +296,7 @@ def check(shape, channels):
 
 def test_fold_bwd_dual_consumer():
     def before(x, conv_weight, out_bias, out_scale, channels):
-        args = [x, conv_weight, out_bias, out_scale]
-        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        args = [x, conv_weight, out_bias]
         y0 = relay.nn.conv2d(x, conv_weight,
                              channels=channels,
                              kernel_size=(3, 3),
@@ -298,8 +323,7 @@ def before(x, conv_weight, out_bias, out_scale, channels):
 
     def expected(x, conv_weight, out_bias, out_scale, channels):
         # use a fixed order of args so alpha equal check can pass
-        args = [x, conv_weight, out_bias, out_scale]
-        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        args = [x, conv_weight, out_bias]
         def fold_conv_weight():
             squeezed_scale = relay.squeeze(out_scale, axis=[1,2])
             return  relay.multiply(
@@ -328,7 +352,7 @@ def check(shape, channels):
         in_channels = shape[1]
         weight = relay.var("weight")
         out_bias = relay.var("out_bias", shape=(channels,))
-        out_scale = relay.var("out_scale", shape=(channels,))
+        out_scale = relay.const(np.random.uniform(size=(channels,1, 1)).astype("float32"))
 
         y1 = before(x, weight, out_bias, out_scale, channels)
         y1 = relay.ir_pass.infer_type(y1)
@@ -346,8 +370,7 @@ def check(shape, channels):
 def test_fold_bwd_fail():
     """Dual path testcase."""
     def fail1(x, conv_weight, out_bias, out_scale, channels):
-        args = [x, conv_weight, out_bias, out_scale]
-        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        args = [x, conv_weight, out_bias]
         out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
         y1 = relay.nn.conv2d(x, conv_weight,
                              channels=channels,
@@ -367,8 +390,7 @@ def fail1(x, conv_weight, out_bias, out_scale, channels):
         return relay.Function(args, y)
 
     def fail2(x, conv_weight, out_bias, out_scale, channels):
-        args = [x, conv_weight, out_bias, out_scale]
-        out_scale = relay.expand_dims(out_scale, axis=1, num_newaxis=2)
+        args = [x, conv_weight, out_bias]
         out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
         y1 = relay.nn.conv2d(x, conv_weight,
                              channels=channels,
@@ -380,13 +402,12 @@ def fail2(x, conv_weight, out_bias, out_scale, channels):
         y = relay.add(y1, y2)
         return relay.Function(args, y)
 
-
     def check(shape, channels, fbefore):
         x =  relay.var("x", shape=shape)
         in_channels = shape[1]
         weight = relay.var("weight")
         out_bias = relay.var("out_bias", shape=(channels,))
-        out_scale = relay.var("out_scale", shape=(channels,))
+        out_scale = relay.const(np.random.uniform(size=(channels, 1, 1)).astype("float32"))
         y1 = fbefore(x, weight, out_bias, out_scale, channels)
         y1 = relay.ir_pass.infer_type(y1)
         y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
@@ -396,11 +417,40 @@ def check(shape, channels, fbefore):
     check((4, 4, 10, 10), 4, fail2)
 
 
+def test_fold_bwd_relu_fail():
+    """testcase where we canont fold because scale can not pass relu"""
+    def before(x, conv_weight, out_scale, channels):
+        y = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NCHW",
+                             padding=(1, 1))
+        y = relay.nn.relu(y)
+        y = relay.multiply(x, out_scale)
+        return relay.Function(relay.ir_pass.free_vars(y), y)
+
+    def check(shape, channels, out_scale):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        y1 = before(x, weight, out_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
+        assert relay.ir_pass.alpha_equal(y1, y1_folded)
+
+    out_scale = relay.var("in_scale", shape=(4, 1, 1))
+    check((4, 4, 10, 10), 4, out_scale)
+    out_scale = relay.const(np.random.uniform(size=(4, 1, 1), low=-1.0, high=0.0)).astype("float32")
+    check((4, 4, 10, 10), 4, out_scale)
+
+
 if __name__ == "__main__":
     test_fold_fwd_simple()
     test_fold_fwd_dual_path()
     test_fold_fwd_fail()
+    test_fold_fwd_relu_fail()
     test_fold_bwd_simple()
     test_fold_bwd_dual_path()
     test_fold_bwd_dual_consumer()
     test_fold_bwd_fail()
+    test_fold_bwd_relu_fail()

From 19c3d0e0e1e654f03555bd5f0fbcc984a5e02744 Mon Sep 17 00:00:00 2001
From: Wu Zhao <wuzhaozju@gmail.com>
Date: Wed, 5 Dec 2018 15:56:08 +0800
Subject: [PATCH 462/529] Remove redact date_vec op

---
 topi/python/topi/arm_cpu/depthwise_conv2d.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index e486142b80e6..9706559cea69 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -221,7 +221,6 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
     else:
         _, dv_oh, dv_ow, dv_c, dv_vh, dv_vw = s[data_vec].op.axis
 
-    _, dv_oh, dv_ow, dv_c, dv_vh, dv_vw = s[data_vec].op.axis
     data_pad = data_vec.op.input_tensors[0]
     if data_pad.op.name == "data_pad":
         assert isinstance(data_pad.op, tvm.tensor.ComputeOp)

From 71d642319fedf2b6dfe5fb727894ca20e788299a Mon Sep 17 00:00:00 2001
From: Denis Khalikov <dennis.khalikov@gmail.com>
Date: Wed, 5 Dec 2018 13:00:52 +0300
Subject: [PATCH 463/529] [TOPHUB] Set vulkan as alias for opencl (#2230)

---
 python/tvm/autotvm/tophub.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index d90fd76b2532..1d9684442a51 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -39,6 +39,7 @@ def _alias(name):
         'vtacpu': 'vta',
 
         'metal': 'opencl',
+        'vulkan': 'opencl',
         'nvptx': 'cuda',
     }
     return table.get(name, name)

From 68fdb340b9230743a1f1ef736f7cc44689d655f2 Mon Sep 17 00:00:00 2001
From: hlu1 <14827759+hlu1@users.noreply.github.com>
Date: Wed, 5 Dec 2018 18:28:18 -0800
Subject: [PATCH 464/529] [contrib][nnpack] remove training-optimized ops
 (#2224)

---
 python/tvm/contrib/nnpack.py          | 63 --------------------
 src/contrib/nnpack/convolution.cc     | 59 -------------------
 src/contrib/nnpack/fully_connected.cc | 33 -----------
 tests/python/contrib/test_nnpack.py   | 83 +--------------------------
 4 files changed, 2 insertions(+), 236 deletions(-)

diff --git a/python/tvm/contrib/nnpack.py b/python/tvm/contrib/nnpack.py
index 3fb00a3f85e5..98367b4ef04e 100644
--- a/python/tvm/contrib/nnpack.py
+++ b/python/tvm/contrib/nnpack.py
@@ -34,30 +34,6 @@ def fully_connected_inference(lhs, rhs, nthreads=1):
             "tvm.contrib.nnpack.fully_connected_inference",
             ins[0], ins[1], outs[0], nthreads), name="C")
 
-def fully_connected_output(lhs, rhs, nthreads=1):
-    """Create an extern op that compute fully connected of 2D tensor lhs and
-    2D tensor rhs with nnpack.
-
-    Parameters
-    ----------
-    lhs : Tensor
-        lhs 2D matrix input[batch_size][input_channels] of FP32 elements
-    rhs : Tensor
-        lhs 2D matrix kernel[output_channels][input_channels] of FP32 elements
-
-    Returns
-    -------
-    C : Tensor
-        lhs 2D array out[batch_size][output_channels] of FP32 elements.
-    """
-    n = lhs.shape[0]
-    m = rhs.shape[0]
-    return _api.extern(
-        (n, m), [lhs, rhs],
-        lambda ins, outs: _intrin.call_packed(
-            "tvm.contrib.nnpack.fully_connected_output",
-            ins[0], ins[1], outs[0], nthreads), name="C")
-
 
 class ConvolutionAlgorithm:
     AUTO = 0
@@ -204,43 +180,4 @@ def convolution_inference_weight_transform(
             "tvm.contrib.nnpack.convolution_inference_weight_transform",
             ins[0], outs[0], nthreads, algorithm), name="transform_kernel")
 
-def convolution_output(data, kernel, bias, padding, nthreads=1):
-    """Create an extern op to compute convolution of 4D tensor data and
-    4D tensor kernel and 1D tensor bias with nnpack.
-
-    Parameters
-    ----------
-    data : Tensor
-        data 4D tensor input[batch_size][input_channels][input_height]
-        [input_width] of FP32 elements.
-    kernel : Tensor
-        kernel 4D tensor kernel[output_channels][input_channels][kernel_height]
-        [kernel_width] of FP32 elements.
-    bias : Tensor
-        bias 1D array bias[output_channels][input_channels][kernel_height]
-        [kernel_width] of FP32 elements.
-    padding : list
-        padding A 4-dim list of [pad_top, pad_bottom, pad_left, pad_right],
-        which indicates the padding around the feature map.
-
-    Returns
-    -------
-    output : Tensor
-        output 4D tensor output[batch_size][output_channels][output_height]
-        [output_width] of FP32 elements.
-    """
-
-    assert isinstance(padding, list) and len(padding) == 4
-    batch, _, input_height, input_width = data.shape
-    output_channels, _, kernel_height, kernel_width = kernel.shape
-    output_height = (input_height + padding[0] + padding[1] - kernel_height) + 1
-    output_width = (input_width + padding[0] + padding[1] - kernel_width) + 1
-
-    return _api.extern(
-        (batch, output_channels, output_height, output_width), [data, kernel, bias],
-        lambda ins, outs: _intrin.call_packed(
-            "tvm.contrib.nnpack.convolution_output", ins[0], ins[1], ins[2],
-            outs[0], padding[0], padding[1], padding[2], padding[3], nthreads), name="C")
-
-
 _init_api("tvm.contrib.nnpack")
diff --git a/src/contrib/nnpack/convolution.cc b/src/contrib/nnpack/convolution.cc
index 8bcdd64281cc..e600360c67f1 100644
--- a/src/contrib/nnpack/convolution.cc
+++ b/src/contrib/nnpack/convolution.cc
@@ -215,64 +215,5 @@ TVM_REGISTER_GLOBAL(
           entry->threadpool, nullptr);
       CHECK_EQ(status, nnp_status_success);
     });
-
-
-TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_output")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
-    nnp_initialize();
-    DLTensor* input  = args[0];
-    DLTensor* kernel = args[1];
-    DLTensor* bias   = args[2];
-    DLTensor* output = args[3];
-    uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6], pad_left = args[7];
-    nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
-    NNPackConfig(args[8]);
-
-    CHECK_EQ(input->ndim, 4);
-    CHECK_EQ(kernel->ndim, 4);
-    CHECK_EQ(bias->ndim, 1);
-    CHECK_EQ(output->ndim, 4);
-
-    CHECK_EQ(input->shape[0], output->shape[0]);
-    size_t batch_size = input->shape[0];
-    CHECK_EQ(input->shape[1], kernel->shape[1]);
-    size_t input_channels = input->shape[1];
-    CHECK_EQ(output->shape[1], bias->shape[0]);
-    CHECK_EQ(output->shape[1], kernel->shape[0]);
-    size_t output_channels = output->shape[1];
-    nnp_size input_size{static_cast<size_t>(input->shape[2]),
-                        static_cast<size_t>(input->shape[3])};
-    nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
-                         static_cast<size_t>(kernel->shape[3])};
-
-    CHECK(input->strides == nullptr);
-    CHECK(kernel->strides == nullptr);
-    CHECK(bias->strides == nullptr);
-
-    CHECK(TypeMatch(input->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(output->dtype, kDLFloat, 32));
-
-    nnp_status status = nnp_convolution_output(nnp_convolution_algorithm_auto,
-                           batch_size,
-                           input_channels,
-                           output_channels,
-                           input_size,
-                           input_padding,
-                           kernel_size,
-                           static_cast<float*>(input->data),
-                           static_cast<float*>(kernel->data),
-                           static_cast<float*>(bias->data),
-                           static_cast<float*>(output->data),
-                           NULL,
-                           NULL,
-                           nnp_activation_identity,
-                           NULL,
-                           entry->threadpool,
-                           NULL);
-    CHECK_EQ(status, nnp_status_success);
-  });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/nnpack/fully_connected.cc b/src/contrib/nnpack/fully_connected.cc
index ad2569e1f2e1..80f981b29cf6 100644
--- a/src/contrib/nnpack/fully_connected.cc
+++ b/src/contrib/nnpack/fully_connected.cc
@@ -43,38 +43,5 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_inference")
                                   entry->threadpool);
   });
 
-
-TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_output")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
-    nnp_initialize();
-    DLTensor* A = args[0];
-    DLTensor* B = args[1];
-    DLTensor* C = args[2];
-    NNPackConfig(args[3]);
-
-    CHECK_EQ(A->ndim, 2);
-    CHECK_EQ(B->ndim, 2);
-    CHECK_EQ(C->ndim, 2);
-    CHECK_EQ(B->shape[0], C->shape[1]);
-    CHECK_EQ(B->shape[1], A->shape[1]);
-    CHECK_EQ(A->shape[0], C->shape[0]);
-    CHECK(C->strides == nullptr);
-    CHECK(B->strides == nullptr);
-    CHECK(A->strides == nullptr);
-    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
-
-    nnp_fully_connected_output(A->shape[0],
-                               B->shape[1],
-                               B->shape[0],
-                               static_cast<float*>(A->data),
-                               static_cast<float*>(B->data),
-                               static_cast<float*>(C->data),
-                               entry->threadpool,
-                               NULL);
-  });
-
 }  // namespace contrib
 }  // namespace tvm
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index a4b77a39af63..3ebea0e62ce3 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -3,38 +3,6 @@
 import scipy.signal
 from tvm.contrib import nnpack
 
-def test_fully_connected_output():
-    n = 1024
-    l = 128
-    m = 235
-    bias = tvm.var('bias', dtype=tvm.float32)
-    A = tvm.placeholder((n, l), name='A')
-    B = tvm.placeholder((m, l), name='B')
-    C = nnpack.fully_connected_output(A, B)
-    D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
-    s = tvm.create_schedule(D.op)
-
-    def verify(target="llvm"):
-        if not tvm.module.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
-        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_output", True):
-            print("skip because extern function is not available")
-            return
-        if not nnpack.is_available():
-            return
-
-        ctx = tvm.cpu(0)
-        f = tvm.build(s, [A, B, D, bias], target)
-        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(m, l)).astype(B.dtype), ctx)
-        d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
-        bb = 10.0
-        f(a, b, d, bb)
-        tvm.testing.assert_allclose(
-            d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy().T) + bb, rtol=1e-5)
-    verify()
-
 
 def test_fully_connected_inference():
     n = 1024
@@ -131,7 +99,7 @@ def verify(target="llvm",
         if not tvm.module.enabled(target):
             print("skip because %s is not enabled..." % target)
             return
-        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
+        if not tvm.get_global_func("tvm.contrib.nnpack.convolution_inference", True):
             print("skip because extern function is not available")
             return
         if not nnpack.is_available():
@@ -195,7 +163,7 @@ def verify(target="llvm",
         if not tvm.module.enabled(target):
             print("skip because %s is not enabled..." % target)
             return
-        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
+        if not tvm.get_global_func("tvm.contrib.nnpack.convolution_inference_without_weight_transform", True):
             print("skip because extern function is not available")
             return
         if not nnpack.is_available():
@@ -228,53 +196,6 @@ def verify(target="llvm",
         for with_bias in [True, False]:
             verify(algorithm=algorithm, with_bias=with_bias)
 
-def test_convolution_output():
-    BATCH = 32
-    IH = 48
-    IW = 48
-    IC = 16
-    OC = 16
-    K = 3
-    PAD = 1
-
-    OH = (IH + 2*PAD - K) + 1
-    OW = (IW + 2*PAD - K) + 1
-    dshape = (BATCH, IC, IH, IW)
-    kshape = (OC, IC, K, K)
-    bshape = (OC, )
-    oshape = (BATCH, OC, OH, OW)
-
-    data = tvm.placeholder(dshape, name='data')
-    kernel = tvm.placeholder(kshape, name='kernel')
-    bias = tvm.placeholder(bshape, name='bias')
-    output = nnpack.convolution_output(data, kernel, bias, [PAD, PAD, PAD, PAD])
-    s = tvm.create_schedule(output.op)
-
-    def verify(target="llvm"):
-        if not tvm.module.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
-        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            print("skip because extern function is not available")
-            return
-        if not nnpack.is_available():
-            return
-
-        ctx = tvm.cpu(0)
-        f = tvm.build(s, [data, kernel, bias, output], target)
-
-        na = np.random.uniform(size=dshape).astype(data.dtype)
-        nb = np.random.uniform(size=kshape).astype(kernel.dtype)
-        nc = np.zeros(bshape, dtype=bias.dtype)
-        ta = tvm.nd.array(na, ctx)
-        tb = tvm.nd.array(nb, ctx)
-        tc = tvm.nd.array(nc, ctx)
-        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
-        f(ta, tb, tc, td)
-        nd = np_conv(na, nb, PAD)
-        tvm.testing.assert_allclose(
-            td.asnumpy(), nd, rtol=1e-5)
-    verify()
 
 if __name__ == "__main__":
     import nose

From 8afd9b546bb5a02ff28953a4d77eb9f249b9e5f5 Mon Sep 17 00:00:00 2001
From: Liang ZOU <liang.d.zou@gmail.com>
Date: Thu, 6 Dec 2018 11:41:00 +0800
Subject: [PATCH 465/529] [typo] fucntion ==> function (#2239)

fucntion ==> function
---
 include/tvm/runtime/c_backend_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/c_backend_api.h b/include/tvm/runtime/c_backend_api.h
index f55748e38289..4adc9e2790fe 100644
--- a/include/tvm/runtime/c_backend_api.h
+++ b/include/tvm/runtime/c_backend_api.h
@@ -118,7 +118,7 @@ TVM_DLL int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv);
 
 
 /*!
- * \brief Simple static initialization fucntion.
+ * \brief Simple static initialization function.
  *  Run f once and set handle to be not null.
  *  This function is mainly used for test purpose.
  *

From 2003619dee75c4b5cf8b14268ac30598973edd8a Mon Sep 17 00:00:00 2001
From: Liang ZOU <liang.d.zou@gmail.com>
Date: Thu, 6 Dec 2018 11:41:09 +0800
Subject: [PATCH 466/529] [typo] sin ==> in (#2238)

sin ==> in
---
 include/tvm/runtime/util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/util.h b/include/tvm/runtime/util.h
index 5fd130737158..6ec168a250b6 100644
--- a/include/tvm/runtime/util.h
+++ b/include/tvm/runtime/util.h
@@ -16,7 +16,7 @@ namespace runtime {
  * \param t The type
  * \param code The type code.
  * \param bits The number of bits to be matched.
- * \param lanes The number of lanes sin the type.
+ * \param lanes The number of lanes in the type.
  */
 inline bool TypeMatch(TVMType t, int code, int bits, int lanes = 1) {
   return t.code == code && t.bits == bits && t.lanes == lanes;

From 35cdddfa539596aeb6a6571716784ca0f12b033a Mon Sep 17 00:00:00 2001
From: xqdan <danxiaoqiang@126.com>
Date: Thu, 6 Dec 2018 11:41:53 +0800
Subject: [PATCH 467/529] fix dump ir (#2235)

---
 python/tvm/build_module.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 4068b1ce3a94..6117a963ae3a 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -5,8 +5,8 @@
 """
 from __future__ import absolute_import as _abs
 import warnings
-import types
 
+from ._ffi.function import Function
 from ._ffi.node import NodeBase, register_node
 from . import api
 from . import _api_internal
@@ -69,7 +69,7 @@ def recover():
             vset[k] = v
         for k, v in vset.items():
             self._recover_list.append(recover)
-            vset[k] = self.decorate(v) if isinstance(v, types.FunctionType) else v
+            vset[k] = self.decorate(v) if isinstance(v, Function) else v
 
     def decorate_custompass(self, custom_pass):
         """decorate given list of custom passes, and return decorated passes"""

From 301f979d7341aff4ce7fdf9c49162d915ff93b88 Mon Sep 17 00:00:00 2001
From: Ruslan Baratov <ruslan_baratov@yahoo.com>
Date: Thu, 6 Dec 2018 14:30:30 +0000
Subject: [PATCH 468/529] Fix misprint (#2243)

---
 tutorials/cross_compilation_and_rpc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/cross_compilation_and_rpc.py b/tutorials/cross_compilation_and_rpc.py
index e0967d54b09c..6c23368f840a 100644
--- a/tutorials/cross_compilation_and_rpc.py
+++ b/tutorials/cross_compilation_and_rpc.py
@@ -211,7 +211,7 @@ def run_opencl():
     opencl_device_host = '10.77.1.145'
     opencl_device_port = 9090
 
-    # create scheule for the above "add one" compute decleration
+    # create schedule for the above "add one" compute declaration
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=32)
     s[B].bind(xo, tvm.thread_axis("blockIdx.x"))

From 79b384652a07210e3ef2c5c3827d415873969838 Mon Sep 17 00:00:00 2001
From: Wu Zhao <FrozenGene@users.noreply.github.com>
Date: Thu, 6 Dec 2018 22:30:44 +0800
Subject: [PATCH 469/529] Add test case of argmax for detecting out of bound
 access (#2234)

---
 nnvm/tests/python/compiler/test_top_level4.py | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 46383e73657e..fc4e62fb7156 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -686,6 +686,28 @@ def test_where():
     y = np.random.uniform(size=shape).astype("float32")
     verify_where(condition, x, y)
 
+def test_argmax():
+    dshape = (204800, 2)
+    oshape = (1, 320, 640)
+
+    dtype = "float32"
+    x = sym.Variable("x", shape=dshape, dtype=dtype)
+    x = sym.reshape(x, shape=(1, 320, 640, 2))
+    x = sym.transpose(x, axes=(0, 3, 1, 2))
+    y = sym.argmax(x, axis=1)
+    target_str = "llvm"
+    target = tvm.target.create(target_str)
+    ctx = tvm.context(target_str, 0)
+    with nnvm.compiler.build_config(opt_level=2):
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
+    m = graph_runtime.create(graph, lib, ctx)
+    data = np.random.uniform(size=dshape).astype(dtype)
+    m.run(x=data)
+    np_reshape = np.reshape(data, (1, 320, 640, 2))
+    np_transpose = np.transpose(np_reshape, axes=(0, 3, 1, 2))
+    np_argmax = np.argmax(np_transpose, axis=1)
+    out = m.get_output(0)
+    np.testing.assert_allclose(out.asnumpy(), np_argmax, atol=1e-5, rtol=1e-5)
 
 if __name__ == "__main__":
     test_reshape()
@@ -707,4 +729,5 @@ def test_where():
     test_nms()
     test_slice_like()
     test_where()
+    test_argmax()
     print(nnvm.compiler.engine.dump())

From 46d755a33983c7a007217df3ad196000e817a4bf Mon Sep 17 00:00:00 2001
From: Liang ZOU <liang.d.zou@gmail.com>
Date: Fri, 7 Dec 2018 00:16:35 +0800
Subject: [PATCH 470/529] [typo] fucn => func (#2240)

---
 nnvm/src/pass/infer_shape_type.cc   | 2 +-
 python/tvm/relay/ty.py              | 2 +-
 python/tvm/relay/ty.pyi             | 2 +-
 rust/src/runtime/c_runtime_api.rs   | 2 +-
 src/api/api_test.cc                 | 2 +-
 src/relay/backend/compile_engine.cc | 2 +-
 src/runtime/cuda/cuda_module.cc     | 2 +-
 src/runtime/metal/metal_module.mm   | 2 +-
 src/runtime/rocm/rocm_module.cc     | 2 +-
 src/runtime/vulkan/vulkan_module.cc | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/nnvm/src/pass/infer_shape_type.cc b/nnvm/src/pass/infer_shape_type.cc
index cc4916ce0b9f..0f322f12e9c4 100644
--- a/nnvm/src/pass/infer_shape_type.cc
+++ b/nnvm/src/pass/infer_shape_type.cc
@@ -215,7 +215,7 @@ NNVM_REGISTER_PASS(InferShape)
 .set_change_graph(false)
 .provide_graph_attr("shape");
 
-// inference fucntion for same type
+// inference function for same type
 inline bool SameType(const NodeAttrs& attrs,
                      std::vector<int> *iattr,
                      std::vector<int> *oattr) {
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index 7ea63e6200bf..96dde5acb4df 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -141,7 +141,7 @@ class FuncType(Type):
 
     This is the type assigned to functions in Relay. They consist of
     a list of type parameters which enable the definition of generic
-    fucntions, a set of type constraints which we omit for the time
+    functions, a set of type constraints which we omit for the time
     being, a sequence of argument types, and a return type.
 
     We informally write them as:
diff --git a/python/tvm/relay/ty.pyi b/python/tvm/relay/ty.pyi
index 933814853f3e..c4d5df7ac06c 100644
--- a/python/tvm/relay/ty.pyi
+++ b/python/tvm/relay/ty.pyi
@@ -121,7 +121,7 @@ class FuncType(Type):
 
     This is the type assigned to functions in Relay. They consist of
     a list of type parameters which enable the definition of generic
-    fucntions, a set of type constraints which we omit for the time
+    functions, a set of type constraints which we omit for the time
     being, a sequence of argument types, and a return type.
 
     We informally write them as:
diff --git a/rust/src/runtime/c_runtime_api.rs b/rust/src/runtime/c_runtime_api.rs
index 62cfa0d15451..6facf9ca274f 100644
--- a/rust/src/runtime/c_runtime_api.rs
+++ b/rust/src/runtime/c_runtime_api.rs
@@ -750,7 +750,7 @@ extern "C" {
   ) -> ::std::os::raw::c_int;
 }
 extern "C" {
-  /// \brief Simple static initialization fucntion.
+  /// \brief Simple static initialization function.
   /// Run f once and set handle to be not null.
   /// This function is mainly used for test purpose.
   ///
diff --git a/src/api/api_test.cc b/src/api/api_test.cc
index 181036acf82f..2c637a28f01a 100644
--- a/src/api/api_test.cc
+++ b/src/api/api_test.cc
@@ -49,7 +49,7 @@ TVM_REGISTER_API("_context_test")
     *ret = ctx;
   });
 
-// internal fucntion used for debug and testing purposes
+// internal function used for debug and testing purposes
 TVM_REGISTER_API("_ndarray_use_count")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     runtime::NDArray nd = args[0];
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 17a5b60b322e..b8938bd34804 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -221,7 +221,7 @@ class ScheduleGetter :
 
 class CompileEngineImpl : public CompileEngineNode {
  public:
-  // Lower the fucntion.
+  // Lower the function.
   CachedFunc Lower(const CCacheKey& key)  {
     return LowerInternal(key)->cached_func;
   }
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index 4984517b16c6..f818a78345bb 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -141,7 +141,7 @@ class CUDAModuleNode : public runtime::ModuleNode {
   std::mutex mutex_;
 };
 
-// a wrapped function class to get packed fucn.
+// a wrapped function class to get packed func.
 class CUDAWrappedFunc {
  public:
   // initialize the CUDA function.
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index c538957ca561..cf470b6c8a34 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -163,7 +163,7 @@ void SaveToBinary(dmlc::Stream* stream) final {
   std::mutex mutex_;
 };
 
-// a wrapped function class to get packed fucn.
+// a wrapped function class to get packed func.
 class MetalWrappedFunc {
  public:
   // initialize the METAL function.
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index da3b04f66c49..0607e9938225 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -123,7 +123,7 @@ class ROCMModuleNode : public runtime::ModuleNode {
   std::mutex mutex_;
 };
 
-// a wrapped function class to get packed fucn.
+// a wrapped function class to get packed func.
 class ROCMWrappedFunc {
  public:
   // initialize the ROCM function.
diff --git a/src/runtime/vulkan/vulkan_module.cc b/src/runtime/vulkan/vulkan_module.cc
index 134c5fa45ba4..4afe8cc782ce 100644
--- a/src/runtime/vulkan/vulkan_module.cc
+++ b/src/runtime/vulkan/vulkan_module.cc
@@ -223,7 +223,7 @@ class VulkanModuleNode final :public runtime::ModuleNode {
   std::mutex mutex_;
 };
 
-// a wrapped function class to get packed fucn.
+// a wrapped function class to get packed func.
 class VulkanWrappedFunc {
  public:
   // initialize the VULKAN function.

From 467f3c6fac6a544fad151f2c615885c3d44b6801 Mon Sep 17 00:00:00 2001
From: Zhebin Jin <zhebin.jzb@alibaba-inc.com>
Date: Fri, 7 Dec 2018 01:51:05 +0800
Subject: [PATCH 471/529] [FRONTEND][TENSORFLOW]Add Split and realdiv op
 support (#2123)

* Add Split and realdiv op support

* Fix the pad calculation in the case of dilated convolution
---
 nnvm/python/nnvm/frontend/tensorflow.py       | 50 ++++++++----
 .../frontend/tensorflow/test_forward.py       | 79 +++++++++++++++++++
 2 files changed, 114 insertions(+), 15 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 26e59dc7e830..c8db662152e9 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -215,7 +215,7 @@ def _impl(inputs, attr, params):
                 attr['channels'] = input_shape[3] * depth_mult
 
             if 'dilations' in attr:
-                attr['dilations'] = (attr['dilations'][0], attr['dilations'][1])
+                attr['dilations'] = (attr['dilations'][1], attr['dilations'][2])
             attr['strides'] = (attr['strides'][1], attr['strides'][2])
         elif attr['data_format'] == 'NCHW':
             depth_mult, _, kernel_h, kernel_w = weights_shape
@@ -252,8 +252,12 @@ def _impl(inputs, attr, params):
                 in_h = input_shape[2]
                 in_w = input_shape[3]
 
-            pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
-            pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
+            dilation_h = attr['dilations'][0]
+            dilation_w = attr['dilations'][1]
+            dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+            dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+            pad_v = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
+            pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
 
             if attr['data_format'] == 'NHWC':
                 inputs[0] = _sym.pad(data=inputs[0],
@@ -783,6 +787,15 @@ def _impl(inputs, attr, params):
         )(inputs, attr)
     return _impl
 
+def _split():
+    def _impl(inputs, attr, params):
+        axis = params.pop(inputs[0].list_output_names()[0])
+        return AttrCvt(
+            op_name="split", ignores=['T'],
+            transforms={'num_split': 'indices_or_sections'},
+            extras={'axis': axis.asnumpy()[0]})(inputs[1], attr)
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -813,6 +826,7 @@ def _impl(inputs, attr, params):
     'Add'                               : _elemwise('add'),
     'Sub'                               : _elemwise('sub'),
     'Mul'                               : _elemwise('mul'),
+    'RealDiv'                           : _elemwise('div'),
     'Maximum'                           : _elemwise('max'),
     'Minimum'                           : _elemwise('min'),
     'Sum'                               : _sum(),
@@ -849,6 +863,7 @@ def _impl(inputs, attr, params):
     'GreaterEqual'                      : _broadcast('greater_equal'),
     'Equal'                             : _broadcast('equal'),
     'NotEqual'                          : _broadcast('not_equal'),
+    'Split'                             : _split(),
 }
 
 # _convert_map_rnn defines maps of rnn operator name to
@@ -1144,21 +1159,26 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
                 # Pass the target layout
                 attr["_target_layout"] = layout
 
-                #ToDo: Some of the tensorflow operators internaly maintain
-                #execution layers and its output name will the layer number along with
-                #graph node name.eg: Node name:- 'Model/RNN/cell_0/RnnCell', but the
-                #output name will be 'Model/RNN/cell_0/RnnCell:0'. In this case,
-                #the digit has to be ignored.
-                if ":" in node.input[0]:
-                    in_name, _ = node.input[0].split(':')
-                    node.input[0] = in_name
-
                 # Fill shapes for all inputs in a list
                 inputs = []
                 for i in node.input:
-                    if i in self._nodes:
-                        inputs.append(self._nodes[i])
-                        input_shapes[self._nodes[i]] = self._output_shapes[i]
+                    #ToDo: Some of the tensorflow operators internaly maintain
+                    #execution layers and its output name will the layer number along with
+                    #graph node name.eg: Node name:- 'Model/RNN/cell_0/RnnCell', but the
+                    #output name will be 'Model/RNN/cell_0/RnnCell:0'. In this case,
+                    #the digit has to be ignored.
+                    tensor_name = i.split(':')
+                    node_name = tensor_name[0]
+                    if node_name in self._nodes:
+                        in_sym = self._nodes[node_name]
+                        if len(in_sym.list_output_names()) > 1:
+                            tensor_slot = int(tensor_name[1]) if len(tensor_name) > 1 else 0
+                            in_sym = in_sym[tensor_slot]
+                            input_shape = (self._output_shapes[node_name])[tensor_slot]
+                        else:
+                            input_shape = self._output_shapes[node_name][0]
+                        inputs.append(in_sym)
+                        input_shapes[in_sym] = [input_shape]
                 attr['_input_shapes'] = input_shapes
 
                 inputs = self._fix_extranodes(node.op, attr, inputs)
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index c98748c0fc03..219ceb5bd379 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -502,6 +502,83 @@ def test_forward_gather():
     _test_gather((4,3,5,6), (1,4), [[2,1,0,0]], 0, 'float32')
 
 
+#######################################################################
+# Split
+# -----
+
+def _test_split(in_shape, axis, num_split, dtype):
+    """ One iteration of a Split """
+
+    with tf.Graph().as_default():
+        in_data = tf.placeholder(dtype, in_shape, name="in_data")
+        tf.split(in_data, num_split, axis)
+        np_data = np.random.uniform(size=in_shape).astype(dtype)
+        compare_tf_with_tvm(np_data, 'in_data:0', 'split:0')
+
+def test_forward_split():
+    '''test split layer'''
+    # rank 1
+    _test_split((3,), 0, 1, 'float32')
+    _test_split((3,), 0, 3, 'float32')
+    _test_split((6,), 0, 3, 'float32')
+    # rank 2
+    _test_split((6, 2), 0, 3, 'float32')
+    _test_split((2, 6), 1, 3, 'float32')
+    # rank 3
+    _test_split((6, 2, 4), 0, 3, 'float32')
+    _test_split((2, 6, 4), 1, 3, 'float32')
+    _test_split((2, 4, 6), 2, 3, 'float32')
+    # rank 4
+    _test_split((6, 1, 3, 5), 0, 3, 'float32')
+    _test_split((1, 6, 3, 5), 1, 3, 'float32')
+    _test_split((1, 3, 6, 5), 2, 3, 'float32')
+    _test_split((1, 3, 5, 6), 3, 3, 'float32')
+    # split along negative axis
+    _test_split((6, 1, 3, 5), -4, 3, 'float32')
+    _test_split((1, 6, 3, 5), -3, 3, 'float32')
+    _test_split((1, 3, 6, 5), -2, 3, 'float32')
+    _test_split((1, 3, 5, 6), -1, 3, 'float32')
+
+
+#######################################################################
+# Split followed by concat
+# ------------------------
+
+def _test_split_concat(in_shape, axis, num_split, dtype):
+    """ One iteration of a split_concat pair"""
+
+    with tf.Graph().as_default():
+        in_data = tf.placeholder(dtype, in_shape, name="in_data")
+        splitted = tf.split(in_data, num_split, axis)
+        tf.concat(splitted, axis)
+        np_data = np.random.uniform(size=in_shape).astype(dtype)
+        compare_tf_with_tvm(np_data, 'in_data:0', 'concat:0')
+
+def test_forward_split_concat():
+    '''test split followed by concat layers'''
+    # rank 1
+    _test_split_concat((3,), 0, 1, 'float32')
+    _test_split_concat((3,), 0, 3, 'float32')
+    _test_split_concat((6,), 0, 3, 'float32')
+    # rank 2
+    _test_split_concat((6, 2), 0, 3, 'float32')
+    _test_split_concat((2, 6), 1, 3, 'float32')
+    # rank 3
+    _test_split_concat((6, 2, 4), 0, 3, 'float32')
+    _test_split_concat((2, 6, 4), 1, 3, 'float32')
+    _test_split_concat((2, 4, 6), 2, 3, 'float32')
+    # rank 4
+    _test_split((6, 1, 3, 5), 0, 3, 'float32')
+    _test_split((1, 6, 3, 5), 1, 3, 'float32')
+    _test_split((1, 3, 6, 5), 2, 3, 'float32')
+    _test_split((1, 3, 5, 6), 3, 3, 'float32')
+    # split along negative axis
+    _test_split((6, 1, 3, 5), -4, 3, 'float32')
+    _test_split((1, 6, 3, 5), -3, 3, 'float32')
+    _test_split((1, 3, 6, 5), -2, 3, 'float32')
+    _test_split((1, 3, 5, 6), -1, 3, 'float32')
+
+
 #######################################################################
 # Multi Input to graph
 # --------------------
@@ -1061,6 +1138,8 @@ def test_forward_rel_ops():
     test_forward_pad()
     test_forward_gather()
     test_forward_stridedslice()
+    test_forward_split()
+    test_forward_split_concat()
 
     # Activations
     test_forward_sigmoid()

From 0ae7aa554d31772dfe732222f16001538a92673e Mon Sep 17 00:00:00 2001
From: Chang Lan <changlan9@gmail.com>
Date: Thu, 6 Dec 2018 12:33:00 -0800
Subject: [PATCH 472/529] Use unsafe_get in nnvm (#2247)

---
 3rdparty/dmlc-core        | 2 +-
 nnvm/include/nnvm/base.h  | 3 +++
 nnvm/include/nnvm/graph.h | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 4d49691f1a9d..519d013a213c 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 4d49691f1a9d944c3b0aa5e63f1db3cad1f941f8
+Subproject commit 519d013a213c0c447a971f51219473ef564d2348
diff --git a/nnvm/include/nnvm/base.h b/nnvm/include/nnvm/base.h
index 449bd2f4626e..39ff70093bed 100644
--- a/nnvm/include/nnvm/base.h
+++ b/nnvm/include/nnvm/base.h
@@ -25,6 +25,9 @@ using dmlc::array_view;
 /*!\brief getter function of any type */
 using dmlc::get;
 
+/*!\brief "unsafe" getter function of any type */
+using dmlc::unsafe_get;
+
 }  // namespace nnvm
 
 // describe op registration point
diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h
index 3f8a2a3642b1..93612ccb9ece 100644
--- a/nnvm/include/nnvm/graph.h
+++ b/nnvm/include/nnvm/graph.h
@@ -229,7 +229,7 @@ inline const T& Graph::GetAttr(const std::string& attr_name) const {
   auto it = attrs.find(attr_name);
   CHECK(it != attrs.end())
       << "Cannot find attribute " << attr_name << " in the graph";
-  return nnvm::get<T>(*it->second);
+  return nnvm::unsafe_get<T>(*it->second);
 }
 
 inline bool Graph::HasAttr(const std::string& attr_name) const {

From 98a6160f3279c44b9624e26f654fc484a44672a8 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Fri, 7 Dec 2018 06:31:53 -0800
Subject: [PATCH 473/529] [COMMUNITY] @masahi -> Committer (#2252)

---
 CONTRIBUTORS.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 185bf329f6d7..828ebc6e8dde 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -14,13 +14,14 @@ We do encourage everyone to work anything they are interested in.
 - [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri - rocm
 - [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
-- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
+- [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
+- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
+- [Masahiro Masuda](https://github.com/masahi): @masahi - topi, relay
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
-- [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy - autotvm, topi
 
 ## Reviewers

From 97c1606560280b9298a4d5a7649ff1f9d84158bb Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 7 Dec 2018 06:32:13 -0800
Subject: [PATCH 474/529] GetChar() in base64.h should return int, not char
 (#2255)

---
 src/common/base64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/base64.h b/src/common/base64.h
index 31b02d3ca2a3..3f530e10a7e5 100644
--- a/src/common/base64.h
+++ b/src/common/base64.h
@@ -58,10 +58,10 @@ class StreamBufferReader {
   /*!
    * \return allows quick read using get char
    */
-  char GetChar() {
+  int GetChar() {
     while (true) {
       if (read_ptr_ < read_len_) {
-        return buffer_[read_ptr_++];
+        return static_cast<int>(buffer_[read_ptr_++]);
       } else {
         read_len_ = stream_->Read(&buffer_[0], buffer_.length());
         if (read_len_ == 0) return EOF;

From 9d9c2831780d7e9d1c28dd0a0a09441fc46ab121 Mon Sep 17 00:00:00 2001
From: Liang ZOU <liang.d.zou@gmail.com>
Date: Fri, 7 Dec 2018 22:34:14 +0800
Subject: [PATCH 475/529] add c backend to CreateTarget (#2256)

---
 src/codegen/build_module.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 0659a07f2520..859fdb2bc86c 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -58,7 +58,7 @@ Target CreateTarget(const std::string& target_name,
   }
   t->device_type = kDLCPU;
   t->thread_warp_size = 1;
-  if (target_name == "llvm") {
+  if (target_name == "c" || target_name == "llvm") {
     t->keys_array.push_back(ir::StringImm::make("cpu"));
   } else if (target_name == "cuda" || target_name == "nvptx") {
     t->device_type = kDLGPU;

From f0b03833ac5d2bc89731e47c379804dc95e78258 Mon Sep 17 00:00:00 2001
From: Sergey Mironov <grrwlf@gmail.com>
Date: Fri, 7 Dec 2018 17:43:28 +0300
Subject: [PATCH 476/529] Fix missing sigmoid intrinsic in C++ (#2231)

---
 python/tvm/intrin.py       |  3 ---
 src/codegen/intrin_rule.cc | 10 ++++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py
index 3207b6112b1d..cd9a108c546a 100644
--- a/python/tvm/intrin.py
+++ b/python/tvm/intrin.py
@@ -492,6 +492,3 @@ def _rule_float_direct(op):
 register_intrin_rule("opencl", "exp", _rule_float_direct, override=True)
 # default pattern for exp
 register_intrin_rule("default", "exp", _rule_float_suffix, override=True)
-
-# default pattern for sigmoid
-register_intrin_rule("default", "sigmoid", lambda op: 1.0 / (1.0 + exp(-op.args[0])))
diff --git a/src/codegen/intrin_rule.cc b/src/codegen/intrin_rule.cc
index 822d515fb8a5..f326fceb6ee8 100644
--- a/src/codegen/intrin_rule.cc
+++ b/src/codegen/intrin_rule.cc
@@ -24,6 +24,16 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.sqrt")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.pow")
 .set_body(DispatchExtern<FloatSuffix>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.sigmoid")
+.set_body([](const TVMArgs& args, TVMRetValue* rv){
+    Expr e = args[0];
+    const Call* call = e.as<Call>();
+    CHECK(call != nullptr);
+
+    auto one = make_const(call->args[0].type(), 1);
+    *rv = one / (one + exp(-call->args[0]));
+  });
+
 }  // namespace intrin
 }  // namespace codegen
 }  // namespace tvm

From 3fd0ce497e470b6a5c06d4542daf2c2da755be18 Mon Sep 17 00:00:00 2001
From: Jian Weng <werefluke@gmail.com>
Date: Fri, 7 Dec 2018 20:32:39 -0800
Subject: [PATCH 477/529] allows constant param in op construct (#2257)

---
 python/tvm/hybrid/parser.py                 | 45 ++++++++++++---------
 python/tvm/hybrid/util.py                   | 30 +++++++-------
 python/tvm/hybrid/var_decl.py               | 11 ++++-
 tests/python/unittest/test_hybrid_script.py | 33 +++++++++++++--
 4 files changed, 81 insertions(+), 38 deletions(-)

diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index a16f5abd4349..ee550ab623cb 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -144,14 +144,14 @@ def visit_Expr(self, node):
 
     def visit_Name(self, node):
         _id = node.id
-        if _id in self._args.keys() and isinstance(self._args[_id], _expr.Var):
+        if _id in self._args.keys() and isinstance(self._args[_id], (_expr.Var, _expr.ConstExpr)):
             return self._args[_id]
         elif _id in self.loops_above.keys():
             return self.loops_above[_id]
         _internal_assert(_id not in self._args.keys(), \
-                "This id %s should be handled in visit_Subscript!" % _id)
+                         "This id %s should be handled in visit_Subscript!" % _id)
         _internal_assert(_id in self.usage.keys(), \
-                "This id %s is expected to be a defined variable!" % _id)
+                         "This id %s is expected to be a defined variable!" % _id)
         # Buffer
         if _id in self.alloc_buffers.keys():
             _buf, _ = self.alloc_buffers[_id]
@@ -166,6 +166,15 @@ def visit_Num(self, node):
         return _api.const(node.n)
 
 
+    def visit_AugAssign(self, node):
+        lhs = self.visit(node.target)
+        rhs = self.visit(node.value)
+        rhs = HybridParser._binop_maker[type(node.op)](lhs, rhs)
+        _internal_assert(isinstance(lhs, _expr.Call), \
+                         "The LHS of an AugAssign is supposed to be a call!")
+        return _make.Provide(lhs.func, 0, rhs, lhs.args)
+
+
     def visit_Assign(self, node):
         _internal_assert(len(node.targets) == 1, "So far only one-valued assignment is supported!")
         lhs = node.targets[0]
@@ -177,7 +186,7 @@ def visit_Assign(self, node):
             lhs_ = lhs
             lhs = lhs.id
             _internal_assert(lhs not in self.loops_above.keys(), \
-                    "Loop variable cannot be overwritten!")
+                             "Loop variable cannot be overwritten!")
             decl, _, rw = self.usage[lhs]
             if decl == lhs_:
                 _internal_assert(lhs not in self.var_consts.keys(), \
@@ -227,16 +236,16 @@ def visit_Subscript(self, node):
             return _make.Call(_buf.dtype, array, args, _expr.Call.Halide, _buf.op, 0)
 
         _internal_assert(isinstance(node.value, ast.Attribute), \
-                "Only variable and attribute's subscript supported so far")
+                         "Only variable and attribute's subscript supported so far")
         _internal_assert(isinstance(node.value.value, ast.Name), \
-            "The root of array access is expect to be a id!")
+                         "The root of array access is expect to be a id!")
         _internal_assert(node.value.attr == "shape", \
-            "Attribute access so far only 'shape' is supported!")
+                         "Attribute access so far only 'shape' is supported!")
         _internal_assert(len(args) == 1, "For 'shape' access the argument should be only one!")
         args = args[0]
         #TODO: maybe support non-constant value later?
         _internal_assert(isinstance(args, (_expr.IntImm, _expr.UIntImm)), \
-            "So far only constant shape access supported!")
+                         "So far only constant shape access supported!")
         buf = self._get_buffer_from_id(node.value.value.id)
         return buf.shape[args.value]
 
@@ -294,7 +303,7 @@ def visit_BinOp(self, node):
     def visit_Call(self, node):
         # Yet, no function pointer supported
         _internal_assert(isinstance(node.func, ast.Name), \
-            "Only id-function function call is supported so far!")
+                         "Only id-function function call is supported so far!")
         func_id = node.func.id
         n = len(node.args)
         if func_id in LOOP_INTRIN.keys() and func_id != 'bind':
@@ -311,7 +320,7 @@ def visit_Call(self, node):
         elif func_id == 'bind':
             _internal_assert(n == 2, "A loop bind should only have 2 arguments!")
             _internal_assert(isinstance(node.args[0], ast.Str), \
-                "A loop bind's first argument should be a string!")
+                             "A loop bind's first argument should be a string!")
             _vn = node.args[0].s
             iter_var = thread_axis(node.args[0].s)
             low, ext = _api.const(0, dtype='int32'), self.visit(node.args[1])
@@ -321,11 +330,11 @@ def visit_Call(self, node):
             return getattr(intrin, func_id)(*[self.visit(arg) for arg in node.args])
         elif func_id in ['allocate', 'output_tensor']:
             _internal_assert(isinstance(node.args[0], ast.Tuple), \
-                "allocate's first argument should be a tuple of shape!")
+                             "allocate's first argument should be a tuple of shape!")
             shape = tuple(self.visit(i) for i in node.args[0].elts)
             if func_id == 'output_tensor':
                 _internal_assert(not self.loops_above, \
-                        "Are you sure to allocate a output buffer multiple times?")
+                                 "Are you sure to allocate a output buffer multiple times?")
             for i in shape:
                 _internal_assert(isinstance(i, _expr.Expr), "The shape should be an expression")
             if n > 1:
@@ -333,18 +342,18 @@ def visit_Call(self, node):
                     dtype = node.args[1].s
                 else:
                     _internal_assert(isinstance(node.args[1], ast.Attribute), \
-                            "Unable to evaluate to get data type")
+                                     "Unable to evaluate to get data type")
                     to_eval = node.args[1]
                     _internal_assert(isinstance(to_eval.value, ast.Name), \
-                            "Unable to evaluate the attribute to get data type")
+                                     "Unable to evaluate the attribute to get data type")
                     _internal_assert(to_eval.attr == 'dtype', \
-                            "Only dtype attribute is supported so far")
+                                     "Only dtype attribute is supported so far")
                     dtype = self._get_buffer_from_id(to_eval.value.id).dtype
             else:
                 dtype = 'float32'
             if n > 2:
                 _internal_assert(isinstance(node.args[2], ast.Str), \
-                        "The data scope should be an string")
+                                 "The data scope should be an string")
                 _internal_assert(func_id != 'output_tensor', "Output tensor cannot specify scope")
                 scope = node.args[2].s
             else:
@@ -361,7 +370,7 @@ def visit_Call(self, node):
     def visit_For(self, node):
         iter_var, low, ext, for_type = self.visit(node.iter)
         _internal_assert(isinstance(node.target, ast.Name), \
-                "The loop iterator should be a variable!")
+                         "The loop iterator should be a variable!")
         _name = node.target.id
         if iter_var is None:
             _internal_assert(for_type is not None, "The loop bind function parse error!")
@@ -389,7 +398,7 @@ def visit_Return(self, node):
             ids.append(node.value.id)
         else:
             _internal_assert(isinstance(node.value, ast.Tuple), \
-                    "You should return either a single tensor or a tuple")
+                             "You should return either a single tensor or a tuple")
             for i in node.value.elts:
                 _internal_assert(isinstance(i, ast.Name), "What do you return?")
                 ids.append(i.id)
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index e38f466381ff..78106838f13e 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -15,9 +15,14 @@
 
 #pylint: disable=invalid-name
 np_arg_types = tuple(list(numeric_types) + [numpy.ndarray])
-tvm_arg_types = (Tensor, _expr.Var)
+tvm_arg_types = (Tensor, _expr.Var, _expr.ConstExpr)
 halide_imm_types = (_expr.IntImm, _expr.FloatImm, _expr.UIntImm)
 
+def _internal_assert(cond, err):
+    """Simplify the code segment like if not XXX then raise an error"""
+    if not cond:
+        raise ValueError(err)
+
 
 # Useful constants. In avoid of runtime dependences, we use function calls to return them.
 def make_nop():
@@ -50,14 +55,16 @@ def _is_tvm_arg_types(args):
     If neither is true, raise a value error."""
     if isinstance(args[0], tvm_arg_types):
         for elem in args[1:]:
-            if not isinstance(elem, tvm_arg_types):
-                raise ValueError("Expect a Var or Tensor instance but % get!" % str(type(elem)))
+            _internal_assert(isinstance(elem, tvm_arg_types),
+                             "Expecting a Var, Tensor or ConstExpr instance but %s get!" \
+                             % str(type(elem)))
         return True
-    if not isinstance(args[0], np_arg_types):
-        raise ValueError("Expect a numpy type but % get!" % str(type(args[0])))
+
+    _internal_assert(isinstance(args[0], np_arg_types), \
+                     "Expect a numpy type but %s get!" % str(type(args[0])))
     for elem in args[1:]:
-        if not isinstance(elem, np_arg_types):
-            raise ValueError("Expect a numpy type but % get!" % str(type(elem)))
+        _internal_assert(isinstance(elem, np_arg_types), \
+                         "Expect a numpy type but %s get!" % str(type(elem)))
     return False
 
 
@@ -79,12 +86,3 @@ def _restore_runtime(func, intersect):
         _globals.pop(elem)
     for k, v in intersect:
         _globals[k] = v
-
-def _internal_assert(cond, err):
-    """Simplify the code segment like if not XXX then raise an error"""
-    if not cond:
-        raise ValueError(err)
-
-# Almost the same functionality as the one above, but in this case,
-# the error is caused by users inproper usage.
-_user_assert = _internal_assert
diff --git a/python/tvm/hybrid/var_decl.py b/python/tvm/hybrid/var_decl.py
index 586ef95461ea..27df87874377 100644
--- a/python/tvm/hybrid/var_decl.py
+++ b/python/tvm/hybrid/var_decl.py
@@ -15,6 +15,7 @@ def __init__(self, args):
         self.scope_level = []
         self._args = {}
         self.args = args
+        self.aug_assign_ = False
 
 
     def visit_FunctionDef(self, node):
@@ -48,6 +49,12 @@ def visit_Call(self, node):
             self.visit(elem)
 
 
+    def visit_AugAssign(self, node):
+        self.aug_assign_ = True
+        self.generic_visit(node)
+        self.aug_assign_ = False
+
+
     def visit_Name(self, node):
         # If it is from the argument list or loop variable, we do not worry about it!
         if node.id in self._args.keys():
@@ -61,7 +68,9 @@ def visit_Name(self, node):
 
         if node.id not in self.status.keys():
             _internal_assert(isinstance(node.ctx, ast.Store), \
-                    'Undeclared variable %s' % node.id)
+                             'Undeclared variable %s' % node.id)
+            if self.aug_assign_:
+                raise ValueError('"First store" cannot be an AugAssign')
             self.status[node.id] = (node, self.scope_level[-1], set())
         else:
             decl, loop, usage = self.status[node.id]
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 9156e40f949f..3304039d7400 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -115,7 +115,7 @@ def fanout(n, a):
         for i in range(a.shape[0] - 3):
             sigma = 0.0
             for j in range(3):
-                sigma = sigma + a[i + j]
+                sigma += a[i + j]
             sigma = sigma / three
             b[i] = sigma
         return b
@@ -246,7 +246,7 @@ def test_bind():
     def vec_add(a, b):
         c = output_tensor((1000, ), dtype='float32')
         for tx in bind('threadIdx.x', 1000):
-            c[tx] = b[tx] + c[tx]
+            c[tx] = a[tx] + b[tx]
         return c
 
     a = tvm.placeholder((1000, ), dtype='float32', name='a')
@@ -308,7 +308,7 @@ def blur(a):
                 s = 0.0
                 for di in range(3):
                     for dj in range(3):
-                        s = s + a[i-di, j-dj]
+                        s += a[i-di, j-dj]
                 b[i-2, j-2] = s / 9.0
         return b
 
@@ -419,6 +419,32 @@ def downstream(a):
     module(tvm_a, tvm_c)
     tvm.testing.assert_allclose(tvm_c.asnumpy(), ref, 1e-5, 1e-5)
 
+def test_const_param():
+    @tvm.hybrid.script
+    def add_something(a, b):
+        c = output_tensor((11, ), 'int32')
+        for i in range(11):
+            c[i] = a[i] + b
+        return c
+
+    a = tvm.placeholder((11, ), dtype='int32', name='a')
+    b = tvm.const(11, 'int32')
+    c = add_something(a, b)
+    sch = tvm.create_schedule(c.op)
+    module = tvm.build(sch, [a, c], 'llvm')
+    assert(module)
+
+    np_a = numpy.arange(11).astype('int32')
+    np_b = 11
+    np_c = numpy.zeros((11, )).astype('int32')
+
+    nd_a = tvm.ndarray.array(np_a)
+    nd_c = tvm.ndarray.array(numpy.zeros((11, )).astype('int32'))
+    module(nd_a, nd_c)
+    ref = add_something(np_a, 11)
+
+    tvm.testing.assert_allclose(nd_c.asnumpy(), ref, 1e-5, 1e-5)
+
 
 if __name__ == "__main__":
     test_outer_product()
@@ -432,5 +458,6 @@ def downstream(a):
     #test_inplace()
     test_upstream()
     test_downstream()
+    test_const_param()
 
 

From e8a56941f18f25041c1606e8a27baf2b21e43dfb Mon Sep 17 00:00:00 2001
From: Salem Derisavi <derisavi@users.noreply.github.com>
Date: Sat, 8 Dec 2018 11:44:17 -0500
Subject: [PATCH 478/529] Generate predicates for non-root iteration variables
 as well (#2258)

---
 src/schedule/message_passing.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/schedule/message_passing.cc b/src/schedule/message_passing.cc
index 6c185d6f8637..dff2895cd42d 100644
--- a/src/schedule/message_passing.cc
+++ b/src/schedule/message_passing.cc
@@ -475,16 +475,20 @@ std::vector<Expr> MakeBoundCheck(
     iset_dmap[kv.first->var.get()] = IntSet::range(kv.second);
   }
 
-  for (IterVar iv : stage->op->root_iter_vars()) {
+  for (const IterVar& iv : stage->all_iter_vars) {
     if (skip_iter.count(iv) || iv->iter_type == kOpaque) continue;
-    Range dom = dom_map.at(iv);
     if (bound_state.at(iv)) {
+      Range dom = dom_map.at(iv);
       Expr value = ComputeExpr<Sub>(value_map.at(iv), dom->min);
       Expr vmax = EvalSet(value, iset_dmap).max();
       if (vmax.type() != value.type() || !can_prove(vmax < dom->extent)) {
         preds.emplace_back(value < dom->extent);
       }
     }
+  }
+  for (const IterVar& iv : stage->op->root_iter_vars()) {
+    if (skip_iter.count(iv) || iv->iter_type == kOpaque) continue;
+    Range dom = dom_map.at(iv);
     CHECK(iv->dom.defined());
     if (!skip_ivar_domain && !iv->dom.same_as(dom)) {
       Expr value = ComputeExpr<Sub>(value_map.at(iv), iv->dom->min);

From 033cd4723fae0e6fd2e4c1ebcc676dee597a43f5 Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Sat, 8 Dec 2018 13:07:02 -0800
Subject: [PATCH 479/529] [COMMUNITY] @ajtulloch -> Reviewer (#2236)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 828ebc6e8dde..788121ba82e1 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -55,6 +55,7 @@ We do encourage everyone to work anything they are interested in.
 - [Eddie Yan](https://github.com/eqy): @eqy
 - [Joshua Z. Zhang](https://github.com/zhreshold): @zhreshold
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy
+- [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch
 
 ## List of Contributors
 - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)

From 130c7d52bbbb4f02be82baf9bde1e11da23273fd Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Sun, 9 Dec 2018 11:16:00 +0530
Subject: [PATCH 480/529] [RUNTIME][GOLANG] TVM runtime for golang v0.1 (#1470)

---
 golang/Makefile                           |  64 +++
 golang/README.md                          | 107 ++++
 golang/sample/Makefile                    |  17 +
 golang/sample/complex.go                  | 171 +++++++
 golang/sample/deploy.py                   |  40 ++
 golang/sample/pack_func_closure_arg.go    |  57 +++
 golang/sample/pack_func_closure_return.go |  57 +++
 golang/sample/pack_func_convert.go        |  44 ++
 golang/sample/pack_func_handle_arg.go     |  60 +++
 golang/sample/pack_func_register.go       |  63 +++
 golang/sample/simple.go                   |  72 +++
 golang/src/array_test.go                  | 596 ++++++++++++++++++++++
 golang/src/bytearray.go                   |  72 +++
 golang/src/bytearray_test.go              |  32 ++
 golang/src/context.go                     |  89 ++++
 golang/src/error.go                       |  31 ++
 golang/src/error_test.go                  |  28 +
 golang/src/function.go                    | 365 +++++++++++++
 golang/src/function_test.go               | 331 ++++++++++++
 golang/src/gotvm.cc                       | 195 +++++++
 golang/src/gotvm.go                       |  24 +
 golang/src/gotvm.h                        |  42 ++
 golang/src/gotvm_test.go                  |  30 ++
 golang/src/module.go                      | 121 +++++
 golang/src/module_test.go                 |  93 ++++
 golang/src/ndarray.go                     | 329 ++++++++++++
 golang/src/tvm_runtime_pack.cc            |  49 ++
 golang/src/type.go                        |  72 +++
 golang/src/util.go                        |  24 +
 golang/src/value.go                       | 360 +++++++++++++
 golang/src/value_test.go                  | 237 +++++++++
 31 files changed, 3872 insertions(+)
 create mode 100644 golang/Makefile
 create mode 100644 golang/README.md
 create mode 100644 golang/sample/Makefile
 create mode 100644 golang/sample/complex.go
 create mode 100644 golang/sample/deploy.py
 create mode 100644 golang/sample/pack_func_closure_arg.go
 create mode 100644 golang/sample/pack_func_closure_return.go
 create mode 100644 golang/sample/pack_func_convert.go
 create mode 100644 golang/sample/pack_func_handle_arg.go
 create mode 100644 golang/sample/pack_func_register.go
 create mode 100644 golang/sample/simple.go
 create mode 100644 golang/src/array_test.go
 create mode 100644 golang/src/bytearray.go
 create mode 100644 golang/src/bytearray_test.go
 create mode 100644 golang/src/context.go
 create mode 100644 golang/src/error.go
 create mode 100644 golang/src/error_test.go
 create mode 100644 golang/src/function.go
 create mode 100644 golang/src/function_test.go
 create mode 100644 golang/src/gotvm.cc
 create mode 100644 golang/src/gotvm.go
 create mode 100644 golang/src/gotvm.h
 create mode 100644 golang/src/gotvm_test.go
 create mode 100644 golang/src/module.go
 create mode 100644 golang/src/module_test.go
 create mode 100644 golang/src/ndarray.go
 create mode 100644 golang/src/tvm_runtime_pack.cc
 create mode 100644 golang/src/type.go
 create mode 100644 golang/src/util.go
 create mode 100644 golang/src/value.go
 create mode 100644 golang/src/value_test.go

diff --git a/golang/Makefile b/golang/Makefile
new file mode 100644
index 000000000000..54019740c87a
--- /dev/null
+++ b/golang/Makefile
@@ -0,0 +1,64 @@
+.PHONY: clean all
+
+TVM_BASE   = $(CURDIR)/../
+TARGET     = gotvm
+LIBS       = -lm -ldl
+NATIVE_SRC = tvm_runtime_pack.cc
+
+GOPATH=$(CURDIR)/gopath
+GOPATHDIR=${GOPATH}/src/${TARGET}/
+CGO_CPPFLAGS="-I. -I${TVM_BASE}/ -I${TVM_BASE}/3rdparty/dmlc-core/include -I${TVM_BASE}/include -I${TVM_BASE}/3rdparty/dlpack/include/"
+CGO_CXXFLAGS="-std=c++11"
+CGO_CFLAGS="-I${TVM_BASE}"
+CGO_LDFLAGS="-ldl -lm"
+
+all:
+	@mkdir gopath 2>/dev/null || true
+	@mkdir gopath/src 2>/dev/null || true
+	@mkdir gopath/src/$(TARGET) 2>/dev/null || true
+	@cp src/$(TARGET).cc gopath/src/$(TARGET)
+	@cp src/$(TARGET).h gopath/src/$(TARGET)
+	@cp src/$(NATIVE_SRC) gopath/src/$(TARGET)
+	@cp src/*.go gopath/src/$(TARGET)
+	@export GOPATH=$(GOPATH); \
+	export CGO_CPPFLAGS=$(CGO_CPPFLAGS); \
+	export CGO_CXXFLAGS=$(CGO_CXXFLAGS); \
+	export CGO_CFLAGS=$(CGO_CFLAGS); \
+	export CGO_LDFLAGS=$(CGO_LDFLAGS); \
+	(cd $(GOPATHDIR) && go clean -cache \
+	&& golint && go build -o $(TARGET).a \
+	&& go install)
+	@find . -name gotvm.a
+	@#mkdir gopath/doc 2>/dev/null || true
+	@#godoc -html -goroot gopath/ gotvm | grep -v "for documentation on the gotvm command" > gopath/doc/gotvm.html
+	@#echo "Run 'godoc -http=:6060  -goroot=./gopath' for documentation"
+
+samples: all
+	cp gopath/pkg/linux_amd64/gotvm.a sample/ -rfa
+	make -C sample
+
+tests: all
+	@(cd sample; python3 deploy.py)
+	@export GOPATH=$(GOPATH); \
+	export CGO_CPPFLAGS=$(CGO_CPPFLAGS); \
+	export CGO_CXXFLAGS=$(CGO_CXXFLAGS); \
+	export CGO_CFLAGS=$(CGO_CFLAGS); \
+	export CGO_LDFLAGS=$(CGO_LDFLAGS); \
+	(cd $(GOPATHDIR) \
+	&& cp ../../../sample/deploy.so . \
+	&& go test -v)
+
+clean:
+	@if [ -d $(GOPATHDIR) ] ; then \
+	export GOPATH=$(GOPATH); \
+	export CGO_CPPFLAGS=$(CGO_CPPFLAGS); \
+	export CGO_CFLAGS=$(CGO_CFLAGS); \
+	export CGO_LDFLAGS=$(CGO_LDFLAGS); \
+	(cd $(GOPATHDIR) && go clean -cache); fi
+	@rm -rf gopath
+	@make -C sample clean
+
+lint:
+	@(cd src; golint)
+	@python3 ${TVM_BASE}/dmlc-core/scripts/lint.py gotvm cpp src/*.cc
+	@python3 ${TVM_BASE}/dmlc-core/scripts/lint.py gotvm cpp src/*.h
diff --git a/golang/README.md b/golang/README.md
new file mode 100644
index 000000000000..9c152dd7365c
--- /dev/null
+++ b/golang/README.md
@@ -0,0 +1,107 @@
+# gotvm - Golang Frontend for TVM Runtime
+
+This folder contain golang interface for TVM runtime. It brings TVM runtime to Golang.
+
+- It enable c runtime api of tvm exposed to golang.
+- It enables module loading (lib, graph and params) and inference operations.
+
+## Installation
+
+### Requirements
+
+- go compiler (https://golang.org/) version 0.10 or above.
+
+### Modules
+
+- src
+  Module that generates golang package corresponding to the c runtime api exposed from tvm source tree.
+  This process build golang package _gotvm.a_
+
+- samples
+  Sample golang reference application to inference through gotvm package.
+
+### Build
+
+Once the Requirements are installed
+
+To build _gotvm_ package
+
+```bash
+make
+```
+
+To build and run internal tests
+
+```bash
+make tests
+```
+
+To build sample apps.
+
+```bash
+make samples
+```
+
+## Run
+
+To Demonstrates sample TVM module compilation using python and deploy via golang.
+```bash
+./simple
+``` 
+
+To deploy a realtime module with lib, graph and param.
+```bash
+./complex
+```
+
+To demonstrate go function closure conversion to packed function handle.
+
+```bash
+./pack_func_convert
+```
+
+To demonstrate a packed function handle given as an argument.
+
+```bash
+pack_func_handle_arg
+```
+
+To register go function with runtime as a global function.
+
+```bash
+pack_func_register
+```
+
+To demonstrate function closure passed as argument to a function call.
+
+```bash
+./pack_func_closure_arg
+```
+
+To demonstrate function closure returned from a packed function.
+
+```bash
+./pack_func_closure_return
+```
+
+## Documentation
+gotvm.go is documented with sufficient information about gotvm package.
+A html version documentation can be accessed by running below command after building runtime.
+
+```bash
+godoc -http=:6060  -goroot=./gopath
+```
+After above command try http://127.0.0.1:6060 from any browser.
+
+Also please refer to the sample applications under sample folder.
+
+## Docker
+Docker setup may need below additions for dependencies and environment preparation.
+
+Please refer ```docker/install/ubuntu_install_golang.sh``` for the packages dependencies.
+
+go compiler 1.10 on ubuntu doesn't install on standard path, hence an explicit export may be needed as shown below.
+
+```bash
+export PATH="/usr/lib/go-1.10/bin:$PATH"```
+```
diff --git a/golang/sample/Makefile b/golang/sample/Makefile
new file mode 100644
index 000000000000..8ebea49da42f
--- /dev/null
+++ b/golang/sample/Makefile
@@ -0,0 +1,17 @@
+.PHONY: clean all
+
+SOURCES=$(wildcard *.go)
+EXECUTABLE=$(patsubst %.go, %, $(SOURCES))
+
+all: $(EXECUTABLE)
+	@golint
+	@python3 deploy.py
+
+%: %.o
+	@go tool link -linkmode external -extld "g++" -extldflags "-ldl" -o $@ $<
+
+%.o: %.go
+	@go tool compile -pack -o $@ $<
+
+clean:
+	@rm -f $(EXECUTABLE) *.so *.o *.a
diff --git a/golang/sample/complex.go b/golang/sample/complex.go
new file mode 100644
index 000000000000..7a8d0044375c
--- /dev/null
+++ b/golang/sample/complex.go
@@ -0,0 +1,171 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application deployment over tvm.
+ * \file complex.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "io/ioutil"
+    "math/rand"
+    "./gotvm"
+    "runtime"
+)
+
+// NNVM compiled model paths.
+const (
+    modLib    = "./mobilenet.so"
+    modJSON   = "./mobilenet.json"
+    modParams = "./mobilenet.params"
+)
+
+// main
+func main() {
+    defer runtime.GC()
+    // Welcome
+    fmt.Printf("TVM Version   : v%v\n", gotvm.TVMVersion)
+    fmt.Printf("DLPACK Version: v%v\n\n", gotvm.DLPackVersion)
+
+    // Query global functions available
+    funcNames, err := gotvm.FuncListGlobalNames()
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Global Functions:%v\n", funcNames)
+
+    // Import tvm module (so)
+    modp, err := gotvm.LoadModuleFromFile(modLib)
+    if err != nil {
+        fmt.Print(err)
+        fmt.Printf("Please copy tvm compiled modules here and update the sample.go accordingly.\n")
+        fmt.Printf("You may need to update modLib, modJSON, modParams, tshapeIn, tshapeOut\n")
+        return
+    }
+    fmt.Printf("Module Imported:%p\n", modp)
+    bytes, err := ioutil.ReadFile(modJSON)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    jsonStr := string(bytes)
+
+    // Load module on tvm runtime - call tvm.graph_runtime.create
+    funp, err := gotvm.GetGlobalFunction("tvm.graph_runtime.create")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Calling tvm.graph_runtime.create\n")
+    // Call function
+    graphrt, err := funp.Invoke(jsonStr, modp, (int64)(gotvm.KDLCPU), (int64)(0))
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    graphmod := graphrt.AsModule()
+    fmt.Printf("Graph runtime Created\n")
+
+    // Array allocation attributes
+    tshapeIn  := []int64{1, 224, 224, 3}
+    tshapeOut := []int64{1, 1001}
+
+    // Allocate input Array
+    inX, err := gotvm.Empty(tshapeIn, "float32", gotvm.CPU(0))
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    // Allocate output Array
+    out, err := gotvm.Empty(tshapeOut)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Input and Output Arrays allocated\n")
+
+    // Get module function from graph runtime : load_params
+    // Read params
+    bytes, err = ioutil.ReadFile(modParams)
+    if err != nil {
+        fmt.Print(err)
+    }
+
+    // Load Params
+    funp, err = graphmod.GetFunction("load_params")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Func load_params:%p\n", funp)
+
+    // Call function
+    _, err = funp.Invoke(bytes)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Module params loaded\n")
+
+    // Set some data in input Array
+    inSlice := make([]float32, (244 * 244 * 3))
+    rand.Seed(10)
+    rand.Shuffle(len(inSlice), func(i, j int) {inSlice[i],
+                                               inSlice[j] = rand.Float32(),
+                                               rand.Float32() })
+    inX.CopyFrom(inSlice)
+
+    // Set Input
+    funp, err = graphmod.GetFunction("set_input")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    // Call function
+    _, err = funp.Invoke("input", inX)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    fmt.Printf("Module input is set\n")
+
+    // Run
+    funp, err = graphmod.GetFunction("run")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    // Call function
+    _, err = funp.Invoke()
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Module Executed \n")
+
+    // Call runtime function get_output
+    funp, err = graphmod.GetFunction("get_output")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    // Call function
+    _, err = funp.Invoke(int64(0), out)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Got Module Output \n")
+
+    // Print results
+    outIntf, _ := out.AsSlice()
+    outSlice := outIntf.([]float32)
+    fmt.Printf("Result:%v\n", outSlice[:10])
+}
diff --git a/golang/sample/deploy.py b/golang/sample/deploy.py
new file mode 100644
index 000000000000..065638299bc6
--- /dev/null
+++ b/golang/sample/deploy.py
@@ -0,0 +1,40 @@
+"""
+Get Started with TVM Go
+=======================
+"""
+from __future__ import absolute_import, print_function
+
+import tvm
+import numpy as np
+
+# Global declarations of environment.
+
+tgt_host="llvm"
+tgt="llvm"
+
+######################################################################
+# Describe the Computation
+# ------------------------
+n = tvm.var("n")
+A = tvm.placeholder((n,), name='A')
+B = tvm.placeholder((n,), name='B')
+C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+
+######################################################################
+# Schedule the Computation
+# ------------------------
+s = tvm.create_schedule(C.op)
+
+######################################################################
+# Compilation
+# -----------
+fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+
+######################################################################
+# Save Compiled Module
+# --------------------
+from tvm.contrib import cc
+from tvm.contrib import util
+
+fadd.save("deploy.o")
+cc.create_shared("deploy.so", ["deploy.o"])
diff --git a/golang/sample/pack_func_closure_arg.go b/golang/sample/pack_func_closure_arg.go
new file mode 100644
index 000000000000..b31113160586
--- /dev/null
+++ b/golang/sample/pack_func_closure_arg.go
@@ -0,0 +1,57 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application to demonstrate go-closure given to a packed function argument.
+ * \file pack_func_closure_arg.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "./gotvm"
+)
+
+
+// sampleFunctionArg receives a Packed Function handle and calls it.
+func sampleFunctionArg(args ...*gotvm.Value) (retVal interface{}, err error) {
+    // Reveive Packed Function Handle
+    pfunc := args[0].AsFunction()
+    // Call Packed Function
+    retVal, err = pfunc.Invoke(args[1].AsInt64(), args[2].AsInt64())
+    return
+}
+
+// main
+func main() {
+    // Not passing a function name implicitely
+    // picks the name from reflection as "main.sampleDunctionArg"
+    gotvm.RegisterFunction(sampleFunctionArg);
+    fmt.Printf("Registered: sampleFunctionArg\n")
+
+    // Get registered global function.
+    funp, err := gotvm.GetGlobalFunction("main.sampleFunctionArg")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("GetGlobalFunction: main.sampleFunctionArg - Success\n")
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*gotvm.Value) (retVal interface{}, err error) {
+        for _, v := range args {
+            fmt.Printf("ARGS:%T : %v\n", v.AsInt64(), v.AsInt64())
+        }
+        val1 := args[0].AsInt64()
+        val2 := args[1].AsInt64()
+        retVal = int64(val1+val2)
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke(funccall, 30, 50)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Invoked sampleFunctionArg with function closure arg : Result:%v\n", result.AsInt64())
+}
diff --git a/golang/sample/pack_func_closure_return.go b/golang/sample/pack_func_closure_return.go
new file mode 100644
index 000000000000..98de8e2e5146
--- /dev/null
+++ b/golang/sample/pack_func_closure_return.go
@@ -0,0 +1,57 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application to demonstrate go-closure returned from a callback function.
+ * \file pack_func_closure_return.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "./gotvm"
+)
+
+// sampleFunctionCb returns a function closure which is embed as packed function in TVMValue.
+func sampleFunctionCb(args ...*gotvm.Value) (retVal interface{}, err error) {
+    funccall := func (cargs ...*gotvm.Value) (fret interface{}, ferr error) {
+        for _, v := range cargs {
+            fmt.Printf("ARGS:%T : %v\n", v.AsInt64(), v.AsInt64())
+        }
+        val1 := cargs[0].AsInt64()
+        val2 := cargs[1].AsInt64()
+        fret = int64(val1+val2)
+        return
+    }
+    retVal = funccall
+    return
+}
+
+// main
+func main() {
+    // Not passing a function name implicitely
+    // picks the name from reflection as "main.sampleDunctionCb"
+    gotvm.RegisterFunction(sampleFunctionCb);
+    fmt.Printf("Registered: sampleFunctionCb\n")
+
+    // Get registered global function
+    funp, err := gotvm.GetGlobalFunction("main.sampleFunctionCb")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("GetGlobalFunction: main.sampleFunctionCb - Success\n")
+
+    // Call function
+    result, err := funp.Invoke()
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Invoked main.sampleFunctionCb via Function handle\n")
+
+    pfunc := result.AsFunction()
+    fmt.Printf("Function Handle received via Packed Function call:%T - %v \n", pfunc, pfunc)
+
+    pfuncRet, err := pfunc.Invoke(30, 40)
+    fmt.Printf("Invoked closure inside sampleFunctionCb result:%v\n", pfuncRet.AsInt64())
+}
diff --git a/golang/sample/pack_func_convert.go b/golang/sample/pack_func_convert.go
new file mode 100644
index 000000000000..6748d67fe75f
--- /dev/null
+++ b/golang/sample/pack_func_convert.go
@@ -0,0 +1,44 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application to demonstrate function conversion to packed function.
+ * \file pack_func_convert.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "./gotvm"
+)
+
+// sampleCb is a simple golang callback function like C = A + B.
+func sampleCb(args ...*gotvm.Value) (retVal interface{}, err error) {
+    for _, v := range args {
+        fmt.Printf("ARGS:%T : %v\n", v.AsInt64(), v.AsInt64())
+    }
+    val1 := args[0].AsInt64()
+    val2 := args[1].AsInt64()
+    retVal = int64(val1+val2)
+    return
+}
+
+// main
+func main() {
+    // Welcome
+
+    // Simple convert to a packed function
+    fhandle, err := gotvm.ConvertFunction(sampleCb)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Converted function\n")
+
+    retVal, err := fhandle.Invoke(10, 20)
+    fmt.Printf("Invoke Completed\n")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Result:%v\n", retVal.AsInt64())
+}
diff --git a/golang/sample/pack_func_handle_arg.go b/golang/sample/pack_func_handle_arg.go
new file mode 100644
index 000000000000..ad1313f93f5f
--- /dev/null
+++ b/golang/sample/pack_func_handle_arg.go
@@ -0,0 +1,60 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application to demonstrate converted packed
+ * function handle passed to another packed function.
+ * \file pack_func_handle_arg.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "./gotvm"
+)
+
+// sampleCb is a simple golang callback function like C = A + B.
+func sampleCb(args ...*gotvm.Value) (retVal interface{}, err error) {
+    for _, v := range args {
+        fmt.Printf("ARGS:%T : %v\n", v.AsInt64(), v.AsInt64())
+    }
+    val1 := args[0].AsInt64()
+    val2 := args[1].AsInt64()
+    retVal = int64(val1+val2)
+    return
+}
+
+// sampleFunctionArg receives a Packed Function handle and calls it.
+func sampleFunctionArg(args ...*gotvm.Value) (retVal interface{}, err error) {
+    // Reveive Packed Function Handle
+    pfunc := args[0].AsFunction()
+
+    // Call Packed Function
+    retVal, err = pfunc.Invoke(args[1], args[2])
+    return
+}
+
+// main
+func main() {
+    // Simple convert to a packed function
+    fhandle, err := gotvm.ConvertFunction(sampleCb)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    gotvm.RegisterFunction(sampleFunctionArg);
+    fmt.Printf("Registered: sampleFunctionArg\n")
+
+    funp, err := gotvm.GetGlobalFunction("main.sampleFunctionArg")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    retVal, err := funp.Invoke(fhandle, 10, 20)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Result:%v\n", retVal.AsInt64())
+}
diff --git a/golang/sample/pack_func_register.go b/golang/sample/pack_func_register.go
new file mode 100644
index 000000000000..5da67e00c16c
--- /dev/null
+++ b/golang/sample/pack_func_register.go
@@ -0,0 +1,63 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application to demonstrate function register into TVM global functions.
+ * \file pack_func_register.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "./gotvm"
+    "strings"
+)
+
+// sampleCb is a simple golang callback function like C = A + B.
+func sampleCb(args ...*gotvm.Value) (retVal interface{}, err error) {
+    for _, v := range args {
+        fmt.Printf("ARGS:%T : %v\n", v.AsInt64(), v.AsInt64())
+    }
+    val1 := args[0].AsInt64()
+    val2 := args[1].AsInt64()
+    retVal = int64(val1+val2)
+    return
+}
+
+// main
+func main() {
+    // Register sampleCb with TVM packed function system and call and check Global Function List.
+    gotvm.RegisterFunction(sampleCb, "sampleCb");
+    // Query global functions available
+    funcNames, err := gotvm.FuncListGlobalNames()
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    found := 0
+    for ii := range (funcNames) {
+        if strings.Compare(funcNames[ii], "sampleCb") == 0 {
+            found = 1
+        }
+    }
+    if found == 0 {
+        fmt.Printf("Function registerd but, not listed\n")
+        return
+    }
+
+
+    // Get "sampleCb" and verify the call.
+    funp, err := gotvm.GetGlobalFunction("sampleCb")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke((int64)(10), (int64)(20))
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("sampleCb result: %v\n", result.AsInt64())
+}
diff --git a/golang/sample/simple.go b/golang/sample/simple.go
new file mode 100644
index 000000000000..ada3963662de
--- /dev/null
+++ b/golang/sample/simple.go
@@ -0,0 +1,72 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application deployment over tvm.
+ * \file simple.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "runtime"
+    "./gotvm"
+    "math/rand"
+)
+
+// NNVM compiled model paths.
+const (
+    modLib    = "./deploy.so"
+)
+
+// main
+func main() {
+    // Welcome
+    defer runtime.GC()
+    fmt.Printf("TVM Version   : v%v\n", gotvm.TVMVersion)
+    fmt.Printf("DLPACK Version: v%v\n\n", gotvm.DLPackVersion)
+
+    // Import tvm module (so)
+    modp, _ := gotvm.LoadModuleFromFile(modLib)
+    fmt.Printf("Module Imported\n")
+
+
+    // Allocate Array for inputs and outputs.
+    // Allocation by explicit type and context.
+    tshapeIn  := []int64{4}
+    inX, _ := gotvm.Empty(tshapeIn, "float32", gotvm.CPU(0))
+
+    // Default allocation on CPU
+    inY, _ := gotvm.Empty(tshapeIn, "float32")
+
+    // Default allocation to type "float32" and on CPU
+    out, _ := gotvm.Empty(tshapeIn)
+    fmt.Printf("Input and Output Arrays allocated\n")
+
+    // Fill Input Data : inX , inY
+    inXSlice := make([]float32, 4)
+    inYSlice := make([]float32, 4)
+    for i := range inXSlice {
+        inXSlice[i] = rand.Float32()
+        inYSlice[i] = rand.Float32()
+    }
+
+
+    // Copy the data on target memory through runtime CopyFrom api.
+    inX.CopyFrom(inXSlice)
+    inY.CopyFrom(inYSlice)
+    fmt.Printf("X: %v\n", inXSlice)
+    fmt.Printf("Y: %v\n", inYSlice)
+
+    // Get function "myadd"
+    funp, _ := modp.GetFunction("myadd")
+
+    // Call function
+    funp.Invoke(inX, inY, out)
+    fmt.Printf("Module function myadd executed\n")
+
+    // Get the output tensor as an interface holding a slice through runtime CopyTo api.
+    outSlice, _ := out.AsSlice()
+
+    // Print results
+    fmt.Printf("Result:%v\n", outSlice.([]float32))
+}
diff --git a/golang/src/array_test.go b/golang/src/array_test.go
new file mode 100644
index 000000000000..6917dd14e373
--- /dev/null
+++ b/golang/src/array_test.go
@@ -0,0 +1,596 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file array_test.go
+ */
+
+
+package gotvm
+
+import (
+    "testing"
+    "unsafe"
+    "math/rand"
+)
+
+// Create an array and check size.
+func TestArrayCreateSize(t *testing.T) {
+    _, err := Empty([]int64{4})
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = Empty([]int64{4, 5, 6})
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = Empty([]int64{})
+    if err == nil {
+        t.Error("Expected err for empty Array created, but didn't got !!")
+        return
+    }
+}
+
+// Check array creation via various different arguments.
+func TestArrayCreateArgs(t *testing.T) {
+    _, err := Empty([]int64{4, 2}, "float32", CPU(0))
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = Empty([]int64{4, 2}, "float32")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = Empty([]int64{4, 2}, CPU(0))
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = Empty([]int64{4, 2}, CPU(0), "float32")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+}
+
+// Create an array and check the NDim.
+func TestArrayNDim(t *testing.T) {
+    arr, err := Empty([]int64{4, 5, 6})
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if 3 != arr.GetNdim() {
+        t.Errorf("GetNdim failed Expected: 3 Got :%v\n", arr.GetNdim())
+        return
+    }
+}
+
+// Create an array and check Shape.
+func TestArrayShape(t *testing.T) {
+    arr, err := Empty([]int64{4, 5, 6})
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    shape := arr.GetShape()
+    if len(shape) != 3 {
+        t.Errorf("Shape slice expected: 3 Got :%v\n", len(shape))
+        return
+    }
+
+    if shape[0] != 4 || shape[1] != 5 || shape[2] != 6 {
+        t.Errorf("Shape values expected {4, 5, 6} Got : %v\n", shape);
+        return
+    }
+}
+
+// Create an array and check created Context.
+func TestArrayCtx(t *testing.T) {
+    // TODO: Could some test cases for other targets
+    arr, err := Empty([]int64{4}, CPU(0))
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ctx := arr.GetCtx()
+    if ctx.DeviceType != KDLCPU {
+        t.Errorf("Ctx DeviceType expected: %v Got :%v\n", KDLCPU, ctx.DeviceType)
+        return
+    }
+    if ctx.DeviceID != 0 {
+        t.Errorf("Ctx DeviceID expected: %v Got :%v\n", KDLCPU, ctx.DeviceID)
+        return
+    }
+
+    arr, err = Empty([]int64{4}, CPU(2))
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ctx = arr.GetCtx()
+    if ctx.DeviceType != KDLCPU {
+        t.Errorf("Ctx DeviceType expected: %v Got :%v\n", KDLCPU, ctx.DeviceType)
+        return
+    }
+    if ctx.DeviceID != 2 {
+        t.Errorf("Ctx DeviceID expected: %v Got :%v\n", KDLCPU, ctx.DeviceID)
+        return
+    }
+}
+
+// Create array of different dtypes and check dtypes.
+func TestArrayDType(t *testing.T) {
+    for _, dtype := range  []string{"int8", "int16", "int32", "int64",
+                                    "uint8", "uint16", "uint32", "uint64",
+                                    "float32", "float64"} {
+        arr, err := Empty([]int64{4}, dtype)
+        if err != nil {
+            t.Error(err.Error())
+            return
+        }
+
+        if dtype != arr.GetDType() {
+            t.Errorf("Dtype expected: %v Got :%v\n", dtype, arr.GetDType())
+            return
+        }
+    }
+}
+
+// Copy Int8 data to created Array and verify.
+func TestArrayCopySliceInt8(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "int8")
+
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen)
+    rand.Read(bdata)
+    data := (*[1<<31]int8)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []int8:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+
+    dataRet := ret.([]int8)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy Int16 data to created Array and verify.
+func TestArrayCopySliceInt16(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "int16")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*2)
+    rand.Read(bdata)
+    data := (*[1<<31]int16)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    switch ret.(type) {
+        case []int16:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+
+    dataRet := ret.([]int16)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy Int32 data to created Array and verify.
+func TestArrayCopySliceInt32(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "int32")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*4)
+    rand.Read(bdata)
+    data := (*[1<<31]int32)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []int32:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]int32)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy Int64 data to created Array and verify.
+func TestArrayCopySliceInt64(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "int64")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*8)
+    rand.Read(bdata)
+    data := (*[1<<31]int64)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []int64:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]int64)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy UInt8 data to created Array and verify.
+func TestArrayCopySliceUInt8(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "uint8")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen)
+    rand.Read(bdata)
+    data := (*[1<<31]uint8)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []uint8:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]uint8)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy UInt16 data to created Array and verify.
+func TestArrayCopySliceUInt16(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "uint16")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*2)
+    rand.Read(bdata)
+    data := (*[1<<31]uint16)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []uint16:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]uint16)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy UInt32 data to created Array and verify.
+func TestArrayCopySliceUInt32(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "uint32")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*4)
+    rand.Read(bdata)
+    data := (*[1<<31]uint32)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []uint32:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]uint32)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy UInt64 data to created Array and verify.
+func TestArrayCopySliceUInt64(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "uint64")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*8)
+    rand.Read(bdata)
+    data := (*[1<<31]uint64)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []uint64:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]uint64)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy Float32 data to created Array and verify.
+func TestArrayCopySliceFloat32(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "float32")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    data := make([]float32, dlen)
+
+    for i := range data {
+        data[i] = rand.Float32()
+    }
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []float32:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]float32)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v \nGot :%v \n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy Float64 data to created Array and verify.
+func TestArrayCopySliceFloat64(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "float64")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    data := make([]float64, dlen)
+
+    for i := range data {
+        data[i] = rand.Float64()
+    }
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []float64:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]float64)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
diff --git a/golang/src/bytearray.go b/golang/src/bytearray.go
new file mode 100644
index 000000000000..e40a630223dc
--- /dev/null
+++ b/golang/src/bytearray.go
@@ -0,0 +1,72 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMByteArray interface.
+ * \file bytearray.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "unsafe"
+)
+
+// ByteArray type wraps the TVMByteArray of C runtime API.
+// 
+// This can be used to hold raw data like params of a model.
+type ByteArray uintptr
+
+// nativeCPtr returns the type freed unitptr for ByteArray.
+func (tbytearray ByteArray) nativeCPtr() (retVal uintptr) {
+	retVal = (uintptr)(tbytearray)
+    return
+}
+
+// SetData is used to intialize ByteArray from a golang string object.
+//
+// This method initialize both data and data size of the underlaying object.
+// This function handles freeing old data object if any before allocating new.
+//
+// `val` is the golang string object from which the ByteArray is initialized.
+func (tbytearray ByteArray) setData(val string) {
+    bufPtr := ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).data
+    if bufPtr == (*_Ctype_char)(C.NULL) {
+        C.free(unsafe.Pointer(bufPtr))
+    }
+
+    ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).data = C.CString(val)
+    ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).size = C.ulong(len(val))
+}
+
+// getData returns the golang byte slice corresponding to the ByteArray.
+func (tbytearray ByteArray) getData() (retVal []byte) {
+	val := ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).data
+	blen := ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).size
+	retVal = C.GoBytes(unsafe.Pointer(val), C.int(blen))
+    return
+}
+
+// newByteArray initilizes the native TVMByteArray object with given byte slice
+//
+//`val` is the golang byte array used to initialize.
+//
+// returns newly created ByteArray.
+func newByteArray(val []byte) (retVal ByteArray) {
+    handle := ByteArray(C.malloc(C.sizeof_TVMByteArray))
+    ((*C.TVMByteArray)(unsafe.Pointer(handle))).data = (*_Ctype_char)(C.NULL)
+    ((*C.TVMByteArray)(unsafe.Pointer(handle))).size = 0
+    handle.setData(string(val))
+    retVal = handle
+    return
+}
+
+// deleteTVMByteArray releases the allocated native object of ByteArray.
+//
+// This delete handles freeing of underlaying native data object too.
+func (tbytearray ByteArray) deleteTVMByteArray() {
+    bufPtr := ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).data
+    C.free(unsafe.Pointer(bufPtr))
+	C.free(unsafe.Pointer(tbytearray.nativeCPtr()))
+}
diff --git a/golang/src/bytearray_test.go b/golang/src/bytearray_test.go
new file mode 100644
index 000000000000..f49e75ee2fa6
--- /dev/null
+++ b/golang/src/bytearray_test.go
@@ -0,0 +1,32 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file bytearray_test.go
+ */
+
+
+package gotvm
+
+import (
+    "testing"
+    "math/rand"
+)
+
+// Check ByteArray creation from byte slice and verify the data.
+func TestByteArrayGet(t *testing.T) {
+    data := make([]byte, 1024)
+    rand.Read(data)
+
+    barr := newByteArray(data)
+    dataRet := barr.getData()
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v at : %v\n", data[i], dataRet[i], i)
+            return
+        }
+    }
+}
diff --git a/golang/src/context.go b/golang/src/context.go
new file mode 100644
index 000000000000..8a3b613ea6b9
--- /dev/null
+++ b/golang/src/context.go
@@ -0,0 +1,89 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMContext interface
+ * \file context.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+// KDLCPU is golang enum correspond to TVM device type kDLCPU.
+var KDLCPU                  = int32(C.kDLCPU)
+// KDLGPU is golang enum correspond to TVM device type kDLGPU.
+var KDLGPU                  = int32(C.kDLGPU)
+// KDLCPUPinned is golang enum correspond to TVM device type kDLCPUPinned.
+var KDLCPUPinned            = int32(C.kDLCPUPinned)
+// KDLOpenCL is golang enum correspond to TVM device type kDLOpenCL.
+var KDLOpenCL               = int32(C.kDLOpenCL)
+// KDLMetal is golang enum correspond to TVM device type kDLMetal.
+var KDLMetal                = int32(C.kDLMetal)
+// KDLVPI is golang enum correspond to TVM device type kDLVPI.
+var KDLVPI                  = int32(C.kDLVPI)
+// KDLROCM is golang enum correspond to TVM device type kDLROCM.
+var KDLROCM                 = int32(C.kDLROCM)
+// KDLSDAccel is golang enum correspond to TVM device type kDLSDAccel.
+var KDLSDAccel              = int32(C.kDLSDAccel)
+// KDLVulkan is golang enum correspond to TVM device type kDLVulkan.
+var KDLVulkan               = int32(C.kDLVulkan)
+// KOpenGL is golang enum correspond to TVM device type kOpenGL.
+var KOpenGL                 = int32(C.kOpenGL)
+// KExtDev is golang enum correspond to TVM device type kDLExtDev.
+var KExtDev                 = int32(C.kDLExtDev)
+
+// Context dtype corresponding to TVMContext aka DLContext
+type Context struct {
+    DeviceType int32
+    DeviceID    int32
+}
+
+// CPU returns the Context object for CPU target on given index
+func CPU(index int32) Context {
+    return Context{KDLCPU, index}
+}
+
+// GPU returns the Context object for GPU target on given index
+func GPU(index int32) Context {
+    return Context{KDLGPU, index}
+}
+
+// CPUPinned returns the Context object for CPUPinned target on given index
+func CPUPinned(index int32) Context {
+    return Context{KDLCPUPinned, index}
+}
+
+// OpenCL returns the Context object for OpenCL target on given index
+func OpenCL(index int32) Context {
+    return Context{KDLOpenCL, index}
+}
+
+// Metal returns the Context object for Metal target on given index
+func Metal(index int32) Context {
+    return Context{KDLMetal, index}
+}
+
+// VPI returns the Context object for VPI target on given index
+func VPI(index int32) Context {
+    return Context{KDLVPI, index}
+}
+
+// ROCM returns the Context object for ROCM target on given index
+func ROCM(index int32) Context {
+    return Context{KDLROCM, index}
+}
+
+// SDAccel returns the Context object for SDAccel target on given index
+func SDAccel(index int32) Context {
+    return Context{KDLSDAccel, index}
+}
+
+// Vulkan returns the Context object for Vulkan target on given index
+func Vulkan(index int32) Context {
+    return Context{KDLVulkan, index}
+}
+
+// OpenGL returns the Context object for OpenGL target on given index
+func OpenGL(index int32) Context {
+    return Context{KOpenGL, index}
+}
diff --git a/golang/src/error.go b/golang/src/error.go
new file mode 100644
index 000000000000..00a24652953c
--- /dev/null
+++ b/golang/src/error.go
@@ -0,0 +1,31 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for error related API interface.
+ * \file error.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "unsafe"
+)
+
+// getTVMLastError returns the detailed error string for any api called in TVM runtime.
+//
+// This is useful when any api returns non zero value.
+//
+// Returns golang string for the corresponding native error message.
+func getTVMLastError() (retVal string) {
+    errStr := C.TVMGetLastError()
+    retVal = C.GoString(errStr)
+    return
+}
+
+func setTVMLastError(errStr string) {
+    cstr := C.CString(errStr)
+    C.TVMAPISetLastError(cstr)
+    C.free(unsafe.Pointer(cstr))
+}
diff --git a/golang/src/error_test.go b/golang/src/error_test.go
new file mode 100644
index 000000000000..2a8c345b424b
--- /dev/null
+++ b/golang/src/error_test.go
@@ -0,0 +1,28 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file error_test.go
+ */
+
+
+package gotvm
+
+import (
+    "testing"
+    "strings"
+)
+
+// Check err receiving from TVM global function.
+func TestErrorTest(t *testing.T) {
+    _, err := LoadModuleFromFile("dummy.so")
+    if err == nil {
+        t.Error("Expected an error, but not received\n")
+        return
+    }
+
+    errStr := err.Error()
+    if !(strings.Contains(errStr, string("cannot open shared object"))) {
+        t.Error("Ah! TVM didn't report an error\n")
+    }
+}
+
diff --git a/golang/src/function.go b/golang/src/function.go
new file mode 100644
index 000000000000..fa1c53a5917f
--- /dev/null
+++ b/golang/src/function.go
@@ -0,0 +1,365 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMFunction interface.
+ * \file function.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "unsafe"
+    "encoding/binary"
+    "errors"
+    "runtime"
+    "reflect"
+    "fmt"
+)
+
+// Function type in golang hold pointer for the TVMFunction handle.
+type Function uintptr
+
+// nativeCPtr returns type freed uintptr for the Function.
+func (tvmfunction Function) nativeCPtr() (retVal uintptr) {
+    retVal = (uintptr)(tvmfunction)
+    return
+}
+
+// Invoke calls the TVM packed function referred by the handle with given arguments.
+func (tvmfunction *Function) Invoke(args ...interface{}) (retVal *Value, err error) {
+    funccall := func (fargs ...interface{}) (*Value, error) {
+        return callNativeFunction(tvmfunction, fargs)
+    }
+    // Check is any args are contain any ValueArray
+    // Possible is it's a args forward from one packed function to another.
+    valueArrayFound := false
+    for ii := range args {
+        switch args[ii].(type) {
+            case []*Value:
+                valueArrayFound = true
+        }
+    }
+
+    if !valueArrayFound {
+        return funccall(args...)
+    }
+    if len(args) != 1 {
+        err = fmt.Errorf("Not supported if packed function args are a mix of []Value and other types")
+        return
+    }
+
+    valArray := args[0].([]*Value)
+    if len(valArray) > 0 {
+        newArgs := make([]interface{}, len(valArray))
+        for ii := range valArray {
+            newVal := newTVMValue()
+            newVal.moveFrom(valArray[ii])
+            newArgs[ii] = newVal
+        }
+
+        return funccall(newArgs...)
+    }
+    return funccall()
+}
+
+// FuncListGlobalNames is used to query global callable packed function names from TVM.
+//
+// returns slice of string holding function names and error if any.
+func FuncListGlobalNames() (retVal []string, err error) {
+    var str string
+    ret := (int32)(C._TVMFuncListGlobalNames(unsafe.Pointer((&str))))
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+        return
+    }
+
+    str = goStringFromNative(*(*string)(unsafe.Pointer(&str)))
+    bin := binary.LittleEndian
+    size := bin.Uint64([]byte(str[:8]))
+    str = str[8:]
+    retVal = make([]string, size)
+    for i := range retVal {
+        len := bin.Uint64([]byte(str[:8]))
+        str = str[8:]
+        retVal[i] = str[:len]
+        str = str[len:]
+    }
+    return
+}
+
+// GetGlobalFunction is to get handle to the given global function name.
+//
+// `funcname` is the name of global packed function.
+//
+// returns a function closure with signature
+//         func (args ...interface{}) (interface{}, error) and  error if any.
+//
+// The closure function can be used to call Function with arguments directly.
+//
+// Variadic arguments can be any type which can be embed into Value.
+func GetGlobalFunction(funcname string) (retVal *Function, err error) {
+    var funp uintptr
+
+    cfuncname := C.CString(funcname)
+    ret := (int32)(C.TVMFuncGetGlobal(cfuncname,
+                                      (*_Ctype_TVMFunctionHandle)(unsafe.Pointer(&funp))))
+    C.free(unsafe.Pointer(cfuncname))
+
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+        return
+    }
+
+    handle := new(Function)
+    *handle = Function(funp)
+    finalizer := func(fhandle *Function) {
+        nativeTVMFuncFree(fhandle)
+        fhandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    retVal = handle
+    return
+}
+
+// callNativeFunction is routine which calls gotvm native wrapper with given arguments.
+//
+// `handle` is the handle for Function.
+//
+// `args` are the variadic arguments to the Function.
+//
+// returns the interface for the return value from TVM if any and error if any.
+func callNativeFunction(handle *Function, args []interface{}) (retVal *Value, err error) {
+    argsIn := make([]*Value, len(args))
+    var typeCodes []int32
+    if len(args) != 0 {
+        typeCodes = make([]int32, len(args))
+    } else {
+        typeCodes = make([]int32, 1)
+    }
+
+    for ii := range args {
+        argsIn[ii] = newTVMValue()
+        if typeCodes[ii], err = argsIn[ii].setValue(args[ii]); err != nil {
+            return
+        }
+    }
+
+    retVal = newTVMValue()
+    argsOut := []*Value{retVal}
+    retTypeCode := KNull
+    err = nativeTVMFuncCall(handle, argsIn, typeCodes, argsOut, &retTypeCode)
+    if err != nil {
+        retVal = nil
+        return
+    }
+    retVal.isLocal = false
+    retVal.dtype = retTypeCode
+    return
+}
+
+// nativeTVMFuncFree free the function handle allocated in TVM runtime.
+//
+// `funp` is the Function handle to be freed.
+func nativeTVMFuncFree(funp *Function) (retVal int32) {
+    retVal = (int32) (C.TVMFuncFree(C.TVMFunctionHandle(funp.nativeCPtr())))
+    return
+}
+
+// nativeToGoSlice converts native TVMValue array to Golang slice of TVMValue
+//
+//
+func nativeToGoSlice(nargValues (*C.void), argValues []*Value, typeCodes []int32) {
+    for ii := range argValues {
+        C._TVMValueNativeGet(unsafe.Pointer(argValues[ii].nativeCPtr()),
+                             unsafe.Pointer(nargValues),
+                             C.int(int32(ii)))
+        argValues[ii].dtype = typeCodes[ii]
+    }
+}
+
+// nativeFromGoSlice converts golang slice of TVMValue to native TVMValue array.
+//
+//
+func nativeFromGoSlice(argValues []*Value) (nptr (*C.void)) {
+    nargValues := ((uintptr)(C.malloc(C.ulong(C.sizeof_TVMValue * len(argValues)))))
+    for ii := range argValues {
+        C._TVMValueNativeSet(unsafe.Pointer(nargValues),
+                             unsafe.Pointer(argValues[ii].nativeCPtr()),
+                             C.int(int32(ii)))
+    }
+    nptr = (*C.void)(unsafe.Pointer(nargValues))
+    return
+}
+
+// nativeTVMFuncCall executes the function with given arguments
+//
+// `funp` Function handle to the packed function.
+//
+// `argValues` is the slice of Value which are arguments to the packed function.
+//
+// `typeCodes` is the alice of argument type codes corresponding to argValues.
+//
+// `retValues` is return argument which is slice of return values from the packed function.
+//
+// `retTypeCode` is int32 holding type codes for retValue
+//
+// Returns err indicating native error if any.
+func nativeTVMFuncCall(funp *Function, argValues []*Value, typeCodes []int32,
+                 retValues []*Value, retTypeCode *int32) (err error) {
+    nargValues := nativeFromGoSlice(argValues)
+    nretValues := nativeFromGoSlice(retValues)
+	result := (int32)(C.TVMFuncCall(_Ctype_TVMFunctionHandle(*funp),
+                                    (*_Ctype_TVMValue)(unsafe.Pointer(nargValues)),
+                                    (*_Ctype_int)(unsafe.Pointer(&(typeCodes[0]))),
+                                    C.int(len(argValues)),
+                                    (*_Ctype_TVMValue)(unsafe.Pointer(nretValues)),
+                                    (*_Ctype_int)(unsafe.Pointer(retTypeCode))))
+    nativeToGoSlice(nargValues, argValues, typeCodes)
+    nativeToGoSlice(nretValues, retValues, (*[1<<31] int32)(unsafe.Pointer(retTypeCode))[:1:1])
+    C.free(unsafe.Pointer(nargValues))
+    C.free(unsafe.Pointer(nretValues))
+
+    if result != 0 {
+	    err = errors.New(getTVMLastError())
+    }
+    return
+}
+
+// goCallBack is a structure holding the go callback function pointer.
+// This wrapping is necessary as cgo doesn't support
+// passing golang functions type conversion to native.
+type goCallBack struct {
+    cb func (args ...*Value) (interface{}, error)
+}
+
+//export goTVMCallback
+func goTVMCallback(args C.native_voidp, typeCodes C.native_voidp, numArgs int32,
+                   retArg C.native_voidp, resourceHandle C.native_voidp) (ret int32){
+    fcb := (*goCallBack)(resourceHandle)
+    // Make Value Sice from native TVMValue pointer.
+    argValues := make([]*Value, numArgs)
+
+    for ii := range argValues {
+        argValues[ii] = newTVMValue()
+        argValues[ii].isLocal = false
+    }
+
+    // Prepare arguments for golang callback function
+    nativeToGoSlice((*C.void)(unsafe.Pointer(args)), argValues,
+                    (*[1<<31] int32)(unsafe.Pointer(typeCodes))[:numArgs:numArgs])
+    cbargs := argValues
+
+    // Execute the callback
+    retVal, err := fcb.cb(cbargs...)
+    if err != nil {
+        errStr := err.Error()
+        setTVMLastError(errStr)
+        return -1
+    }
+
+    // It's possible a packed function directly return 
+    // the return value of another packed function.
+    //
+    // Inside a packed func :
+    //      ```return pfunc.Invoke(args)```
+    //
+    // In this case pfunc returns nil which is 
+    // returned as an interface holding nil *Value.
+    // Which becomes a valid retVal holding nil *Value.
+    isRetNull := false
+    switch retVal.(type) {
+        case *Value:
+            pRet := retVal.(*Value)
+            if pRet == nil {
+                isRetNull = true
+            }
+    }
+
+    // Handle return value from callback function
+    if retVal != nil && !isRetNull {
+        var retTypeCode int32
+        retValues := []*Value{newTVMValue()}
+
+        retTypeCode, err = retValues[0].setValue(retVal)
+        if err != nil {
+            errStr := err.Error()
+            setTVMLastError(errStr)
+            return -1
+        }
+        nretValues := nativeFromGoSlice(retValues)
+
+        // Handle KStr, KBytes: Local finalizers shouldn't try freeing them.
+        retValues[0].isLocal = false
+
+        apiRet := (int32) (C.TVMCFuncSetReturn(_Ctype_TVMRetValueHandle(retArg),
+                                               (*_Ctype_TVMValue)(unsafe.Pointer(nretValues)),
+                                               (*_Ctype_int)(unsafe.Pointer(&retTypeCode)), 1))
+        C.free(unsafe.Pointer(nretValues))
+        if apiRet != 0 {
+            errStr := string("TVMCFuncSetReturn failed ")
+            setTVMLastError(errStr)
+        }
+    }
+    return
+}
+
+// ConvertFunction converts given golang function to TVM packed function.
+//
+// `args[0]` function pointer for a type ```func (args ...interface{}) (interface{})```
+//
+// Returns Function handle and err if any.
+func ConvertFunction(args ...interface{}) (retVal *Function, err error) {
+    function := args[0].(func (args ...*Value) (interface{}, error))
+    fcb := &goCallBack{cb:function}
+    var funp uintptr
+
+    result := (int32) (C._ConvertFunction(unsafe.Pointer(fcb),
+                                          unsafe.Pointer(&funp)))
+    if result != 0 {
+	    err = errors.New(getTVMLastError())
+    }
+
+    handle := new(Function)
+    *handle = Function(funp)
+    finalizer := func(fhandle *Function) {
+        nativeTVMFuncFree(fhandle)
+        fhandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    retVal = handle
+    return
+}
+
+// RegisterFunction registers the golang func in TVM runtime global space.
+//
+// `args[0]` function pointer for a type ```func (args ...interface{}) (interface{})```
+//
+// `args[1]` Optional argument of function name with which it will be registered.
+//           If not passed we use function name from reflection.
+//
+// Returns err indicating native error if any.
+func RegisterFunction(args ...interface{}) (err error) {
+    fhandle, err := ConvertFunction(args...)
+    if err != nil {
+        return
+    }
+
+    funcname := runtime.FuncForPC(reflect.ValueOf(args[0]).Pointer()).Name()
+    if len(args) > 1 {
+        funcname = args[1].(string)
+    }
+
+    cfuncname := C.CString(funcname)
+    result := (int32) (C.TVMFuncRegisterGlobal(cfuncname,
+                                               _Ctype_TVMFunctionHandle(*fhandle),
+                                               0)); // Override = False
+    C.free(unsafe.Pointer(cfuncname))
+    if result != 0 {
+	    err = errors.New(getTVMLastError())
+    }
+    // Clear the finalizer as we don't need to control it anymore.
+    runtime.SetFinalizer(fhandle, nil)
+    return
+}
diff --git a/golang/src/function_test.go b/golang/src/function_test.go
new file mode 100644
index 000000000000..d53822837220
--- /dev/null
+++ b/golang/src/function_test.go
@@ -0,0 +1,331 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file function_test.go
+ */
+
+package gotvm
+
+import (
+    "testing"
+    "reflect"
+    "math/rand"
+    "strings"
+    "fmt"
+)
+
+// Check global function list API
+func TestFunctionGlobals(t *testing.T) {
+    funcNames, err := FuncListGlobalNames()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if len(funcNames) < 1 {
+        t.Errorf("Global Function names received:%v\n", funcNames)
+    }
+}
+
+// Check GetFunction API
+func TestFunctionGlobalGet(t *testing.T) {
+    funp, err := GetGlobalFunction("tvm.graph_runtime.create")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(funp).Kind() != reflect.Ptr {
+        t.Error("Function type mis matched\n")
+        return
+    }
+}
+
+func TestFunctionModuleGet(t *testing.T) {
+    modp, err := LoadModuleFromFile("./deploy.so")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    funp, err := modp.GetFunction("myadd")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(funp).Kind() != reflect.Ptr {
+        t.Error("Function type mis matched\n")
+        return
+    }
+
+    dlen := int64(1024)
+    shape := []int64{dlen}
+    inX, _ := Empty(shape)
+    inY, _ := Empty(shape)
+    out, _ := Empty(shape)
+    dataX := make([]float32, (dlen))
+    dataY := make([]float32, (dlen))
+    outExpected :=  make([]float32, (dlen))
+
+    for i := range dataX {
+        dataX[i] = rand.Float32()
+        dataY[i] = rand.Float32()
+        outExpected[i] = dataX[i] + dataY[i]
+    }
+
+    inX.CopyFrom(dataX)
+    inY.CopyFrom(dataY)
+
+    funp.Invoke(inX, inY, out)
+    outi, _ := out.AsSlice()
+    outSlice := outi.([]float32)
+    if len(outSlice) != len(outExpected) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(outExpected), len(outSlice))
+            return
+    }
+    for i := range outSlice {
+        if outExpected[i] != outSlice[i] {
+            t.Errorf("Data expected: %v Got :%v at index %v\n", outExpected[i], outSlice[i], i)
+            return
+        }
+    }
+}
+
+// Check FunctionConvert API
+func TestFunctionConvert(t *testing.T) {
+    sampleCb := func (args ...*Value) (retVal interface{}, err error) {
+        val1 := args[0].AsInt64()
+        val2 := args[1].AsInt64()
+        retVal = int64(val1+val2)
+        return
+    }
+
+    fhandle, err := ConvertFunction(sampleCb)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    retVal, err := fhandle.Invoke(10, 20)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if retVal.AsInt64() != int64(30) {
+        t.Errorf("Expected result :30 got:%v\n", retVal.AsInt64())
+        return
+    }
+}
+
+func TestFunctionError(t *testing.T) {
+    sampleCb := func (args ...*Value) (retVal interface{}, err error) {
+        err = fmt.Errorf("Sample Error XYZABC");
+        return
+    }
+
+    fhandle, err := ConvertFunction(sampleCb)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = fhandle.Invoke()
+    if err == nil {
+        t.Error("Expected error but didn't received\n")
+        return
+    }
+
+    if  !strings.Contains(err.Error(), string("Sample Error XYZABC")) {
+        t.Errorf("Expected Error should contain :\"Sample Error XYZABC\" got :%v\n", err.Error())
+    }
+}
+
+// Check FunctionRegister
+func TestFunctionRegister(t *testing.T) {
+    sampleCb := func (args ...*Value) (retVal interface{}, err error) {
+        val1 := args[0].AsInt64()
+        val2 := args[1].AsInt64()
+        retVal = int64(val1+val2)
+        return
+    }
+
+    RegisterFunction(sampleCb, "TestFunctionRegister.sampleCb");
+    // Query global functions available
+    funcNames, err := FuncListGlobalNames()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    found := 0
+    for ii := range (funcNames) {
+        if strings.Compare(funcNames[ii], "TestFunctionRegister.sampleCb") == 0 {
+            found = 1
+        }
+    }
+    if found == 0 {
+        t.Error("Registered function not found in global function list.")
+        return
+    }
+
+    // Get "sampleCb" and verify the call.
+    funp, err := GetGlobalFunction("TestFunctionRegister.sampleCb")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke((int64)(10), (int64)(20))
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if result.AsInt64() != int64(30) {
+        t.Errorf("Expected result :30 got:%v\n", result.AsInt64())
+        return
+    }
+}
+
+// Check packed function receiving go-closure as argument.
+func TestFunctionClosureArg(t *testing.T) {
+    // sampleFunctionArg receives a Packed Function handle and calls it.
+    sampleFunctionArg := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+
+        // Call Packed Function by Value
+        ret, err := pfunc.Invoke(args[1], args[2])
+        if err != nil {
+            return
+        }
+
+        // Call Packed Function with extracted values
+        ret1, err := pfunc.Invoke(args[1].AsInt64(), args[2].AsInt64())
+        if err != nil {
+            return
+        }
+        if ret1.AsInt64() != ret.AsInt64() {
+            err = fmt.Errorf("Invoke with int64 didn't match with Value\n")
+            return
+        }
+        retVal = ret
+        return
+    }
+
+    RegisterFunction(sampleFunctionArg, "TestFunctionClosureArg.sampleFunctionArg");
+    funp, err := GetGlobalFunction("TestFunctionClosureArg.sampleFunctionArg")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        val1 := args[0].AsInt64()
+        val2 := args[1].AsInt64()
+        retVal = int64(val1+val2)
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke(funccall, 30, 50)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if result.AsInt64() != int64(80) {
+        t.Errorf("Expected result :80 got:%v\n", result.AsInt64())
+        return
+    }
+}
+
+// Check packed function returning a go-closure.
+func TestFunctionClosureReturn(t *testing.T) {
+    // sampleFunctionCb returns a function closure which is embed as packed function in TVMValue.
+    sampleFunctionCb := func (args ...*Value) (retVal interface{}, err error) {
+        funccall := func (cargs ...*Value) (fret interface{}, ferr error) {
+            val1 := cargs[0].AsInt64()
+            val2 := cargs[1].AsInt64()
+            fret = int64(val1+val2)
+            return
+        }
+        retVal = funccall
+        return
+    }
+
+    RegisterFunction(sampleFunctionCb, "TestFunctionClosureReturn.sampleFunctionCb");
+    funp, err := GetGlobalFunction("TestFunctionClosureReturn.sampleFunctionCb")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    pfunc := result.AsFunction()
+    pfuncRet, err := pfunc.Invoke(30, 40)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if pfuncRet.AsInt64() != int64(70) {
+        t.Errorf("Expected result :70 got:%v\n", pfuncRet.AsInt64())
+        return
+    }
+}
+
+// Check packed function with no arguments and no return values.
+func TestFunctionNoArgsReturns(t *testing.T) {
+    sampleFunction := func (args ...*Value) (retVal interface{}, err error) {
+        return
+    }
+
+    fhandle, err := ConvertFunction(sampleFunction)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = fhandle.Invoke()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+}
+
+// Check packed function returning a go-closure with no arg and returns.
+func TestFunctionNoArgsReturns2(t *testing.T) {
+    // sampleFunctionCb returns a function closure which is embed as packed function in TVMValue.
+    sampleFunctionCb := func (args ...*Value) (retVal interface{}, err error) {
+        funccall := func (cargs ...*Value) (fret interface{}, ferr error) {
+            return
+        }
+        retVal = funccall
+        return
+    }
+
+    funp, err := ConvertFunction(sampleFunctionCb)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    pfunc := result.AsFunction()
+    _, err = pfunc.Invoke()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+}
diff --git a/golang/src/gotvm.cc b/golang/src/gotvm.cc
new file mode 100644
index 000000000000..cf84e670df79
--- /dev/null
+++ b/golang/src/gotvm.cc
@@ -0,0 +1,195 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm native interface definition
+ * \file gotvm.cxx
+ */
+
+// Standard includes
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+
+// golang string compatible definition
+typedef struct { char *p; int n; } _gostring_;
+#include <string>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TVM runtime C interface
+#include <tvm/runtime/c_runtime_api.h>
+#include <dlpack/dlpack.h>
+
+/*!
+ * \brief Convert native char array to _gostring_ structure.
+ * _gostring_ structure represents the same memory footprint as golang string object.
+ *
+ * \param p is char pointer to a char array.
+ * \param l is the size of the char array. this method exclusively need length as
+ * its possible to have a bytearray in a string.
+ *
+ * \return _gostring_ object corresponding to native char array.
+ * Caller is responsible to free the memory block allocated here.
+ */
+static _gostring_ _native_to_gostring(const char *p, size_t l) {
+  _gostring_ ret;
+  ret.p = reinterpret_cast<char*>(malloc(l));
+  if (NULL == ret.p) {
+    ret.n = 0;
+    return ret;
+  }
+  memcpy(ret.p, p, l);
+  ret.n = l;
+  return ret;
+}
+
+/*!
+ * \brief embeds a 64bit uint value inside a string to serialize the data.
+ *
+ * \param s is string object.
+ * \param off is the offset in the string object.
+ * \param v is the uint64_t value which need to embed into given string.
+ */
+static void putuint64(std::string *s, size_t off, uint64_t v) {
+    for (int i = 0; i < 8; i++) {
+        (*s)[off + i] = (v >> (i * 8)) & 0xff;
+    }
+}
+
+// TVM runtime C interface wrappers
+
+/*!
+ * \brief Native interface to query TVM_VERSION in golang string format.
+ *
+ * \return char pointer to TVM-VERSION
+ */
+const char* _TVM_VERSION(void) {
+  const char *version = TVM_VERSION;
+  return version;
+}
+
+/*!
+ * \brief Native interface for getting TVMGlobal function list.
+ *
+ * \param names return by argument to return the function names.
+ * We wrap all strings into single string joined by (len+string)
+ * which is unpacked and processed in golang.
+ *
+ * \return c_runtime_api return status.
+ */
+int _TVMFuncListGlobalNames(_gostring_* names) {
+  int names_size;
+  char **names_array;
+  int result;
+
+  result = TVMFuncListGlobalNames(&names_size, (char const ***)&names_array);
+  if (result) {
+    return result;
+  }
+
+  size_t tot = 8;
+  for (int ii = 0; ii < names_size ; ++ii) {
+    tot += 8 + strlen(names_array[ii]);
+  }
+
+  std::string str;
+  str.resize(tot);
+  putuint64(&str, 0, names_size);
+  size_t off = 8;
+  for (int64_t ii = 0; ii < names_size ; ++ii) {
+    putuint64(&str, off, strlen(names_array[ii]));
+    off += 8;
+    str.replace(off, strlen(names_array[ii]), names_array[ii]);
+    off += strlen(names_array[ii]);
+  }
+  *names = _native_to_gostring(str.data(), str.size());
+  if (str.size() != names->n) {
+    TVMAPISetLastError("malloc failed during _native_to_gostring");
+    result = 1;
+  }
+  return result;
+}
+
+// Helpers for TVMValue
+
+/*!
+ * \brief Native helper to copy TVMValue from golang slice to native array.
+ * this helper is need as underlying momory for golang slice is not continueous.
+ *
+ * \param to_ptr is the native pointer of TVMValue array.
+ * \param from_ptr pointer to TVMValue in golang slice.
+ * \param array index in native array.
+ */
+void _TVMValueNativeSet(void* to_ptr, void* from_ptr, int ind) {
+  TVMValue *from_p = reinterpret_cast<TVMValue*>(from_ptr);
+  TVMValue *to_p = reinterpret_cast<TVMValue*>(to_ptr);
+  memcpy(to_p+ind, from_p, sizeof(TVMValue));
+}
+
+/*!
+ * \brief Native helper to copy TVMValue from golang slice to native array.
+ * this helper is need as underlying momory for golang slice is not continueous.
+ *
+ * \param to_ptr pointer to TVMValue in golang slice.
+ * \param from_ptr is the native pointer of TVMValue array.
+ * \param array index in native array.
+ */
+void _TVMValueNativeGet(void* to_ptr, void* from_ptr, int ind) {
+  TVMValue *from_p = reinterpret_cast<TVMValue*>(from_ptr);
+  TVMValue *to_p = reinterpret_cast<TVMValue*>(to_ptr);
+  memcpy(to_p, from_p+ind, sizeof(TVMValue));
+}
+
+extern int goTVMCallback(void*, void*, int, void*, void*);
+
+/*!
+ * \brief _TVMCallback is the TVM runtime callback function for PackedFunction system.
+ *
+ * \param args is an array of TVMValue
+ * \param type_codes is an array of int
+ * \param num_args is int representing number of in arguments
+ * \param ret is the return value handle to set the packed function return.
+ * \param resource_handle is the golang private data pointer.
+ *
+ * \returns the error status as TVM_DLL
+ */
+int _TVMCallback(TVMValue* args,
+                 int* type_codes,
+                 int num_args,
+                 TVMRetValueHandle ret,
+                 void* resource_handle) {
+    return goTVMCallback(args, type_codes, num_args, ret, resource_handle);
+}
+
+/*!
+ * _TVMPackedCFuncFinalizer is finalizer for packed function system.
+ *
+ */
+void _TVMPackedCFuncFinalizer(void* resource_handle) {
+    return;
+}
+
+/*!
+ * /brief _ConvertFunction creates a packed function for with given resource handle.
+ *
+ * /param fptr is the pointer to golang resource handle.
+ * /param *fhandle is the return argument holding packed function.
+ *
+ * /return is an int indicating the return status.
+ */
+int _ConvertFunction(void* fptr, TVMFunctionHandle *fhandle) {
+  int ret = TVMFuncCreateFromCFunc(_TVMCallback,
+                                   fptr,
+                                   _TVMPackedCFuncFinalizer,
+                                   fhandle);
+  return ret;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/golang/src/gotvm.go b/golang/src/gotvm.go
new file mode 100644
index 000000000000..3f7aac93d769
--- /dev/null
+++ b/golang/src/gotvm.go
@@ -0,0 +1,24 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file gotvm.go
+ */
+
+
+// Package gotvm is TVM runtime interface definition for golang.
+//
+// Application need to import this package to access the c_runtime_api exposed by TVM.
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+// DLPackVersion is the dlpack version of tvm runtime.
+var DLPackVersion           = int(C.DLPACK_VERSION)
+// TVMVersion is the TVM runtime version.
+var TVMVersion              = getTVMVersion()
+
+func getTVMVersion() (retStr string) {
+    retStr = C.GoString(C._TVM_VERSION())
+    return
+}
diff --git a/golang/src/gotvm.h b/golang/src/gotvm.h
new file mode 100644
index 000000000000..e4487a362cca
--- /dev/null
+++ b/golang/src/gotvm.h
@@ -0,0 +1,42 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm native interface declaration.
+ * \file gotvm.h
+ *
+ * These declarations are in cgo interface definition while calling API
+ * across golang and native C boundaries.
+ */
+
+#ifndef GOTVM_GOTVM_H_
+#define GOTVM_GOTVM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <dlpack/dlpack.h>
+
+// Some type definitions for golang "C"
+typedef void* native_voidp;
+
+// Version
+extern char* _TVM_VERSION(void);
+
+// Wrappers : For incompatible cgo API.
+// To handle array of strings wrapped into __gostring__
+extern int _TVMFuncListGlobalNames(void*);
+// To handle TVMValue slice to/from native sequential TVMValue array.
+extern void _TVMValueNativeSet(void* to, void* from, int index);
+extern void _TVMValueNativeGet(void* to, void* from, int index);
+
+// Callbacks
+extern int _ConvertFunction(void* fptr, void* funp);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // GOTVM_GOTVM_H_
diff --git a/golang/src/gotvm_test.go b/golang/src/gotvm_test.go
new file mode 100644
index 000000000000..5058de400ba7
--- /dev/null
+++ b/golang/src/gotvm_test.go
@@ -0,0 +1,30 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file gotvm_test.go
+ */
+
+
+package gotvm
+
+import (
+    "testing"
+    "reflect"
+)
+
+// Check TVMVersion API
+func TestTVMVersion(t *testing.T) {
+    if len(TVMVersion) == 0 {
+        t.Error("TVMVersion not set\n")
+    }
+    if reflect.TypeOf(TVMVersion).Kind() != reflect.String {
+        t.Error("TVMVersion type mismatch\n")
+    }
+}
+
+// Check DLPackVersion API
+func TestDLPackVersion(t *testing.T) {
+    if reflect.TypeOf(DLPackVersion).Kind() != reflect.Int {
+        t.Error("TVMVersion type mismatch\n")
+    }
+}
diff --git a/golang/src/module.go b/golang/src/module.go
new file mode 100644
index 000000000000..422cb6be20ff
--- /dev/null
+++ b/golang/src/module.go
@@ -0,0 +1,121 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMModule interface.
+ * \file module.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "errors"
+    "runtime"
+    "unsafe"
+)
+
+// Module type in golang hold pointer for the TVMModule handle.
+//
+// Module initialization happen through TVMModLoadFromFile api in TVM runtime.
+type Module uintptr
+
+// nativeCPtr returns type freed uintptr for the Module.
+func (tvmmodule *Module) nativeCPtr() (retVal uintptr) {
+    retVal = (uintptr)(*tvmmodule)
+    return
+}
+
+// LoadModuleFromFile loads the given module in TVM runtime.
+//
+// `modpath` is the path to tvm module.
+//
+// `args` is an optional arguments of ["dll", "dylib", "dso", "so"] with default value "so"
+//
+// returns pointer to Module and err or if any.
+func LoadModuleFromFile(modpath string, args ...interface{}) (retVal *Module, err error) {
+    modtype := "so"
+    if len(args) > 0 {
+       modtype  = args[0].(string)
+    }
+    var modp uintptr
+
+    cmodpath := C.CString(modpath)
+    cmodtype := C.CString(modtype)
+
+    ret := (int32)(C.TVMModLoadFromFile(cmodpath,
+                                        cmodtype,
+                                        (*_Ctype_TVMModuleHandle)(unsafe.Pointer(&modp))))
+
+    C.free(unsafe.Pointer(cmodpath))
+    C.free(unsafe.Pointer(cmodtype))
+
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+        return
+    }
+
+    handle := new(Module)
+    *handle = Module(modp)
+    finalizer := func(mhandle *Module) {
+        nativeTVMModFree(mhandle)
+        mhandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    retVal = handle
+    return
+}
+
+// nativeTVMModFree free the module handle allocated in TVM runtime.
+//
+// `modp` is the Module handle to be freed.
+func nativeTVMModFree(modp *Module) (retVal int32) {
+    retVal = (int32) (C.TVMModFree(C.TVMModuleHandle(modp.nativeCPtr())))
+    return
+}
+
+// GetFunction returns the function pointer from the module for given function name.
+//
+// `tvmmodule` is handle for Module
+//
+// `funcname` function name in module.
+//
+// `args` variadic args of `queryImport`
+//
+// returns function closure with signature
+//         func (args ...interface{}) (interface{}, error) and error if any.
+//
+// The closure function can be used to call Function with arguments directly.
+//
+// Variadic arguments can be any type which can be embed into Value.
+func (tvmmodule *Module) GetFunction (
+      funcname string, args ...interface{}) (
+      retVal *Function, err error){
+    queryImports := int32(1)
+    if len(args) > 0 {
+        queryImports = int32(args[1].(int))
+    }
+
+    var funp uintptr
+    cfuncname := C.CString(funcname)
+    ret := (int32)(C.TVMModGetFunction((_Ctype_TVMModuleHandle)(*tvmmodule),
+                                       cfuncname,
+                                       C.int(queryImports),
+                                       (*_Ctype_TVMFunctionHandle)(unsafe.Pointer(&funp))))
+    C.free(unsafe.Pointer(cfuncname))
+
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+        return
+    }
+
+    handle := new(Function)
+    *handle = Function(funp)
+    finalizer := func(fhandle *Function) {
+        nativeTVMFuncFree(fhandle)
+        fhandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    retVal = handle
+    return
+}
diff --git a/golang/src/module_test.go b/golang/src/module_test.go
new file mode 100644
index 000000000000..fac094438e96
--- /dev/null
+++ b/golang/src/module_test.go
@@ -0,0 +1,93 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file module_test.go
+ */
+
+
+package gotvm
+
+import (
+    "testing"
+    "reflect"
+)
+
+// Check module loading - dll
+func TestModuleTestLoad1(t *testing.T) {
+    // dll
+    mod, err := LoadModuleFromFile("./deploy.so", "dll")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(mod).Kind() != reflect.Ptr {
+        t.Error("Module type mis matched\n")
+        return
+    }
+}
+
+// Check module loading - dylib
+func TestModuleTestLoad2(t *testing.T) {
+    // dylib
+    mod, err := LoadModuleFromFile("./deploy.so", "dylib")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(mod).Kind() != reflect.Ptr {
+        t.Error("Module type mis matched\n")
+        return
+    }
+}
+
+func TestModuleTestLoad3(t *testing.T) {
+    // dso
+    mod, err := LoadModuleFromFile("./deploy.so", "dso")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(mod).Kind() != reflect.Ptr {
+        t.Error("Module type mis matched\n")
+        return
+    }
+}
+
+// Check module loading - so
+func TestModuleTestLoad4(t *testing.T) {
+    // so
+    mod, err := LoadModuleFromFile("./deploy.so", "so")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(mod).Kind() != reflect.Ptr {
+        t.Error("Module type mis matched\n")
+        return
+    }
+}
+
+// Check module loading - default (so)
+func TestModuleTestLoad5(t *testing.T) {
+    // default type as so
+    mod, err := LoadModuleFromFile("./deploy.so")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(mod).Kind() != reflect.Ptr {
+        t.Error("Module type mis matched\n")
+        return
+    }
+}
+
+// Check module loading err
+func TestModuleTestLoadErr(t *testing.T) {
+    // Unknown file should return error
+    _, err := LoadModuleFromFile("xyzabc.so")
+    if err == nil {
+        t.Error("Expected an error, but not received\n")
+        return
+    }
+}
+
diff --git a/golang/src/ndarray.go b/golang/src/ndarray.go
new file mode 100644
index 000000000000..ceae7e58c203
--- /dev/null
+++ b/golang/src/ndarray.go
@@ -0,0 +1,329 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMArray aka DLTensor
+ * \file ndarray.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "unsafe"
+    "fmt"
+    "errors"
+    "runtime"
+    "reflect"
+)
+
+// Array type in golang hold pointer for the TVMArray object from dlpack.
+//
+// Array initialization happen through Empty api
+type Array uintptr
+
+// nativeCPtr returns type freed uintptr for the Array.
+func (parray Array) nativeCPtr() (retVal uintptr) {
+    retVal = (uintptr)(parray)
+    return
+}
+
+func (parray Array) nativeCopyFrom(data unsafe.Pointer, datalen int) (err error) {
+    ret := C.TVMArrayCopyFromBytes((*_Ctype_TVMArray)(unsafe.Pointer(parray.nativeCPtr())),
+                                   data,
+                                   C.ulong(datalen))
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+    }
+    return
+}
+
+// CopyFrom copies given golang data slice into Array.
+//
+// `val` is interface homding a slice of Array data type.
+//
+// returns err is any.
+// TOD: Use reflections for better handling
+func (parray Array) CopyFrom(val interface{}) (err error) {
+    var data unsafe.Pointer
+    var datalen int
+    dtype := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).dtype
+
+    switch val.(type) {
+        case []int8:
+            sliceVal := val.([]int8)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []int16:
+            sliceVal := val.([]int16)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []int32:
+            sliceVal := val.([]int32)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []int64:
+            sliceVal := val.([]int64)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []uint8:
+            sliceVal := val.([]uint8)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+         case []uint16:
+            sliceVal := val.([]uint16)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []uint32:
+            sliceVal := val.([]uint32)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []uint64:
+            sliceVal := val.([]uint64)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []float32:
+            sliceVal := val.([]float32)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []float64:
+            sliceVal := val.([]float64)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        default:
+            err = fmt.Errorf("Given type not supported : %v\n", reflect.TypeOf(val))
+            return
+    }
+    return
+}
+
+func (parray Array) nativeCopyTo (data unsafe.Pointer, datalen int) (err error){
+    ret := C.TVMArrayCopyToBytes((*_Ctype_TVMArray)(unsafe.Pointer(parray.nativeCPtr())),
+                                  unsafe.Pointer(data),
+                                  C.ulong(datalen))
+
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+    }
+   return
+}
+
+// AsSlice returns the unitptr of for the data inside Array.
+//
+// returns the slice of array inside Array and err of any.
+// TOD: Use reflections for better handling
+func (parray Array) AsSlice() (retVal interface{}, err error) {
+    shape := parray.GetShape()
+    size := int64(1)
+    var data unsafe.Pointer
+    var datalen int
+
+    for ii := range shape {
+        size *= shape[ii]
+    }
+    dtype := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).dtype
+
+    switch parray.GetDType() {
+        case "int8":
+            sliceVal := make([]int8, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "int16":
+            sliceVal := make([]int16, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "int32":
+            sliceVal := make([]int32, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "int64":
+            sliceVal := make([]int64, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "uint8":
+            sliceVal := make([]uint8, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "uint16":
+            sliceVal := make([]uint16, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "uint32":
+            sliceVal := make([]uint32, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "uint64":
+            sliceVal := make([]uint64, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "float32":
+            sliceVal := make([]float32, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "float64":
+            sliceVal := make([]float64, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        default:
+            err = fmt.Errorf("Given type not supported : %v\n", parray.GetDType())
+            return
+    }
+    return
+}
+
+// GetNdim returns the number of dimentions in Array
+func (parray Array) GetNdim() (retVal int32) {
+    retVal = int32(((*_Ctype_TVMArray)(unsafe.Pointer(parray))).ndim)
+    return
+}
+
+// GetShape returns the number of dimentions in Array
+func (parray Array) GetShape() (retVal []int64) {
+    shapePtr := (*C.int64_t)(((*_Ctype_TVMArray)(unsafe.Pointer(parray))).shape)
+    ndim := parray.GetNdim()
+
+    shapeSlice := (*[1<<31] int64)(unsafe.Pointer(shapePtr))[:ndim:ndim]
+    retVal = make([]int64, ndim)
+    copy(retVal, shapeSlice)
+    return
+}
+
+// GetDType returns the number of dimentions in Array
+func (parray Array) GetDType() (retVal string) {
+    ret := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).dtype
+    retVal, _ = dtypeFromTVMType(*(*pTVMType)(unsafe.Pointer(&ret)))
+    return
+}
+
+// GetCtx returns the number of dimentions in Array
+func (parray Array) GetCtx() (retVal Context) {
+    ret := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).ctx
+    retVal = *(*Context)(unsafe.Pointer(&ret))
+    return
+}
+
+// nativeTVMArrayAlloc is used to allocate TVMArray from given attributes.
+//
+// `shape` is int64 slice holding shape of the Array to be created.
+//
+// `ndim` is the rank of the Array to be created.
+//
+// `dtypeCode`, `dtypeBits` and `dtypeLanes` describe the data type in Array.
+//
+// `deviceType` indicates the device on whose memory the Array to allocated.
+//
+// `deviceID` indicates device index if multiple devices of same type present.
+//
+// return argument holding native pointer to newly created Array and error is any.
+func nativeTVMArrayAlloc(shape []int64, ndim int32,
+                   dtypeCode int32, dtypeBits int32, dtypeLanes int32,
+                   deviceType int32, deviceID int32) (retVal uintptr, err error) {
+    ret := (int32)(C.TVMArrayAlloc((*_Ctype_long)(&(shape[0])),
+                                   C.int(ndim),
+                                   C.int(dtypeCode),
+                                   C.int(dtypeBits),
+                                   C.int(dtypeLanes),
+                                   C.int(deviceType),
+                                   C.int(deviceID),
+                                   (*_Ctype_TVMArrayHandle)(unsafe.Pointer(&retVal))))
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+        return
+    }
+    return
+}
+
+// Empty is used to allocate TVM empty array of given epecification.
+//
+// `shape` is int64 slice holding shape of the Array
+//
+// `args` is variadic args for
+//
+//        `args[0]` is string for data type. Default value is 'float32'
+//
+//        `args[1]` is Context. Default value is '{KDLCPU, 0}'
+//
+// returns pointer to Array on successful execution and error if any.
+func Empty(shape []int64, args ...interface{}) (parray *Array, err error) {
+    typeName := "float32"
+    ctx := Context{KDLCPU, 0}
+
+    if len(shape) < 1 {
+        err = fmt.Errorf("Invalid shape for Array creation: %v\n", len(shape))
+        return
+    }
+
+    for i, val := range args {
+        switch val.(type) {
+            case string:
+                typeName = args[i].(string)
+            case Context:
+                ctx = args[i].(Context)
+            default:
+                err = fmt.Errorf("Invalid Optional Argument Type: %T\n", val)
+                return
+        }
+    }
+
+    tvmType, err := dtypeToTVMType(typeName)
+    if err != nil {
+        return
+    }
+    ndim := int32(len(shape))
+    newArray, err := nativeTVMArrayAlloc(shape, ndim, int32(tvmType.code),
+                                    int32(tvmType.bits), int32(tvmType.lanes),
+                                    ctx.DeviceType, ctx.DeviceID)
+    if err != nil {
+        return
+    }
+    handle := new(Array)
+    *handle = Array(newArray)
+
+    finalizer := func (ahandle *Array) {
+        nativeTVMArrayFree(*ahandle)
+        ahandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    parray = handle
+    return
+}
+
+// nativeTVMArrayFree is used to release the Array.
+//
+// `parray` is the Array handle.
+//
+// `ret` indicates the status of this api execution.
+func nativeTVMArrayFree(parray Array) (retVal int32) {
+    retVal = (int32)(C.TVMArrayFree((*_Ctype_TVMArray)(unsafe.Pointer(parray.nativeCPtr()))))
+    return
+}
diff --git a/golang/src/tvm_runtime_pack.cc b/golang/src/tvm_runtime_pack.cc
new file mode 100644
index 000000000000..718a79eb7445
--- /dev/null
+++ b/golang/src/tvm_runtime_pack.cc
@@ -0,0 +1,49 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief This is an all in one TVM runtime file.
+ * \file tvm_runtime_pack.cc
+ */
+#include "src/runtime/c_runtime_api.cc"
+#include "src/runtime/cpu_device_api.cc"
+#include "src/runtime/workspace_pool.cc"
+#include "src/runtime/module_util.cc"
+#include "src/runtime/module.cc"
+#include "src/runtime/registry.cc"
+#include "src/runtime/file_util.cc"
+#include "src/runtime/threading_backend.cc"
+#include "src/runtime/thread_pool.cc"
+#include "src/runtime/ndarray.cc"
+
+// NOTE: all the files after this are optional modules
+// that you can include remove, depending on how much feature you use.
+
+// Likely we only need to enable one of the following
+// If you use Module::Load, use dso_module
+// For system packed library, use system_lib_module
+#include "src/runtime/dso_module.cc"
+#include "src/runtime/system_lib_module.cc"
+
+// Graph runtime
+#include "src/runtime/graph/graph_runtime.cc"
+
+// Uncomment the following lines to enable RPC
+// #include "../../src/runtime/rpc/rpc_session.cc"
+// #include "../../src/runtime/rpc/rpc_event_impl.cc"
+// #include "../../src/runtime/rpc/rpc_server_env.cc"
+
+// These macros enables the device API when uncommented.
+#define TVM_CUDA_RUNTIME 1
+#define TVM_METAL_RUNTIME 1
+#define TVM_OPENCL_RUNTIME 1
+
+// Uncomment the following lines to enable Metal
+// #include "../../src/runtime/metal/metal_device_api.mm"
+// #include "../../src/runtime/metal/metal_module.mm"
+
+// Uncomment the following lines to enable CUDA
+// #include "../../src/runtime/cuda/cuda_device_api.cc"
+// #include "../../src/runtime/cuda/cuda_module.cc"
+
+// Uncomment the following lines to enable OpenCL
+// #include "../../src/runtime/opencl/opencl_device_api.cc"
+// #include "../../src/runtime/opencl/opencl_module.cc"
diff --git a/golang/src/type.go b/golang/src/type.go
new file mode 100644
index 000000000000..27364295bf8b
--- /dev/null
+++ b/golang/src/type.go
@@ -0,0 +1,72 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package for TVMType interface
+ * \file type.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "fmt"
+)
+
+// pTVMType corresponding to data types.
+type pTVMType struct {
+    code uint8
+    bits uint8
+    lanes uint16
+}
+
+// data type to pTVMType mapping
+var dtypeMap = map[string] pTVMType {
+    "int8": pTVMType{0, 8, 1},
+    "int16": pTVMType{0, 16, 1},
+    "int32": pTVMType{0, 32, 1},
+    "int64": pTVMType{0, 64, 1},
+    "uint8": pTVMType{1, 8, 1},
+    "uint16": pTVMType{1, 16, 1},
+    "uint32": pTVMType{1, 32, 1},
+    "uint64": pTVMType{1, 64, 1},
+    "float32": pTVMType{2, 32, 1},
+    "float64": pTVMType{2, 64, 1},
+}
+
+// dtypeFromTVMType return the pTVMType corresponding to given dtype
+//
+// `dtype` string for the given data type.
+func dtypeFromTVMType(tvmtype pTVMType) (retVal string, err error) {
+    for k, v := range dtypeMap {
+        if v.code == tvmtype.code && v.bits == tvmtype.bits && v.lanes == tvmtype.lanes {
+            retVal = k
+            return
+        }
+    }
+
+    err = fmt.Errorf("Cannot map TVMType:%v to dtype", tvmtype)
+    return
+}
+
+// dtypeToTVMType return the pTVMType corresponding to given dtype
+//
+// `dtype` string for the given data type.
+func dtypeToTVMType(args ...interface{}) (tvmtype pTVMType, err error) {
+    dtype := args[0].(string)
+    lanes := 1
+
+    if len(args) == 2 {
+        lanes = args[1].(int)
+    }
+
+    for k, v := range dtypeMap {
+        if k == dtype {
+            tvmtype = v
+            tvmtype.lanes = uint16(lanes)
+            return
+        }
+    }
+    err = fmt.Errorf("Cannot map dtype:%v to TVMType", dtype)
+    return
+}
diff --git a/golang/src/util.go b/golang/src/util.go
new file mode 100644
index 000000000000..aa5a6016c97f
--- /dev/null
+++ b/golang/src/util.go
@@ -0,0 +1,24 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for common utilities
+ * \file util.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "unsafe"
+)
+
+// Native string map for go string
+type nativeGoString struct { p uintptr; n int32 }
+
+func goStringFromNative (s string) (retStr string) {
+    p := *(*nativeGoString)(unsafe.Pointer(&s))
+    retStr = string((*[0x7fffffff]byte)(unsafe.Pointer(p.p))[:p.n])
+    C.free(unsafe.Pointer(p.p))
+    return
+}
diff --git a/golang/src/value.go b/golang/src/value.go
new file mode 100644
index 000000000000..2a953560f237
--- /dev/null
+++ b/golang/src/value.go
@@ -0,0 +1,360 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMValue interface
+ * \file value.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "fmt"
+    "runtime"
+    "unsafe"
+)
+
+// KHandle is golang type code for TVM enum kHandle.
+var KHandle                 = int32(C.kHandle)
+// KNull is golang type code for TVM kNull.
+var KNull                   = int32(C.kNull)
+// KTVMType is golang type code for TVM kTVMType.
+var KTVMType                = int32(C.kTVMType)
+// KTVMContext is golang type code for TVM kTVMContext.
+var KTVMContext             = int32(C.kTVMContext)
+// KArrayHandle is golang type code for TVM kArrayHandle.
+var KArrayHandle            = int32(C.kArrayHandle)
+// KNodeHandle is golang type code for TVM kNodeHandle.
+var KNodeHandle             = int32(C.kNodeHandle)
+// KModuleHandle is gonag type code for TVM kModuleHandle.
+var KModuleHandle           = int32(C.kModuleHandle)
+// KFuncHandle is gonalg type code for TVM kFuncHandle.
+var KFuncHandle             = int32(C.kFuncHandle)
+// KStr is golang type code for TVM kStr.
+var KStr                    = int32(C.kStr)
+// KBytes is golang type code for TVM kBytes.
+var KBytes                  = int32(C.kBytes)
+// KNDArrayContainer is golang typecode for kNDArrayContainer.
+var KNDArrayContainer       = int32(C.kNDArrayContainer)
+// KExtBegin is golang enum corresponding to TVM kExtBegin.
+var KExtBegin               = int32(C.kExtBegin)
+// KNNVMFirst is golang enum corresponding to TVM kNNVMFirst.
+var KNNVMFirst              = int32(C.kNNVMFirst)
+// KNNVMLast is golang enum corresponding to TVM kNNVMLast.
+var KNNVMLast               = int32(C.kNNVMLast)
+// KExtReserveEnd is golang enum corresponding to TVM kExtReserveEnd.
+var KExtReserveEnd          = int32(C.kExtReserveEnd)
+// KExtEnd is golang enum corresponding to TVM kExtEnd.
+var KExtEnd                 = int32(C.kExtEnd)
+// KDLInt is golang type code for TVM kDLInt.
+var KDLInt                  = int32(C.kDLInt)
+// KDLUInt is golang type code for TVM kDLUInt.
+var KDLUInt                 = int32(C.kDLUInt)
+// KDLFloat is golang type code for TVM kDLFloat.
+var KDLFloat                = int32(C.kDLFloat)
+
+// Value Typemap for union exposed by TVM runtime API.
+//
+// gotvm maps it to a uintptr and then dynamically allocates memory by newTVMValue method.
+type Value struct {
+    nptr  uintptr
+    dtype int32
+    isLocal bool
+}
+
+// AsInt64 returns the int64 value inside the Value.
+func (tvmval *Value)  AsInt64() (retVal int64) {
+    retVal = tvmval.getVInt64()
+    return
+}
+
+// AsFloat64 returns the Float64 value inside the Value.
+func (tvmval *Value)  AsFloat64() (retVal float64) {
+    retVal = tvmval.getVFloat64()
+    return
+}
+
+// AsModule returns the Module inside the Value.
+func (tvmval *Value)  AsModule() (retVal *Module) {
+    mhandle := tvmval.getVMHandle()
+    retVal = &mhandle
+    return
+}
+
+// AsFunction returns the Function inside the Value.
+func (tvmval *Value)  AsFunction() (retVal *Function) {
+    fhandle := tvmval.getVFHandle()
+    retVal = &fhandle
+
+    return
+}
+
+// AsBytes returns the byte slice value inside the Value.
+func (tvmval *Value)  AsBytes() (retVal []byte) {
+    retVal = tvmval.getVBHandle().getData()
+    return
+}
+
+// AsStr returns the golang string in the Value.
+func (tvmval *Value) AsStr() (retVal string) {
+    str := tvmval.getVStr()
+    retVal = str
+    return
+}
+
+// nativeCPtr return the unitptr corresponding to Value type.
+func (tvmval *Value) nativeCPtr() (ret uintptr) {
+    ret = (uintptr)(tvmval.nptr)
+    return
+}
+
+// moveFrom copies the tvmval from other Value object.
+func (tvmval *Value) moveFrom(fromval *Value) () {
+    C.memcpy(unsafe.Pointer(tvmval.nativeCPtr()),
+             unsafe.Pointer(fromval.nativeCPtr()),
+             C.sizeof_TVMValue)
+
+    // Move the dtype too.
+    tvmval.dtype = fromval.dtype
+    fromval.dtype = KNull
+    return
+}
+
+// setVInt64 initializes the Value object with given int64 value.
+//
+// `val` is the int64 value to initialize the Value
+func (tvmval *Value) setVInt64(val int64) {
+    valp := (*C.int64_t)(unsafe.Pointer(tvmval.nativeCPtr()))
+    *valp = C.int64_t(val)
+    tvmval.dtype = KDLInt
+    return
+}
+
+
+// getVInt64 returns the int64 value inside the Value.
+func (tvmval *Value) getVInt64() (retVal int64) {
+    valp := (*C.int64_t)(unsafe.Pointer(tvmval.nativeCPtr()))
+    retVal = int64(*valp)
+    return
+}
+
+// setVFloat64 initializes the Value object with given float64 value.
+//
+// `val` is the float64 value to initialize the Value.
+func (tvmval *Value) setVFloat64(val float64) {
+    valp := (*C.double)(unsafe.Pointer(tvmval.nativeCPtr()))
+    *valp = C.double(val)
+    tvmval.dtype = KDLFloat
+    return
+}
+
+// getVFloat64 returns the float64 value inside Value.
+func (tvmval *Value) getVFloat64() (retVal float64) {
+    valp := (*C.double)(unsafe.Pointer(tvmval.nativeCPtr()))
+    retVal = float64(*valp)
+    return
+}
+
+// setVHandle initializes the handle inside the Value.
+//
+// Can be used to store any uintptr type object like
+// module handle, function handle and any object's nativeCPtr.
+//
+// `val` is the uintptr type of given handle.
+func (tvmval *Value) setVHandle(val uintptr) {
+    valp := (**C.void)(unsafe.Pointer(tvmval.nativeCPtr()))
+    *valp = (*C.void)(unsafe.Pointer(val))
+}
+
+// getVHandle returns the uintptr handle
+func (tvmval *Value) getVHandle() (retVal uintptr) {
+    valp := (**C.void)(unsafe.Pointer(tvmval.nativeCPtr()))
+    retVal = uintptr(unsafe.Pointer(*valp))
+    return
+}
+
+// setVStr intializes the Value with given golang string object.
+//
+// `val` is the golang string object used to initialize the Value.
+func (tvmval *Value) setVStr(val string) {
+    valp := (**C.char)(unsafe.Pointer(tvmval.nativeCPtr()))
+    *valp = C.CString(val)
+    tvmval.dtype = KStr
+    return
+}
+
+
+// getVStr returns the golang string for the native string inside Value.
+func (tvmval *Value) getVStr() (retVal string) {
+    valp := (**C.char)(unsafe.Pointer(tvmval.nativeCPtr()))
+    retVal = C.GoString(*valp)
+    return
+}
+
+// unSetVStr release the memory allocated in setVStr
+func (tvmval *Value) unSetVStr() {
+    valp := (**C.char)(unsafe.Pointer(tvmval.nativeCPtr()))
+	C.free(unsafe.Pointer(*valp))
+    tvmval.dtype = KNull
+}
+
+// setVAHandle is used to set Array handle in Value.
+//
+// Application can call the setVHandle with nativeCPtr instead too.
+// This is a wrapper to accept Array directly.
+func (tvmval *Value) setVAHandle(ptvmarray Array) {
+    tvmval.setVHandle(ptvmarray.nativeCPtr())
+    tvmval.dtype = KArrayHandle
+    return
+}
+
+// getVAHandle is used to get Array handle in Value.
+func (tvmval *Value) getVAHandle() (retVal Array) {
+	retVal = (Array)(tvmval.getVHandle())
+    return
+}
+
+// setVMHandle is used to set Module handle in Value.
+//
+// Application can call the setVHandle with nativeCPtr instead too.
+// This is a wrapper to accept Module directly.
+func (tvmval *Value) setVMHandle(tvmmodule Module) {
+    tvmval.setVHandle(tvmmodule.nativeCPtr())
+    tvmval.dtype = KModuleHandle
+    return
+}
+
+// getVMHandle is used to get Module handle in Value.
+func (tvmval *Value) getVMHandle() (retVal Module) {
+	retVal = (Module)(tvmval.getVHandle())
+    return
+}
+
+// setVFHandle is used to set Function handle in Value.
+//
+// Application can call the setVHandle with nativeCPtr instead.
+// This is a wrapper to accept Function directly.
+func (tvmval *Value) setVFHandle(tvmfunction Function) {
+    tvmval.setVHandle(tvmfunction.nativeCPtr())
+    tvmval.dtype = KFuncHandle
+    return
+}
+
+// getVFHandle is used to get Function handle in Value.
+func (tvmval *Value) getVFHandle() (retVal Function) {
+	retVal = (Function)(tvmval.getVHandle())
+    return
+}
+
+// setVBHandle is used to set ByteArray handle in Value.
+//
+// Application can call the setVHandle with nativeCPtr instead.
+// This is a wrapper to accept ByteArray directly.
+func (tvmval *Value) setVBHandle(tbytearray ByteArray) {
+    tvmval.setVHandle(tbytearray.nativeCPtr())
+    tvmval.dtype = KBytes
+    return
+}
+
+// getVBHandle is used to get ByteArray handle in Value.
+func (tvmval *Value) getVBHandle() (retVal ByteArray) {
+	retVal = (ByteArray)(tvmval.getVHandle())
+    return
+}
+
+// setValue is used to set the given value in Value.
+//
+// `val` is value of types accepted by Value container or native union.
+func (tvmval *Value) setValue(val interface{}) (retVal int32, err error) {
+    retVal = KNull
+    switch val.(type) {
+        case string:
+            tvmval.setVStr(val.(string))
+        case uint8:
+            tvmval.setVInt64(int64(val.(uint8)))
+        case uint16:
+            tvmval.setVInt64(int64(val.(uint16)))
+        case uint32:
+            tvmval.setVInt64(int64(val.(uint32)))
+        case uint64:
+            tvmval.setVInt64(int64(val.(uint64)))
+        case int:
+            tvmval.setVInt64(int64(val.(int)))
+        case int8:
+            tvmval.setVInt64(int64(val.(int8)))
+        case int16:
+            tvmval.setVInt64(int64(val.(int16)))
+        case int32:
+            tvmval.setVInt64(int64(val.(int32)))
+        case int64:
+            tvmval.setVInt64(val.(int64))
+        case float32:
+            tvmval.setVFloat64(float64(val.(float32)))
+        case float64:
+            tvmval.setVFloat64(val.(float64))
+        case *Module:
+            tvmval.setVMHandle(*(val.(*Module)))
+        case *Function:
+            tvmval.setVFHandle(*(val.(*Function)))
+        case *ByteArray:
+            tvmval.setVBHandle(*(val.(*ByteArray)))
+        case []byte:
+            barray := newByteArray(val.([]byte))
+            tvmval.setVBHandle(barray)
+        case *Array:
+            tvmval.setVAHandle(*(val.(*Array)))
+        case func (args ...*Value) (interface{}, error):
+            fhandle, apierr := ConvertFunction(val)
+            if apierr != nil {
+                err = fmt.Errorf("Given value Type not defined for Value: %v : %T\n", val, val);
+                return
+            }
+            tvmval.setVFHandle(*fhandle)
+
+            // Clear the finalizer as we don't need to control it anymore.
+            runtime.SetFinalizer(fhandle, nil)
+        case *Value:
+            tvmval.moveFrom(val.(*Value))
+        case Value:
+            fromval := val.(Value)
+            tvmval.moveFrom(&fromval)
+        default:
+            err = fmt.Errorf("Given value Type not defined for Value: %v : %T\n", val, val);
+    }
+    retVal = tvmval.dtype
+    return
+}
+
+// newTVMValue initialize the TVMValue native object.
+//
+// This is intended to use as intermediate type between native and golang types.
+// Allocated from FuncCall or Callback to handle conversions.
+func newTVMValue() (retVal *Value) {
+    handle := new(Value)
+
+    handle.nptr = (uintptr(C.malloc(C.sizeof_TVMValue)))
+    handle.dtype = KNull
+    handle.isLocal = true
+    finalizer := func(vhandle *Value) {
+        vhandle.deleteTVMValue()
+        vhandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    retVal = handle
+    return
+}
+
+// deleteTVMValue free the native Value object which is allocated in newTVMValue.
+func (tvmval Value) deleteTVMValue() {
+    if tvmval.isLocal == true {
+        if tvmval.dtype == KStr {
+            tvmval.unSetVStr()
+        }
+        if tvmval.dtype == KBytes {
+            tvmval.getVBHandle().deleteTVMByteArray()
+        }
+    }
+
+	C.free(unsafe.Pointer(tvmval.nativeCPtr()))
+}
diff --git a/golang/src/value_test.go b/golang/src/value_test.go
new file mode 100644
index 000000000000..251af82cb7b9
--- /dev/null
+++ b/golang/src/value_test.go
@@ -0,0 +1,237 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file value_test.go
+ */
+
+package gotvm
+
+import (
+    "testing"
+    "math/rand"
+    "strings"
+)
+
+// Check Int64 Value looping via packed function calling another packed function.
+func TestValueLoopInt64(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        newArgs := args[1:]
+
+        // Call Packed Function by Value
+        return pfunc.Invoke(newArgs)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal = args[0]
+        return
+    }
+
+    result := rand.Int63()
+    retVal, err := fhandle.Invoke(funccall, result)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if retVal.AsInt64() != result {
+        t.Errorf("Expected : %v got:%v\n", result, retVal.AsInt64())
+        return
+    }
+}
+
+// Check Int32 Value looping via packed function calling another packed function.
+func TestValueLoopInt32(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        newArgs := args[1:]
+
+        // Call Packed Function by Value
+        return pfunc.Invoke(newArgs)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal = args[0]
+        return
+    }
+
+    result := rand.Int31()
+    retVal, err := fhandle.Invoke(funccall, result)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if retVal.AsInt64() != int64(result) {
+        t.Errorf("Expected : %v got:%v\n", result, retVal.AsInt64())
+        return
+    }
+}
+
+// Check Float32 Value looping via packed function calling another packed function.
+func TestValueLoopFloat32(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        newArgs := args[1:]
+        // Call Packed Function by Value
+        return pfunc.Invoke(newArgs)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal = args[0]
+        return
+    }
+
+    result := rand.Float32()
+    retVal, err := fhandle.Invoke(funccall, result)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if retVal.AsFloat64() != float64(result) {
+        t.Errorf("Expected : %v got:%v\n", result, retVal.AsInt64())
+        return
+    }
+}
+
+// Check Float64 Value looping via packed function calling another packed function.
+func TestValueLoopFloat64(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        newArgs := args[1:]
+        // Call Packed Function by Value
+        return pfunc.Invoke(newArgs)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal = args[0]
+        return
+    }
+
+    result := rand.Float64()
+    retVal, err := fhandle.Invoke(funccall, result)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if retVal.AsFloat64() != result {
+        t.Errorf("Expected : %v got:%v\n", result, retVal.AsInt64())
+        return
+    }
+}
+
+func TestValueLoopString(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        argStr := args[1].AsStr()
+        // Call Packed Function by Value
+        return pfunc.Invoke(argStr)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal =  args[0].AsStr()
+        return
+    }
+
+    retVal, err := fhandle.Invoke(funccall, "TestString")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    vStr := retVal.AsStr()
+    if strings.Compare(vStr, string("TestString")) != 0  {
+        t.Errorf("Expected : %v got:%v\n", string("TestString"), vStr)
+        return
+    }
+}
+
+// Check []byte Value looping via packed function calling another packed function.
+func TestValueLoopByteSlice(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        argBytes := args[1].AsBytes()
+        // Call Packed Function by Value
+        return pfunc.Invoke(argBytes)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal = args[0].AsBytes()
+        return
+    }
+
+    result := make([]byte, 1024)
+    rand.Read(result)
+    retVal, err := fhandle.Invoke(funccall, result)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    received := retVal.AsBytes()
+    if len(result) != len(received) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(result), len(received))
+            return
+    }
+    for i := range result {
+        if result[i] != received[i] {
+            t.Errorf("Data expected: %v Got :%v at index %v\n", result[i], received[i], i)
+            return
+        }
+    }
+}

From aa0a7b50255df605f4d0bf79e57fe33013bbd6ab Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Sun, 9 Dec 2018 14:05:40 -0800
Subject: [PATCH 481/529] Improve CanonicalSimplify to handle Min, Max(#2248)
 (#2261)

Also enable Mul caching for more cases
---
 src/arithmetic/canonical.cc                  | 20 ++++++++++++++++++-
 tests/cpp/ir_simplify_test.cc                | 19 ++++++++++++++++++
 tests/python/unittest/test_arith_simplify.py | 21 ++++++++++++++++----
 3 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc
index 8f913ccd4350..2151ebf2adba 100644
--- a/src/arithmetic/canonical.cc
+++ b/src/arithmetic/canonical.cc
@@ -236,6 +236,24 @@ class Canonical::Internal : public IRMutator {
   bool EnableOpt(Type t) const {
     return (t.lanes() == 1 && (t.is_int() || t.is_uint()));
   }
+  // Max
+  Expr Mutate_(const Max* op, const Expr& e) final {
+    CacheEntry a = Produce(op->a);
+    CacheEntry b = Produce(op->b);
+    if (a.has_side_effect || b.has_side_effect) {
+      return Binary_(op, e, a.value, b.value);
+    }
+    return Binary(op, e);
+  }
+  // Min
+  Expr Mutate_(const Min* op, const Expr& e) final {
+    CacheEntry a = Produce(op->a);
+    CacheEntry b = Produce(op->b);
+    if (a.has_side_effect || b.has_side_effect) {
+      return Binary_(op, e, a.value, b.value);
+    }
+    return Binary(op, e);
+  }
   // Add
   Expr Mutate_(const Add* op, const Expr& e) final {
     if (!EnableOpt(op->type)) {
@@ -277,7 +295,7 @@ class Canonical::Internal : public IRMutator {
     } else if (is_const(b.value)) {
       return SumMulConst(a.AsSum(), b.value);
     } else {
-      return Binary_(op, e, a.value, b.value);
+      return Binary(op, e);
     }
   }
   // Variable
diff --git a/tests/cpp/ir_simplify_test.cc b/tests/cpp/ir_simplify_test.cc
index 0667dc27367c..8114bb51b771 100644
--- a/tests/cpp/ir_simplify_test.cc
+++ b/tests/cpp/ir_simplify_test.cc
@@ -1,5 +1,6 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
+#include <tvm/ir_pass.h>
 #include <tvm/tvm.h>
 #include <arithmetic/Simplify.h>
 
@@ -8,6 +9,24 @@ TEST(IRSIMPLIFY, Basic) {
   simplify_test();
 }
 
+TEST(IRSIMPLIFY, MinMax) {
+  auto x = tvm::var("x");
+  auto e1 = (tvm::max(x, 1) - tvm::max(x, 1)) ;
+  auto e1s = tvm::ir::CanonicalSimplify(e1);
+  CHECK(is_zero(e1s));
+
+  auto e2 = (x * tvm::min(x, 1)) - (x * tvm::min(x, 1));
+  auto e2s = tvm::ir::CanonicalSimplify(e2);
+  CHECK(is_zero(e2s));
+}
+
+TEST(IRSIMPLIFY, Mul) {
+  auto x = tvm::var("x");
+  auto e = (x * x) - (x * x) ;
+  auto es = tvm::ir::CanonicalSimplify(e);
+  CHECK(is_zero(es));
+}
+
 int main(int argc, char ** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";
diff --git a/tests/python/unittest/test_arith_simplify.py b/tests/python/unittest/test_arith_simplify.py
index e9315eda3257..f6a78b6e3770 100644
--- a/tests/python/unittest/test_arith_simplify.py
+++ b/tests/python/unittest/test_arith_simplify.py
@@ -46,6 +46,21 @@ def test_simplify_mod():
         (j + n * 32) % 16, {j: tvm.Range(0, 6)})
     assert index == j
 
+def test_simplify_minmax():
+    x = tvm.var('x')
+    e1 = tvm.max(x, 1) - tvm.max(x, 1)
+    e1s = tvm.ir_pass.CanonicalSimplify(e1)
+    assert e1s.value == 0
+
+    e2 = tvm.min(x, 1) - tvm.min(x, 1)
+    e2s = tvm.ir_pass.CanonicalSimplify(e2)
+    assert e2s.value == 0
+
+def test_mul():
+    x = tvm.var('x')
+    e = x * x - x * x
+    es = tvm.ir_pass.CanonicalSimplify(e)
+    assert es.value == 0
 
 def test_modular():
     rx = tvm.var("rx")
@@ -62,11 +77,9 @@ def test_modular():
     assert tvm.ir_pass.CanonicalSimplify(z1 - (ry + y)).value == 0
     assert tvm.ir_pass.CanonicalSimplify(z2 - (rx + x)).value == 0
 
-
-
-
-
 if __name__ == "__main__":
     test_simplify_mod()
     test_modular()
     test_simplify()
+    test_mul()
+    test_simplify_minmax()
\ No newline at end of file

From 83b0f5a935dea0ed8a2f22def5035295958e67e6 Mon Sep 17 00:00:00 2001
From: Jian Weng <werefluke@gmail.com>
Date: Mon, 10 Dec 2018 20:35:31 -0800
Subject: [PATCH 482/529] [Hybrid Script] Support logical and/or; support 0 < a
 < 5 clause (#2264)

---
 python/tvm/hybrid/parser.py                 | 34 ++++++++++++++++++---
 tests/python/unittest/test_hybrid_script.py | 24 +++++++++++++++
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index ee550ab623cb..539115d8b6f4 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -8,6 +8,8 @@
 from .intrin import LOOP_INTRIN, MATH_INTRIN
 from .var_decl import determine_variable_usage
 from ..api import thread_axis
+from ..api import all as _all
+from ..api import any as _any
 from .. import expr as _expr
 from .. import make as _make
 from .. import intrin
@@ -47,6 +49,8 @@ class HybridParser(ast.NodeVisitor):
         ast.LtE   : operator.le,
         ast.Eq    : operator.eq,
         ast.NotEq : operator.ne,
+        ast.And   : _all,
+        ast.Or    : _any,
     }
 
 
@@ -282,11 +286,31 @@ def visit_IfExp(self, node):
 
 
     def visit_Compare(self, node):
-        lhs = self.visit(node.left)
-        _internal_assert(len(node.ops) == 1, "Only one compare op is supported!")
-        _internal_assert(len(node.comparators) == 1, "Only one comparator is supported!")
-        rhs = self.visit(node.comparators[0])
-        return HybridParser._binop_maker[type(node.ops[0])](lhs, rhs)
+        _internal_assert(len(node.ops) == len(node.comparators),
+                         "#compare ops != #comparators")
+        ops = [self.visit(node.left)]
+        ops += [self.visit(i) for i in node.comparators]
+        res = []
+        for i in range(len(node.ops)):
+            lhs = ops[i]
+            rhs = ops[i + 1]
+            res.append(HybridParser._binop_maker[type(node.ops[i])](lhs, rhs))
+        return _all(*res)
+
+
+    def visit_BoolOp(self, node):
+        n = len(node.values)
+        if n == 1:
+            _internal_assert(isinstance(node.op, ast.Not), \
+                             "Unary is supposed to be not!")
+            return operator.not_(self.visit(node.values[0]))
+        elif n == 2:
+            _internal_assert(isinstance(node.op, (ast.And, ast.Or)), \
+                             "Binary is supposed to be and/or!")
+            values = [self.visit(i) for i in node.values]
+            return HybridParser._binop_maker[type(node.op)](*values)
+        else:
+            raise ValueError("This Bool Op is not supported yet!")
 
 
     def visit_UnaryOp(self, node):
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 3304039d7400..c718fc66899a 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -237,6 +237,30 @@ def if_then_else(a):
 
     run_and_check(if_then_else, [a])
 
+    @script
+    def if_triple_condition(a):
+        b = output_tensor((10, ), 'int32')
+        for i in range(10):
+            if 0 <= i < 5:
+                b[i] = a[i]
+            else:
+                b[i] = a[i] + 1
+        return b
+
+    run_and_check(if_triple_condition, [a])
+
+    @script
+    def if_and(a):
+        b = output_tensor((10, ), 'int32')
+        for i in range(10):
+            if i >= 0 and i < 5:
+                b[i] = a[i]
+            else:
+                b[i] = a[i] + 1
+        return b
+
+    run_and_check(if_and, [a])
+
 
 def test_bind():
     if not tvm.gpu(0).exist:

From 3c11befee915f1b72cbdf4f5708a979d7144fdaa Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Mon, 10 Dec 2018 20:35:51 -0800
Subject: [PATCH 483/529] Fix serialization issue (#2263)

---
 src/lang/reflection.cc              | 27 +++------------------------
 src/relay/ir/op.cc                  |  2 +-
 tests/python/relay/test_ir_nodes.py | 21 +++++++++++++++++++++
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 5197645026eb..86a11a7e5b42 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -9,6 +9,7 @@
 #include <tvm/node/container.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
 #include <dmlc/json.h>
 #include <dmlc/memory_io.h>
 #include <string>
@@ -25,34 +26,12 @@ ::dmlc::Registry<NodeFactoryReg>* NodeFactoryReg::Registry() {
 }
 
 inline std::string Type2String(const Type& t) {
-  if (t.code()  ==Type::Handle) return "handle";
-  std::ostringstream os;
-  os << t;
-  return os.str();
+  return runtime::TVMType2String(Type2TVMType(t));
 }
 
 
 inline Type String2Type(std::string s) {
-  std::istringstream is(s);
-  halideir_type_code_t code = Type::Int;
-  if (s.substr(0, 3) == "int") {
-    code = Type::Int; s = s.substr(3);
-  } else if (s.substr(0, 4) == "uint") {
-    code = Type::UInt; s = s.substr(4);
-  } else if (s.substr(0, 5) == "float") {
-    code = Type::Float; s = s.substr(5);
-  } else if (s.substr(0, 5) == "float") {
-    code = Type::Float; s = s.substr(5);
-  } else if (s == "handle") {
-    return Handle();
-  } else {
-    LOG(FATAL) << "unknown type " << s;
-  }
-  int bits = 32, lanes = 1;
-  if (sscanf(s.c_str(), "%dx%d", &bits, &lanes) == 0) {
-    LOG(FATAL) << "unknown type " << s;
-  }
-  return Type(code, bits, lanes);
+  return TVMType2Type(runtime::String2TVMType(s));
 }
 
 
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
index d0ae57bb01e1..bc9955251a7e 100644
--- a/src/relay/ir/op.cc
+++ b/src/relay/ir/op.cc
@@ -140,7 +140,7 @@ TVM_REGISTER_API("relay.op._Register")
 
 NodePtr<Node> CreateOp(const std::string& name) {
   auto op = Op::Get(name);
-  CHECK(!op.defined()) << "Cannot find op \'" << name << '\'';
+  CHECK(op.defined()) << "Cannot find op \'" << name << '\'';
   return op.node_;
 }
 
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index 2159dd02de95..e041acca0e0c 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -2,6 +2,7 @@
 import tvm
 from tvm import relay
 from tvm.expr import *
+from tvm.relay import op
 from tvm.relay.ir_pass import graph_equal
 
 
@@ -209,6 +210,24 @@ def test_tuple_get_item():
     check_json_roundtrip(get)
 
 
+def test_op():
+    add = op.op.get("add")
+    check_json_roundtrip(add)
+
+
+def test_conv2d_attrs():
+    data = relay.var('data', shape=(1, 3, 224, 224))
+    param = relay.var('param', shape=(64, 3, 7, 7))
+    out = op.nn.conv2d(
+        data,
+        param,
+        strides=(2, 2),
+        padding=(3, 3),
+        channels=64,
+        kernel_size=(7, 7))
+    check_json_roundtrip(out)
+
+
 if __name__ == "__main__":
     test_bad_constructor()
     test_span()
@@ -226,3 +245,5 @@ def test_tuple_get_item():
     test_let()
     test_if()
     test_tuple_get_item()
+    test_op()
+    test_conv2d_attrs()

From f1279dc7fd0bc0ba4b10454ba27d7fa60baf5533 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Mon, 10 Dec 2018 22:02:58 -0800
Subject: [PATCH 484/529] Allow long type values in shape list (#1806)

* Allow long type values in shape list

* Update build_module.py
---
 nnvm/python/nnvm/compiler/build_module.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nnvm/python/nnvm/compiler/build_module.py b/nnvm/python/nnvm/compiler/build_module.py
index 6fab4460b427..b04d49478830 100644
--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
@@ -251,8 +251,8 @@ def build(graph, target=None, shape=None, dtype="float32",
         if not isinstance(shape, dict):
             raise TypeError("require shape to be dict")
         for value in shape.values():
-            if not all(isinstance(x, int) for x in value):
-                raise TypeError("shape value must be int iterator")
+            if not all(isinstance(x, tvm._ffi.base.integer_types) for x in value):
+                raise TypeError("shape value must be Integer types iterator")
 
         cfg = BuildConfig.current
         graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)

From a518a05105cc7874be6971b66f6fa1a597b6a66b Mon Sep 17 00:00:00 2001
From: Ruslan Baratov <ruslan_baratov@yahoo.com>
Date: Tue, 11 Dec 2018 17:42:21 +0000
Subject: [PATCH 485/529] Fix misprint (#2272)

---
 python/tvm/contrib/mps.py    | 2 +-
 python/tvm/contrib/nnpack.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/contrib/mps.py b/python/tvm/contrib/mps.py
index 43b3b9fb48db..86532f72153c 100644
--- a/python/tvm/contrib/mps.py
+++ b/python/tvm/contrib/mps.py
@@ -1,4 +1,4 @@
-"""External function interface to MPS libraroes."""
+"""External function interface to MPS libraries."""
 from __future__ import absolute_import as _abs
 from .. import api as _api
 from .. import intrin as _intrin
diff --git a/python/tvm/contrib/nnpack.py b/python/tvm/contrib/nnpack.py
index 98367b4ef04e..9fd0e7ed2cba 100644
--- a/python/tvm/contrib/nnpack.py
+++ b/python/tvm/contrib/nnpack.py
@@ -1,4 +1,4 @@
-"""External function interface to NNPACK libraroes."""
+"""External function interface to NNPACK libraries."""
 from __future__ import absolute_import as _abs
 
 from .. import api as _api

From bbf441be4391e9e1df9dff279faca6ff18558518 Mon Sep 17 00:00:00 2001
From: Sean <wonderxboy@gmail.com>
Date: Tue, 11 Dec 2018 11:42:47 -0600
Subject: [PATCH 486/529] correct mistake in muladd function logic (#2269)

Doesn't make sense to have %1 = mul(%x, %y) computed but never use the result %1
---
 docs/dev/relay_intro.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/dev/relay_intro.rst b/docs/dev/relay_intro.rst
index d3c83590cbb8..dde900a502c1 100644
--- a/docs/dev/relay_intro.rst
+++ b/docs/dev/relay_intro.rst
@@ -54,7 +54,7 @@ shows an example of a function calling another function.
 
    def @muladd(%x, %y, %z) {
      %1 = mul(%x, %y)
-     %2 = add(%x, %z)
+     %2 = add(%1, %z)
      %2
    }
    def @myfunc(%x) {

From 4444e75497c7bb41fc220a65aebea028e90cfb41 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Tue, 11 Dec 2018 21:52:21 -0800
Subject: [PATCH 487/529] [DOC]Remove non-existent parameter doc (#2277)

---
 src/relay/pass/fuse_ops.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 21660decf2fa..79ea3e22b139 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -536,7 +536,6 @@ class GraphPartitioner {
    * \brief Commit fusion operation.
    * \param src The source node.
    * \param sink The termination node.
-   * \tparam group the group to be committed.
    * \note sink must be a post-dominator of src.
    */
   void CommitFuse(IndexedForwardGraph::Node* src,

From 45981a8fd3a16ec96c69f0a3add556ce84c4e70c Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Wed, 12 Dec 2018 11:23:10 +0530
Subject: [PATCH 488/529] Testcases of onnx (#2274)

---
 nnvm/python/nnvm/frontend/onnx.py             |   6 +-
 .../python/frontend/onnx/test_forward.py      | 168 +++++++++++++++++-
 2 files changed, 166 insertions(+), 8 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index 92033a31da60..ad0acc31a521 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -346,9 +346,9 @@ class ThresholdedRelu(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        alpha = float(attr.get('alpha', 0.0))
-        return _sym.relu(inputs[0] - alpha)
-
+        alpha = float(attr.get('alpha', 1.0))
+        alpha_tensor = _sym.full_like(inputs[0], fill_value=float(alpha))
+        return _sym.elemwise_mul(inputs[0], _sym.greater(inputs[0], alpha_tensor))
 
 class ImageScaler(OnnxOpConverter):
 
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 022dc4a0fd7b..82b5d319f92f 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -10,7 +10,7 @@
 from model_zoo import super_resolution, squeezenet1_1, lenet, resnet18_1_0
 from onnx import helper, TensorProto
 
-def get_tvm_output(graph_def, input_data, target, ctx, output_shape, output_dtype='float32'):
+def get_tvm_output(graph_def, input_data, target, ctx, output_shape=None, output_dtype='float32'):
     """ Generic function to execute and get tvm output"""
 
     sym, params = nnvm.frontend.from_onnx(graph_def)
@@ -47,12 +47,12 @@ def get_tvm_output(graph_def, input_data, target, ctx, output_shape, output_dtyp
     # get outputs
     if isinstance(output_shape, list) and isinstance(output_dtype, list):
         tvm_output_list = []
-        for i, s in enumerate(output_shape):
-            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+        for i, _ in enumerate(output_shape):
+            tvm_output = m.get_output(i)
             tvm_output_list.append(tvm_output.asnumpy())
         return tvm_output_list
     else:
-        tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype))
+        tvm_output = m.get_output(0)
         return tvm_output.asnumpy()
 
 def get_caffe2_output(model, x, dtype='float32'):
@@ -273,7 +273,7 @@ def test_slice():
     _test_slice_iteration(x, x[:, 0:-1], (0), (-1), (1))
 
 def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
-    indata = np.random.uniform(size=(2, 4, 5, 6)).astype(dtype)
+    indata = np.random.uniform(-1, 1, size=inshape).astype(dtype)
     outdata = outfunc(indata, **npargs)
 
     y = helper.make_node(opname, ['in'], ['out'], **kwargs)
@@ -858,6 +858,154 @@ def test_split():
     verify_split([[1., 2., 3., 4.], [7., 8., 9., 10.]],
                  [[[1., 2.], [7., 8.]], [[3., 4.], [9., 10.]]], [2, 2], 1)
 
+def test_binary_ops():
+    in_shape = (1, 2, 3, 3)
+    dtype = "float32"
+    out_shape = in_shape
+
+    def verify_binary_ops(op, x, y, out_np, broadcast=None):
+        if broadcast is None:
+            z = helper.make_node(op, ['in1', 'in2'], ['out'])
+        else:
+            z = helper.make_node(op, ['in1', 'in2'], ['out'], broadcast=1)
+        graph = helper.make_graph([z],
+                                   '_test',
+                                  inputs = [helper.make_tensor_value_info("in1",
+                                                TensorProto.FLOAT, list(in_shape)),
+                                            helper.make_tensor_value_info("in2",
+                                                TensorProto.FLOAT, list(in_shape))],
+                                  outputs = [helper.make_tensor_value_info("out",
+                                                TensorProto.FLOAT, list(out_shape))])
+        model = helper.make_model(graph, producer_name='_test')
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(model, [x, y], target, ctx)
+            tvm.testing.assert_allclose(out_np, tvm_out)
+
+    x = np.random.uniform(size=in_shape).astype(dtype)
+    y = np.random.uniform(size=in_shape).astype(dtype)
+    z = np.random.uniform(size=(3,)).astype(dtype)
+    verify_binary_ops("Add",x, y, x + y, broadcast=None)
+    verify_binary_ops("Add", x, z,  x + z, broadcast=True)
+    verify_binary_ops("Sub", x, y, x - y, broadcast=None)
+    verify_binary_ops("Sub", x, z, x - z, broadcast=True)
+    verify_binary_ops("Mul",x, y, x * y, broadcast=None)
+    verify_binary_ops("Mul", x, z,  x * z, broadcast=True)
+    verify_binary_ops("Div", x, y, x / y, broadcast=None)
+    verify_binary_ops("Div", x, z, x / z, broadcast=True)
+    verify_binary_ops("Sum", x, y, x + y, broadcast=None)
+
+def test_single_ops():
+    in_shape = (1, 2, 3, 3)
+    dtype = "float32"
+    out_shape = in_shape
+
+    def verify_single_ops(op, x, out_np):
+        z = helper.make_node(op, ['in1'], ['out'])
+        graph = helper.make_graph([z],
+                                   '_test',
+                                  inputs = [helper.make_tensor_value_info("in1",
+                                                TensorProto.FLOAT, list(in_shape)),],
+                                  outputs = [helper.make_tensor_value_info("out",
+                                                TensorProto.FLOAT, list(out_shape))])
+        model = helper.make_model(graph, producer_name='_test')
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(model, [x], target, ctx)
+            tvm.testing.assert_allclose(out_np, tvm_out)
+
+    x = np.random.uniform(size=in_shape).astype(dtype)
+    verify_single_ops("Neg",x, -x)
+    verify_single_ops("Abs",x, np.abs(x))
+    verify_single_ops("Reciprocal",x, 1/x)
+    verify_single_ops("Sqrt",x, np.sqrt(x))
+    verify_single_ops("Relu",x, np.maximum(x, 0))
+    verify_single_ops("Exp",x, np.exp(x))
+    verify_single_ops("Log",x, np.log(x))
+    verify_single_ops("Log",x, np.log(x))
+    verify_single_ops("Tanh",x, np.tanh(x))
+    verify_single_ops("Sigmoid",x, 1 / (1 + np.exp(-x)))
+    verify_single_ops("Softsign",x, x / (1 + np.abs(x)))
+    verify_single_ops("SoftPlus",x, np.log(1 + np.exp(x)))
+
+def test_leaky_relu():
+    def leaky_relu_x(x, alpha):
+        return np.where(x >= 0, x, x * alpha)
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              leaky_relu_x,
+                              {'alpha': 0.25},
+                              'float32',
+                              'LeakyRelu',
+                              {'alpha': 0.25})
+
+def test_elu():
+    def elu_x(x, alpha):
+        return np.where(x > 0, x, alpha * (np.exp(x) - 1.0))
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              elu_x,
+                              {'alpha': 0.25},
+                              'float32',
+                              'Elu',
+                              {'alpha': 0.25})
+
+def test_selu():
+    def selu_x(x, alpha, gamma):
+        return gamma * np.where(x > 0, x, alpha * (np.exp(x) - 1.0))
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              selu_x,
+                              {'alpha': 0.25, 'gamma': 0.3},
+                              'float32',
+                              'Selu',
+                              {'alpha': 0.25, 'gamma': 0.3})
+
+def test_ThresholdedRelu():
+    def ThresholdedRelu_x(x, alpha):
+        out_np = np.clip(x, alpha, np.inf)
+        out_np[out_np == alpha] = 0
+        return out_np
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              ThresholdedRelu_x,
+                              {'alpha': 0.25},
+                              'float32',
+                              'ThresholdedRelu',
+                              {'alpha': 0.25})
+
+def test_ScaledTanh():
+    def ScaledTanh_x(x, alpha, beta):
+        return alpha * np.tanh(beta * x)
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              ScaledTanh_x,
+                              {'alpha': 0.25, 'beta': 0.3},
+                              'float32',
+                              'ScaledTanh',
+                              {'alpha': 0.25, 'beta': 0.3})
+
+def test_ParametricSoftplus():
+    def ParametricSoftplus_x(x, alpha, beta):
+        return alpha * np.log(np.exp(beta * x) + 1)
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              ParametricSoftplus_x,
+                              {'alpha': 0.25, 'beta': 0.3},
+                              'float32',
+                              'ParametricSoftplus',
+                              {'alpha': 0.25, 'beta': 0.3})
+
+def test_Scale():
+    def Scale_x(x, scale):
+        return scale * x
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              Scale_x,
+                              {'scale': 0.25},
+                              'float32',
+                              'Scale',
+                              {'scale': 0.25})
+
+def test_LogSoftmax():
+    _test_onnx_op_elementwise((1, 4),
+                              topi.testing.log_softmax_python,
+                              {},
+                              'float32',
+                              'LogSoftmax',
+                              {'axis': 1})
+
 if __name__ == '__main__':
     # verify_super_resolution_example()
     # verify_squeezenet1_1()
@@ -889,3 +1037,13 @@ def test_split():
     test_reduce_sum()
     test_reduce_mean()
     test_split()
+    test_binary_ops()
+    test_single_ops()
+    test_leaky_relu()
+    test_elu()
+    test_selu()
+    test_ThresholdedRelu()
+    test_ScaledTanh()
+    test_ParametricSoftplus()
+    test_Scale()
+    test_LogSoftmax()

From 75b72c298fb6c89d3b7d54a782d1a5c1c1e75318 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfu.chen@icloud.com>
Date: Wed, 12 Dec 2018 13:54:39 +0800
Subject: [PATCH 489/529] Fix a issue when running with graph_runtime_debug in
 python (#2271)

* fix a issue when running with graph_runtime_debug in python;

* add support to `debug_get_output` in python;

* comply with the linter;
---
 python/tvm/contrib/debugger/debug_result.py  | 15 +++++++++++
 python/tvm/contrib/debugger/debug_runtime.py | 28 ++++++++++++++++++++
 python/tvm/contrib/graph_runtime.py          | 12 ++-------
 3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 5b563c86e6e4..101af6887c47 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -93,6 +93,21 @@ def get_graph_node_dtypes(self):
         """
         return self._dtype_list
 
+    def get_output_tensors(self):
+        """Dump the outputs to a temporary folder, the tensors are in numpy format
+        """
+        eid = 0
+        order = 0
+        output_tensors = {}
+        for node, time in zip(self._nodes_list, self._time_list):
+            num_outputs = self.get_graph_node_output_num(node)
+            for j in range(num_outputs):
+                order += time[0]
+                key = node['name'] + "_" + str(j)
+                output_tensors[key] = self._output_tensor_list[eid]
+                eid += 1
+        return output_tensors
+
     def dump_output_tensor(self):
         """Dump the outputs to a temporary folder, the tensors are in numpy format
         """
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index 6642a8bdc822..d38ee6cf7982 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -173,6 +173,34 @@ def _run_debug(self):
             for j in range(num_outputs):
                 out_tensor = self._get_output_by_layer(i, j)
                 self.debug_datum._output_tensor_list.append(out_tensor)
+
+    def debug_get_output(self, node, out):
+        """Run graph upto node and get the output to out
+
+        Parameters
+        ----------
+        node : int / str
+            The node index or name
+
+        out : NDArray
+            The output array container
+        """
+        ret = None
+        if isinstance(node, str):
+            output_tensors = self.debug_datum.get_output_tensors()
+            try:
+                ret = output_tensors[node]
+            except:
+                node_list = output_tensors.keys()
+                raise RuntimeError("Node " + node + " not found, available nodes are: "
+                                   + str(node_list) + ".")
+        elif isinstance(node, int):
+            output_tensors = self.debug_datum._output_tensor_list
+            ret = output_tensors[node]
+        else:
+            raise RuntimeError("Require node index or name only.")
+        return ret
+
     def run(self, **input_dict):
         """Run forward execution of the graph with debug
 
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 1ba402e20e7e..0d62a04a5571 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -112,10 +112,6 @@ def __init__(self, module):
         self._get_output = module["get_output"]
         self._get_input = module["get_input"]
         self._get_num_outputs = module["get_num_outputs"]
-        try:
-            self._debug_get_output = module["debug_get_output"]
-        except AttributeError:
-            pass
         self._load_params = module["load_params"]
 
     def set_input(self, key=None, value=None, **params):
@@ -209,12 +205,8 @@ def debug_get_output(self, node, out):
         out : NDArray
             The output array container
         """
-        if hasattr(self, '_debug_get_output'):
-            self._debug_get_output(node, out)
-        else:
-            raise RuntimeError(
-                "Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0")
-        return out
+        raise NotImplementedError(
+            "Please use debugger.debug_runtime as graph_runtime instead.")
 
     def load_params(self, params_bytes):
         """Load parameters from serialized byte array of parameter dict.

From 06940eea71894b672ee1a9e8416aa9ea2860f486 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Wed, 12 Dec 2018 11:25:17 +0530
Subject: [PATCH 490/529] [FRONTEND][TENSORFLOW] Bugfix (#2267)

---
 nnvm/python/nnvm/frontend/tensorflow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index c8db662152e9..9c1290bedb6b 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -1124,8 +1124,8 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
                         tensor_value.tensor_shape)]
             elif '_output_shapes' in attr:
                 self._output_shapes[node.name] = \
-                    [tensor_util.TensorShapeProtoToList(shape) \
-                    for shape in attr['_output_shapes']]
+                    [tensor_util.TensorShapeProtoToList(tshape) \
+                    for tshape in attr['_output_shapes']]
             elif shape:
                 # Keep the list indexable to avoid key error.
                 # Actual value will be filled after node creation.

From 55c9efd9effed75f2de6e1b721156c116394afa3 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 13 Dec 2018 05:42:49 +0800
Subject: [PATCH 491/529] [AUTOTVM] Use range in AnnotateSpace to fix JSON
 serialization (#2278)

---
 python/tvm/autotvm/task/space.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index 32bd66b6c12d..3fb02c6190cf 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -423,7 +423,7 @@ def __init__(self, axes, policy, **kwargs):
         elif policy == 'locate_cache':
             self.num_axis = len(axes)
             num_anchor = kwargs["num_anchor"]
-            self.anns = list(itertools.combinations(np.arange(self.num_axis), num_anchor))
+            self.anns = list(itertools.combinations(range(self.num_axis), num_anchor))
             self.entities = [AnnotateEntity(x) for x in self.anns]
         else:  # none, vec, unroll, try_vec, try_unroll, try_vec_unroll, ...
             anns = policy.replace('try', 'none').split('_')

From 07bc51c1bbd53725dbbf342ee1577aadbe14bb3a Mon Sep 17 00:00:00 2001
From: Liang ZOU <liang.d.zou@gmail.com>
Date: Fri, 14 Dec 2018 02:18:55 +0800
Subject: [PATCH 492/529] typo: Xlinx => Xilinx (#2283)

typo: Xlinx => Xilinx
---
 python/tvm/contrib/sdaccel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/contrib/sdaccel.py b/python/tvm/contrib/sdaccel.py
index 0f89911dbdad..6bc246ff1751 100644
--- a/python/tvm/contrib/sdaccel.py
+++ b/python/tvm/contrib/sdaccel.py
@@ -36,7 +36,7 @@ def compile_vhls(kernel_info, device_name):
         platform = os.environ.get("XCL_PLATFORM", os.environ.get("AWS_PLATFORM"))
 
     if platform is None:
-        raise RuntimeError("No Xlinx device specified.")
+        raise RuntimeError("No Xilinx device specified.")
 
     tmp_xo_files = []
     for funcname, code  in kernel_info:

From 7b06e3851553e2e05f251c15221a72a4dbf98589 Mon Sep 17 00:00:00 2001
From: Jian Weng <werefluke@gmail.com>
Date: Thu, 13 Dec 2018 10:21:36 -0800
Subject: [PATCH 493/529] [BUGFIX] [Hybrid Script] fix in-correct value index
 in hybrid script (#2268)

---
 python/tvm/hybrid/parser.py                 | 31 ++++++-------
 tests/python/unittest/test_hybrid_script.py | 48 +++++++++++++++++++--
 2 files changed, 60 insertions(+), 19 deletions(-)

diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index 539115d8b6f4..ba10dd8dde3c 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -35,20 +35,21 @@ class HybridParser(ast.NodeVisitor):
 
 
     _binop_maker = {
-        ast.Add   : operator.add,
-        ast.Sub   : operator.sub,
-        ast.Mult  : operator.mul,
-        ast.Div   : operator.div if sys.version_info[0] == 2 else operator.truediv,
-        ast.Mod   : operator.mod,
-        ast.BitOr : operator.or_,
-        ast.BitAnd: operator.and_,
-        ast.BitXor: operator.xor,
-        ast.Gt    : operator.gt,
-        ast.GtE   : operator.ge,
-        ast.Lt    : operator.lt,
-        ast.LtE   : operator.le,
-        ast.Eq    : operator.eq,
-        ast.NotEq : operator.ne,
+        ast.Add     : operator.add,
+        ast.Sub     : operator.sub,
+        ast.Mult    : operator.mul,
+        ast.Div     : operator.div if sys.version_info[0] == 2 else operator.truediv,
+        ast.FloorDiv: operator.div if sys.version_info[0] == 2 else operator.truediv,
+        ast.Mod     : operator.mod,
+        ast.BitOr   : operator.or_,
+        ast.BitAnd  : operator.and_,
+        ast.BitXor  : operator.xor,
+        ast.Gt      : operator.gt,
+        ast.GtE     : operator.ge,
+        ast.Lt      : operator.lt,
+        ast.LtE     : operator.le,
+        ast.Eq      : operator.eq,
+        ast.NotEq   : operator.ne,
         ast.And   : _all,
         ast.Or    : _any,
     }
@@ -237,7 +238,7 @@ def visit_Subscript(self, node):
         if isinstance(node.value, ast.Name):
             array = node.value.id
             _buf = self._get_buffer_from_id(array)
-            return _make.Call(_buf.dtype, array, args, _expr.Call.Halide, _buf.op, 0)
+            return _make.Call(_buf.dtype, array, args, _expr.Call.Halide, _buf.op, _buf.value_index)
 
         _internal_assert(isinstance(node.value, ast.Attribute), \
                          "Only variable and attribute's subscript supported so far")
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index c718fc66899a..7efbbe43ee21 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -1,4 +1,4 @@
-import tvm, inspect, sys, traceback, numpy, nose
+import tvm, inspect, sys, traceback, numpy, nose, types
 from tvm.hybrid import script
 from tvm.hybrid.intrin import HYBRID_GLOBALS
 
@@ -11,6 +11,10 @@ def tvm_val_2_py_val(val):
         return val.value
 
     ctx = tvm.context(target, 0)
+    op = None
+
+    outs = func(*args)
+    op = outs[0].op if isinstance(outs, list) else outs.op
 
     emu_args = []
     nd_args = []
@@ -24,8 +28,6 @@ def tvm_val_2_py_val(val):
             emu_args.append(tvm_val_2_py_val(i))
             nd_args.append(emu_args[-1])
 
-    outs = func(*args)
-    op = outs[0].op if isinstance(outs, list) else outs.op
     sch = tvm.create_schedule(op)
     module = tvm.build(sch, args + (outs if isinstance(outs, list) else [outs]), target=target)
     assert module
@@ -425,10 +427,12 @@ def downstream(a):
         for i in range(20):
             b[i] = a[i] * i
         return b
+
     
     a = tvm.placeholder((20, ), 'float32')
     b = downstream(a)
     c = tvm.compute((20, ), lambda x: b[x] + 1.0)
+
     sch = tvm.create_schedule(c.op)
     module = tvm.build(sch, [a, c])
     assert module
@@ -469,6 +473,40 @@ def add_something(a, b):
 
     tvm.testing.assert_allclose(nd_c.asnumpy(), ref, 1e-5, 1e-5)
 
+def test_value_index():
+    @tvm.hybrid.script
+    def kernel_a(a):
+        b = output_tensor((16, ), 'int32')
+        c = output_tensor((4, 4), 'int32')
+        for i in range(16):
+            b[i] = a[i] + 2
+            c[i // 4, i % 4] = a[i] + 1
+        return b, c
+
+    @tvm.hybrid.script
+    def kernel_b(b, a):
+        c = output_tensor((4, 4), 'int32')
+        for i in range(4):
+            for j in range(4):
+                c[i, j] = a[i * 4 + j] * b[i, j]
+        return c
+
+    a = tvm.placeholder((16, ), 'int32')
+    b, c = kernel_a(a)
+    d = kernel_b(c, b)
+    sch = tvm.create_schedule(d.op)
+    module = tvm.build(sch, [a, d])
+    assert module
+
+    np_a = numpy.arange(16).astype('int32')
+    np_b, np_c = kernel_a(np_a)
+    ref = kernel_b(np_c, np_b)
+
+    res = tvm.ndarray.array(numpy.zeros((4, 4)).astype('int32'))
+    module(tvm.ndarray.array(np_a), res)
+    tvm.testing.assert_allclose(res.asnumpy(), ref)
+
+
 
 if __name__ == "__main__":
     test_outer_product()
@@ -479,9 +517,11 @@ def add_something(a, b):
     test_math_intrin()
     test_non_zero()
     test_allocate()
-    #test_inplace()
     test_upstream()
     test_downstream()
     test_const_param()
+    test_value_index()
+    # TODO:
+    # test_inplace()
 
 

From 85dca0f37ddb0e45e6a086ede94bc0454330660e Mon Sep 17 00:00:00 2001
From: Alexey Romanov <alexey.v.romanov@gmail.com>
Date: Thu, 13 Dec 2018 21:22:14 +0300
Subject: [PATCH 494/529] [FRONTEND][TENSORFLOW] Support Unstack and Split
 (#2105)

---
 nnvm/python/nnvm/frontend/tensorflow.py       | 106 +++++++++++++++---
 .../frontend/tensorflow/test_forward.py       |  93 +++++++--------
 2 files changed, 136 insertions(+), 63 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 9c1290bedb6b..10f23a49b5de 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -36,6 +36,7 @@ def __call__(self, inputs, attrs, *args):
         self._ignores.append('_node_name')
         self._ignores.append('is_training')
         self._ignores.append('_target_layout')
+        self._ignores.append('_input_0d_mismatch')
         # Retain the names
         try:
             attrs['name'] = attrs['_node_name']
@@ -319,8 +320,7 @@ def _impl(inputs, attr, params):
         dim_input = inputs.pop(1)
         axis = params[dim_input.list_output_names()[0]]
         params.pop(dim_input.list_output_names()[0])
-        return AttrCvt(op_name="expand_dims", ignores=['Tdim'],
-                       extras={'axis': axis.asnumpy()[0]})(inputs, attr)
+        return _expand_dims_0d_aware(inputs[0], attr, axis=axis.asnumpy()[0])
     return _impl
 
 def _resize_bilinear():
@@ -383,7 +383,7 @@ def _impl(inputs, attr, params):
 def _pack():
     def _impl(inputs, attr, params):
         axis = int(attr["axis"])
-        inputs_reshaped = [_sym.expand_dims(i, axis=axis, num_newaxis=1) for i in inputs]
+        inputs_reshaped = [_expand_dims_0d_aware(i, attr, axis=axis, num_newaxis=1) for i in inputs]
         return _sym.concatenate(*inputs_reshaped, axis=axis, name=attr["_node_name"])
 
     return _impl
@@ -787,15 +787,64 @@ def _impl(inputs, attr, params):
         )(inputs, attr)
     return _impl
 
-def _split():
+def _split(has_size_vector):
+    # TF documentation https://www.tensorflow.org/api_docs/python/tf/split
     def _impl(inputs, attr, params):
-        axis = params.pop(inputs[0].list_output_names()[0])
-        return AttrCvt(
-            op_name="split", ignores=['T'],
-            transforms={'num_split': 'indices_or_sections'},
-            extras={'axis': axis.asnumpy()[0]})(inputs[1], attr)
+        try:
+            # order and number of inputs are different:
+            # if has_size_vector:
+            #     https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/split-v
+            # else:
+            #     https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/split
+
+            # in addition, `axis` and `num_or_size_splits` can be tensors in TensorFlow,
+            # we can only support constants
+            if has_size_vector:
+                input_node_index = 0
+                input_axis_index = 2
+                size_splits_input_name = inputs[1].list_output_names()[0]
+                size_splits = params[size_splits_input_name].asnumpy()
+                section_beginnings = np.cumsum(size_splits)[:-1]
+                indices_or_sections = tuple(section_beginnings)
+            else:
+                input_node_index = 1
+                input_axis_index = 0
+                indices_or_sections = attr['num_split']
+            input_node = inputs[input_node_index]
+            axis_input_name = inputs[input_axis_index].list_output_names()[0]
+            axis_input_value = params[axis_input_name].asnumpy()[0]
+        except (IndexError, KeyError):
+            raise TypeError( \
+                "Unsupported argument for split: `axis` and `num_or_size_splits` " \
+                "should be constants")
+        return _sym.split(input_node,
+                          indices_or_sections=indices_or_sections,
+                          axis=axis_input_value)
     return _impl
 
+def _unpack():
+    def _impl(inputs, attr, params):
+        input_node = inputs[0]
+        axis = attr['axis']
+        input_shape = attr['_input_shapes'][input_node][0]
+        axis_length = input_shape[axis]
+        if axis_length < 0:
+            raise TypeError("Unstack with unknown axis length")
+        splitted = _sym.split(input_node,
+                              indices_or_sections=axis_length,
+                              axis=axis,
+                              name=attr.get('_node_name', 'unstack'))
+
+        return _sym.Group([_sym.squeeze(split_item, axis=axis) for split_item in splitted])
+    return _impl
+
+def _expand_dims_0d_aware(data, attr, axis, num_newaxis=1):
+    if data in attr['_input_0d_mismatch']:
+        return data if num_newaxis == 1 else \
+            _sym.expand_dims(data, axis=axis, num_newaxis=num_newaxis-1)
+
+    return _sym.expand_dims(data, axis=axis, num_newaxis=num_newaxis)
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -863,7 +912,9 @@ def _impl(inputs, attr, params):
     'GreaterEqual'                      : _broadcast('greater_equal'),
     'Equal'                             : _broadcast('equal'),
     'NotEqual'                          : _broadcast('not_equal'),
-    'Split'                             : _split(),
+    'Split'                             : _split(False),
+    'SplitV'                            : _split(True),
+    'Unpack'                            : _unpack(),
 }
 
 # _convert_map_rnn defines maps of rnn operator name to
@@ -1059,6 +1110,7 @@ def __init__(self):
         self._output_shapes = {}
         self._num_param = 0
         self._num_rnn_layer = False
+        self._outputs_are_0d = {}
 
     def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         """Construct nnvm nodes from tensorflow  graph definition - GraphDef.
@@ -1114,6 +1166,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             # Operator name 'Const' is treated as a parameter to build NNVM params dict.
 
             input_shapes = {}
+            input_0d_mismatch = set()
             attr = self._parse_attr(node.attr)
 
             #Variable converted to Const will not have only value attr
@@ -1133,6 +1186,9 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             else:
                 raise NotImplementedError( \
                     "Please freeze the graph with add_shapes=True")
+            self._outputs_are_0d[node.name] = [ \
+                not shape if isinstance(shape, list) else False \
+                for shape in self._output_shapes[node.name]]
 
             if node.op == "Placeholder":
                 self._nodes[node.name] = _sym.Variable(name=node.name,
@@ -1162,11 +1218,13 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
                 # Fill shapes for all inputs in a list
                 inputs = []
                 for i in node.input:
-                    #ToDo: Some of the tensorflow operators internaly maintain
-                    #execution layers and its output name will the layer number along with
-                    #graph node name.eg: Node name:- 'Model/RNN/cell_0/RnnCell', but the
-                    #output name will be 'Model/RNN/cell_0/RnnCell:0'. In this case,
-                    #the digit has to be ignored.
+                    # Some TensorFlow operators internally maintain execution layers
+                    # and their output name includes the layer number along with
+                    # graph node name. E.g. the node name is 'Model/RNN/cell_0/RnnCell', but the
+                    # output tensor name is 'Model/RNN/cell_0/RnnCell:0'. In this case,
+                    # the number has to be ignored for single-output nodes.
+                    # On the other hand, for multi-output nodes the number is the output index,
+                    # and the lack of the number implies 0.
                     tensor_name = i.split(':')
                     node_name = tensor_name[0]
                     if node_name in self._nodes:
@@ -1174,12 +1232,18 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
                         if len(in_sym.list_output_names()) > 1:
                             tensor_slot = int(tensor_name[1]) if len(tensor_name) > 1 else 0
                             in_sym = in_sym[tensor_slot]
-                            input_shape = (self._output_shapes[node_name])[tensor_slot]
+                            input_shape = self._output_shapes[node_name][tensor_slot]
                         else:
+                            tensor_slot = 0
                             input_shape = self._output_shapes[node_name][0]
                         inputs.append(in_sym)
                         input_shapes[in_sym] = [input_shape]
+                        # This means the node is 1d in NNVM and 0d in TF.
+                        # See `_expand_dims_0d_aware`.
+                        if self._outputs_are_0d[node_name][tensor_slot] and input_shape:
+                            input_0d_mismatch.add(in_sym)
                 attr['_input_shapes'] = input_shapes
+                attr['_input_0d_mismatch'] = input_0d_mismatch
 
                 inputs = self._fix_extranodes(node.op, attr, inputs)
                 op = self._convert_operator(node.op, inputs, attr, graph)
@@ -1207,7 +1271,13 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         if outputs is None:
             out.append(final_op)
         else:
-            out = [self._nodes[out_name] for out_name in outputs]
+            for out_name in outputs:
+                if ":" in out_name:
+                    out_name, out_num = out_name.split(":")
+                    out_num = int(out_num)
+                    out.append(self._nodes[out_name][out_num])
+                else:
+                    out.append(self._nodes[out_name])
 
         #Add the RNN outputs also with 'head' nodes of the nnvm graph
         if self._num_rnn_layer:
@@ -1215,7 +1285,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             out.append(out_rnn)
 
         if isinstance(out, list):
-            out = _sym.Group(out)
+            out = _sym.Group(out) if len(out) > 1 else out[0]
 
         return out, self._params
 
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 219ceb5bd379..ed3d0272b4fc 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -124,7 +124,8 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False,
             if no_gpu and device == 'cuda':
                 continue
 
-            tvm_output = run_tvm_graph(final_graph_def, in_data, in_node, target=device)
+            tvm_output = run_tvm_graph(final_graph_def, in_data, in_node,
+                                       num_output=len(out_node), target=device, out_names=out_name)
             # since the names from tensorflow and nnvm runs are not exactly same, 
             # first len(tf_output) will be compared
             for i in range(len(tf_output)):
@@ -506,14 +507,24 @@ def test_forward_gather():
 # Split
 # -----
 
-def _test_split(in_shape, axis, num_split, dtype):
+def _test_split(in_shape, axis, num_or_size_splits, dtype):
+    np_data = np.random.uniform(-5, 5, size=in_shape).astype(dtype)
+
     """ One iteration of a Split """
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, in_shape, name="in_data")
+    num_split = len(num_or_size_splits) if isinstance(num_or_size_splits, list) else num_or_size_splits
+    tf.split(in_data, num_or_size_splits, axis=axis)
 
-    with tf.Graph().as_default():
-        in_data = tf.placeholder(dtype, in_shape, name="in_data")
-        tf.split(in_data, num_split, axis)
-        np_data = np.random.uniform(size=in_shape).astype(dtype)
-        compare_tf_with_tvm(np_data, 'in_data:0', 'split:0')
+    compare_tf_with_tvm([np_data], ['in_data:0'], [f'split:{n}' for n in range(num_split)])
+
+    # and now test together with concat
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, in_shape, name="in_data")
+    splitted = tf.split(in_data, num_or_size_splits, axis=axis)
+    tf.concat(splitted, axis)
+
+    compare_tf_with_tvm([np_data], 'in_data:0', 'concat:0')
 
 def test_forward_split():
     '''test split layer'''
@@ -523,11 +534,11 @@ def test_forward_split():
     _test_split((6,), 0, 3, 'float32')
     # rank 2
     _test_split((6, 2), 0, 3, 'float32')
-    _test_split((2, 6), 1, 3, 'float32')
+    _test_split((2, 6), 1, 6, 'float32')
     # rank 3
-    _test_split((6, 2, 4), 0, 3, 'float32')
+    _test_split((6, 2, 4), 0, 2, 'int32')
     _test_split((2, 6, 4), 1, 3, 'float32')
-    _test_split((2, 4, 6), 2, 3, 'float32')
+    _test_split((2, 4, 6), 2, 1, 'float32')
     # rank 4
     _test_split((6, 1, 3, 5), 0, 3, 'float32')
     _test_split((1, 6, 3, 5), 1, 3, 'float32')
@@ -538,45 +549,37 @@ def test_forward_split():
     _test_split((1, 6, 3, 5), -3, 3, 'float32')
     _test_split((1, 3, 6, 5), -2, 3, 'float32')
     _test_split((1, 3, 5, 6), -1, 3, 'float32')
+    # size_splits list
+    _test_split((6,), 0, [1, 2, 3], 'int32')
+    _test_split((3, 6, 4), -2, [1, 4, 1], 'float32')
 
 
 #######################################################################
-# Split followed by concat
-# ------------------------
+# Unstack
+# -------
 
-def _test_split_concat(in_shape, axis, num_split, dtype):
-    """ One iteration of a split_concat pair"""
+def _test_unstack(ip_shape, axis, dtype):
+    np_data = np.random.uniform(-5, 5, size=ip_shape).astype(dtype)
 
-    with tf.Graph().as_default():
-        in_data = tf.placeholder(dtype, in_shape, name="in_data")
-        splitted = tf.split(in_data, num_split, axis)
-        tf.concat(splitted, axis)
-        np_data = np.random.uniform(size=in_shape).astype(dtype)
-        compare_tf_with_tvm(np_data, 'in_data:0', 'concat:0')
-
-def test_forward_split_concat():
-    '''test split followed by concat layers'''
-    # rank 1
-    _test_split_concat((3,), 0, 1, 'float32')
-    _test_split_concat((3,), 0, 3, 'float32')
-    _test_split_concat((6,), 0, 3, 'float32')
-    # rank 2
-    _test_split_concat((6, 2), 0, 3, 'float32')
-    _test_split_concat((2, 6), 1, 3, 'float32')
-    # rank 3
-    _test_split_concat((6, 2, 4), 0, 3, 'float32')
-    _test_split_concat((2, 6, 4), 1, 3, 'float32')
-    _test_split_concat((2, 4, 6), 2, 3, 'float32')
-    # rank 4
-    _test_split((6, 1, 3, 5), 0, 3, 'float32')
-    _test_split((1, 6, 3, 5), 1, 3, 'float32')
-    _test_split((1, 3, 6, 5), 2, 3, 'float32')
-    _test_split((1, 3, 5, 6), 3, 3, 'float32')
-    # split along negative axis
-    _test_split((6, 1, 3, 5), -4, 3, 'float32')
-    _test_split((1, 6, 3, 5), -3, 3, 'float32')
-    _test_split((1, 3, 6, 5), -2, 3, 'float32')
-    _test_split((1, 3, 5, 6), -1, 3, 'float32')
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, ip_shape, name="in_data")
+    tf.unstack(in_data, axis=axis)
+
+    compare_tf_with_tvm([np_data], ['in_data:0'], [f'unstack:{n}' for n in range(ip_shape[axis])])
+
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, ip_shape, name="in_data")
+    tf.stack(tf.unstack(in_data, axis=axis), axis=axis)
+
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'stack:0')
+
+def test_forward_unstack():
+    '''test unstack layer'''
+    _test_unstack((6,), 0, 'int32')
+    _test_unstack((2,6), 1, 'float64')
+    # negative axis
+    _test_unstack((1,4), -1, 'int32')
+    _test_unstack((3,6,4), -2, 'float32')
 
 
 #######################################################################
@@ -1139,7 +1142,7 @@ def test_forward_rel_ops():
     test_forward_gather()
     test_forward_stridedslice()
     test_forward_split()
-    test_forward_split_concat()
+    test_forward_unstack()
 
     # Activations
     test_forward_sigmoid()

From 397241e60ed8b6004a660793596d2cc219c16e76 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Thu, 13 Dec 2018 20:21:21 -0800
Subject: [PATCH 495/529] [DOC]Update documentation (#2286)

---
 src/runtime/graph/graph_runtime.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 919e7ee9afe8..d9e6ef18860a 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -351,11 +351,10 @@ class GraphRuntime : public ModuleNode {
   /*! \brief Setup the executors. */
   void SetupOpExecs();
   /*!
-   * \brief Create a executtion function given input.
+   * \brief Create an execution function given input.
    * \param attrs The node attributes.
    * \param args The arguments to the functor, including inputs and outputs.
    * \param num_inputs Number of inputs.
-   * \param dev_type The device type of the tvm_op.
    * \return The created executor.
    */
   std::function<void()> CreateTVMOp(const TVMOpParam& attrs,

From 4ad048a216c4a518fb23699e2fec79e98152c043 Mon Sep 17 00:00:00 2001
From: OKA Naoya <pn11@users.noreply.github.com>
Date: Sat, 15 Dec 2018 02:59:56 +0900
Subject: [PATCH 496/529] [DOC] fix installation doc (#2290)

---
 docs/install/from_source.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 10d86ebe6243..00cb96fe4e19 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -42,14 +42,14 @@ The minimal building requirements are
 - A recent c++ compiler supporting C++ 11 (g++-4.8 or higher)
 - CMake 3.5 or higher
 - We highly recommend to build with LLVM to enable all the features.
-- It is possible to build TVM without the LLVM dependency if we only want to use CUDA/OpenCL
-- If we want to use the NNVM compiler, then LLVM is required
+- It is possible to build TVM without the LLVM dependency if you only want to use CUDA/OpenCL
+- If you want to use the NNVM compiler, then LLVM is required
 
 We use cmake to build the library.
 The configuration of tvm can be modified by `config.cmake`.
 
 
-- First, check the cmake in your system, you do not have cmake
+- First, check the cmake in your system. If you do not have cmake,
   you can obtain the latest version from `official website <https://cmake.org/download/>`_
 - First create a build directory, copy the ``cmake/config.cmake`` to the directory.
 

From 946b5aa9ca6de685c5d0cabf9250631255f7644b Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Fri, 14 Dec 2018 10:00:46 -0800
Subject: [PATCH 497/529] [RELAY] Fix alter_op_layout (#2289)

---
 python/tvm/relay/op/nn/_nn.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index f5f76e6af38a..8180d8b31044 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -107,6 +107,12 @@ def schedule_conv2d(attrs, outs, target):
                 return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
     raise ValueError("No compatible schedule")
 
+
+@reg.register_alter_op_layout("nn.conv2d")
+def alter_op_layout_conv2d(attrs, inputs, tinfos):
+    """Alternate the layout of conv2d"""
+    return None
+
 reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 

From c287e0c85ef9ef1b6bcff813ab24746cec06445b Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 14 Dec 2018 13:32:15 -0800
Subject: [PATCH 498/529] [TOPI] NCHWc added input shape 4 condition, intel
 graphics conv2d schedule debugged for inception_v3 workloads (#2265)

---
 nnvm/python/nnvm/top/nn.py                |  9 ++-
 topi/python/topi/intel_graphics/conv2d.py | 83 ++++++++---------------
 2 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 74196c078798..a37a5d7e071e 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, unused-argument
+# pylint: disable=invalid-name, unused-argument, missing-docstring, no-else-return
 """Definition of nn ops"""
 from __future__ import absolute_import
 
@@ -170,8 +170,11 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
     out_layout = attrs.get_string("out_layout")
     out_dtype = attrs.get_string("out_dtype")
     out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
-    _, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape)
-    in_channel = in_channel_chunk * in_channel_block
+    if layout == "NCHW":
+        _, in_channel, _, _ = get_const_tuple(inputs[0].shape)
+    else:
+        _, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape)
+        in_channel = in_channel_chunk * in_channel_block
     assert dilation == (1, 1), "not support dilate now"
     if groups == 1:
         # pylint: disable=assignment-from-no-return
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index f6767b68afa1..d712e71410d7 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches, too-many-boolean-expressions
 """conv2d schedule on Intel Graphics"""
 
 from __future__ import absolute_import as _abs
@@ -61,7 +61,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
 
 @conv2d_NCHWc.register(["intel_graphics"])
-def _decl_conv2d(data, kernel, stride, padding, layout, out_layout, out_dtype='float32'):
+def _decl_conv2d(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'):
     """Conv2D operator for Intel Graphics backend.
 
     Parameters
@@ -126,8 +126,7 @@ def traverse(op):
             for tensor in op.input_tensors:
                 if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
-        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
-           or "1_16" in op.tag:
+        if 'conv2d' in op.tag:
             _schedule_cl_spatialpack_NCHWc(s, op)
 
         scheduled_ops.append(op)
@@ -156,31 +155,30 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
     rx = tvm.reduce_axis((0, kernel_w), name='rx')
 
-    block_w = 0
-    block_h = 0
+    block_w = 1
+    block_h = 1
     if stride_h == 2:
         if num_filter + kernel_h == 515:
-            conv_tag = "4_4"
             block_h = 4
             block_w = 4
         else:
-            conv_tag = "4_5"
             block_h = 4
             block_w = 5
     elif kernel_h == 3:
         if num_filter == 512:
-            conv_tag = "2_7"
             block_h = 2
             block_w = 7
         else:
-            conv_tag = "2_14"
             block_h = 2
             block_w = 14
+    elif kernel_h == 7 and padding == 3 and stride == 1:
+        block_h = 3
+        block_w = 4
     else:
-        conv_tag = "1_16"
         block_h = 1
         block_w = 16
 
+    attrs = {'block_h': block_h, 'block_w' : block_w}
     c_h = out_height
     c_w = out_width
 
@@ -202,13 +200,13 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
           tvm.sum(
               temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
               kernel[ff, rc, ry, rx, vc].astype(out_dtype),
-              axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+              axis=[rc, ry, rx]), name='conv', attrs=attrs)
 
     output = tvm.compute(
         oshape,
         lambda nn, ff, yy, xx:
         conv[nn][ff//nv][yy][xx][ff%nv],
-        name='output_unpack', tag=conv_tag)
+        name='output_unpack', tag='conv2d')
 
     return output
 
@@ -224,21 +222,10 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
 
     kernel_L = s.cache_read(kernel, "local", [conv_L])
     _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
-    if "1_16" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 1
-        OUTPUT_BLOCK_WIDTH = 16
-    elif "2_14" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 14
-    elif "2_7" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 7
-    elif "4_5" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 5
-    elif "4_4" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 4
+
+    attrs = s[conv].op.attrs
+    OUTPUT_BLOCK_HEIGHT = attrs['block_h']
+    OUTPUT_BLOCK_WIDTH = attrs['block_w']
 
     # schedule conv
     z_factor = 1
@@ -308,7 +295,7 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
 
 
 @conv2d.register(["intel_graphics"])
-def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+def decl_conv2d(data, kernel, stride, padding, dilation, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for Intel Graphics backend.
 
     Parameters
@@ -368,8 +355,7 @@ def traverse(op):
             for tensor in op.input_tensors:
                 if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
-        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
-           or "1_16" in op.tag:
+        if 'conv2d' in op.tag:
             _schedule_cl_spatialpack(s, op)
 
         scheduled_ops.append(op)
@@ -396,31 +382,30 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
     rx = tvm.reduce_axis((0, kernel_w), name='rx')
 
-    block_w = 0
-    block_h = 0
+    block_w = 1
+    block_h = 1
     if stride_h == 2:
         if num_filter + kernel_h == 515:
-            conv_tag = "4_4"
             block_h = 4
             block_w = 4
         else:
-            conv_tag = "4_5"
             block_h = 4
             block_w = 5
     elif kernel_h == 3:
         if num_filter == 512:
-            conv_tag = "2_7"
             block_h = 2
             block_w = 7
         else:
-            conv_tag = "2_14"
             block_h = 2
             block_w = 14
+    elif kernel_h == 7 and padding == 3 and stride == 1:
+        block_h = 3
+        block_w = 4
     else:
-        conv_tag = "1_16"
         block_h = 1
         block_w = 16
 
+    attrs = {'block_h': block_h, 'block_w' : block_w}
     c_h = out_height
     c_w = out_width
 
@@ -453,13 +438,13 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
           tvm.sum(
               temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
               kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
-              axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+              axis=[rc, ry, rx]), name='conv', attrs=attrs)
 
     output = tvm.compute(
         oshape,
         lambda nn, ff, yy, xx:
         conv[nn][ff//nv][yy][xx][ff%nv],
-        name='output_unpack', tag=conv_tag)
+        name='output_unpack', tag='conv2d')
 
     return output
 
@@ -477,21 +462,9 @@ def _schedule_cl_spatialpack(s, op):
     kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
     _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
 
-    if "1_16" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 1
-        OUTPUT_BLOCK_WIDTH = 16
-    elif "2_14" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 14
-    elif "2_7" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 7
-    elif "4_5" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 5
-    elif "4_4" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 4
+    attrs = s[conv].op.attrs
+    OUTPUT_BLOCK_HEIGHT = attrs['block_h']
+    OUTPUT_BLOCK_WIDTH = attrs['block_w']
 
     # schedule conv
     z_factor = 1

From 93ba79208be691e2cfd2481e655549b1161d7ef1 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Sat, 15 Dec 2018 23:29:45 +0530
Subject: [PATCH 499/529] [CI] Golang unit test trigger for Jenkins (#2266)

---
 Jenkinsfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 02f00e42e8fd..f0c11426a078 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -144,6 +144,7 @@ stage('Build') {
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_cpp_unittest.sh"
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_vta.sh"
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_rust.sh"
+          sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_golang.sh"
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_integration.sh"
         }

From f2e1bbfce204abd2dd4e7808db99f6704b33d5ad Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Mon, 17 Dec 2018 10:18:21 -0800
Subject: [PATCH 500/529] [RELAY] Support concatenate. (#2298)

---
 nnvm/python/nnvm/to_relay.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/to_relay.py b/nnvm/python/nnvm/to_relay.py
index 318ff1ee92dd..a168f4fd88d2 100644
--- a/nnvm/python/nnvm/to_relay.py
+++ b/nnvm/python/nnvm/to_relay.py
@@ -364,6 +364,11 @@ def _squeeze(children, attrs, odtype='float32'):
 
     return op.squeeze(children[0], axis)
 
+def _concatenate(children, attrs, odtype='float32'):
+    axis = attrs.get_int('axis', None)
+    return op.concatenate(children, axis)
+
+
 NNVM_OP_2_RELAY_OP = {
     'flatten': _nn_batch_flatten,
     'dense': _dense,
@@ -422,6 +427,7 @@ def _squeeze(children, attrs, odtype='float32'):
     'strided_slice': _strided_slice,
     'split': _split,
     'squeeze': _squeeze,
+    'concatenate': _concatenate,
 }
 
 
@@ -436,7 +442,7 @@ def to_relay(graph, shape_dict, dtype_dict, params):
     shape_dict : dict of str to shape
        The input shape.
 
-    dtype_dict : dict of str to shape
+    dtype_dict : dict of str to str/dtype
        The input shape.
 
     params : dict of str to array

From 0728920b52c5d096a724e14399a0cf1a791129f0 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Mon, 17 Dec 2018 16:18:43 -0800
Subject: [PATCH 501/529] [RELAY] Add broadcast_to operator (#2276)

---
 docs/langref/relay_op.rst             |  1 +
 python/tvm/relay/op/_transform.py     |  1 +
 python/tvm/relay/op/transform.py      | 18 +++++++++++
 src/relay/op/nn/pooling.cc            |  3 +-
 src/relay/op/tensor/transform.cc      | 46 +++++++++++++++++++++++++++
 tests/python/relay/test_op_level10.py | 18 +++++++++++
 6 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index f4a65023ee53..5c3ab8b1ffda 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -124,6 +124,7 @@ This level enables additional math and transform operators.
    tvm.relay.mean
    tvm.relay.prod
    tvm.relay.strided_slice
+   tvm.relay.broadcast_to
 
 
 **Level 5: Vision/Image Operators**
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index c1e71e9133ea..085a8ceed5d1 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -11,6 +11,7 @@
 
 
 _reg.register_schedule("collapse_sum_like", _schedule_reduce)
+_reg.register_schedule("broadcast_to", schedule_broadcast)
 _reg.register_schedule("broadcast_to_like", schedule_broadcast)
 _reg.register_schedule("expand_dims", schedule_broadcast)
 _reg.register_schedule("squeeze", schedule_injective)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index f536e75fd9b4..2791eaf7d9db 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -267,6 +267,24 @@ def where(condition, x, y):
     """
     return _make.where(condition, x, y)
 
+def broadcast_to(data, shape):
+    """Return an scalar value array with the same type, broadcast to
+    the provided shape.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input tensor.
+
+    shape : shape
+        Provide the shape to broadcast to.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.broadcast_to(data, shape)
 
 def broadcast_to_like(data, broadcast_type):
     """Return an scalar value array with the same shape and type as the input array.
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 6233e6d51776..6cf37668cab5 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -258,8 +258,7 @@ bool GlobalPool2DRel(const Array<Type>& types,
                      const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
-
-  CHECK(data != nullptr);
+  if (data == nullptr) { return false; }
   const auto dshape = data->shape;
   CHECK_NE(dshape.size(), 0);
   CHECK_GE(dshape.size(), 2U)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index fcf7f6fe3299..eb8b4f13fb3f 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1084,6 +1084,52 @@ RELAY_REGISTER_OP("collapse_sum_like")
 .set_attr<FTVMCompute>("FTVMCompute", CollapseSumLikeCompute)
 .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
+// BroadCastTo: <A, B> -> B where BroadCast(A, B) = B
+bool BroadCastToRel(const Array<Type>& types,
+                    int num_inputs,
+                    const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  auto ioattrs = attrs.as<InitOpAttrs>();
+  CHECK(ioattrs);
+  auto intt = types[0].as<TensorTypeNode>();
+  if (intt == nullptr) { return false; }
+  auto type = TensorTypeNode::make(ioattrs->shape, intt->dtype);
+  reporter->Assign(types[1], type);
+  return true;
+}
+
+Expr MakeBroadCastTo(Expr data, Array<IndexExpr> shape) {
+  static const Op& op = Op::Get("broadcast_to");
+  auto attrs = make_node<InitOpAttrs>();
+  attrs->shape = std::move(shape);
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+Array<Tensor> BroadCastToCompute(const Attrs& attrs,
+                                 const Array<Tensor>& inputs,
+                                 const Type& out_type,
+                                 const Target& target) {
+  auto ioattrs = attrs.as<InitOpAttrs>();
+  CHECK(ioattrs != nullptr);
+  return { topi::broadcast_to(inputs[0], ioattrs->shape) };
+}
+
+TVM_REGISTER_API("relay.op._make.broadcast_to")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeBroadCastTo, args, rv);
+  });
+
+RELAY_REGISTER_OP("broadcast_to")
+.describe(R"code(Broadcast the first input to match the shape argument.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(4)
+.add_type_rel("BroadCastTo", BroadCastToRel)
+.set_attr<FTVMCompute>("FTVMCompute", BroadCastToCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
+
 // BroadCastToLike: <A, B> -> B where BroadCast(A, B) = B
 bool BroadCastToLikeRel(const Array<Type>& types,
                         int num_inputs,
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 5d65691a2ad5..2c0ed73a7535 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -25,6 +25,24 @@ def test_collapse_sum_like():
             op_res = intrp.evaluate(func)(x, y)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
+def test_broadcast_to():
+    shape = (4, 1, 6)
+    shape_like = (3, 4, 5, 6)
+    dtype = "float32"
+    x = relay.Var("x", relay.ty.TensorType(shape , dtype))
+    z = relay.broadcast_to(x, shape=shape_like)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType(shape_like, dtype)
+
+    func = relay.Function([x], z)
+    x = np.random.uniform(size=shape).astype(dtype)
+    ref_res = np.broadcast_to(x, shape_like)
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
 def test_broadcast_to_like():
     shape = (4, 1, 6)
     shape_like = (3, 4, 5, 6)

From 1739cf48df772a34d9911e3ed7b87aca54ada3d8 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Mon, 17 Dec 2018 23:42:59 -0800
Subject: [PATCH 502/529] added error checking to loading symbol json (#2301)

---
 nnvm/src/pass/saveload_json.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/nnvm/src/pass/saveload_json.cc b/nnvm/src/pass/saveload_json.cc
index 485b1417a493..4a0706b6d501 100644
--- a/nnvm/src/pass/saveload_json.cc
+++ b/nnvm/src/pass/saveload_json.cc
@@ -209,10 +209,12 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph &jgraph, bool no_parse)
   for (const JSONNode &n : jgraph.nodes) {
     n.node->inputs.reserve(n.inputs.size());
     for (const JSONNode::Entry &e : n.inputs) {
+      CHECK(e.node_id < jgraph.nodes.size());
       n.node->inputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
     }
     n.node->control_deps.reserve(n.control_deps.size());
     for (uint32_t nid : n.control_deps) {
+      CHECK(nid < jgraph.nodes.size());
       n.node->control_deps.push_back(jgraph.nodes[nid].node);
     }
     for (const JSONGraph &subgraph : n.subgraphs) {
@@ -233,11 +235,13 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph &jgraph, bool no_parse)
   }
   // consistency check
   for (uint32_t nid : jgraph.arg_nodes) {
+    CHECK(nid < jgraph.nodes.size());
     CHECK(jgraph.nodes[nid].node->is_variable());
   }
   std::shared_ptr<Symbol> symbol = std::make_shared<Symbol>();
   symbol->outputs.reserve(jgraph.heads.size());
   for (const JSONNode::Entry &e : jgraph.heads) {
+    CHECK(e.node_id < jgraph.nodes.size());
     symbol->outputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
   }
   return symbol;

From 94ff83a6f21619e86f0620e751bfe9e24ca80b6e Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Tue, 18 Dec 2018 12:26:19 -0800
Subject: [PATCH 503/529] [Relay][doc] Update the description of returns in
 mxnet.py (#2309)

---
 python/tvm/relay/frontend/mxnet.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 77e97d26efe0..f61c65bbaf6a 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -343,7 +343,7 @@ def _mx_lrn(inputs, attrs):
 
 
 def _from_mxnet_impl(symbol, shape_dict, dtype_info):
-    """Convert mxnet symbol to nnvm implementation.
+    """Convert mxnet symbol to compatible relay Function.
 
     Reconstruct a relay Function by traversing the mxnet symbol.
 
@@ -361,15 +361,14 @@ def _from_mxnet_impl(symbol, shape_dict, dtype_info):
 
     Returns:
     -------
-    nnvm.sym.Symbol
-        Converted symbol
+    func : tvm.relay.Function
+        Converted relay Function
     """
     assert symbol is not None
     jgraph = json.loads(symbol.tojson())
     jnodes = jgraph["nodes"]
     node_map = {}
 
-
     for nid, node in enumerate(jnodes):
         children = [node_map[e[0]][e[1]] for e in node["inputs"]]
         attrs = StrAttrsDict(node.get("attrs", {}))
@@ -444,8 +443,8 @@ def from_mxnet(symbol,
 
     Returns
     -------
-    sym : nnvm.Symbol
-        Compatible nnvm symbol
+    sym : tvm.relay.Function
+        Compatible relay Function
 
     params : dict of str to tvm.NDArray
         The parameter dict to be used by nnvm

From 6fb304d85c52e667666d953d1ff53575b48287a1 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 18 Dec 2018 14:01:10 -0800
Subject: [PATCH 504/529] [PASS] Avoid recursion in FoldScaleAxis (#2299)

* [PASS] Avoid recursion in FoldScaleAxis

* remove GetForwardScale
---
 src/relay/pass/fold_scale_axis.cc             | 72 ++++---------------
 src/relay/pass/pass_util.h                    |  9 +++
 src/relay/pass/pattern_util.h                 | 51 -------------
 src/relay/pass/util.cc                        | 62 ++++++++++++++++
 .../python/relay/test_pass_fold_scale_axis.py | 12 ++--
 5 files changed, 92 insertions(+), 114 deletions(-)

diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index 9e9dd0604916..760a226a2fac 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -246,44 +246,9 @@ class ForwardPrep : private ExprVisitor {
 // Per operator defs for FScaleAxisForward
 //----------------------------------------------
 
-// Helper functions
-Expr GetForwardScale(const Expr& expr, AxesSet out) {
-  static const Op& multiply = Op::Get("multiply");
-  static const auto& fprep = Op::GetAttr<FForwardPrep>("FScaleAxisForwardPrep");
-
-  const CallNode* call = expr.as<CallNode>();
-  if (!call) return NullValue<Expr>();
-  auto f = fprep.get(call->op, nullptr);
-
-  if (call->op.same_as(multiply)) {
-    const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
-    const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
-    if (MatchBroadcastToLeftAxes(tlhs, trhs, out)) {
-      return call->args[1];
-    } else if (MatchBroadcastToLeftAxes(trhs, tlhs, out)) {
-      return call->args[0];
-    } else {
-      return NullValue<Expr>();
-    }
-  } else if (f != nullptr) {
-    Array<AxesSet> in_axes = f(GetRef<Call>(call), out);
-    for (size_t i = 0; i < call->args.size(); i++) {
-      auto scale = GetForwardScale(call->args[i], in_axes[i]);
-      if (scale.defined()) {
-        return scale;
-      }
-    }
-  }
-  return NullValue<Expr>();
-}
-
 // Intermediate operators
 Array<AxesSet> ReluForwardPrep(const Call& call, AxesSet out) {
-  Expr scale = GetForwardScale(call->args[0], out);
-  if (IsPositiveConstant(scale)) {
-    return {out};
-  }
-  return {NullValue<AxesSet>()};
+  return {out};
 }
 
 Expr ReluForwardRewrite(const Call& ref_call,
@@ -391,16 +356,21 @@ Expr MultiplyForwardRewrite(const Call& ref_call,
   Expr lhs = new_args[0];
   Expr rhs = new_args[1];
   auto rnode = make_node<ScaledExprNode>();
-  if (MatchBroadcastToLeftAxes(tlhs, trhs, expected_out_axes, &rhs)) {
+  if (MatchBroadcastToLeftAxes(tlhs, trhs, expected_out_axes, &rhs) &&
+      IsAllPositiveConstant(rhs)) {
     rnode->value = lhs;
     rnode->scale = rhs;
     rnode->axes = expected_out_axes;
-  } else if (MatchBroadcastToLeftAxes(trhs, tlhs, expected_out_axes, &lhs)) {
+    return Expr(rnode);
+  } else if (MatchBroadcastToLeftAxes(trhs, tlhs, expected_out_axes, &lhs) &&
+             IsAllPositiveConstant(lhs)) {
     rnode->value = rhs;
     rnode->scale = lhs;
     rnode->axes = expected_out_axes;
+    return Expr(rnode);
+  } else {
+    return Expr();
   }
-  return Expr(rnode);
 }
 
 RELAY_REGISTER_OP("multiply")
@@ -790,22 +760,6 @@ RELAY_REGISTER_OP("subtract")
 RELAY_REGISTER_OP("subtract")
 .set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", AddSubBackwardTransform);
 
-// Find relu in the backward path between multiply and conv2d
-bool FindBackwardRelu(const Expr& expr) {
-  const CallNode* call = expr.as<CallNode>();
-  static const Op& conv2d = Op::Get("nn.conv2d");
-  static const Op& relu = Op::Get("nn.relu");
-
-  if (!call) return false;
-  if (call->op.same_as(relu)) return true;
-  if (call->op.same_as(conv2d)) return false;
-
-  for (size_t i = 0; i < call->args.size(); i++) {
-    if (FindBackwardRelu(call->args[i])) return true;
-  }
-  return false;
-}
-
 // Producer operators
 // Multiply produces the scale-axis pair.
 Expr MultiplyBackwardTransform(const Call& call,
@@ -821,16 +775,16 @@ Expr MultiplyBackwardTransform(const Call& call,
     // NOTE we won't recursively call mutating on scale part.
     // since there  won't be scale chance within scale part.
     Expr rhs = call->args[1];
+    // Only propagate positive scaling.
     if (MatchBroadcastToLeftAxes(tlhs, trhs, lhs_axes, &rhs) &&
-        (!FindBackwardRelu(call->args[0]) ||
-         IsPositiveConstant(call->args[1]))) {
+        IsAllPositiveConstant(rhs)) {
       return transformer->Transform(call->args[0], lhs_axes, rhs);
     }
   } else if (rhs_axes.defined() && rhs_axes.size() != 0) {
+    // Only propagate positive scaling.
     Expr lhs = call->args[0];
     if (MatchBroadcastToLeftAxes(trhs, tlhs, rhs_axes, &lhs) &&
-        (!FindBackwardRelu(call->args[1]) ||
-         IsPositiveConstant(call->args[0]))) {
+        IsAllPositiveConstant(lhs)) {
       return transformer->Transform(call->args[1], rhs_axes, lhs);
     }
   }
diff --git a/src/relay/pass/pass_util.h b/src/relay/pass/pass_util.h
index d42494409b53..ddd73901c452 100644
--- a/src/relay/pass/pass_util.h
+++ b/src/relay/pass/pass_util.h
@@ -22,6 +22,15 @@ namespace relay {
 std::unordered_map<const Node*, size_t>
 GetExprRefCount(const Expr& body);
 
+
+/*!
+ * \brief Check if expr is positive constant.
+ * \param expr The expression to be checked.
+ * \return Whether all elements of expr is positive constant.
+ */
+bool IsAllPositiveConstant(const Expr& expr);
+
+
 /*!
  * \brief Substitute var with subst.
  * \param type The type to be substituted.
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 5d76efd0124d..e6e8415bd620 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -190,57 +190,6 @@ Expr MakeConcatenate(Expr data, int axis);
 
 Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides);
 
-
-template <typename T>
-bool IsNDArrayAllGreaterEqual(const runtime::NDArray& tensor, T value) {
-  CHECK_EQ(tensor->ctx.device_type, kDLCPU);
-  CHECK(tensor->strides == nullptr);
-  CHECK_EQ(tensor->byte_offset, 0);
-  const T* data = static_cast<const T*>(tensor->data);
-  int64_t num_elems = 1;
-  for (int i = 0; i < tensor->ndim; ++i) {
-    num_elems *= tensor->shape[i];
-  }
-
-  for (int64_t i = 0; i < num_elems; i++) {
-    if (*data < value) {
-      return false;
-    }
-    data++;
-  }
-  return true;
-}
-
-
-inline bool IsPositiveConstant(const Expr& expr) {
-  const auto* constant = expr.as<ConstantNode>();
-  if (!constant) return false;
-  const auto& tensor = constant->data;
-  const auto& dtype = tensor->dtype;
-
-  if (dtype.lanes != 1) {
-    // pass
-  } else if (dtype.code == kDLFloat && dtype.bits == 32) {
-    return IsNDArrayAllGreaterEqual<float>(tensor, 0);
-  } else if (dtype.code == kDLFloat && dtype.bits == 64) {
-    return IsNDArrayAllGreaterEqual<double>(tensor, 0);
-  } else if (dtype.code == kDLInt && dtype.bits == 8) {
-    return IsNDArrayAllGreaterEqual<int8_t>(tensor, 0);
-  } else if (dtype.code == kDLInt && dtype.bits == 32) {
-    return IsNDArrayAllGreaterEqual<int32_t>(tensor, 0);
-  } else if (dtype.code == kDLUInt && dtype.bits == 8) {
-    return IsNDArrayAllGreaterEqual<uint8_t>(tensor, 0);
-  } else if (dtype.code == kDLUInt && dtype.bits == 32) {
-    return IsNDArrayAllGreaterEqual<uint32_t>(tensor, 0);
-  }
-
-  LOG(WARNING) << "Unsupported data type (code = " << dtype.code
-               << ", bits = " << dtype.bits << ", lanes = " << dtype.lanes
-               << ")";
-  return false;
-}
-
-
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_PATTERN_UTIL_H_
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index 8f7179deea53..b99d975135be 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -146,5 +146,67 @@ GetExprRefCount(const Expr& body) {
   return ExprRefCounter().Get(body);
 }
 
+template <typename T>
+bool IsNDArrayAllGreaterEqual(const runtime::NDArray& tensor, T value) {
+  CHECK_EQ(tensor->ctx.device_type, kDLCPU);
+  CHECK(tensor->strides == nullptr);
+  CHECK_EQ(tensor->byte_offset, 0);
+  const T* data = static_cast<const T*>(tensor->data);
+  int64_t num_elems = 1;
+  for (int i = 0; i < tensor->ndim; ++i) {
+    num_elems *= tensor->shape[i];
+  }
+
+  for (int64_t i = 0; i < num_elems; i++) {
+    if (*data < value) {
+      return false;
+    }
+    data++;
+  }
+  return true;
+}
+
+bool IsAllPositiveConstant(const Expr& expr) {
+  // peel through a few common transform ops.
+  static const auto& expand_dims = Op::Get("expand_dims");
+  static const auto& reshape = Op::Get("reshape");
+  static const auto& transpose = Op::Get("transpose");
+  static const auto& squeeze = Op::Get("squeeze");
+
+  if (const auto* constant = expr.as<ConstantNode>()) {
+    const auto& tensor = constant->data;
+    const auto& dtype = tensor->dtype;
+    if (dtype.lanes != 1) {
+      return false;
+    } else if (dtype.code == kDLFloat && dtype.bits == 32) {
+      return IsNDArrayAllGreaterEqual<float>(tensor, 0);
+    } else if (dtype.code == kDLFloat && dtype.bits == 64) {
+      return IsNDArrayAllGreaterEqual<double>(tensor, 0);
+    } else if (dtype.code == kDLInt && dtype.bits == 8) {
+      return IsNDArrayAllGreaterEqual<int8_t>(tensor, 0);
+    } else if (dtype.code == kDLInt && dtype.bits == 32) {
+      return IsNDArrayAllGreaterEqual<int32_t>(tensor, 0);
+    } else if (dtype.code == kDLUInt && dtype.bits == 8) {
+      return IsNDArrayAllGreaterEqual<uint8_t>(tensor, 0);
+    } else if (dtype.code == kDLUInt && dtype.bits == 32) {
+      return IsNDArrayAllGreaterEqual<uint32_t>(tensor, 0);
+    } else {
+      return false;
+    }
+  } else if (const auto* op = expr.as<CallNode>()) {
+    // tail recursion.
+    if (op->op.same_as(expand_dims) ||
+        op->op.same_as(reshape) ||
+        op->op.same_as(transpose) ||
+        op->op.same_as(squeeze)) {
+      return IsAllPositiveConstant(op->args[0]);
+    } else {
+      return false;
+    }
+  } else {
+    return false;
+  }
+}
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index f42aa7b7b8d0..57cb7c84b10d 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -1,6 +1,9 @@
 from tvm import relay
 import numpy as np
 
+def _get_positive_scale(size):
+    return np.random.uniform(0.5, 1, size=size).astype('float32')
+
 
 def test_fold_fwd_simple():
     """Simple testcase."""
@@ -14,6 +17,7 @@ def before(x, conv_weight, in_bias, in_scale, channels):
                             channels=channels,
                             kernel_size=(3, 3),
                             padding=(1, 1))
+
         return relay.Function(args, y)
 
     def expected(x, conv_weight, in_bias, in_scale, channels):
@@ -37,14 +41,14 @@ def check(shape, channels):
         in_channels = shape[1]
         weight = relay.var("weight")
         in_bias = relay.var("in_bias", shape=(in_channels,))
-        in_scale = relay.const(np.random.uniform(size=(in_channels, 1, 1)).astype('float32'))
-
+        in_scale = relay.const(_get_positive_scale((in_channels, 1, 1)))
         y1 = before(x, weight, in_bias, in_scale, channels)
         y1 = relay.ir_pass.infer_type(y1)
         type_dict = {x.name_hint:x.checked_type for x in y1.params}
         weight = relay.var("weight", type_dict["weight"])
         y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
         y1_expected = expected(x, weight, in_bias, in_scale, channels)
+
         y1_folded = relay.ir_pass.infer_type(y1_folded)
         y1_expected = relay.ir_pass.infer_type(y1_expected)
         assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
@@ -107,7 +111,7 @@ def check(shape, channels):
         assert in_channels == channels
         weight = relay.var("weight")
         in_bias = relay.var("in_bias", shape=(in_channels,))
-        in_scale = relay.const(np.random.uniform(size=(in_channels,)).astype("float32"))
+        in_scale = relay.const(_get_positive_scale(in_channels,))
         y1 = before(x, weight, in_bias, in_scale, channels)
         y1 = relay.ir_pass.infer_type(y1)
         y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
@@ -141,7 +145,7 @@ def check(shape, channels):
         assert in_channels == channels
         weight = relay.var("weight")
         in_bias = relay.var("in_bias", shape=(in_channels,))
-        in_scale = relay.const(np.random.uniform(size=(in_channels,)).astype("float32"))
+        in_scale = relay.const(_get_positive_scale(size=(in_channels,)))
         y1 = before(x, weight, in_bias, in_scale, channels)
         y1 = relay.ir_pass.infer_type(y1)
         y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)

From 36d6318be1d2a9f00c3cb947bbd62fe7a7951dca Mon Sep 17 00:00:00 2001
From: Yong Wu <55wuyong@163.com>
Date: Tue, 18 Dec 2018 14:36:45 -0800
Subject: [PATCH 505/529] add relay and autotvm in readme (#2312)

---
 src/README.md | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/README.md b/src/README.md
index dfa7a1d33d22..b0363a04411a 100644
--- a/src/README.md
+++ b/src/README.md
@@ -5,12 +5,14 @@ There can be internal header files within each module that sit in src.
 
 ## Modules
 - common: Internal common utilities.
-- api: API function registration
-- lang: The definition of DSL related data structure
-- arithmetic: Arithmetic expression and set simplification
-- op: The detail implementations about each operation(compute, scan, placeholder)
+- api: API function registration.
+- lang: The definition of DSL related data structure.
+- arithmetic: Arithmetic expression and set simplification.
+- op: The detail implementations about each operation(compute, scan, placeholder).
 - schedule: The operations on the schedule graph before converting to IR.
-- pass: The optimization pass on the IR structure
+- pass: The optimization pass on the IR structure.
 - codegen: The code generator.
-- runtime: Minimum runtime related codes
-- contrib: Contrib extension libraries
+- runtime: Minimum runtime related codes.
+- autotvm: The auto-tuning module.
+- relay: Implementation of Relay. The second generation of NNVM, a new IR for deep learning frameworks.
+- contrib: Contrib extension libraries.

From 19c4ba1d472f84ecbd11563cd285a526168b9d52 Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Tue, 18 Dec 2018 18:00:42 -0800
Subject: [PATCH 506/529] Bundled interpreter demo (#2297)

---
 apps/bundle_deploy/Makefile       | 39 ++++++++++++++++++
 apps/bundle_deploy/README.md      | 35 ++++++++++++++++
 apps/bundle_deploy/build_model.py | 40 +++++++++++++++++++
 apps/bundle_deploy/bundle.cc      | 47 ++++++++++++++++++++++
 apps/bundle_deploy/demo.cc        | 66 +++++++++++++++++++++++++++++++
 apps/bundle_deploy/runtime.cc     | 17 ++++++++
 6 files changed, 244 insertions(+)
 create mode 100644 apps/bundle_deploy/Makefile
 create mode 100644 apps/bundle_deploy/README.md
 create mode 100644 apps/bundle_deploy/build_model.py
 create mode 100644 apps/bundle_deploy/bundle.cc
 create mode 100644 apps/bundle_deploy/demo.cc
 create mode 100644 apps/bundle_deploy/runtime.cc

diff --git a/apps/bundle_deploy/Makefile b/apps/bundle_deploy/Makefile
new file mode 100644
index 000000000000..0bf1613c8d66
--- /dev/null
+++ b/apps/bundle_deploy/Makefile
@@ -0,0 +1,39 @@
+# Makefile Example to bundle TVM modules.
+TVM_ROOT=$(shell cd ../..; pwd)
+NNVM_PATH=nnvm
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
+PKG_CFLAGS = -std=c++14 -Oz -fPIC\
+	-I${TVM_ROOT}/include\
+	-I${DMLC_CORE}/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+
+PKG_LDFLAGS = -L${TVM_ROOT}/build
+
+build_dir := build
+
+test: $(build_dir)/demo $(build_dir)/bundle.so
+	$(build_dir)/demo $(build_dir)/bundle.so
+
+$(build_dir)/demo: demo.cc
+	@mkdir -p $(@D)
+	$(CXX) $(PKG_CFLAGS) -o $@  $^
+
+# Serialize our graph.json file.
+$(build_dir)/graph.json.cc: $(build_dir)/graph.json
+	xxd -i $^  > $@
+
+# Serialize our params.bin file.
+$(build_dir)/params.bin.cc: $(build_dir)/params.bin
+	xxd -i $^  > $@
+
+$(build_dir)/model.o $(build_dir)/graph.json $(build_dir)/params.bin: build_model.py
+	python $< -o $(build_dir)
+
+# Build our bundle against the serialized bundle.cc API, the runtime.cc API, and
+# the serialized graph.json and params.bin
+$(build_dir)/bundle.so: bundle.cc runtime.cc $(build_dir)/model.o $(build_dir)/graph.json.cc $(build_dir)/params.bin.cc
+	@mkdir -p $(@D)
+	$(CXX) $(PKG_CFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS) -shared
+
+clean:
+	rm -r $(build_dir)
diff --git a/apps/bundle_deploy/README.md b/apps/bundle_deploy/README.md
new file mode 100644
index 000000000000..2db8150b2659
--- /dev/null
+++ b/apps/bundle_deploy/README.md
@@ -0,0 +1,35 @@
+How to Bundle TVM Modules
+=========================
+
+This folder contains an example on how to bundle a TVM module (with the required
+interpreter runtime modules such as `runtime::GraphRuntime`, the graph JSON, and
+the params) into a single, self-contained shared object (`bundle.so`) which
+exposes a C API wrapping the appropriate `runtime::GraphRuntime` instance.
+
+This is useful for cases where we'd like to avoid deploying the TVM runtime
+components to the target host in advance - instead, we simply deploy the bundled
+shared-object to the host, which embeds both the model and the runtime
+components. The bundle should only depend on libc/libc++.
+
+It also contains an example code (`demo.cc`) to load this shared object and
+invoke the packaged TVM model instance. This is a dependency-free binary that
+uses the functionality packaged in `bundle.so` (which means that `bundle.so` can
+be deployed lazily at runtime, instead of at compile time) to invoke TVM
+functionality.
+
+Type the following command to run the sample code under the current folder,
+after building TVM first.
+
+```bash
+make demo
+```
+
+This will:
+
+- Download the mobilenet0.25 model from the MXNet Gluon Model Zoo
+- Compile the model with NNVM
+- Build a `bundle.so` shared object containing the model specification and
+  parameters
+- Build a `demo` executable that `dlopen`'s `bundle.so`, instantiates the
+  contained graph runtime, and invokes the `GraphRuntime::Run` function on a
+  random input, then prints the output tensor to `stderr`.
diff --git a/apps/bundle_deploy/build_model.py b/apps/bundle_deploy/build_model.py
new file mode 100644
index 000000000000..901996b8774e
--- /dev/null
+++ b/apps/bundle_deploy/build_model.py
@@ -0,0 +1,40 @@
+"""Creates a simple TVM modules."""
+
+import argparse
+import os
+import nnvm.compiler
+import nnvm.testing
+import tvm
+import logging
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--out-dir', default='.')
+    opts = parser.parse_args()
+
+    dshape = (1, 3, 224, 224)
+    from mxnet.gluon.model_zoo.vision import get_model
+    block = get_model('mobilenet0.25', pretrained=True)
+    net, params = nnvm.frontend.from_mxnet(block)
+    net = nnvm.sym.softmax(net)
+
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, 'llvm --system-lib', shape={'data': dshape}, params=params)
+    print(graph.symbol().debug_str())
+    build_dir = os.path.abspath(opts.out_dir)
+    if not os.path.isdir(build_dir):
+        os.makedirs(build_dir)
+
+    lib.save(os.path.join(build_dir, 'model.o'))
+    with open(os.path.join(build_dir, 'graph.json'), 'w') as f_graph_json:
+        f_graph_json.write(graph.json())
+    with open(os.path.join(build_dir, 'params.bin'), 'wb') as f_params:
+        f_params.write(nnvm.compiler.save_param_dict(params))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/apps/bundle_deploy/bundle.cc b/apps/bundle_deploy/bundle.cc
new file mode 100644
index 000000000000..af1ef7225bcb
--- /dev/null
+++ b/apps/bundle_deploy/bundle.cc
@@ -0,0 +1,47 @@
+#include <memory>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/registry.h>
+
+extern unsigned char build_graph_json[];
+extern unsigned int build_graph_json_len;
+extern unsigned char build_params_bin[];
+extern unsigned int build_params_bin_len;
+
+#define TVM_BUNDLE_FUNCTION __attribute__((visibility("default"))) extern "C"
+
+TVM_BUNDLE_FUNCTION void *tvm_runtime_create() {
+  const std::string json_data(&build_graph_json[0],
+                              &build_graph_json[0] + build_graph_json_len);
+  tvm::runtime::Module mod_syslib =
+      (*tvm::runtime::Registry::Get("module._GetSystemLib"))();
+  int device_type = kDLCPU;
+  int device_id = 0;
+  tvm::runtime::Module mod =
+      (*tvm::runtime::Registry::Get("tvm.graph_runtime.create"))(
+          json_data, mod_syslib, device_type, device_id);
+  TVMByteArray params;
+  params.data = reinterpret_cast<const char *>(&build_params_bin[0]);
+  params.size = build_params_bin_len;
+  mod.GetFunction("load_params")(params);
+  return new tvm::runtime::Module(mod);
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_destroy(void *handle) {
+  delete reinterpret_cast<tvm::runtime::Module *>(handle);
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_set_input(void *handle, const char *name,
+                                               void *tensor) {
+  reinterpret_cast<tvm::runtime::Module *>(handle)->GetFunction("set_input")(
+      name, reinterpret_cast<DLTensor *>(tensor));
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_run(void *handle) {
+  reinterpret_cast<tvm::runtime::Module *>(handle)->GetFunction("run")();
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_get_output(void *handle, int index,
+                                                void *tensor) {
+  reinterpret_cast<tvm::runtime::Module *>(handle)->GetFunction("get_output")(
+      index, reinterpret_cast<DLTensor *>(tensor));
+}
diff --git a/apps/bundle_deploy/demo.cc b/apps/bundle_deploy/demo.cc
new file mode 100644
index 000000000000..c888edcee772
--- /dev/null
+++ b/apps/bundle_deploy/demo.cc
@@ -0,0 +1,66 @@
+#include "tvm/runtime/c_runtime_api.h"
+#include <assert.h>
+#include <dlfcn.h> //dlopen
+#include <dlpack/dlpack.h>
+#include <iostream>
+#include <random>
+#include <vector>
+
+template <typename F> auto getFunc(void *bundle, const char *name) {
+  dlerror();
+  auto *f =
+      reinterpret_cast<typename std::add_pointer<F>::type>(dlsym(bundle, name));
+  assert(!dlerror());
+  return f;
+}
+
+int main(int argc, char **argv) {
+  assert(argc == 2 && "Usage: demo <bundle.so>");
+  auto *bundle = dlopen(argv[1], RTLD_LAZY | RTLD_LOCAL);
+  assert(bundle);
+
+  auto *handle = getFunc<void *()>(bundle, "tvm_runtime_create")();
+
+  std::vector<float> input_storage(1 * 3 * 224 * 224);
+  std::mt19937 gen(0);
+  for (auto &e : input_storage) {
+    e = std::uniform_real_distribution<float>(0.0, 1.0)(gen);
+  }
+
+  std::vector<int64_t> input_shape = {1, 3, 224, 224};
+  DLTensor input;
+  input.data = input_storage.data();
+  input.ctx = DLContext{kDLCPU, 0};
+  input.ndim = 4;
+  input.dtype = DLDataType{kDLFloat, 32, 1};
+  input.shape = input_shape.data();
+  input.strides = nullptr;
+  input.byte_offset = 0;
+  getFunc<void(void *, const char *, void *)>(bundle, "tvm_runtime_set_input")(
+      handle, "data", &input);
+
+  auto *ftvm_runtime_run =
+      (auto (*)(void *)->void)dlsym(bundle, "tvm_runtime_run");
+  assert(!dlerror());
+  ftvm_runtime_run(handle);
+
+  std::vector<float> output_storage(1000);
+  std::vector<int64_t> output_shape = {1, 1000};
+  DLTensor output;
+  output.data = output_storage.data();
+  output.ctx = DLContext{kDLCPU, 0};
+  output.ndim = 2;
+  output.dtype = DLDataType{kDLFloat, 32, 1};
+  output.shape = output_shape.data();
+  output.strides = nullptr;
+  output.byte_offset = 0;
+
+  getFunc<void(void *, int, void *)>(bundle, "tvm_runtime_get_output")(
+      handle, 0, &output);
+  for (auto i = 0; i < output_storage.size(); ++i) {
+    std::cerr << "output[" << i << "]: " << output_storage[i] << std::endl;
+  }
+  getFunc<void(void *)>(bundle, "tvm_runtime_destroy")(handle);
+  dlclose(bundle);
+  return 0;
+}
diff --git a/apps/bundle_deploy/runtime.cc b/apps/bundle_deploy/runtime.cc
new file mode 100644
index 000000000000..2284953b8c16
--- /dev/null
+++ b/apps/bundle_deploy/runtime.cc
@@ -0,0 +1,17 @@
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/packed_func.h>
+
+#include "../../src/runtime/c_runtime_api.cc"
+#include "../../src/runtime/cpu_device_api.cc"
+#include "../../src/runtime/workspace_pool.cc"
+#include "../../src/runtime/module_util.cc"
+#include "../../src/runtime/module.cc"
+#include "../../src/runtime/registry.cc"
+#include "../../src/runtime/file_util.cc"
+#include "../../src/runtime/threading_backend.cc"
+#include "../../src/runtime/thread_pool.cc"
+#include "../../src/runtime/ndarray.cc"
+#include "../../src/runtime/system_lib_module.cc"
+#include "../../src/runtime/graph/graph_runtime.cc"

From 1d148c1661ac09b02f1a0bbc88d9086577386b62 Mon Sep 17 00:00:00 2001
From: Jian Weng <werefluke@gmail.com>
Date: Wed, 19 Dec 2018 15:20:34 -0800
Subject: [PATCH 507/529] [Hybrid Script] Inter-function call supported!
 (#2287)

---
 python/tvm/hybrid/api.py                    |   4 +-
 python/tvm/hybrid/calls.py                  |  92 ++++++
 python/tvm/hybrid/intrin.py                 |  15 +-
 python/tvm/hybrid/parser.py                 | 307 ++++++++++----------
 python/tvm/hybrid/util.py                   |  18 ++
 python/tvm/hybrid/var_decl.py               |  15 +-
 tests/python/unittest/test_hybrid_script.py |  36 ++-
 7 files changed, 303 insertions(+), 184 deletions(-)
 create mode 100644 python/tvm/hybrid/calls.py

diff --git a/python/tvm/hybrid/api.py b/python/tvm/hybrid/api.py
index 5267731f4f52..d43217ca5dfc 100644
--- a/python/tvm/hybrid/api.py
+++ b/python/tvm/hybrid/api.py
@@ -24,17 +24,15 @@ def wrapped_func(func, *args, **kwargs): #pylint: disable=missing-docstring
         from .util import _enter_hybrid_runtime, _restore_runtime, _is_tvm_arg_types
         if _is_tvm_arg_types(args):
             src = _pruned_source(func)
-            parser = parse_python(src, args)
+            parser = parse_python(src, func.__globals__, args)
 
             input_tensors = []
             for i in args:
                 if isinstance(i, Tensor):
                     input_tensors.append(i)
-
             op = _tvm_internal._HybridOp(parser.func_name, "HybridOp", None, input_tensors,
                                          parser.outputs, parser.parsed_body)
             res = [op.output(i) for i in range(len(parser.outputs))]
-
             return res[0] if len(res) == 1 else res
 
         intersect = _enter_hybrid_runtime(func)
diff --git a/python/tvm/hybrid/calls.py b/python/tvm/hybrid/calls.py
new file mode 100644
index 000000000000..730b56f58bd2
--- /dev/null
+++ b/python/tvm/hybrid/calls.py
@@ -0,0 +1,92 @@
+"""Intrinsics of TVM-Python Hybrid Script for Python compilation time
+semantic support."""
+
+from .. import api as _api
+from .. import expr as _expr
+from .. import make as _make
+from ..container import Array
+from .. import ir_pass
+from ..stmt import For
+from .util import _internal_assert
+
+#pylint: disable=redefined-builtin
+
+LOOP_INTRIN = {
+    'range'    : For.Serial,
+    'unroll'   : For.Unrolled,
+    'parallel' : For.Parallel,
+    'vectorize': For.Vectorized,
+}
+
+def _range(annotation, args):
+    """Handling TVM loop types"""
+    n = len(args)
+    if n == 1:
+        low, ext = _api.const(0, dtype='int32'), args[0]
+    else:
+        _internal_assert(n == 2, "A loop intrinsic should only have 1 or 2 arguments!")
+        low, ext = args[0], args[1]
+    if not ir_pass.Equal(low, _api.const(0, dtype='int32')):
+        ext = ext - low
+    for_type = LOOP_INTRIN[annotation]
+    iter_var = None
+    return iter_var, low, ext, for_type
+
+
+range = unroll = vectorize = parallel = _range #pylint: disable=invalid-name
+
+
+def bind(func_id, args):
+    """Handling TVM thread binding"""
+    _internal_assert(func_id == "bind", "This function cannot be directly invoked!")
+    _internal_assert(len(args) == 2, "A loop bind should only have 2 arguments!")
+    _internal_assert(isinstance(args[0], str), \
+                     "A loop bind's first argument should be a string!")
+    iter_var = _api.thread_axis(args[0])
+    low, ext = _api.const(0), args[1]
+    for_type = None
+    return iter_var, low, ext, for_type
+
+
+def _math_intrin(func_id, args):
+    from .. import intrin
+    return getattr(intrin, func_id)(*args)
+
+sqrt = log = exp = tanh = sigmoid = power = popcount = _math_intrin #pylint: disable=invalid-name
+
+
+def _min_max(func_id, args):
+    _internal_assert(len(args) == 2, "Max/Min function should have 2 elements")
+    return getattr(_make, func_id.title())(args[0], args[1])
+
+
+min = max = _min_max #pylint: disable=invalid-name
+
+
+def _allocate_tensor(func_id, args):
+    """Handling TVM tensor allocation.
+    You may refer hybrid.intrin.allocate for more details."""
+    n = len(args)
+    _internal_assert(isinstance(_api.convert(args[0]), Array), \
+                     "allocate's first argument should be a tuple of shape!")
+    shape = args[0]
+    for i in shape:
+        _internal_assert(isinstance(i, _expr.Expr), "The shape should be an expression")
+    if n > 1:
+        _internal_assert(isinstance(args[1], str),
+                         "The data type should be an str")
+        _internal_assert(args[1].startswith('int') or args[1].startswith('float'), \
+                         "The data type should be either int or float!")
+        dtype = args[1]
+    else:
+        dtype = 'float32'
+    if n > 2:
+        _internal_assert(isinstance(args[2], str), \
+                         "The data scope should be an string")
+        _internal_assert(func_id != 'output_tensor', "Output tensor cannot specify scope")
+        scope = args[2]
+    else:
+        scope = 'global' if func_id != 'output_tensor' else 'output'
+    return (shape, dtype, scope)
+
+output_tensor = allocate = _allocate_tensor #pylint: disable=invalid-name
diff --git a/python/tvm/hybrid/intrin.py b/python/tvm/hybrid/intrin.py
index 92e259585b7a..48e92a8bf5ac 100644
--- a/python/tvm/hybrid/intrin.py
+++ b/python/tvm/hybrid/intrin.py
@@ -1,7 +1,6 @@
-"""Intrinsics of TVM-Python Hybrid Script for Python runtime"""
+"""Intrinsics of TVM-Python Hybrid Script for Python emulation runtime"""
 
 import numpy
-from ..stmt import For
 
 class _range(object):
     """Base class of the loop ranges in hybrid script"""
@@ -102,15 +101,3 @@ def sigmoid(x):
     'sigmoid'      : sigmoid,
     'popcount'     : popcount
 }
-
-
-LOOP_INTRIN = {
-    'range'    : For.Serial,
-    'unroll'   : For.Unrolled,
-    'parallel' : For.Parallel,
-    'vectorize': For.Vectorized,
-    'bind'     : None
-}
-
-
-MATH_INTRIN = ['sqrt', 'log', 'exp', 'tanh', 'sigmoid', 'power', 'popcount']
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index ba10dd8dde3c..26b0e141d0db 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -4,24 +4,24 @@
 import operator
 import logging
 import sys
-from .util import make_nop, halide_imm_types, is_docstring, _internal_assert
-from .intrin import LOOP_INTRIN, MATH_INTRIN
+from .util import _internal_assert
+from . import calls
+from . import util
 from .var_decl import determine_variable_usage
-from ..api import thread_axis
 from ..api import all as _all
 from ..api import any as _any
+from ..tensor import Tensor, Operation
 from .. import expr as _expr
 from .. import make as _make
-from .. import intrin
 from .. import api  as _api
 from .. import ir_pass as _ir_pass
 
 def list_to_block(visit, lst):
     """Convert a list of Python IR nodes to HalideIR Block"""
-    lst = [visit(stmt) for stmt in lst if not is_docstring(stmt)]
-    lst = [stmt for stmt in lst if not _ir_pass.Equal(stmt, make_nop())]
+    lst = [visit(stmt) for stmt in lst if not util.is_docstring(stmt)]
+    lst = [stmt for stmt in lst if not _ir_pass.Equal(stmt, util.make_nop())]
     if not lst:
-        return make_nop()
+        return util.make_nop()
     if len(lst) == 1:
         return lst[0]
     body = lst[0]
@@ -62,7 +62,7 @@ class HybridParser(ast.NodeVisitor):
     }
 
 
-    def __init__(self, args, usage, func_name=None):
+    def __init__(self, args, usage, symbols, func_name=None):
         """
         Parameters
         ----------
@@ -81,32 +81,49 @@ def __init__(self, args, usage, func_name=None):
         self.args = list(args)
         self.usage = usage.copy()
         self._args = {} # Dict maps arg name to actual arg instance (either a var or a buffer)
-        self.alloc_buffers = {} # Buffers formed by allocate instructions
+        self.alloc_buffers = {} # Buffers formed by explicit allocate instructions
         self.loops_above = {} # State variable that indicates loop levels above the current node
-        self.var_consts = {} # Variables that are determined as readonly in previous stage
+        self.variables = {} # The status of defined variables
         self.func_name = func_name # The name of the function to be lowered
         self.outputs = [] # Output tensors' name
         self.side_effect = set() # Tensors with side effects
         self.parsed_body = None # The parsed HalideIR body
-        self.returned = False
+        self.returned = False # If this function has a valid return
+        self.symbols = symbols # The global context
 
 
     def wrap_up_realize(self, node, body):
         """Wrap up all the variables which will no longer be used"""
+        pop_buf = []
+        pop_var = []
         for key, val in self.usage.items():
-            if key in self.var_consts.keys():
-                continue
             _, level, _ = val
-            if level == node:
-                if key in self._args.keys():
+            if level != node:
+                continue
+            if key in self._args.keys():
+                continue
+            if key in self.alloc_buffers.keys():
+                _buf, _scope = self.alloc_buffers[key]
+                if _scope == 'output':
                     continue
-                else:
-                    _buf, _scope = self.alloc_buffers[key]
-                _domain = [_make.range_by_min_extent(0, i) for i in _buf.shape]
-                _dtype = _buf.dtype
-                _true = _api.convert(True)
-                body = _make.Realize(_buf.op, 0, _dtype, _domain, _true, body)
-                body = _make.AttrStmt(_buf.op, 'realize_scope', _api.convert(_scope), body)
+                pop_buf.append(key)
+            else:
+                _internal_assert(key in self.variables.keys(),
+                                 "Key should be either in one of args, buffers, and vars")
+                if not isinstance(self.variables[key], tuple):
+                    continue
+                _buf, _scope = self.variables[key]
+                pop_var.append(key)
+            _domain = [_make.range_by_min_extent(0, i) for i in _buf.shape]
+            _dtype = _buf.dtype
+            _true = _api.convert(True)
+            body = _make.Realize(_buf.op, 0, _dtype, _domain, _true, body)
+            body = _make.AttrStmt(_buf.op, 'realize_scope', _api.convert(_scope), body)
+
+        for elem in pop_buf:
+            self.alloc_buffers.pop(elem)
+        for elem in pop_var:
+            self.variables.pop(elem)
         return body
 
 
@@ -121,7 +138,6 @@ def _get_buffer_from_id(self, s, for_provide=False):
         return self.alloc_buffers[s][0]
 
 
-
     #pylint: disable=invalid-name, missing-docstring
     def visit_Module(self, node):
         _internal_assert(len(node.body) == 1, \
@@ -133,13 +149,13 @@ def visit_FunctionDef(self, node):
         _internal_assert(len(node.args.args) == len(self.args), \
                          "The number of arguments passed to the \
                          function should be the same as it is defined!")
+        if self.func_name is None:
+            self.func_name = node.name
         for idx, arg in enumerate(node.args.args):
             _attr = 'id' if sys.version_info[0] < 3 else 'arg' # To make py2 and 3 compatible
             self._args[getattr(arg, _attr)] = self.args[idx]
         res = list_to_block(self.visit, node.body)
         res = self.wrap_up_realize(node, res)
-        if self.func_name is None:
-            self.func_name = node.name
         return res
 
 
@@ -148,23 +164,22 @@ def visit_Expr(self, node):
 
 
     def visit_Name(self, node):
-        _id = node.id
-        if _id in self._args.keys() and isinstance(self._args[_id], (_expr.Var, _expr.ConstExpr)):
-            return self._args[_id]
-        elif _id in self.loops_above.keys():
-            return self.loops_above[_id]
-        _internal_assert(_id not in self._args.keys(), \
-                         "This id %s should be handled in visit_Subscript!" % _id)
-        _internal_assert(_id in self.usage.keys(), \
-                         "This id %s is expected to be a defined variable!" % _id)
-        # Buffer
-        if _id in self.alloc_buffers.keys():
-            _buf, _ = self.alloc_buffers[_id]
-            return _make.Call(_buf.dtype, _id, [_api.const(0)], _expr.Call.Halide, _buf.op, 0)
-        # Compilation time constant
-        _internal_assert(_id in self.var_consts.keys(),
-                         "This id %s is expected to a compilation time constant!" % _id)
-        return self.var_consts[_id]
+        name = node.id
+        if name in self.loops_above.keys():
+            return self.loops_above[name]
+        elif name in self.variables.keys():
+            res = self.variables[name]
+            if isinstance(res, tuple):
+                buf = res[0]
+                if isinstance(node.ctx, ast.Load):
+                    return _make.Call(buf.dtype, buf.name, [_api.const(0)], \
+                                      _expr.Call.Halide, buf.op, buf.value_index)
+                return buf, [_api.const(0)]
+            if isinstance(node.ctx, ast.Load):
+                return res
+            return None
+        buf = self._get_buffer_from_id(name)
+        return buf
 
 
     def visit_Num(self, node):
@@ -172,18 +187,36 @@ def visit_Num(self, node):
 
 
     def visit_AugAssign(self, node):
-        lhs = self.visit(node.target)
+        buf = self.visit(node.target)
         rhs = self.visit(node.value)
-        rhs = HybridParser._binop_maker[type(node.op)](lhs, rhs)
-        _internal_assert(isinstance(lhs, _expr.Call), \
-                         "The LHS of an AugAssign is supposed to be a call!")
-        return _make.Provide(lhs.func, 0, rhs, lhs.args)
+        if isinstance(buf, tuple):
+            _internal_assert(len(buf) == 2, "LHS is supposed to be (buf, args)!")
+            buf, args = buf
+        else:
+            args = [_api.const(0)]
+        _internal_assert(isinstance(buf, Tensor), "LHS is supposed to be Tensor!")
+
+        read = _make.Call(buf.dtype, buf.name, args, _expr.Call.Halide, buf.op, buf.value_index)
+        value = HybridParser._binop_maker[type(node.op)](read, rhs)
+
+        return _make.Provide(buf.op, 0, value, args)
 
 
     def visit_Assign(self, node):
+        rhs = self.visit(node.value)
+        if isinstance(rhs, Operation):
+            rmap = {}
+            _internal_assert(len(node.targets) == rhs.num_outputs, \
+                             "Unable to detuple the outs to targets")
+            for i in range(rhs.num_outputs):
+                _internal_assert(isinstance(node.targets[i], ast.Name),
+                                 "You should bind a pure name to the tensors")
+                self.alloc_buffers[node.targets[i].id] = (rhs.output(i), 'global')
+                rmap[rhs.outputs[i].op] = rhs.output(i)
+            return util.replace_io(rhs.body, rmap)
+
         _internal_assert(len(node.targets) == 1, "So far only one-valued assignment is supported!")
         lhs = node.targets[0]
-        rhs = self.visit(node.value)
         if isinstance(rhs, _expr.Expr):
             rhs = _ir_pass.Simplify(rhs)
         if isinstance(lhs, ast.Name):
@@ -194,65 +227,63 @@ def visit_Assign(self, node):
                              "Loop variable cannot be overwritten!")
             decl, _, rw = self.usage[lhs]
             if decl == lhs_:
-                _internal_assert(lhs not in self.var_consts.keys(), \
-                                 "A constant cannot be overwritten!")
-                _internal_assert(lhs not in self.alloc_buffers.keys(), \
+                _internal_assert(lhs not in self.variables.keys() and
+                                 lhs not in self.alloc_buffers.keys(), \
                                  "This value should not be defined before this point!")
                 if isinstance(rhs, tuple):
                     shape, dtype, scope = rhs
                     ph = _api.placeholder(shape, dtype=dtype, name=lhs)
-                    if scope != 'output':
-                        self.alloc_buffers[lhs] = (ph, scope)
-                    else:
-                        self._args[lhs] = ph
+                    self.alloc_buffers[lhs] = (ph, scope)
+                    if scope == 'output':
                         self.outputs.append(lhs)
-                    return make_nop()
-                if isinstance(rhs, halide_imm_types) and ast.Store not in rw:
-                    self.var_consts[lhs] = rhs
+                    return util.make_nop()
+                if isinstance(rhs, util.halide_imm_types) and ast.Store not in rw:
+                    self.variables[lhs] = rhs
                 else:
                     ph = _api.placeholder((1, ), dtype=rhs.dtype, name=lhs)
-                    self.alloc_buffers[lhs] = (ph, 'global')
-            if lhs in self.var_consts.keys():
-                return make_nop()
-            _internal_assert(lhs in self.alloc_buffers.keys(), \
-                             "This variable should be defined before!")
-            tgt, _ = self.alloc_buffers[lhs]
-            return _make.Provide(tgt.op, 0, rhs, [_api.const(0, dtype=rhs.dtype)])
+                    self.variables[lhs] = (ph, 'global')
+            lhs = self.visit(lhs_)
+            if lhs is not None:
+                buf, args = lhs
+                return _make.Provide(buf.op, 0, rhs, args)
+            return util.make_nop()
         else:
-            lhs = self.visit(lhs)
-            _internal_assert(isinstance(lhs, _expr.Call), \
+            lhs, args = self.visit(lhs)
+            _internal_assert(isinstance(lhs, Tensor), \
                              "An array access's LHS is expected to be a expr.Call!")
-            #TODO: support slice later
-            buf = self._get_buffer_from_id(lhs.name, for_provide=True)
-            return _make.Provide(buf.op, 0, rhs, lhs.args)
+            res = _make.Provide(lhs.op, lhs.value_index, rhs, args)
+            return res
 
 
     def visit_Index(self, node):
         if isinstance(node.value, ast.Tuple):
-            return [self.visit(i) for i in node.value.elts]
+            return self.visit(node.value)
         return [self.visit(node.value)]
 
 
+    def visit_Attribute(self, node):
+        _internal_assert(isinstance(node.value, ast.Name), \
+                         "For atrribute access, only both names are supported so far!")
+        buf = self._get_buffer_from_id(node.value.id)
+        return getattr(buf, node.attr)
+
+
     def visit_Subscript(self, node):
         args = self.visit(node.slice)
         if isinstance(node.value, ast.Name):
-            array = node.value.id
-            _buf = self._get_buffer_from_id(array)
-            return _make.Call(_buf.dtype, array, args, _expr.Call.Halide, _buf.op, _buf.value_index)
-
-        _internal_assert(isinstance(node.value, ast.Attribute), \
-                         "Only variable and attribute's subscript supported so far")
-        _internal_assert(isinstance(node.value.value, ast.Name), \
-                         "The root of array access is expect to be a id!")
-        _internal_assert(node.value.attr == "shape", \
-                         "Attribute access so far only 'shape' is supported!")
+            buf = self.visit(node.value)
+            if isinstance(node.ctx, ast.Load):
+                return _make.Call(buf.dtype, buf.name, args, \
+                                  _expr.Call.Halide, buf.op, buf.value_index)
+            return buf, args
+
+        shape = self.visit(node.value)
         _internal_assert(len(args) == 1, "For 'shape' access the argument should be only one!")
         args = args[0]
         #TODO: maybe support non-constant value later?
         _internal_assert(isinstance(args, (_expr.IntImm, _expr.UIntImm)), \
                          "So far only constant shape access supported!")
-        buf = self._get_buffer_from_id(node.value.value.id)
-        return buf.shape[args.value]
+        return shape[args.value]
 
 
     def visit_With(self, node):
@@ -275,7 +306,7 @@ def visit_If(self, node):
         if node.orelse:
             else_body = list_to_block(self.visit, node.orelse)
         else:
-            else_body = make_nop()
+            else_body = util.make_nop()
         return _make.IfThenElse(cond, if_body, else_body)
 
 
@@ -305,13 +336,10 @@ def visit_BoolOp(self, node):
             _internal_assert(isinstance(node.op, ast.Not), \
                              "Unary is supposed to be not!")
             return operator.not_(self.visit(node.values[0]))
-        elif n == 2:
-            _internal_assert(isinstance(node.op, (ast.And, ast.Or)), \
-                             "Binary is supposed to be and/or!")
-            values = [self.visit(i) for i in node.values]
-            return HybridParser._binop_maker[type(node.op)](*values)
-        else:
-            raise ValueError("This Bool Op is not supported yet!")
+        _internal_assert(isinstance(node.op, (ast.And, ast.Or)), \
+                         "Binary is supposed to be and/or!")
+        values = [self.visit(i) for i in node.values]
+        return HybridParser._binop_maker[type(node.op)](*values)
 
 
     def visit_UnaryOp(self, node):
@@ -329,67 +357,17 @@ def visit_Call(self, node):
         # Yet, no function pointer supported
         _internal_assert(isinstance(node.func, ast.Name), \
                          "Only id-function function call is supported so far!")
+
         func_id = node.func.id
-        n = len(node.args)
-        if func_id in LOOP_INTRIN.keys() and func_id != 'bind':
-            if n == 1:
-                low, ext = _api.const(0, dtype='int32'), self.visit(node.args[0])
-            else:
-                _internal_assert(n == 2, "A loop intrinsic should only have 1 or 2 arguments!")
-                low, ext = self.visit(node.args[0]), self.visit(node.args[1])
-            if not _ir_pass.Equal(low, _api.const(0, dtype='int32')):
-                ext = ext - low
-            for_type = LOOP_INTRIN[func_id]
-            iter_var = None
-            return iter_var, low, ext, for_type
-        elif func_id == 'bind':
-            _internal_assert(n == 2, "A loop bind should only have 2 arguments!")
-            _internal_assert(isinstance(node.args[0], ast.Str), \
-                             "A loop bind's first argument should be a string!")
-            _vn = node.args[0].s
-            iter_var = thread_axis(node.args[0].s)
-            low, ext = _api.const(0, dtype='int32'), self.visit(node.args[1])
-            for_type = None
-            return iter_var, low, ext, for_type
-        elif func_id in MATH_INTRIN:
-            return getattr(intrin, func_id)(*[self.visit(arg) for arg in node.args])
-        elif func_id in ['allocate', 'output_tensor']:
-            _internal_assert(isinstance(node.args[0], ast.Tuple), \
-                             "allocate's first argument should be a tuple of shape!")
-            shape = tuple(self.visit(i) for i in node.args[0].elts)
-            if func_id == 'output_tensor':
-                _internal_assert(not self.loops_above, \
-                                 "Are you sure to allocate a output buffer multiple times?")
-            for i in shape:
-                _internal_assert(isinstance(i, _expr.Expr), "The shape should be an expression")
-            if n > 1:
-                if isinstance(node.args[1], ast.Str):
-                    dtype = node.args[1].s
-                else:
-                    _internal_assert(isinstance(node.args[1], ast.Attribute), \
-                                     "Unable to evaluate to get data type")
-                    to_eval = node.args[1]
-                    _internal_assert(isinstance(to_eval.value, ast.Name), \
-                                     "Unable to evaluate the attribute to get data type")
-                    _internal_assert(to_eval.attr == 'dtype', \
-                                     "Only dtype attribute is supported so far")
-                    dtype = self._get_buffer_from_id(to_eval.value.id).dtype
-            else:
-                dtype = 'float32'
-            if n > 2:
-                _internal_assert(isinstance(node.args[2], ast.Str), \
-                                 "The data scope should be an string")
-                _internal_assert(func_id != 'output_tensor', "Output tensor cannot specify scope")
-                scope = node.args[2].s
-            else:
-                scope = 'global' if func_id != 'output_tensor' else 'output'
-            return (shape, dtype, scope)
-        elif func_id == 'max' or func_id == 'min':
-            _internal_assert(n == 2, "Max/Min function should have 2 elements")
-            a, b = self.visit(node.args[0]), self.visit(node.args[1])
-            return getattr(_make, func_id.title())(a, b)
-        else:
-            raise ValueError("Function call not supported yet!")
+        args = [self.visit(i) for i in node.args]
+        try:
+            return getattr(calls, func_id)(func_id, args)
+        except AttributeError:
+            _internal_assert(func_id in self.symbols.keys(), \
+                             "The function called is not in the context either!")
+            outs = self.symbols[func_id](*args)
+            op = outs.op if isinstance(outs, Tensor) else outs[0].op
+            return op
 
 
     def visit_For(self, node):
@@ -400,7 +378,7 @@ def visit_For(self, node):
         if iter_var is None:
             _internal_assert(for_type is not None, "The loop bind function parse error!")
             offset = iter_var = _api.var(_name)
-            if not _ir_pass.Equal(low, _api.const(0, dtype='int32')):
+            if not _ir_pass.Equal(low, _api.const(0)):
                 offset = iter_var + low
             self.loops_above[_name] = offset
         else:
@@ -411,7 +389,7 @@ def visit_For(self, node):
         if for_type is None:
             res = _make.AttrStmt(iter_var, 'thread_extent', ext, _body)
         else:
-            res = _make.For(iter_var, _api.const(0, dtype='int32'), ext, for_type, 0, _body)
+            res = _make.For(iter_var, _api.const(0), ext, for_type, 0, _body)
         self.loops_above.pop(_name)
         return res
 
@@ -428,14 +406,22 @@ def visit_Return(self, node):
                 _internal_assert(isinstance(i, ast.Name), "What do you return?")
                 ids.append(i.id)
         _internal_assert(len(set(ids)) == len(ids), "Duplicated tensors in the return tuples")
-        if len(ids) != len(self.outputs):
+        if len(ids) < len(self.outputs):
             logging.log(logging.CRITICAL, '[Warning] Not all the output buffers returned!')
-        self.outputs = [self._args[i] for i in ids]
+        self.outputs = [self.alloc_buffers[i][0] for i in ids]
         self.returned = True
-        return make_nop()
+        return util.make_nop()
+
+
+    def visit_Tuple(self, node):
+        return tuple(self.visit(i) for i in node.elts)
 
 
-def parse_python(src, args):
+    def visit_Str(self, node):
+        return node.s
+
+
+def parse_python(src, symbols, args):
     """The helper function of calling the AST visitor
 
     Parameters
@@ -443,6 +429,9 @@ def parse_python(src, args):
     src : str
         The source code of the function to be parsed.
 
+    src : str
+        The symbol list of the global context of the function.
+
     args : list of Tensors or Vars
         The argument lists to the function.
         It is NOT encouraged to write a function without arguments.
@@ -454,8 +443,8 @@ def parse_python(src, args):
         The result Halide IR and the parser class instance.
     """
     root = ast.parse(src)
-    var_usage = determine_variable_usage(root, args)
-    parser = HybridParser(args, var_usage)
+    var_usage = determine_variable_usage(root, args, symbols)
+    parser = HybridParser(args, var_usage, symbols)
     parser.parsed_body = parser.visit(root)
     _internal_assert(parser.returned, 'No valid return found in the function body!')
     return parser
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index 78106838f13e..aa86d55a6fcf 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -10,6 +10,7 @@
 from .. import api as _api
 from .. import make as _make
 from .. import expr as _expr
+from .. import stmt as _stmt
 from ..tensor import Tensor
 
 
@@ -86,3 +87,20 @@ def _restore_runtime(func, intersect):
         _globals.pop(elem)
     for k, v in intersect:
         _globals[k] = v
+
+
+def replace_io(body, rmap):
+    """Replacing tensors usage according to the dict given"""
+    from .. import ir_pass
+
+    def replace(op):
+        if isinstance(op, _stmt.Provide) and op.func in rmap.keys():
+            buf = rmap[op.func]
+            return _make.Provide(buf.op, op.value_index, op.value, op.args)
+        elif isinstance(op, _expr.Call) and  op.func in rmap.keys():
+            buf = rmap[op.func]
+            return _make.Call(buf.dtype, buf.name, op.args, \
+                              _expr.Call.Halide, buf.op, buf.value_index)
+        return None
+
+    return ir_pass.IRTransform(body, None, replace, ['Provide', 'Call'])
diff --git a/python/tvm/hybrid/var_decl.py b/python/tvm/hybrid/var_decl.py
index 27df87874377..eb893a7f22a1 100644
--- a/python/tvm/hybrid/var_decl.py
+++ b/python/tvm/hybrid/var_decl.py
@@ -10,12 +10,13 @@ class PyVariableUsage(ast.NodeVisitor):
     """The vistor class to determine the declaration, r/w status, and last use of each variable"""
     #pylint: disable=invalid-name
     #pylint: disable=missing-docstring
-    def __init__(self, args):
+    def __init__(self, args, symbols):
         self.status = {}
         self.scope_level = []
         self._args = {}
         self.args = args
         self.aug_assign_ = False
+        self.symbols = symbols
 
 
     def visit_FunctionDef(self, node):
@@ -43,8 +44,10 @@ def visit_Call(self, node):
         #No function pointer supported so far
         _internal_assert(isinstance(node.func, ast.Name), "Function call should be an id")
         func_id = node.func.id
-        _internal_assert(func_id in list(HYBRID_GLOBALS.keys()) + ['range', 'max', 'min'], \
-                "Function call id not in intrinsics' list")
+        _internal_assert(func_id in list(HYBRID_GLOBALS.keys()) + \
+                         ['range', 'max', 'min'] + \
+                         list(self.symbols.keys()), \
+                         "Function call id not in intrinsics' list")
         for elem in node.args:
             self.visit(elem)
 
@@ -75,11 +78,13 @@ def visit_Name(self, node):
         else:
             decl, loop, usage = self.status[node.id]
             usage.add(type(node.ctx))
+            _internal_assert(loop in self.scope_level,
+                             "%s is used out of the scope it is defined!" % node.id)
             self.status[node.id] = (decl, loop, usage)
 
 
-def determine_variable_usage(root, args):
+def determine_variable_usage(root, args, symbols):
     """The helper function for calling the dedicated visitor."""
-    visitor = PyVariableUsage(args)
+    visitor = PyVariableUsage(args, symbols)
     visitor.visit(root)
     return visitor.status
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 7efbbe43ee21..f87c75f7929d 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -270,7 +270,7 @@ def test_bind():
         return
     @script
     def vec_add(a, b):
-        c = output_tensor((1000, ), dtype='float32')
+        c = output_tensor((1000, ), 'float32')
         for tx in bind('threadIdx.x', 1000):
             c[tx] = a[tx] + b[tx]
         return c
@@ -506,7 +506,37 @@ def kernel_b(b, a):
     module(tvm.ndarray.array(np_a), res)
     tvm.testing.assert_allclose(res.asnumpy(), ref)
 
+def test_func_call():
+    @tvm.hybrid.script
+    def foo(a, b):
+        for i in range(10):
+            a[i] = i + 1.0
+        for i in range(10):
+            b[i] = i + 1.0
+        c = outer_product(10, 10, a, b)
+        d = output_tensor(c.shape, c.dtype)
+        for i in range(10):
+            for j in range(10):
+                d[i, j] = c[i, j] + i * j
+        return d
 
+    a = tvm.placeholder((10, ), name='a')
+    b = tvm.placeholder((10, ), name='b')
+    run_and_check(foo, [a, b])
+
+def test_bool():
+    @tvm.hybrid.script
+    def foo(a):
+        b = output_tensor(a.shape, a.dtype)
+        b[0] = 1.2
+        for i in range(1, a.shape[0] - 1):
+            if a[i] * a[i - 1] < a[i] or a[i] * a[i - 1] < a[i - 1] or i * a[i] == a[i]:
+                b[i] = a[i]
+            else:
+                b[i] = 0.0
+        return b
+    a = tvm.placeholder((10, ), name='a')
+    run_and_check(foo, [a])
 
 if __name__ == "__main__":
     test_outer_product()
@@ -521,7 +551,7 @@ def kernel_b(b, a):
     test_downstream()
     test_const_param()
     test_value_index()
+    test_func_call()
+    test_bool()
     # TODO:
     # test_inplace()
-
-

From f00791712e9afe8547c1ac972478c1a11a784494 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 21 Dec 2018 03:07:44 +0900
Subject: [PATCH 508/529] [DOC] Codebase walkthrough with vector add example
 (#2273)

---
 docs/dev/codebase_walkthrough.rst | 226 ++++++++++++++++++++++++++++++
 docs/dev/index.rst                |   3 +-
 2 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 docs/dev/codebase_walkthrough.rst

diff --git a/docs/dev/codebase_walkthrough.rst b/docs/dev/codebase_walkthrough.rst
new file mode 100644
index 000000000000..6f5cff8a06d6
--- /dev/null
+++ b/docs/dev/codebase_walkthrough.rst
@@ -0,0 +1,226 @@
+=======================================
+**TVM Codebase Walkthrough by Example**
+=======================================
+
+Getting to know a new codebase can be a challenge. This is especially true for a codebase like that of TVM, where different components interact in non-obvious ways. In this guide, we try to illustrate the key elements that comprise a compilation pipeline with a simple example. For each important step, we show where in the codebase it is implemented. The purpose is to let new developers and interested users dive into the codebase more quickly.
+
+*******************************************
+Codebase Structure Overview
+*******************************************
+
+At the root of the TVM repository, we have following subdirectories that together comprise a bulk of the codebase.
+
+- ``src`` - C++ code for operator compilation and deployment runtimes.
+- ``src/relay`` - Implementation of Relay, a new IR for deep learning framework superseding ``nnvm`` below.
+- ``python`` - Python frontend that wraps C++ functions and objects implemented in ``src``.
+- ``topi`` - Compute definitions and backend schedules for standard neural network operators.
+- ``nnvm`` - C++ code and Python frontend for graph optimization and compilation. After the introduction of Relay, it remains in the codebase for backward compatibility.
+
+Using standard Deep Learning terminologies, ``src/relay`` is the component that manages a computational graph, and nodes in a graph are compiled and executed using infrastructures implemented in the rest of ``src``. ``python`` provides python bindings for the C++ API and driver code that users can use to execute compilation. Operators corresponding to each node are registered in ``src/relay/op``. Implementations for operators are in ``topi``, and they are coded in either C++ or Python.
+
+Relay is the new IR for deep networks that is intended to replace NNVM. If you have used NNVM, Relay provides equivalent or better functionalities. In fact, Relay goes beyond a traditional way of thinking deep networks in terms of computational graphs. But for the purpose of this document, we can think of Relay as a traditional computational graph framework. You can read more about Relay `here <https://docs.tvm.ai/dev/relay_intro.html>`_.
+
+When a user invokes graph compilation by ``relay.build(...)`` (or ``nnvm.compiler.build(...)`` for the older API), the following sequence of actions happens for each node in the graph:
+
+- Look up an operator implementation by querying the operator registry
+- Generate a compute expression and a schdule for the operator
+- Compile the operator into object code
+
+One of the interesting aspects of TVM codebase is that interop between C++ and Python is not unidirectional. Typically, all code that do heavy liftings are implemented in C++, and Python bindings are provided for user interface. This is also true in TVM, but in TVM codebase, C++ code also call into functions defined in a Python module. For example, the convolution operator is implemented in Python, and its implementation is invoked from C++ code in Relay.
+
+*******************************************
+Vector Add Example
+*******************************************
+
+We use a simple example that uses the low level TVM API directly. The example is vector addition, which is covered in detail in `this tutorial <https://docs.tvm.ai/tutorials/get_started.html#sphx-glr-tutorials-get-started-py>`_.
+
+::
+
+   n = 1024
+   A = tvm.placeholder((n,), name='A')
+   B = tvm.placeholder((n,), name='B')
+   C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+
+Here, types of ``A``, ``B``, ``C`` are ``tvm.tensor.Tensor``, defined in ``python/tvm/tensor.py``. The Python ``Tensor`` is backed by C++ ``Tensor``, implemented in ``include/tvm/tensor.h`` and ``src/lang/tensor.cc``. All Python types in TVM can be thought of as a handle to the underlying C++ type with the same name. If you look at the definition of Python ``Tensor`` type below, you can see it is a subclass of ``NodeBase``.
+
+::
+
+   @register_node
+   class Tensor(NodeBase, _expr.ExprOp):
+       """Tensor object, to construct, see function.Tensor"""
+
+       def __call__(self, *indices):
+          ...
+
+The Node system is the basis of exposing C++ types to frontend languages, including Python. The way TVM implements Python wrapping is not straightforward. It is briefly covered in `this document <https://docs.tvm.ai/dev/runtime.html#tvm-node-and-compiler-stack>`_, and details are in ``python/tvm/_ffi/`` if you are interested.
+
+``Tensor`` is created by functions in ``python/tvm/api.py``, which in turn calls into C++ functions exposed in ``src/api/api_lang.cc``. All C++ functions that are callable from Python are exposed in the ``src/api`` subdirectory. For example, the ``tvm.compute()`` function above calls into ``_ComputeOp`` api exposed in ``src/api/api_lang.cc``:
+
+::
+
+   TVM_REGISTER_API("_ComputeOp")
+   .set_body([](TVMArgs args,  TVMRetValue* ret) {
+       *ret = ComputeOpNode::make(args[0],
+                                  args[1],
+                                  args[2],
+                                  args[3],
+                                  args[4]);
+     });
+
+We use ``TVM_REGISTER_*`` macro to expose C++ functions to frontend languages, in the form of `PackedFunc <https://docs.tvm.ai/dev/runtime.html#packedfunc>`_. ``PackedFunc`` is another mechanism by which TVM implements C++ and Python interop. In particular, this is what makes calling Python functions from the C++ codebase very easy.
+
+A ``Tensor`` object has an ``Operation`` object associated with it, defined in ``python/tvm/tensor.py``, ``include/tvm/operation.h``, and ``src/tvm/op`` subdirectory. A ``Tensor`` is an output of its ``Operation`` object. Each ``Operation`` object has in turn ``input_tensors()`` method, which returns a list of input ``Tensor`` to it. This way we can keep track of dependencies between ``Operation``.
+
+We pass the operation corresponding to the output tensor ``C`` to ``tvm.create_schedule()`` function in ``python/tvm/schedule.py``.
+
+::
+
+   s = tvm.create_schedule(C.op)
+
+This function is mapped to the C++ function in ``include/tvm/schedule.h``.
+
+::
+
+   inline Schedule create_schedule(Array<Operation> ops) {
+     return ScheduleNode::make(ops);
+   }
+
+``Schedule`` consists of collections of ``Stage`` and output ``Operation``.
+
+``Stage`` corresponds to one ``Operation``. In the vector add example above, there are two placeholder ops and one compute op, so the schedule ``s`` contains three stages. Each ``Stage`` holds information about a loop nest structure, types of each loop (``Parallel``, ``Vectorized``, ``Unrolled``), and where to execute its computation in the loop nest of the next ``Stage``, if any.
+
+``Schedule`` and ``Stage`` are defined in ``tvm/python/schedule.py``, ``include/tvm/schedule.h``, and ``src/schedule/schedule_ops.cc``.
+
+To keep it simple, we call ``tvm.build(...)`` on the default schedule created by ``create_schedule()`` function above.
+
+::
+
+   target = "cuda"
+   fadd = tvm.build(s, [A, B, C], target)
+
+``tvm.build()``, defined in ``python/tvm/build_module.py``, takes a schedule, input and output ``Tensor``, and a target, and returns a ``tvm.Module`` object, defined in ``python/tvm/module.py``. A ``Module`` object contains a compiled function which can be invoked with function call syntax.
+
+The process of ``tvm.build()`` can be divided into two steps:
+
+- Lowering, where a high level, initial loop nest structures are transformed into a final, low level IR
+- Code generation, where target machine code is generated from the low level IR
+
+Lowering is done by ``tvm.lower()`` function, defined in ``python/tvm/build_module.py``. First, bound inference is peformed, and an initial loop nest structure is created.
+
+::
+
+   def lower(sch,
+             args,
+             name="default_function",
+             binds=None,
+             simple_mode=False):
+      ...
+      bounds = schedule.InferBound(sch)
+      stmt = schedule.ScheduleOps(sch, bounds)
+      ...
+
+Bound inference is the process where all loop bounds and sizes of intermidiate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/schedule/bound.cc``, ``src/schedule/graph.cc`` and ``src/schedule/message_passing.cc``.
+
+``stmt``, which is the output of ``ScheduleOps()``, represents an initial loop nest structure. If you have applied ``reorder`` or ``split`` primitives to your schedule, then the initial loop nest already reflects that changes. ``ScheduleOps()`` is defined in ``src/schedule/schedule_ops.cc``.
+
+Next, we apply a number of lowering passes to ``stmt``. These passes are implemented in ``src/pass`` subdirectory. For example, if you have applied ``vectorize`` or ``unroll`` primitives to your schedule, they are applied in loop vectorization and unrolling passes below.
+
+::
+
+     ...
+     stmt = ir_pass.VectorizeLoop(stmt)
+     ...
+     stmt = ir_pass.UnrollLoop(
+         stmt,
+         cfg.auto_unroll_max_step,
+         cfg.auto_unroll_max_depth,
+         cfg.auto_unroll_max_extent,
+         cfg.unroll_explicit)
+     ...
+
+After lowering is done, ``build()`` function generates target machine code from the lowered function. This code can contain SSE or AVX instructions if you target x86, or PTX instructions for CUDA target. In addition to target specific machine code, TVM also generates host side code that is responsible for memory management, kernel launch etc.
+
+Code generation is done by ``build_module()`` function, defined in ``python/tvm/codege.py``. On the C++ side, code generation is implemented in ``src/codegen`` subdirectory. ``build_module()`` Python function will reach ``Build()`` function below in ``src/codegen/codegen.cc``:
+
+::
+
+   runtime::Module Build(const Array<LoweredFunc>& funcs,
+                         const std::string& target) {
+     std::string build_f_name = "codegen.build_" + target;
+     const PackedFunc* bf = runtime::Registry::Get(build_f_name);
+     runtime::Module m = (*bf)(funcs, target);
+     return m;
+   }
+
+
+``Build()`` function looks up the code generator for the given target in the ``PackedFunc`` registry, and invokes the function found. For example, ``codegen.build_cuda`` function is registered in ``src/codegen/build_cuda_on.cc``, like this:
+
+::
+
+   TVM_REGISTER_API("codegen.build_cuda")
+   .set_body([](TVMArgs args, TVMRetValue* rv) {
+       *rv = BuildCUDA(args[0]);
+     });
+
+``BuildCUDA()`` above generates CUDA kernel source from the lowered IR using ``CodeGenCUDA`` class defined in ``src/codegen/codegen_cuda.cc``, and compile the kernel using NVRTC. If you target a backend that uses LLVM, which includes x86, ARM, NVPTX and AMDGPU, code generation is done primarily by ``CodeGenLLVM`` class defined in ``src/codegen/llvm/codegen_llvm.cc``. ``CodeGenLLVM`` translates TVM IR into LLVM IR, runs a number of LLVM optimization passes, and generates target machine code.
+
+``Build()`` function in ``src/codegen/codegen.cc`` returns a ``runtime::Module`` object, defined in ``include/tvm/runtime/module.h`` and ``src/runtime/module.cc``. A ``Module`` object is a container for the underlying target specific ``ModuleNode`` object. Each backend implements a subclass of ``ModuleNode`` to add target specific runtime API calls. For example, the CUDA backend implements ``CUDAModuleNode`` class in ``src/runtime/cuda/cuda_module.cc``, which manages CUDA driver API. ``BuildCUDA()`` function above wraps ``CUDAModuleNode`` with ``runtime::Module`` and return it to the Python side. The LLVM backend implements ``LLVMModuleNode`` in ``src/codegen/llvm/llvm_module.cc``, which handles JIT execution of compiled code. Other subclasses of ``ModuleNode`` can be found under subdirectories of ``src/runtime`` corresponding to each backend.
+
+The returned module, which can be thought of as a combination of a compiled function and a device API, can be invoked on TVM's NDArray objects.
+
+::
+
+   ctx = tvm.context(target, 0)
+   a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+   b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+   c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+   fadd(a, b, c)
+   output = c.asnumpy()
+
+Under the hood, TVM allocates device memory and manages memory transfers automatically. To do that, each backend needs to subclass ``DeviceAPI`` class, defined in ``include/tvm/runtime/device_api.h``, and override memory management methods to use device specific API. For example, the CUDA backend implements ``CUDADeviceAPI`` in ``src/runtime/cuda/cuda_device_api.cc`` to use ``cudaMalloc``, ``cudaMemcpy`` etc.
+
+The first time you invoke the compiled module with ``fadd(a, b, c)``, ``GetFunction()`` method of ``ModuleNode`` is called to get a ``PackedFunc`` that can be used for a kernel call. For example, in ``src/runtime/cuda/cuda_module.cc`` the CUDA backend implements ``CUDAModuleNode::GetFunction()`` like this:
+
+::
+
+   PackedFunc CUDAModuleNode::GetFunction(
+         const std::string& name,
+         const std::shared_ptr<ModuleNode>& sptr_to_self) {
+     auto it = fmap_.find(name);
+     const FunctionInfo& info = it->second;
+     CUDAWrappedFunc f;
+     f.Init(this, sptr_to_self, name, info.arg_types.size(), info.thread_axis_tags);
+     return PackFuncVoidAddr(f, info.arg_types);
+   }
+
+The ``PackedFunc``'s overloaded ``operator()`` will be called, which in turn calls ``operator()`` of ``CUDAWrappedFunc`` in ``src/runtime/cuda/cuda_module.cc``, where finally we see the ``cuLaunchKernel`` driver call:
+
+::
+
+   class CUDAWrappedFunc {
+    public:
+     void Init(...)
+     ...
+     void operator()(TVMArgs args,
+                     TVMRetValue* rv,
+                     void** void_args) const {
+       int device_id;
+       CUDA_CALL(cudaGetDevice(&device_id));
+       if (fcache_[device_id] == nullptr) {
+         fcache_[device_id] = m_->GetFunc(device_id, func_name_);
+       }
+       CUstream strm = static_cast<CUstream>(CUDAThreadEntry::ThreadLocal()->stream);
+       ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
+       CUresult result = cuLaunchKernel(
+           fcache_[device_id],
+           wl.grid_dim(0),
+           wl.grid_dim(1),
+           wl.grid_dim(2),
+           wl.block_dim(0),
+           wl.block_dim(1),
+           wl.block_dim(2),
+           0, strm, void_args, 0);
+     }
+   };
+
+This concludes an overview of how TVM compiles and executes a function. Although we did not detail TOPI or Relay, at the end all neural network operators go through the same compilation process as above. You are encouraged to dive into the details of the rest of the codebase.
diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index 2734a816dc68..3f4944fe1d52 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -13,4 +13,5 @@ In this part of documentation, we share the rationale for the specific choices m
    nnvm_overview
    hybrid_script
    relay_intro
-   relay_add_op
\ No newline at end of file
+   relay_add_op
+   codebase_walkthrough

From f6f21c2ed8c7c3016af6bdf33b049b2416bb44df Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Thu, 20 Dec 2018 21:41:30 +0300
Subject: [PATCH 509/529] [TVM] Move check_numerical_grads to tvm.testing_
 (#2314)

---
 nnvm/python/nnvm/testing/check_computation.py | 111 +-------------
 python/tvm/testing.py                         | 135 ++++++++++++++++++
 tests/python/unittest/test_testing.py         |  95 ++++++++++++
 3 files changed, 231 insertions(+), 110 deletions(-)
 create mode 100644 tests/python/unittest/test_testing.py

diff --git a/nnvm/python/nnvm/testing/check_computation.py b/nnvm/python/nnvm/testing/check_computation.py
index aab3f916e19f..7ab4dc0d4c6c 100644
--- a/nnvm/python/nnvm/testing/check_computation.py
+++ b/nnvm/python/nnvm/testing/check_computation.py
@@ -7,6 +7,7 @@
 
 import tvm
 from tvm.contrib import graph_runtime
+from tvm.testing import check_numerical_grads
 
 import nnvm
 from nnvm.compiler import graph_util
@@ -535,113 +536,3 @@ def scalar_function(**kwargs):
 
     if nothing_was_done:
         logging.warning("Nothing was done in check_function. Check ctx_list().")
-
-
-def check_numerical_grads(function, input_values, grad_values, function_value=None,
-                          delta=1e-3, atol=1e-2, rtol=0.1):
-    """A helper function that checks that numerical gradients of a function are equal to
-    gradients computed in some different way (analytical gradients).
-
-    Numerical gradients are computed using finite difference approximation. To reduce the number of
-    function evaluations, the number of points used is gradually increased if the error value is
-    too high (up to 5 points).
-
-    Parameters
-    ----------
-    function
-        A function that takes inputs as keyword arguments (like `function(**input_values)`) and
-        returns a scalar result. Should accept numpy ndarrays.
-
-    input_values : Dict[str, numpy.ndarray]
-        A dict assigning values to variables. Represents the point at which gradients should be
-        computed.
-
-    grad_values : Dict[str, numpy.ndarray]
-        Gradients computed using a different method.
-
-    function_value : float, optional
-        Should be equal to `function(**input_values)`.
-
-    delta : float, optional
-        A small number used for numerical computation of partial derivatives. The default 1e-3 is a
-        good choice for float32.
-
-    atol : float, optional
-        Absolute tolerance.
-
-    rtol : float, optional
-        Relative tolerance.
-    """
-
-    if function_value is None:
-        function_value = function(**input_values)
-
-    # a helper to modify j-th element of val by a_delta
-    def modify(val, j, a_delta):
-        val = val.copy()
-        val.reshape(-1)[j] = val.reshape(-1)[j] + a_delta
-        return val
-
-    # numerically compute a partial derivative with respect to j-th element of the var `name`
-    def derivative(x_name, j, a_delta):
-        modified_values = {n: modify(val, j, a_delta) if n == x_name else val
-                           for n, val in input_values.items()}
-        return (function(**modified_values) - function_value)/a_delta
-
-    def compare_derivative(j, n_der, grad):
-        der = grad.reshape(-1)[j]
-        return np.abs(n_der - der) < atol + rtol*np.abs(n_der)
-
-    for x_name, grad in grad_values.items():
-        if grad.shape != input_values[x_name].shape:
-            raise AssertionError(
-                "Gradient wrt '{}' has unexpected shape {}, expected {} "
-                .format(x_name, grad.shape, input_values[x_name].shape))
-
-        ngrad = np.zeros_like(grad)
-
-        # compute partial derivatives for each position in this variable
-        for j in range(np.prod(grad.shape)):
-            # forward difference approximation
-            nder = derivative(x_name, j, delta)
-
-            # if the derivative is not equal to the analytical one, try to use more
-            # precise and expensive methods
-            if not compare_derivative(j, nder, grad):
-                # central difference approximation
-                nder = (derivative(x_name, j, -delta) + nder)/2
-
-                if not compare_derivative(j, nder, grad):
-                    # central difference approximation using h = delta/2
-                    cnder2 = (derivative(x_name, j, delta/2) + derivative(x_name, j, -delta/2))/2
-                    # five-point derivative
-                    nder = (4*cnder2 - nder)/3
-
-            ngrad.reshape(-1)[j] = nder
-
-        dist = np.sqrt(np.sum((ngrad - grad)**2))
-        grad_norm = np.sqrt(np.sum(ngrad**2))
-
-        if not (np.isfinite(dist) and np.isfinite(grad_norm)):
-            raise ValueError(
-                "NaN or infinity detected during numerical gradient checking wrt {}\n"
-                "analytical grad = {}\n numerical grad = {}\n"
-                .format(x_name, grad, ngrad))
-
-        # we multiple atol by this number to make it more universal for different sizes
-        sqrt_n = np.sqrt(float(np.prod(grad.shape)))
-
-        if dist > atol*sqrt_n + rtol*grad_norm:
-            raise AssertionError(
-                "Analytical and numerical grads wrt {} differ too much\n"
-                "analytical grad = {}\n numerical grad = {}\n"
-                "distance > atol*sqrt(n) + rtol*grad_norm\n"
-                "distance {} > {}*{} + {}*{}"
-                .format(x_name, grad, ngrad,
-                        dist, atol, sqrt_n, rtol, grad_norm))
-
-        max_diff = np.max(np.abs(ngrad - grad))
-        avg_diff = np.mean(np.abs(ngrad - grad))
-        logging.info("Numerical grad test wrt %s of shape %s passes, "
-                     "dist = %f, max_diff = %f, avg_diff = %f",
-                     x_name, grad.shape, dist, max_diff, avg_diff)
diff --git a/python/tvm/testing.py b/python/tvm/testing.py
index 5c0b9b9da4ae..1a6666bdee2a 100644
--- a/python/tvm/testing.py
+++ b/python/tvm/testing.py
@@ -1,4 +1,5 @@
 """ TVM testing utilities """
+import logging
 import numpy as np
 
 def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7):
@@ -10,3 +11,137 @@ def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7):
     often allow `desired` to be close to zero, we generally want non-zero `atol`.
     """
     np.testing.assert_allclose(actual, desired, rtol=rtol, atol=atol, verbose=True)
+
+
+def check_numerical_grads(function, input_values, grad_values, function_value=None,
+                          delta=1e-3, atol=1e-2, rtol=0.1):
+    """A helper function that checks that numerical gradients of a function are
+    equal to gradients computed in some different way (analytical gradients).
+
+    Numerical gradients are computed using finite difference approximation. To
+    reduce the number of function evaluations, the number of points used is
+    gradually increased if the error value is too high (up to 5 points).
+
+    Parameters
+    ----------
+    function
+        A function that takes inputs either as positional or as keyword
+        arguments (either `function(*input_values)` or `function(**input_values)`
+        should be correct) and returns a scalar result. Should accept numpy
+        ndarrays.
+
+    input_values : Dict[str, numpy.ndarray] or List[numpy.ndarray]
+        A list of values or a dict assigning values to variables. Represents the
+        point at which gradients should be computed.
+
+    grad_values : Dict[str, numpy.ndarray] or List[numpy.ndarray]
+        Gradients computed using a different method.
+
+    function_value : float, optional
+        Should be equal to `function(**input_values)`.
+
+    delta : float, optional
+        A small number used for numerical computation of partial derivatives.
+        The default 1e-3 is a good choice for float32.
+
+    atol : float, optional
+        Absolute tolerance. Gets multiplied by `sqrt(n)` where n is the size of a
+        gradient.
+
+    rtol : float, optional
+        Relative tolerance.
+    """
+    # If input_values is a list then function accepts positional arguments
+    # In this case transform it to a function taking kwargs of the form {"0": ..., "1": ...}
+    if not isinstance(input_values, dict):
+        input_len = len(input_values)
+        input_values = {str(idx): val for idx, val in enumerate(input_values)}
+
+        def _function(_input_len=input_len, _orig_function=function, **kwargs):
+            return _orig_function(*(kwargs[str(i)] for i in range(input_len)))
+        function = _function
+
+        grad_values = {str(idx): val for idx, val in enumerate(grad_values)}
+
+    if function_value is None:
+        function_value = function(**input_values)
+
+    # a helper to modify j-th element of val by a_delta
+    def modify(val, j, a_delta):
+        val = val.copy()
+        val.reshape(-1)[j] = val.reshape(-1)[j] + a_delta
+        return val
+
+    # numerically compute a partial derivative with respect to j-th element of the var `name`
+    def derivative(x_name, j, a_delta):
+        modified_values = {n: modify(val, j, a_delta) if n == x_name else val
+                           for n, val in input_values.items()}
+        return (function(**modified_values) - function_value)/a_delta
+
+    def compare_derivative(j, n_der, grad):
+        der = grad.reshape(-1)[j]
+        return np.abs(n_der - der) < atol + rtol*np.abs(n_der)
+
+    for x_name, grad in grad_values.items():
+        if grad.shape != input_values[x_name].shape:
+            raise AssertionError(
+                "Gradient wrt '{}' has unexpected shape {}, expected {} "
+                .format(x_name, grad.shape, input_values[x_name].shape))
+
+        ngrad = np.zeros_like(grad)
+
+        wrong_positions = []
+
+        # compute partial derivatives for each position in this variable
+        for j in range(np.prod(grad.shape)):
+            # forward difference approximation
+            nder = derivative(x_name, j, delta)
+
+            # if the derivative is not equal to the analytical one, try to use more
+            # precise and expensive methods
+            if not compare_derivative(j, nder, grad):
+                # central difference approximation
+                nder = (derivative(x_name, j, -delta) + nder)/2
+
+                if not compare_derivative(j, nder, grad):
+                    # central difference approximation using h = delta/2
+                    cnder2 = (derivative(x_name, j, delta/2) + derivative(x_name, j, -delta/2))/2
+                    # five-point derivative
+                    nder = (4*cnder2 - nder)/3
+
+            # if the derivatives still don't match, add this position to the
+            # list of wrong positions
+            if not compare_derivative(j, nder, grad):
+                wrong_positions.append(np.unravel_index(j, grad.shape))
+
+            ngrad.reshape(-1)[j] = nder
+
+        wrong_percentage = int(100*len(wrong_positions)/np.prod(grad.shape))
+
+        dist = np.sqrt(np.sum((ngrad - grad)**2))
+        grad_norm = np.sqrt(np.sum(ngrad**2))
+
+        if not (np.isfinite(dist) and np.isfinite(grad_norm)):
+            raise ValueError(
+                "NaN or infinity detected during numerical gradient checking wrt '{}'\n"
+                "analytical grad = {}\n numerical grad = {}\n"
+                .format(x_name, grad, ngrad))
+
+        # we multiply atol by this number to make it more universal for different sizes
+        sqrt_n = np.sqrt(float(np.prod(grad.shape)))
+
+        if dist > atol*sqrt_n + rtol*grad_norm:
+            raise AssertionError(
+                "Analytical and numerical grads wrt '{}' differ too much\n"
+                "analytical grad = {}\n numerical grad = {}\n"
+                "{}% of elements differ, first 10 of wrong positions: {}\n"
+                "distance > atol*sqrt(n) + rtol*grad_norm\n"
+                "distance {} > {}*{} + {}*{}"
+                .format(x_name, grad, ngrad, wrong_percentage, wrong_positions[:10],
+                        dist, atol, sqrt_n, rtol, grad_norm))
+
+        max_diff = np.max(np.abs(ngrad - grad))
+        avg_diff = np.mean(np.abs(ngrad - grad))
+        logging.info("Numerical grad test wrt '%s' of shape %s passes, "
+                     "dist = %f, max_diff = %f, avg_diff = %f",
+                     x_name, grad.shape, dist, max_diff, avg_diff)
diff --git a/tests/python/unittest/test_testing.py b/tests/python/unittest/test_testing.py
new file mode 100644
index 000000000000..852bf2ce7e11
--- /dev/null
+++ b/tests/python/unittest/test_testing.py
@@ -0,0 +1,95 @@
+import numpy as np
+import tvm
+from tvm.testing import check_numerical_grads
+
+def test_check_numerical_grads():
+    # Functions and their derivatives
+    functions = [
+        lambda x: (x*x*x, 3*x*x),
+        lambda x: (x*x, 2*x),
+        lambda x: (np.abs(x), np.sign(x)),
+        lambda x: (np.log(np.abs(x)), 1/x),
+        lambda x: (np.sqrt(np.abs(x)), np.sign(x)/(2*np.sqrt(np.abs(x)))),
+        lambda x: (1/x, -1/(x*x)),
+        lambda x: (np.sign(np.sin(1/x)), np.zeros_like(x)),
+        lambda x: (x*np.sin(1/x), np.sin(1/x) - np.cos(1/x)/x),
+        lambda x: (np.sin(1/x), - np.cos(1/x)/(x*x)),
+    ]
+
+    # Avoid values too close to 0 since singularities of our functions are there
+    min_x = 0.5
+
+    for func in functions:
+        x_input = np.random.uniform(min_x, 10, size=(3, 4))
+
+        # We need a function returning a scalar, so sum the results
+        func_forw = lambda x: np.sum(func(x)[0])
+        grads = [func(x_input)[1]]
+
+        check_numerical_grads(func_forw, [x_input], grads)
+
+    # Check functions with multiple arguments
+    for f1 in functions:
+        for f2 in functions:
+            x_input = np.random.uniform(min_x, 10, size=(3, 4))
+            y_input = np.random.uniform(min_x, 10, size=(3, 4))
+
+            func_forw = lambda x, y: np.sum(f1(x)[0] + f2(y)[0])
+            grads = [f1(x_input)[1], f2(y_input)[1]]
+
+            check_numerical_grads(func_forw, [x_input, y_input], grads)
+
+            # Same thing but with keyword arguments
+            func_forw = lambda x, y: np.sum(f1(x)[0] + f2(y)[0])
+            grads = {'x': f1(x_input)[1], 'y': f2(y_input)[1]}
+
+            check_numerical_grads(func_forw, {'x': x_input, 'y': y_input}, grads)
+
+    def _noise1(x, atol=1e-2, rtol=0.1):
+        # We go in random direction using twice the original tolerance to be sure this
+        # results in an error
+        sqrt_n = np.sqrt(float(np.prod(x.shape)))
+        tol = 2*(np.linalg.norm(x)*rtol + atol*sqrt_n)
+        noise = np.random.normal(size=x.shape)
+        noise = tol * noise / np.linalg.norm(noise)
+        return x + noise
+
+    def _noise2(x, atol=1e-2, rtol=0.1):
+        # This noise affects just a single component
+        sqrt_n = np.sqrt(float(np.prod(x.shape)))
+        tol = 2*(np.linalg.norm(x)*rtol + atol*sqrt_n)
+        n = np.random.randint(np.prod(x.shape))
+        noise = np.zeros_like(x)
+        noise.reshape(-1)[n] = tol
+        return x + noise
+
+    # Add noise to gradients and check that the function throws
+    for f1 in functions:
+        for f2 in functions:
+            x_input = np.random.uniform(min_x, 10, size=(3, 4))
+            y_input = np.random.uniform(min_x, 10, size=(3, 4))
+
+            func_forw = lambda x, y: np.sum(f1(x)[0] + f2(y)[0])
+            grads = [_noise1(f1(x_input)[1]), _noise1(f2(y_input)[1])]
+
+            try:
+                check_numerical_grads(func_forw, [x_input, y_input], grads)
+            except AssertionError as e:
+                pass
+            else:
+                raise AssertionError("check_numerical_grads didn't raise an exception")
+
+            func_forw = lambda x, y: np.sum(f1(x)[0] + f2(y)[0])
+            grads = {'x': _noise2(f1(x_input)[1]), 'y': _noise2(f2(y_input)[1])}
+
+            try:
+                check_numerical_grads(func_forw, {'x': x_input, 'y': y_input}, grads)
+            except AssertionError as e:
+                pass
+            else:
+                raise AssertionError("check_numerical_grads didn't raise an exception")
+
+
+if __name__ == "__main__":
+    test_check_numerical_grads()
+

From 6feb5b84f5edcee7faf18129278897c48da0101f Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Thu, 20 Dec 2018 13:58:15 -0800
Subject: [PATCH 510/529] [relay][op] multibox_transform_loc (#2315)

---
 include/tvm/relay/attrs/vision.h       | 18 +++++++
 python/tvm/relay/op/vision/multibox.py | 36 +++++++++++++
 src/relay/op/vision/multibox_op.cc     | 75 +++++++++++++++++++++++++-
 tests/python/relay/test_op_level5.py   | 56 +++++++++++++++++++
 4 files changed, 184 insertions(+), 1 deletion(-)

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 5408582c8356..b736bd9c06a0 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -40,6 +40,24 @@ struct MultiBoxPriorAttrs : public tvm::AttrsNode<MultiBoxPriorAttrs> {
   }
 };
 
+struct MultiBoxTransformLocAttrs
+    : public tvm::AttrsNode<MultiBoxTransformLocAttrs> {
+  bool clip;
+  double threshold;
+  Array<IndexExpr> variances;
+
+  TVM_DECLARE_ATTRS(MultiBoxTransformLocAttrs,
+                    "relay.attrs.MultiBoxTransformLocAttrs") {
+    TVM_ATTR_FIELD(clip).set_default(true)
+      .describe("Clip out-of-boundary boxes.");
+    TVM_ATTR_FIELD(threshold).set_default(0.01)
+      .describe("Threshold to be a positive prediction.");
+    TVM_ATTR_FIELD(variances)
+      .set_default(Array<IndexExpr>({0.1f, 0.1f , 0.2f, 0.2f}))
+      .describe("Variances to be decoded from box regression output.");
+  }
+};
+
 /*! \brief Attributes used in non_maximum_suppression operators */
 struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
   double overlap_threshold;
diff --git a/python/tvm/relay/op/vision/multibox.py b/python/tvm/relay/op/vision/multibox.py
index 9b7483eec5ab..b04610aaa080 100644
--- a/python/tvm/relay/op/vision/multibox.py
+++ b/python/tvm/relay/op/vision/multibox.py
@@ -36,3 +36,39 @@ def multibox_prior(data,
         3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
     return _make.multibox_prior(data, sizes, ratios, steps, offsets, clip)
+
+
+def multibox_transform_loc(cls_prob,
+                           loc_pred,
+                           anchor,
+                           clip=True,
+                           threshold=0.01,
+                           variance=(0.1, 0.1, 0.2, 0.2)):
+    """Location transformation for multibox detection
+
+    Parameters
+    ----------
+    cls_prob : tvm.relay.Expr
+        Class probabilities.
+
+    loc_pred : tvm.relay.Expr
+        Location regression predictions.
+
+    anchor : tvm.relay.Expr
+        Prior anchor boxes.
+
+    clip : boolean, optional
+        Whether to clip out-of-boundary boxes.
+
+    threshold : double, optional
+        Threshold to be a positive prediction.
+
+    variance : Tuple of float, optional
+        Variances to be decoded from box regression output.
+
+    Returns
+    -------
+    ret : tuple of tvm.relay.Expr
+    """
+    return _make.multibox_transform_loc(cls_prob, loc_pred, anchor, clip,
+                                        threshold, variance)
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index e347e544e4f9..55db8862e849 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -1,5 +1,5 @@
 /*!
- *  Copyright (c) 2018 by Contributors
+ * Copyright (c) 2018 by Contributors
  * \file multibox_op.cc
  * \brief Multibox related operators
  */
@@ -68,5 +68,78 @@ RELAY_REGISTER_OP("vision.multibox_prior")
 .set_support_level(5)
 .add_type_rel("MultiBoxPrior", MultiboxPriorRel);
 
+TVM_REGISTER_NODE_TYPE(MultiBoxTransformLocAttrs);
+
+bool MultiBoxTransformLocRel(const Array<Type>& types, int num_inputs,
+                             const Attrs& attrs, const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 4);
+
+  const auto* cls_prob = types[0].as<TensorTypeNode>();
+  const auto* loc_pred = types[1].as<TensorTypeNode>();
+  const auto* anchor = types[2].as<TensorTypeNode>();
+  CHECK(cls_prob != nullptr && loc_pred != nullptr && anchor != nullptr);
+
+  const auto& cls_shape = cls_prob->shape;
+  const auto& loc_shape = loc_pred->shape;
+  const auto& anchor_shape = anchor->shape;
+
+  CHECK_EQ(cls_shape.size(), 3U)
+      << "The dimension of class probability should be 3, but received "
+      << cls_shape.size();
+  CHECK_EQ(loc_shape.size(), 2U)
+      << "The dimension of location prediction should be 2, but received "
+      << loc_shape.size();
+  CHECK_EQ(anchor_shape.size(), 3U)
+      << "The dimension of anchor should be 3, but received "
+      << anchor_shape.size();
+
+  CHECK(reporter->AssertEQ(cls_shape[2], anchor_shape[1]))
+      << "Number of anchors mismatch found";
+  CHECK(reporter->AssertEQ(cls_shape[2] * 4, loc_shape[1]))
+      << "# anchors mismatch with # loc.";
+  CHECK(reporter->Assert(anchor_shape[1] > 0)) << "Number of anchors must > 0.";
+  CHECK(reporter->AssertEQ(anchor_shape[2], 4));
+
+  std::vector<IndexExpr> oshape0({cls_shape[0], anchor_shape[1], 6});
+  std::vector<IndexExpr> oshape1({cls_shape[0]});
+  std::vector<Type> fields;
+  fields.push_back(TensorTypeNode::make(oshape0, cls_prob->dtype));
+  fields.push_back(TensorTypeNode::make(oshape1, Int(32)));
+
+  // assign output type
+  reporter->Assign(types[3], TupleTypeNode::make(Array<Type>(fields)));
+  return true;
+}
+
+Expr MakeMultiBoxTransformLoc(Expr cls_prob,
+                              Expr loc_pred,
+                              Expr anchor,
+                              bool clip,
+                              double threshold,
+                              Array<IndexExpr> variances) {
+  auto attrs = make_node<MultiBoxTransformLocAttrs>();
+  attrs->clip = std::move(clip);
+  attrs->threshold = std::move(threshold);
+  attrs->variances = std::move(variances);
+  static const Op& op = Op::Get("vision.multibox_transform_loc");
+  return CallNode::make(op, {cls_prob, loc_pred, anchor}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.vision._make.multibox_transform_loc")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 6>(MakeMultiBoxTransformLoc, args, rv);
+});
+
+RELAY_REGISTER_OP("vision.multibox_transform_loc")
+.describe(R"doc("Location transformation for multibox detection."
+)doc" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.MultiBoxTransformLocAttrs")
+.set_num_inputs(3)
+.add_argument("cls_prob", "Tensor", "Class probabilities.")
+.add_argument("loc_pred", "Tensor", "Location regression predictions.")
+.add_argument("anchor", "Tensor", "Multibox prior anchor boxes")
+.add_type_rel("MultiBoxTransformLoc", MultiBoxTransformLocRel)
+.set_support_level(5);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 77e3f005dade..6bd331b98120 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -102,8 +102,64 @@ def test_nms():
         (n, num_anchors, 6), "float32")
 
 
+def test_multibox_transform_loc():
+    def test_default_value():
+        num_anchors = 5
+        num_classes = 5
+
+        cls_prob = relay.var(
+            "cls_prob",
+            relay.ty.TensorType((1, num_anchors, num_classes), "float32"))
+        loc_pred = relay.var(
+            "loc_pred", relay.ty.TensorType((1, num_anchors * 4), "float32"))
+        anchors = relay.var(
+            "anchors", relay.ty.TensorType((1, num_anchors, 4), "float32"))
+
+        ret = relay.vision.multibox_transform_loc(
+            cls_prob=cls_prob, loc_pred=loc_pred, anchor=anchors)
+        ret = relay.ir_pass.infer_type(ret)
+        ref_type = relay.ty.TupleType(
+            tvm.convert([
+                relay.ty.TensorType((1, num_anchors, 6), "float32"),
+                relay.ty.TensorType((1, ), "int")
+            ]))
+        assert ret.checked_type == ref_type
+
+    def test_threshold():
+        num_anchors = 5
+        num_classes = 5
+        n = tvm.var("n")
+        cls_prob = relay.var(
+            "cls_prob",
+            relay.ty.TensorType((n, num_anchors, num_classes), "float32"))
+        loc_pred = relay.var(
+            "loc_pred", relay.ty.TensorType((n, num_anchors * 4), "float32"))
+        anchors = relay.var(
+            "anchors", relay.ty.TensorType((1, num_anchors, 4), "float32"))
+        threshold = 0.02
+        variance = (0.2, 0.2, 0.3, 0.3)
+
+        ret = relay.vision.multibox_transform_loc(
+            cls_prob=cls_prob,
+            loc_pred=loc_pred,
+            anchor=anchors,
+            threshold=threshold,
+            variance=variance)
+        ret = relay.ir_pass.infer_type(ret)
+        ref_type = relay.ty.TupleType(
+            tvm.convert([
+                relay.ty.TensorType((n, num_anchors, 6), "float32"),
+                relay.ty.TensorType((n, ), "int")
+            ]))
+        assert ret.checked_type == ref_type
+
+    test_default_value()
+    test_threshold()
+
+
 if __name__ == "__main__":
     test_resize_infer_type()
     test_resize()
     test_multibox_prior()
+    test_multibox_transform_loc()
     test_nms()

From e54408d4ea01cc8fda17fca961589dd0ddce1d6d Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Thu, 20 Dec 2018 14:40:55 -0800
Subject: [PATCH 511/529] [COMMUNITY] @eqy -> Committer (#2311)

* Add Eddie to committer

* Fix order
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 788121ba82e1..964b700392b0 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -22,6 +22,7 @@ We do encourage everyone to work anything they are interested in.
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
+- [Eddie Yan](https://github.com/eqy): @eqy - runtime, autotvm, rpc, topi
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy - autotvm, topi
 
 ## Reviewers

From e78e4326207f99e6270d58cb1f81af488269d256 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Thu, 20 Dec 2018 14:43:33 -0800
Subject: [PATCH 512/529] [Relay][Frontend] Add MXNet test example for relay
 (#2316)

* Add MXNet test example for relay
* Fix a bug in BiasAddSimplifier
---
 src/relay/pass/canonicalize_ops.cc            |   2 +-
 .../frontend/mxnet/model_zoo/__init__.py      |  46 ++++
 .../relay/frontend/mxnet/model_zoo/dcgan.py   |  66 ++++++
 .../relay/frontend/mxnet/model_zoo/dqn.py     |  27 +++
 .../frontend/mxnet/model_zoo/inception_v3.py  | 170 +++++++++++++++
 .../relay/frontend/mxnet/model_zoo/mlp.py     |  40 ++++
 .../relay/frontend/mxnet/model_zoo/resnet.py  | 199 +++++++++++++++++
 .../frontend/mxnet/model_zoo/squeezenet.py    |  76 +++++++
 .../relay/frontend/mxnet/model_zoo/vgg.py     |  85 ++++++++
 .../relay/frontend/mxnet/test_forward.py      | 206 ++++++++++++++++++
 .../python/relay/frontend/mxnet/test_graph.py |  87 ++++++++
 11 files changed, 1003 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/relay/frontend/mxnet/model_zoo/__init__.py
 create mode 100644 tests/python/relay/frontend/mxnet/model_zoo/dcgan.py
 create mode 100644 tests/python/relay/frontend/mxnet/model_zoo/dqn.py
 create mode 100644 tests/python/relay/frontend/mxnet/model_zoo/inception_v3.py
 create mode 100644 tests/python/relay/frontend/mxnet/model_zoo/mlp.py
 create mode 100644 tests/python/relay/frontend/mxnet/model_zoo/resnet.py
 create mode 100644 tests/python/relay/frontend/mxnet/model_zoo/squeezenet.py
 create mode 100644 tests/python/relay/frontend/mxnet/model_zoo/vgg.py
 create mode 100644 tests/python/relay/frontend/mxnet/test_forward.py
 create mode 100644 tests/python/relay/frontend/mxnet/test_graph.py

diff --git a/src/relay/pass/canonicalize_ops.cc b/src/relay/pass/canonicalize_ops.cc
index 77cd59e2afd8..4482dc3954ab 100644
--- a/src/relay/pass/canonicalize_ops.cc
+++ b/src/relay/pass/canonicalize_ops.cc
@@ -22,7 +22,7 @@ class BiasAddSimplifier : public ExprMutator {
       CHECK_EQ(call->args.size(), 2);
       const BiasAddAttrs* param = call->attrs.as<BiasAddAttrs>();
 
-      auto ttype = call->args[0]->type_as<TensorTypeNode>();
+      auto ttype = n->args[0]->type_as<TensorTypeNode>();
       size_t n_dim = ttype->shape.size();
       Expr expanded_bias = ExpandBiasToMatchAxis(call->args[1], n_dim, {param->axis});
       Expr ret = Add(call->args[0], expanded_bias);
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/__init__.py b/tests/python/relay/frontend/mxnet/model_zoo/__init__.py
new file mode 100644
index 000000000000..1c796f7810b7
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/__init__.py
@@ -0,0 +1,46 @@
+"""MXNet and Relay model zoo."""
+from __future__ import absolute_import
+from . import mlp, resnet, vgg, dqn, dcgan, squeezenet, inception_v3
+import tvm.relay.testing
+
+_num_class = 1000
+_batch = 2
+
+# mlp fc
+mx_mlp = mlp.get_symbol(_num_class)
+relay_mlp = tvm.relay.testing.mlp.get_workload(_batch, _num_class)[0]
+
+# vgg fc
+mx_vgg = {}
+relay_vgg = {}
+for num_layers in [11, 13, 16, 19]:
+    mx_vgg[num_layers] = vgg.get_symbol(_num_class, num_layers)
+    relay_vgg[num_layers] = tvm.relay.testing.vgg.get_workload(
+        _batch, _num_class, num_layers=num_layers)[0]
+
+# resnet fc
+mx_resnet = {}
+relay_resnet = {}
+for num_layers in [18, 34, 50, 101, 152, 200, 269]:
+    mx_resnet[num_layers] = resnet.get_symbol(_num_class, num_layers, '3,224,224')
+    relay_resnet[num_layers] = tvm.relay.testing.resnet.get_workload(
+        _batch, _num_class, num_layers=num_layers)[0]
+
+# squeezenet
+mx_squeezenet = {}
+relay_squeezenet = {}
+for version in ['1.0', '1.1']:
+    mx_squeezenet[version] = squeezenet.get_symbol(version=version)
+    relay_squeezenet[version] = tvm.relay.testing.squeezenet.get_workload(_batch, version=version)[0]
+
+# inception
+mx_inception_v3 = inception_v3.get_symbol()
+relay_inception_v3 = tvm.relay.testing.inception_v3.get_workload(_batch)[0]
+
+# dqn
+mx_dqn = dqn.get_symbol()
+relay_dqn = tvm.relay.testing.dqn.get_workload(_batch)[0]
+
+# dcgan generator
+mx_dcgan = dcgan.get_symbol()
+relay_dcgan = tvm.relay.testing.dcgan.get_workload(_batch)[0]
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/dcgan.py b/tests/python/relay/frontend/mxnet/model_zoo/dcgan.py
new file mode 100644
index 000000000000..8af030b6b184
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/dcgan.py
@@ -0,0 +1,66 @@
+# pylint: disable=unused-argument
+"""
+The MXNet symbol of DCGAN generator
+
+Adopted from:
+https://github.com/tqchen/mxnet-gan/blob/master/mxgan/generator.py
+
+Reference:
+Radford, Alec, Luke Metz, and Soumith Chintala.
+"Unsupervised representation learning with deep convolutional generative adversarial networks."
+arXiv preprint arXiv:1511.06434 (2015).
+"""
+
+import mxnet as mx
+
+def deconv2d(data, ishape, oshape, kshape, name, stride=(2, 2)):
+    """a deconv layer that enlarges the feature map"""
+    target_shape = (oshape[-2], oshape[-1])
+    pad_y = (kshape[0] - 1) // 2
+    pad_x = (kshape[1] - 1) // 2
+    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
+    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
+
+    net = mx.sym.Deconvolution(data,
+                               kernel=kshape,
+                               stride=stride,
+                               pad=(pad_y, pad_x),
+                               adj=(adj_y, adj_x),
+                               num_filter=oshape[0],
+                               no_bias=True,
+                               name=name)
+    return net
+
+def deconv2d_bn_relu(data, prefix, **kwargs):
+    """a block of deconv + batch norm + relu"""
+    eps = 1e-5 + 1e-12
+
+    net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
+    net = mx.sym.BatchNorm(net, eps=eps, name="%s_bn" % prefix)
+    net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu')
+    return net
+
+def get_symbol(oshape=(3, 64, 64), ngf=128, code=None):
+    """get symbol of dcgan generator"""
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
+
+    code = mx.sym.Variable("data") if code is None else code
+    net = mx.sym.FullyConnected(code, name="g1", num_hidden=ngf*8*4*4, no_bias=True, flatten=False)
+    net = mx.sym.Activation(net, act_type='relu')
+    # 4 x 4
+    net = mx.sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
+    # 8 x 8
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
+    # 16x16
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
+    # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
+    net = deconv2d(
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
+    net = mx.sym.Activation(net, act_type='tanh')
+    return net
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/dqn.py b/tests/python/relay/frontend/mxnet/model_zoo/dqn.py
new file mode 100644
index 000000000000..e037511efdf2
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/dqn.py
@@ -0,0 +1,27 @@
+"""
+The mxnet symbol of Nature DQN
+
+Reference:
+Mnih, Volodymyr, et al.
+"Human-level control through deep reinforcement learning."
+Nature 518.7540 (2015): 529.
+"""
+
+import mxnet as mx
+
+def get_symbol(num_action=18):
+    data = mx.sym.Variable(name='data')
+    net = mx.sym.Convolution(data, kernel=(8, 8), stride=(4, 4),
+                             num_filter=32, name='conv1')
+    net = mx.sym.Activation(net, act_type='relu', name='relu1')
+    net = mx.sym.Convolution(net, kernel=(4, 4), stride=(2, 2),
+                             num_filter=64, name='conv2')
+    net = mx.sym.Activation(net, act_type='relu', name='relu2')
+    net = mx.sym.Convolution(net, kernel=(3, 3), stride=(1, 1),
+                             num_filter=64, name='conv3')
+    net = mx.sym.Activation(net, act_type='relu', name='relu3')
+    net = mx.sym.FullyConnected(net, num_hidden=512, name='fc4')
+    net = mx.sym.Activation(net, act_type='relu', name='relu4')
+    net = mx.sym.FullyConnected(net, num_hidden=num_action, name='fc5', flatten=False)
+
+    return net
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/inception_v3.py b/tests/python/relay/frontend/mxnet/model_zoo/inception_v3.py
new file mode 100644
index 000000000000..b8585bf05037
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/inception_v3.py
@@ -0,0 +1,170 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+import mxnet as mx
+import numpy as np
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
+    bn = mx.sym.BatchNorm(data=conv, eps=2e-5, name='%s%s_batchnorm' % (name, suffix))
+    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
+    return act
+
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(pooling, proj, name=('%s_tower_2' %  name), suffix='_conv')
+    concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name))
+    concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = mx.sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1")
+
+    # # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool")
+    flatten = mx.sym.Flatten(data=pool, name="flatten")
+    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1', flatten=False)
+    softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+    return softmax
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/mlp.py b/tests/python/relay/frontend/mxnet/model_zoo/mlp.py
new file mode 100644
index 000000000000..922b208749bf
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/mlp.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+a simple multilayer perceptron
+"""
+import mxnet as mx
+
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.sym.Flatten(data=data)
+    try:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128, flatten=False)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64, flatten=False)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes, flatten=False)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    except:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    return mlp
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/resnet.py b/tests/python/relay/frontend/mxnet/model_zoo/resnet.py
new file mode 100644
index 000000000000..3f9a870d31c0
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/resnet.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+import mxnet as mx
+import numpy as np
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    if bottle_neck:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=stride, pad=(0,0),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1,1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+        act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
+        conv3 = mx.sym.Convolution(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                                   workspace=workspace, name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    dtype : str
+        Precision (float32 or float16)
+    """
+    num_unit = len(units)
+    assert(num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if dtype == 'float32':
+        # data = mx.sym.identity(data=data, name='id')
+        data = data
+    else:
+        if dtype == 'float16':
+            data = mx.sym.Cast(data=data, dtype=np.float16)
+    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    (nchannel, height, width) = image_shape
+    if height <= 32:            # such as cifar10
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
+                                  no_bias=True, name="conv0", workspace=workspace)
+    else:                       # often expected to be 224 such as imagenet
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
+                                  no_bias=True, name="conv0", workspace=workspace)
+        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
+        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+
+    for i in range(num_stages):
+        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
+                             memonger=memonger)
+        for j in range(units[i]-1):
+            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
+    bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = mx.sym.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
+    flat = mx.sym.Flatten(data=pool1)
+    try:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1', flatten=False)
+    except:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
+    if dtype == 'float16':
+        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
+    return mx.sym.softmax(data=fc1, name='softmax')
+
+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    image_shape = [int(l) for l in image_shape.split(',')]
+    (nchannel, height, width) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+
+    return resnet(units       = units,
+                  num_stages  = num_stages,
+                  filter_list = filter_list,
+                  num_classes = num_classes,
+                  image_shape = image_shape,
+                  bottle_neck = bottle_neck,
+                  workspace   = conv_workspace,
+                  dtype       = dtype)
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/squeezenet.py b/tests/python/relay/frontend/mxnet/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..deb896a21385
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/squeezenet.py
@@ -0,0 +1,76 @@
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+import mxnet as mx
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = mx.sym.concat(left, right, dim=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = mx.sym.Convolution(net, num_filter=channels, kernel=(kernel_size, kernel_size),
+                             pad=(padding, padding))
+    net = mx.sym.Activation(net, act_type='relu')
+    return net
+
+# Net
+def get_symbol(num_classes=1000, version='1.0', **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version in ['1.0', '1.1'], ("Unsupported SqueezeNet version {version}:"
+                                       "1.0 or 1.1 expected".format(version=version))
+    net = mx.sym.Variable("data")
+    if version == '1.0':
+        net = mx.sym.Convolution(net, num_filter=96, kernel=(7, 7), stride=(2, 2), pad=(3, 3))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 64, 256, 256)
+    else:
+        net = mx.sym.Convolution(net, num_filter=64, kernel=(3, 3), stride=(2, 2), pad=(1, 1))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 64, 256, 256)
+    net = mx.sym.Dropout(net, p=0.5)
+    net = mx.sym.Convolution(net, num_filter=num_classes, kernel=(1, 1))
+    net = mx.sym.Activation(net, act_type='relu')
+    net = mx.sym.Pooling(data=net, global_pool=True, kernel=(13, 13), pool_type='avg')
+    net = mx.sym.flatten(net)
+    return mx.sym.softmax(net)
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/vgg.py b/tests/python/relay/frontend/mxnet/model_zoo/vgg.py
new file mode 100644
index 000000000000..68215bb80aaa
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/vgg.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""References:
+
+Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
+large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
+"""
+
+import mxnet as mx
+import numpy as np
+
+def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs):
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1))
+            if batch_norm:
+                internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1))
+        internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1))
+    return internel_layer
+
+def get_classifier(input_data, num_classes, **kwargs):
+    flatten = mx.sym.Flatten(data=input_data, name="flatten")
+    try:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6", flatten=False)
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7", flatten=False)
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8", flatten=False)
+    except:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
+    return fc8
+
+def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs):
+    """
+    Parameters
+    ----------
+    num_classes : int, default 1000
+        Number of classification classes.
+    num_layers : int
+        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+    batch_norm : bool, default False
+        Use batch normalization.
+    dtype: str, float32 or float16
+        Data precision.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if num_layers not in vgg_spec:
+        raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data = mx.sym.Variable(name="data")
+    if dtype == 'float16':
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    if dtype == 'float16':
+        classifier = mx.sym.Cast(data=classifier, dtype=np.float32)
+    symbol = mx.sym.softmax(data=classifier, name='softmax')
+    return symbol
diff --git a/tests/python/relay/frontend/mxnet/test_forward.py b/tests/python/relay/frontend/mxnet/test_forward.py
new file mode 100644
index 000000000000..fcc760981ef5
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/test_forward.py
@@ -0,0 +1,206 @@
+import numpy as np
+
+import topi
+import tvm
+from tvm.contrib import graph_runtime
+from tvm import relay
+from tvm.relay.testing.config import ctx_list
+import mxnet as mx
+from mxnet import gluon
+from mxnet.gluon.model_zoo import vision
+import model_zoo
+
+
+def verify_mxnet_frontend_impl(mx_symbol, data_shape=(1, 3, 224, 224), out_shape=(1, 1000),
+                               gluon_impl=False, name=None, dtype='float32'):
+    """Use name different from test to avoid let nose pick it up"""
+    if gluon_impl:
+        def get_gluon_output(name, x):
+            net = vision.get_model(name)
+            net.collect_params().initialize(mx.init.Xavier())
+            net_sym = gluon.nn.SymbolBlock(outputs=net(mx.sym.var('data')),
+                                           inputs=mx.sym.var('data'),
+                                           params=net.collect_params())
+            out = net_sym(mx.nd.array(x.astype(dtype))).asnumpy()
+            return out, net_sym
+    else:
+        def get_mxnet_output(symbol, x, dtype='float32'):
+            from collections import namedtuple
+            Batch = namedtuple('Batch', ['data'])
+            mod = mx.mod.Module(symbol, label_names=None)
+            mod.bind(data_shapes=[('data', x.shape)], for_training=False)
+            mod.init_params()
+            mod.forward(Batch([mx.nd.array(x.astype(dtype))]))
+            out = mod.get_outputs()[0].asnumpy()
+            args, auxs = mod.get_params()
+            return out, args, auxs
+
+    def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
+        dshape = x.shape
+        shape_dict = {'data': dshape}
+        if gluon_impl:
+            new_sym, params = relay.frontend.from_mxnet(symbol, shape_dict)
+        else:
+            new_sym, params = relay.frontend.from_mxnet(symbol, shape_dict, arg_params=args, aux_params=auxs)
+
+        with relay.build_config(opt_level=3):
+            graph, lib, params = relay.build(new_sym, target, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("data", tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+        return out.asnumpy()
+
+    # random input
+    x = np.random.uniform(size=data_shape)
+    if gluon_impl:
+        gluon_out, gluon_sym = get_gluon_output(name, x)
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype)
+            tvm.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
+    else:
+        mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype)
+        assert "data" not in args
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype)
+            tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_mlp():
+    mlp = model_zoo.mx_mlp
+    verify_mxnet_frontend_impl(mlp)
+
+def test_forward_vgg():
+    for n in [11]:
+        mx_sym = model_zoo.mx_vgg[n]
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_resnet():
+    for n in [18]:
+        mx_sym = model_zoo.mx_resnet[n]
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_elu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='elu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_rrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='rrelu', lower_bound=0.3, upper_bound=0.7)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_prelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='prelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_softrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.Activation(data, act_type='softrelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_fc_flatten():
+    # test flatten=True option in mxnet 0.11.1
+    data = mx.sym.var('data')
+    try:
+        mx_sym = mx.sym.FullyConnected(data, num_hidden=100, flatten=True)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+        mx_sym = mx.sym.FullyConnected(mx.sym.Flatten(data), num_hidden=100, flatten=False)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+    except:
+        pass
+
+def test_forward_clip():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicity
+    mx_sym = mx.sym.clip(data, a_min=0, a_max=1)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_split():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=False)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 1, 2, 1))
+
+def test_forward_split_squeeze():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=True)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 2, 1))
+
+def test_forward_expand_dims():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.expand_dims(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 1, 3, 4))
+
+def test_forward_pooling():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='avg')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='max')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+def test_forward_lrn():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.LRN(data, alpha=2, beta=2, knorm=1, nsize=5)
+    verify_mxnet_frontend_impl(mx_sym, (1, 10, 24, 24), (1, 10, 24, 24))
+
+def test_forward_ones():
+    data = mx.sym.var('data')
+    ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, ones)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+    
+def test_forward_zeros():
+    data = mx.sym.var('data')
+    zeros = mx.sym.zeros(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, zeros)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_ones_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.ones_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_zeros_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.zeros_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_argmax():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmax(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (5, 3), (5,))
+
+def test_forward_argmin():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmin(data, axis=0)
+    verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
+    
+if __name__ == '__main__':
+    test_forward_mlp()
+    test_forward_vgg()
+    test_forward_resnet()
+    test_forward_elu()
+    test_forward_rrelu()
+    test_forward_prelu()
+    test_forward_softrelu()
+    test_forward_fc_flatten()
+    test_forward_clip()
+    test_forward_split()
+    test_forward_split_squeeze()
+    test_forward_expand_dims()
+    test_forward_pooling()
+    test_forward_lrn()
+    test_forward_ones()
+    test_forward_zeros()
+    test_forward_ones_like()
+    test_forward_zeros_like()
+    test_forward_argmax()
+    test_forward_argmin()
diff --git a/tests/python/relay/frontend/mxnet/test_graph.py b/tests/python/relay/frontend/mxnet/test_graph.py
new file mode 100644
index 000000000000..820e78242808
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/test_graph.py
@@ -0,0 +1,87 @@
+import mxnet as mx
+import tvm
+from tvm import relay
+import model_zoo
+from model_zoo import _batch
+
+def test_mlp():
+    mx_sym = model_zoo.mx_mlp
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 1, 28, 28)})
+    from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+    relay_sym = model_zoo.relay_mlp
+    assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_vgg():
+    for n in [11, 13, 16, 19]:
+        mx_sym = model_zoo.mx_vgg[n]
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 3, 224, 224)})
+        from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+        relay_sym = model_zoo.relay_vgg[n]
+        assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_resnet():
+    for n in [18, 34, 50, 101, 152, 200, 269]:
+        mx_sym = model_zoo.mx_resnet[n]
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 3, 224, 224)})
+        from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+        relay_sym = model_zoo.relay_resnet[n]
+        assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_squeezenet():
+    for version in ['1.0', '1.1']:
+        mx_sym = model_zoo.mx_squeezenet[version]
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 3, 224, 224)})
+        from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+        relay_sym = model_zoo.relay_squeezenet[version]
+        assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_inception_v3():
+    mx_sym = model_zoo.mx_inception_v3
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 3, 299, 299)})
+    from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+    relay_sym = model_zoo.relay_inception_v3
+    assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_dqn():
+    mx_sym = model_zoo.mx_dqn
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 4, 84, 84)})
+    from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+    relay_sym = model_zoo.relay_dqn
+    assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_dcgan():
+    mx_sym = model_zoo.mx_dcgan
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 100)})
+    from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+    relay_sym = model_zoo.relay_dcgan
+    assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_multi_outputs():
+    def compose_mxnet(**kwargs):
+        x = mx.sym.Variable('x')
+        y = mx.sym.Variable('y')
+        z = mx.sym.split(x, **kwargs)
+        return mx.sym.broadcast_sub(mx.sym.broadcast_add(z[0], z[2]), y)
+    def compose_relay(**kwargs):
+        x = relay.var("x", shape=(_batch, 3, 224, 224))
+        y = relay.var("y", shape=(1,))
+        z = relay.split(x, **kwargs)
+        ret = z[0] + z[2] - y
+        args = relay.ir_pass.free_vars(ret)
+        return relay.Function(args, ret)
+    mx_sym = compose_mxnet(num_outputs=3, axis=1)
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'x': (_batch, 3, 224, 224), 'y': (1,)})
+    from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+    relay_sym = compose_relay(indices_or_sections=3, axis=1)
+    relay_sym = relay.ir_pass.infer_type(relay_sym)
+    assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+if __name__ == '__main__':
+    test_mlp()
+    test_vgg()
+    test_resnet()
+    test_squeezenet()
+    test_inception_v3()
+    test_dqn()
+    test_dcgan()
+    test_multi_outputs()

From 21e3a5daa0826a2ba588be9d13ad2828a4988704 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Fri, 21 Dec 2018 09:44:26 -0800
Subject: [PATCH 513/529] [BUGFIX] Seg fault in memory planing for symbolic
 shape (#2317)

---
 src/relay/backend/graph_plan_memory.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 4a5aa4ea0a33..e17c7a6839ea 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -196,7 +196,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
  protected:
   using StorageAllocaBaseVisitor::VisitExpr_;
   // override create token by getting token as prototype requirements.
-  void CreateToken(const ExprNode* op, bool can_realloc)  final {
+  void CreateToken(const ExprNode* op, bool can_realloc) final {
     CHECK(!token_map_.count(op));
     auto it = prototype_.find(op);
     CHECK(it != prototype_.end());
@@ -253,12 +253,12 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     size_t size = 1;
     for (IndexExpr dim : ttype->shape) {
       const int64_t* pval = as_const_int(dim);
-      CHECK_GE(*pval, 0) <<
-        "can not allocate memory for tensor with negative shape" <<
-        *pval;
       CHECK(pval != nullptr)
           << "Cannot allocate memory symbolic tensor shape "
           << ttype->shape;
+      CHECK_GE(*pval, 0)
+          << "Cannot allocate memory for tensor with negative shape"
+          << *pval;
       size *= static_cast<size_t>(pval[0]);
     }
     size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);

From 8aee1721cab57f75015f8d20bb1a42bd538aede9 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Sat, 22 Dec 2018 09:42:48 -0800
Subject: [PATCH 514/529] Small refactors and bug fixes. (#2281)

---
 include/tvm/relay/expr.h                      |   7 +
 python/tvm/relay/__init__.py                  |   5 +
 .../relay/backend/graph_runtime_codegen.py    |   6 +-
 python/tvm/relay/expr.py                      | 130 +--------------
 python/tvm/relay/expr_functor.py              | 155 ++++++++++++++++++
 src/relay/backend/compile_engine.cc           |   8 +-
 src/relay/backend/interpreter.cc              |   9 +-
 src/relay/ir/expr.cc                          |   8 +-
 src/relay/pass/fuse_ops.cc                    |   4 +-
 9 files changed, 188 insertions(+), 144 deletions(-)
 create mode 100644 python/tvm/relay/expr_functor.py

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 37c91ffe4ed2..14b3cd91701c 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -248,6 +248,13 @@ class FunctionNode : public ExprNode {
    */
   TVM_DLL FuncType func_type_annotation() const;
 
+  /*!
+   * \brief Check whether the function is a primitive function.
+   *
+   * \return Whether the function is primitive or not.
+   */
+  bool IsPrimitive() const;
+
   TVM_DLL static Function make(tvm::Array<Var> params,
                                Expr body,
                                Type ret_type,
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index b66132f27775..69180837b724 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -5,6 +5,7 @@
 from . import base
 from . import ty
 from . import expr
+from . import expr_functor
 from . import module
 from . import ir_pass
 from .build_module import build, build_config, create_executor
@@ -53,6 +54,10 @@
 If = expr.If
 TupleGetItem = expr.TupleGetItem
 
+# ExprFunctor
+ExprFunctor = expr_functor.ExprFunctor
+ExprMutator = expr_functor.ExprMutator
+
 # helper functions
 var = expr.var
 const = expr.const
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
index 0da9b81269aa..91d09973ea8f 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -24,7 +24,8 @@
 from . import _backend
 from . import compile_engine
 from ..op import Op
-from ..expr import Function, GlobalVar, ExprFunctor
+from ..expr import Function, GlobalVar
+from ..expr_functor import ExprFunctor
 from ..ty import TupleType, TensorType
 
 
@@ -251,6 +252,9 @@ def visit_call(self, call):
                          op_name, inputs, {})
         return self.add_node(op_node, call)
 
+    def visit_op(self, _):
+        raise Exception("can not compile op in non-eta expanded form")
+
     def _get_json(self):
         """
         Convert the sequence of nodes stored by the compiler into the
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 4725c0a7a07d..e0c1f68ad431 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -222,12 +222,13 @@ def __init__(self,
                  params,
                  body,
                  ret_type=None,
-                 type_params=None):
+                 type_params=None,
+                 attrs=None):
         if type_params is None:
             type_params = convert([])
 
         self.__init_handle_by_constructor__(
-            _make.Function, params, body, ret_type, type_params)
+            _make.Function, params, body, ret_type, type_params, attrs)
 
     def __call__(self, *args):
         """Invoke the gobal function.
@@ -343,131 +344,6 @@ def realize(self):
         return _expr.TempExprRealize(self)
 
 
-class ExprFunctor(object):
-    """
-    An abstract visitor defined over Expr.
-
-    Defines the default dispatch over expressions, and
-    implements memoization.
-    """
-    def __init__(self):
-        self.memo_map = {}
-
-    # pylint: disable=no-else-return
-    def visit(self, expr):
-        """Apply the visitor to an expression."""
-        found = self.memo_map.get(expr)
-        if found:
-            return found
-
-        if isinstance(expr, Function):
-            res = self.visit_function(expr)
-        elif isinstance(expr, Call):
-            res = self.visit_call(expr)
-        elif isinstance(expr, Let):
-            res = self.visit_let(expr)
-        elif isinstance(expr, Var):
-            res = self.visit_var(expr)
-        elif isinstance(expr, GlobalVar):
-            res = self.visit_global_var(expr)
-        elif isinstance(expr, If):
-            res = self.visit_if(expr)
-        elif isinstance(expr, Tuple):
-            res = self.visit_tuple(expr)
-        elif isinstance(expr, TupleGetItem):
-            res = self.visit_tuple_getitem(expr)
-        elif isinstance(expr, Constant):
-            res = self.visit_constant(expr)
-        else:
-            raise Exception("warning unhandled case: {0}".format(type(expr)))
-
-        self.memo_map[expr] = res
-        return res
-
-    def visit_function(self, _):
-        raise NotImplementedError()
-
-    def visit_let(self, _):
-        raise NotImplementedError()
-
-    def visit_call(self, _):
-        raise NotImplementedError()
-
-    def visit_var(self, _):
-        raise NotImplementedError()
-
-    def visit_type(self, typ):
-        return typ
-
-    def visit_if(self, _):
-        raise NotImplementedError()
-
-    def visit_tuple(self, _):
-        raise NotImplementedError()
-
-    def visit_tuple_getitem(self, _):
-        raise NotImplementedError()
-
-    def visit_constant(self, _):
-        raise NotImplementedError()
-
-    def visit_global_var(self, _):
-        raise NotImplementedError()
-
-
-class ExprMutator(ExprFunctor):
-    """
-    A functional visitor over Expr.
-
-    The default behavior recursively traverses the AST
-    and reconstructs the AST.
-    """
-    def visit_function(self, fn):
-        new_body = self.visit(fn.body)
-        return Function(
-            list(fn.params),
-            fn.ret_type, new_body,
-            fn.type_params)
-
-    def visit_let(self, let):
-        new_var = self.visit(let.var)
-        new_val = self.visit(let.value)
-        new_body = self.visit(let.body)
-        return Let(new_var, new_val, new_body)
-
-    def visit_call(self, call):
-        new_fn = self.visit(call.op)
-        new_args = [self.visit(arg) for arg in call.args]
-        return Call(new_fn, new_args, call.attrs)
-
-    def visit_var(self, rvar):
-        return rvar
-
-    def visit_global_id(self, global_var):
-        return global_var
-
-    def visit_if(self, ite):
-        return If(
-            self.visit(ite.guard),
-            self.visit(ite.true_b),
-            self.visit(ite.false_b))
-
-    def visit_tuple(self, tup):
-        return Tuple([self.visit(field) for field in tup.fields])
-
-    def visit_tuple_getitem(self, op):
-        tuple_value = self.visit(op.tuple_value)
-        if not tuple_value.same_as(op.tuple_value):
-            return TupleGetItem(tuple_value, op.index)
-        return op
-
-    def visit_global_var(self, gvar):
-        return gvar
-
-    def visit_constant(self, rconst):
-        return rconst
-
-
 class TupleWrapper(object):
     """TupleWrapper.
 
diff --git a/python/tvm/relay/expr_functor.py b/python/tvm/relay/expr_functor.py
new file mode 100644
index 000000000000..eafe5f09309f
--- /dev/null
+++ b/python/tvm/relay/expr_functor.py
@@ -0,0 +1,155 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
+"""The expression functor of Relay."""
+
+from .expr import Function, Call, Let, Var, GlobalVar, If, Tuple, TupleGetItem, Constant
+from .op import Op
+
+class ExprFunctor:
+    """
+    An abstract visitor defined over Expr.
+
+    Defines the default dispatch over expressions, and
+    implements memoization.
+    """
+    def __init__(self):
+        self.memo_map = {}
+
+    # pylint: disable=no-else-return
+    def visit(self, expr):
+        """Apply the visitor to an expression."""
+        found = self.memo_map.get(expr)
+        if found:
+            return found
+
+        if isinstance(expr, Function):
+            res = self.visit_function(expr)
+        elif isinstance(expr, Call):
+            res = self.visit_call(expr)
+        elif isinstance(expr, Let):
+            res = self.visit_let(expr)
+        elif isinstance(expr, Var):
+            res = self.visit_var(expr)
+        elif isinstance(expr, GlobalVar):
+            res = self.visit_global_var(expr)
+        elif isinstance(expr, If):
+            res = self.visit_if(expr)
+        elif isinstance(expr, Tuple):
+            res = self.visit_tuple(expr)
+        elif isinstance(expr, TupleGetItem):
+            res = self.visit_tuple_getitem(expr)
+        elif isinstance(expr, Constant):
+            res = self.visit_constant(expr)
+        elif isinstance(expr, Op):
+            res = self.visit_op(expr)
+        else:
+            raise Exception("warning unhandled case: {0}".format(type(expr)))
+
+        self.memo_map[expr] = res
+
+        return res
+
+    def visit_function(self, _):
+        raise NotImplementedError()
+
+    def visit_let(self, _):
+        raise NotImplementedError()
+
+    def visit_call(self, _):
+        raise NotImplementedError()
+
+    def visit_var(self, _):
+        raise NotImplementedError()
+
+    def visit_type(self, typ):
+        return typ
+
+    def visit_if(self, _):
+        raise NotImplementedError()
+
+    def visit_tuple(self, _):
+        raise NotImplementedError()
+
+    def visit_tuple_getitem(self, _):
+        raise NotImplementedError()
+
+    def visit_global_var(self, _):
+        raise NotImplementedError()
+
+    def visit_op(self, _):
+        raise NotImplementedError()
+
+    def visit_constant(self, _):
+        raise NotImplementedError()
+
+
+class ExprMutator(ExprFunctor):
+    """
+    A functional visitor over Expr.
+
+    The default behavior recursively traverses the AST
+    and reconstructs the AST.
+    """
+    def visit_function(self, fn):
+        new_body = self.visit(fn.body)
+        return Function(
+            list(fn.params),
+            new_body,
+            fn.ret_type,
+            fn.type_params,
+            fn.attrs)
+
+    def visit_let(self, let):
+        new_var = self.visit(let.var)
+        new_val = self.visit(let.value)
+        new_body = self.visit(let.body)
+        return Let(new_var, new_val, new_body)
+
+    def visit_call(self, call):
+        new_fn = self.visit(call.op)
+        new_args = [self.visit(arg) for arg in call.args]
+        return Call(new_fn, new_args, call.attrs)
+
+    def visit_var(self, rvar):
+        return rvar
+
+    def visit_global_id(self, global_var):
+        return global_var
+
+    def visit_if(self, ite):
+        return If(
+            self.visit(ite.guard),
+            self.visit(ite.true_b),
+            self.visit(ite.false_b))
+
+    def visit_tuple(self, tup):
+        return Tuple([self.visit(field) for field in tup.fields])
+
+    def visit_tuple_getitem(self, op):
+        tuple_value = self.visit(op.tuple_value)
+        if not tuple_value.same_as(op.tuple_value):
+            return TupleGetItem(tuple_value, op.index)
+        return op
+
+    def visit_global_var(self, gvar):
+        return gvar
+
+    def visit_op(self, op):
+        return op
+
+    def visit_constant(self, const):
+        return const
+
+    def visit_constructor(self, con):
+        return con
+
+    def visit_match(self, m):
+        return Match(self.visit(m.data), [Clause(c.lhs, self.visit(c.rhs)) for c in m.pattern])
+
+    def visit_ref_new(self, r):
+        return RefNew(self.visit(r.value))
+
+    def visit_ref_write(self, r):
+        return RefWrite(self.visit(r.ref), self.visit(r.value))
+
+    def visit_ref_read(self, r):
+        return RefRead(self.visit(r.ref))
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index b8938bd34804..42394955cc64 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -157,14 +157,14 @@ class ScheduleGetter :
 
     int op_pattern = fpattern[op];
     if (op_pattern >= kCommReduce) {
-      CHECK(!master_op_.defined() || master_op_patetrn_ < kCommReduce)
+      CHECK(!master_op_.defined() || master_op_pattern_ < kCommReduce)
           << "Two complicated op in a primitive function "
           << " master=" << master_op_ << " current=" << op;
     }
-    if (op_pattern >= master_op_patetrn_) {
+    if (op_pattern >= master_op_pattern_) {
       master_op_ = op;
       master_attrs_ = call_node->attrs;
-      master_op_patetrn_ = op_pattern;
+      master_op_pattern_ = op_pattern;
     }
     if (outputs.size() != 1) {
       const auto* tuple_type =
@@ -213,7 +213,7 @@ class ScheduleGetter :
   tvm::Target target_;
   Op master_op_;
   Attrs master_attrs_;
-  int master_op_patetrn_{0};
+  int master_op_pattern_{0};
   std::ostringstream readable_name_stream_;
   std::unordered_map<Expr, Array<Tensor>, NodeHash, NodeEqual> memo_;
 };
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 5bef4a22f371..33d06e9c6c28 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -292,17 +292,10 @@ class Interpreter :
     }
   }
 
-  // Check if function is a primitive function.
-  bool IsPrimitive(const Function& func) const {
-    NodeRef res = FunctionGetAttr(func, "Primitive");
-    const ir::IntImm* pval = res.as<ir::IntImm>();
-    return pval && pval->value != 0;
-  }
-
   // Invoke the closure
   Value Invoke(const Closure& closure, const tvm::Array<Value>& args) {
     // Get a reference to the function inside the closure.
-    if (IsPrimitive(closure->func)) {
+    if (closure->func->IsPrimitive()) {
       return InvokePrimitiveOp(closure->func, args);
     }
     auto func = closure->func;
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 6f1260b05b99..cdb2a32a0009 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -135,6 +135,12 @@ FuncType FunctionNode::func_type_annotation() const {
   return FuncTypeNode::make(param_types, this->ret_type, this->type_params, {});
 }
 
+bool FunctionNode::IsPrimitive() const {
+  NodeRef res = FunctionGetAttr(GetRef<Function>(this), "Primitive");
+  const ir::IntImm* pval = res.as<ir::IntImm>();
+  return pval && pval->value != 0;
+}
+
 NodeRef FunctionGetAttr(const Function& func, const std::string& key) {
   if (!func->attrs.defined()) { return NodeRef(); }
 
@@ -172,7 +178,7 @@ TVM_REGISTER_NODE_TYPE(FunctionNode);
 
 TVM_REGISTER_API("relay._make.Function")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = FunctionNode::make(args[0], args[1], args[2], args[3]);
+  *ret = FunctionNode::make(args[0], args[1], args[2], args[3], args[4]);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 79ea3e22b139..b2b35c51a1ca 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -699,9 +699,7 @@ class FuseMutator : private ExprMutator {
   std::unordered_map<GraphPartitioner::Group*, GroupInfo> ginfo_;
   // Skip primitive function.
   Expr VisitExpr_(const FunctionNode* fn_node) {
-    NodeRef res = FunctionGetAttr(GetRef<Function>(fn_node), "Primitive");
-    const ir::IntImm* pval = res.as<ir::IntImm>();
-    if (pval && pval->value != 0) {
+    if (fn_node->IsPrimitive()) {
       return GetRef<Expr>(fn_node);
     } else {
       return ExprMutator::VisitExpr_(fn_node);

From bb9e18480e0188f2173706e2337f5e9c4628f12c Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Tue, 25 Dec 2018 01:37:55 +0800
Subject: [PATCH 515/529] [NNVM] Fix dtype of output of pad. (#2331)

Dtype of output of pad should follows input, but if dtype of input is not float,
  output will still be float becase pad_value is float.
---
 nnvm/src/top/nn/nn.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
index 09dfbb211f00..e301f167ff1d 100644
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -620,7 +620,8 @@ NNVM_REGISTER_OP(pad)
     for (size_t i = 0; i < pad_width.ndim(); ++i) {
       pad_after.push_back(tvm::make_const(tvm::Int(32), pad_width[i][1]));
     }
-    return Array<Tensor>{ topi::pad(inputs[0], pad_before, pad_after, param.pad_value) };
+    return Array<Tensor>{ topi::pad(inputs[0], pad_before, pad_after,
+                          tvm::make_const(inputs[0]->dtype, param.pad_value)) };
 })
 .set_support_level(1);
 

From e12f310e0172c0251d37495f648f703c91b3e149 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 25 Dec 2018 04:05:29 +0900
Subject: [PATCH 516/529] [ROCM] Make sure all bit code files exist (#2323)

---
 python/tvm/contrib/rocm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index 10cfaed83e68..172d081ff96a 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -1,6 +1,6 @@
 """Utility for ROCm backend"""
 import subprocess
-from os.path import join
+from os.path import join, exists
 from . import util
 from .._ffi.base import py_str
 from ..api import register_func, convert
@@ -79,4 +79,5 @@ def callback_rocm_bitcode_path(rocdl_dir="/opt/rocm/lib/"):
         "oclc_unsafe_math_off.amdgcn.bc",
         "oclc_unsafe_math_on.amdgcn.bc"
     ]
-    return convert([join(rocdl_dir, bitcode) for bitcode in bitcode_files])
+    paths = [join(rocdl_dir, bitcode) for bitcode in bitcode_files]
+    return convert([path for path in paths if exists(path)])

From a2e77a838630736177966443c2f86d6d32f3ee8e Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Mon, 24 Dec 2018 14:15:32 -0500
Subject: [PATCH 517/529] [Relay][docs] Details on comp. graphs in Relay dev
 intro (#2324)

---
 docs/dev/relay_intro.rst | 378 ++++++++++++++++++++-------------------
 1 file changed, 190 insertions(+), 188 deletions(-)

diff --git a/docs/dev/relay_intro.rst b/docs/dev/relay_intro.rst
index dde900a502c1..2462d0d3ecc2 100644
--- a/docs/dev/relay_intro.rst
+++ b/docs/dev/relay_intro.rst
@@ -1,188 +1,190 @@
-Introduction to Relay IR
-========================
-This article introduces Relay IR -- the second generation of NNVM.
-We expect readers from two kinds of background -- those who have a programming language background and deep learning
-framework developers who are familiar with the computational graph representation.
-
-We briefly summarize the design goal here, and will touch upon these points in the later part of the article.
-
-- Support traditional data flow style programming and transformations.
-- Support functional style scoping, let-binding and making it fully featured differentiable language.
-- Being able to allow the user to mix the two programming styles.
-
-Build Computational Graph with Relay
-------------------------------------
-Traditional deep learning frameworks use computational graphs as their intermediate representation.
-A computational graph (or data-flow graph), is a directed acyclic graph (DAG) that represents the computation.
-
-.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/dataflow.png
-    :align: center
-    :scale: 70%
-
-
-You can use Relay to build a computational(dataflow) graph. Specifically, the above code shows how to
-construct a simple two-node graph. You can find that the syntax of the example is not that different from existing
-computational graph IR like NNVMv1, with the only difference in terms of terminology:
-
-- Existing frameworks usually use graph and subgraph
-- Relay uses function e.g. --  ``fn (%x)``, to indicate the graph
-
-Each data-flow node is a CallNode in Relay. The relay python DSL allows you to construct a data-flow quickly.
-One thing we want to highlight in the above code -- is that we explicitly constructed an Add node with
-both input point to ``%1``.  When a deep learning framework evaluates the above program, it will compute
-the nodes in topological order, and ``%1`` will only be computed once.
-While this fact is very natural to deep learning framework builders, it is something that might
-surprise a PL folk in the first place.  If we implement a simple visitor to print out the result and
-treat the result as nested Call expression, it becomes ``log(%x) + log(%x)``.
-
-Such ambiguity is caused by different interpretation of program semantics when there is a shared node in the DAG.
-In a normal functional programming IR, nested expressions are treated as expression trees, without considering the
-fact that the ``%1`` is actually reused twice in ``%2``.
-
-Relay IR choose to be mindful of this difference. Usually, deep learning framework users build the computational
-graph in this fashion, where a DAG node reuse often occur. As a result, when we print out the Relay program in
-the text format, we print one CallNode per line and assign a temporary id ``(%1, %2)`` to each CallNode so each common
-node can be referenced in later parts of the program.
-
-Module: Support Multiple Functions(Graphs)
-------------------------------------------
-So far we have introduced how can we build a data flow graph as a function. One might naturally ask -- can we support multiple
-functions and enable them to call each other. Relay allows grouping multiple functions together in a module, the code below
-shows an example of a function calling another function.
-
-.. code::
-
-   def @muladd(%x, %y, %z) {
-     %1 = mul(%x, %y)
-     %2 = add(%1, %z)
-     %2
-   }
-   def @myfunc(%x) {
-     %1 = @muladd(%x, 1, 2)
-     %2 = @muladd(%1, 2, 3)
-     %2
-   }
-
-The Module can be viewed as a ``Map<GlobalVar, Function>``. Here GlobalVar is just an id that is used to represent the functions
-in the module. ``@muladd`` and ``@myfunc`` are GlobalVars in the above example. When a CallNode is used to call another function,
-the corresponding GlobalVar is stored in the op field of the CallNode. It contains a level of indirection -- we need to look up
-body of the called function from the module using the corresponding GlobalVar. In this particular case, we could also directly
-store the reference to the Function as op in the CallNode. So, why do we need to introduce GlobalVar? The main reason is that
-GlobalVar decouples the definition/declaration and enables recursion and delayed declaration of the function.
-
-.. code ::
-
-  @def myfunc(%x) {
-    %1 = equal(%x, 1)
-     if (%1) {
-        %x
-     } else {
-       %2 = sub(%x, 1)
-       %3 = @myfunc(%2)
-        %4 = add(%3, %3)
-        %4
-    }
-  }
-
-In the above example, ``@myfunc`` recursively calls itself. Using GlobalVar ``@myfunc`` to represent the function avoids
-the cyclic dependency in the data structure.
-At this point, we have introduced the basic concepts in Relay. Notably, Relay has the following improvements over NNVMv1:
-
-- Succinct text format that eases debugging of writing passes.
-- First-class support for subgraphs-functions, in a joint module, this enables further chance of joint optimizations such as inlining and calling convention specification.
-- Naive front-end language interop, for example, all the data structure can be visited in python, which allows quick prototyping of optimizations in python and mixing them with c++ code.
-
-
-Let Binding and Scopes
-----------------------
-
-So far, we have introduced how to build a computational graph in the good old way used in deep learning frameworks.
-This section will talk about a new important construct introduced by Relay -- let bindings.
-
-Let binding is used in every high-level programming languages. In Relay, it is a data structure with three
-fields ``Let(var, value, body)``. When we evaluate a let expression, we first evaluate the value part, assign
-it to the var, then return the evaluated result in the body expression.
-
-You can use a sequence of let bindings to construct a logically equivalent program to a data-flow program.
-The code example below shows one program with two forms side by side.
-
-.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/dataflow_vs_func.png
-    :align: center
-    :scale: 70%
-
-
-The nested let-binding is called A-normal form, and it is commonly used as IRs in functional programming languages.
-Now, please take a close look at the AST structure. While the two programs are semantically identical
-(so are their textual representations, except that A-normal form has let prefix), their AST structures are different from each other.
-
-Since program optimizations take these AST data structures and transform them, the two different structure will
-affect the compiler code we are going to write. For example, if we want to detect a pattern ``add(log(x), y)``:
-
-- In the data-flow form, we can first access the add node, then directly look at its first argument to see if it is a log
-- In the A-normal form, we cannot directly do the check anymore, because the first input to add is ``%v1`` -- we will need to keep a map from variable to its bound values and lookup that map, in order to know that ``%v1`` is a log.
-
-Different data structures will impact how you might write transformations, and we need to keep that in mind.
-So now, as a deep learning framework developer, you might ask, why do we need let-binding.
-Your PL friends will always tell you that let is important -- as PL is a quite established field,
-there must be some wisdom behind that.
-
-
-Why We Might Need Let Binding
------------------------------
-One key usage of let binding is that it specifies the scope of computation. Let us take look at the following example,
-which does not use let binding.
-
-.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/let_scope.png
-    :align: center
-    :scale: 70%
-
-The problem comes when we try to decide where we should evaluate node ``%1``. In particular, while the text format seems
-to suggest that we should evaluate node ``%1`` outside the if scope, the AST(as shown in the picture) does not suggest so.
-Actually, a dataflow graph never defines its scope of the evaluation. This introduces some ambiguity in the semantics.
-
-This ambiguity becomes more interesting when we have closures. Consider the following program, which returns a closure.
-We don’t know where should we compute ``%1``. It can either be outside the closure, or inside the closure.
-
-.. code::
-
-  fn (%x) {
-    %1 = log(%x)
-    %2 = fn(%y) {
-      add(%y, %1)
-    }
-    %2
-  }
-
-Let binding solves this problem, as the computation of the value happens at the let node. In both programs,
-if we change ``%1 = log(%x)`` to ``let %v1 = log(%x)``, we clearly specify the computation location to
-be outside of the if scope and closure. As you can see let-binding gives a more precise specification of the computation site
-and could be useful when we generate backend code(as such specification is in the IR).
-
-On the other hand, the data-flow form, which does not specify the scope of computation, does have its own advantages
--- we don’t need to worry about where to put the let when we generate the code. The dataflow form also gives more freedom
-to the later passes to decide where to put the evaluation point. As a result, it might not be a bad idea to use data flow
-form of the program in the initial phases of optimizations when you find it is convenient.
-Many optimizations in Relay today are written to optimize dataflow programs.
-
-However, when we lower the IR to actual runtime program, we need to be precise about the scope of computation.
-In particular, we want to explicitly specify where the scope of computation should happen when we are using
-sub-functions and closures. Let-binding can be used to solve this problem in later stage execution specific optimizations.
-
-
-Implication on IR Transformations
----------------------------------
-
-Hopefully, by now you are familiar with the two kinds of representations.
-Most functional programming languages do their analysis in A-normal form,
-where the analyzer does not need to be mindful that the expressions are DAGs.
-
-Relay choose to support both the data-flow form and let binding. We believe that it is important to let the
-framework developer choose the representation they are familiar with.
-This does, however, have some implications on how we write passes:
-
-- If you come from a data-flow background and want to handle let, keep a map of var to the expressions so you can perform lookup when encountering a var. This likely means a minimum change as we already need a map from expr -> transformed expression anyway. Note that this will effectively remove all the let in the program.
-- If you come from a PL background and like A-normal form, we will provide a dataflow -> A-normal form pass.
-- For PL folks, when you are implementing something (like dataflow->ANF transformation), be mindful that the expression can be DAG, and this usually means that we should visit expressions with a ``Map<Expr, Result>`` and only compute the transformed result once, so the result expression keeps the common structure.
-
-There are additional advanced concepts such as symbolic shape inference, polymorphic functions
-that are not covered by this material, you are more than welcomed to look at other materials.
+Introduction to Relay IR
+========================
+This article introduces Relay IR -- the second generation of NNVM.
+We expect readers from two kinds of background -- those who have a programming language background and deep learning
+framework developers who are familiar with the computational graph representation.
+
+We briefly summarize the design goal here, and will touch upon these points in the later part of the article.
+
+- Support traditional data flow-style programming and transformations.
+- Support functional-style scoping, let-binding and making it a fully featured differentiable language.
+- Being able to allow the user to mix the two programming styles.
+
+Build a Computational Graph with Relay
+--------------------------------------
+Traditional deep learning frameworks use computational graphs as their intermediate representation.
+A computational graph (or dataflow graph), is a directed acyclic graph (DAG) that represents the computation.
+Though dataflow graphs are limited in terms of the computations they are capable of expressing due to
+lacking control flow, their simplicity makes it easier to implement automatic differentiation and
+compile for heterogeneous execution environments (e.g., executing parts of the graph on specialized hardware).
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/dataflow.png
+    :align: center
+    :scale: 70%
+
+
+You can use Relay to build a computational (dataflow) graph. Specifically, the above code shows how to
+construct a simple two-node graph. You can find that the syntax of the example is not that different from existing
+computational graph IR like NNVMv1, with the only difference in terms of terminology:
+
+- Existing frameworks usually use graph and subgraph
+- Relay uses function e.g. --  ``fn (%x)``, to indicate the graph
+
+Each dataflow node is a CallNode in Relay. The Relay Python DSL allows you to construct a dataflow graph quickly.
+One thing we want to highlight in the above code -- is that we explicitly constructed an Add node with
+both input point to ``%1``.  When a deep learning framework evaluates the above program, it will compute
+the nodes in topological order, and ``%1`` will only be computed once.
+While this fact is very natural to deep learning framework builders, it is something that might
+surprise a PL researcher in the first place.  If we implement a simple visitor to print out the result and
+treat the result as nested Call expression, it becomes ``log(%x) + log(%x)``.
+
+Such ambiguity is caused by different interpretations of program semantics when there is a shared node in the DAG.
+In a normal functional programming IR, nested expressions are treated as expression trees, without considering the
+fact that the ``%1`` is actually reused twice in ``%2``.
+
+The Relay IR is mindful of this difference. Usually, deep learning framework users build the computational
+graph in this fashion, where a DAG node reuse often occurs. As a result, when we print out the Relay program in
+the text format, we print one CallNode per line and assign a temporary id ``(%1, %2)`` to each CallNode so each common
+node can be referenced in later parts of the program.
+
+Module: Support Multiple Functions (Graphs)
+-------------------------------------------
+So far we have introduced how can we build a dataflow graph as a function. One might naturally ask: Can we support multiple
+functions and enable them to call each other? Relay allows grouping multiple functions together in a module; the code below
+shows an example of a function calling another function.
+
+.. code::
+
+   def @muladd(%x, %y, %z) {
+     %1 = mul(%x, %y)
+     %2 = add(%1, %z)
+     %2
+   }
+   def @myfunc(%x) {
+     %1 = @muladd(%x, 1, 2)
+     %2 = @muladd(%1, 2, 3)
+     %2
+   }
+
+The Module can be viewed as a ``Map<GlobalVar, Function>``. Here GlobalVar is just an id that is used to represent the functions
+in the module. ``@muladd`` and ``@myfunc`` are GlobalVars in the above example. When a CallNode is used to call another function,
+the corresponding GlobalVar is stored in the op field of the CallNode. It contains a level of indirection -- we need to look up
+body of the called function from the module using the corresponding GlobalVar. In this particular case, we could also directly
+store the reference to the Function as op in the CallNode. So, why do we need to introduce GlobalVar? The main reason is that
+GlobalVar decouples the definition/declaration and enables recursion and delayed declaration of the function.
+
+.. code ::
+
+  @def myfunc(%x) {
+    %1 = equal(%x, 1)
+     if (%1) {
+        %x
+     } else {
+       %2 = sub(%x, 1)
+       %3 = @myfunc(%2)
+        %4 = add(%3, %3)
+        %4
+    }
+  }
+
+In the above example, ``@myfunc`` recursively calls itself. Using GlobalVar ``@myfunc`` to represent the function avoids
+the cyclic dependency in the data structure.
+At this point, we have introduced the basic concepts in Relay. Notably, Relay has the following improvements over NNVMv1:
+
+- Succinct text format that eases debugging of writing passes.
+- First-class support for subgraphs-functions, in a joint module, this enables further chance of joint optimizations such as inlining and calling convention specification.
+- Naive front-end language interop, for example, all the data structure can be visited in Python, which allows quick prototyping of optimizations in Python and mixing them with C++ code.
+
+
+Let Binding and Scopes
+----------------------
+
+So far, we have introduced how to build a computational graph in the good old way used in deep learning frameworks.
+This section will talk about a new important construct introduced by Relay -- let bindings.
+
+Let binding is used in every high-level programming language. In Relay, it is a data structure with three
+fields ``Let(var, value, body)``. When we evaluate a let expression, we first evaluate the value part, assign
+it to the var, then return the evaluated result in the body expression.
+
+You can use a sequence of let bindings to construct a logically equivalent program to a dataflow program.
+The code example below shows one program with two forms side by side.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/dataflow_vs_func.png
+    :align: center
+    :scale: 70%
+
+
+The nested let binding is called A-normal form, and it is commonly used as IRs in functional programming languages.
+Now, please take a close look at the AST structure. While the two programs are semantically identical
+(so are their textual representations, except that A-normal form has let prefix), their AST structures are different.
+
+Since program optimizations take these AST data structures and transform them, the two different structures will
+affect the compiler code we are going to write. For example, if we want to detect a pattern ``add(log(x), y)``:
+
+- In the data-flow form, we can first access the add node, then directly look at its first argument to see if it is a log
+- In the A-normal form, we cannot directly do the check anymore, because the first input to add is ``%v1`` -- we will need to keep a map from variable to its bound values and look up that map, in order to know that ``%v1`` is a log.
+
+Different data structures will impact how you might write transformations, and we need to keep that in mind.
+So now, as a deep learning framework developer, you might ask, Why do we need let bindings?
+Your PL friends will always tell you that let is important -- as PL is a quite established field,
+there must be some wisdom behind that.
+
+Why We Might Need Let Binding
+-----------------------------
+One key usage of let binding is that it specifies the scope of computation. Let us take a look at the following example,
+which does not use let bindings.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/let_scope.png
+    :align: center
+    :scale: 70%
+
+The problem comes when we try to decide where we should evaluate node ``%1``. In particular, while the text format seems
+to suggest that we should evaluate node ``%1`` outside the if scope, the AST(as shown in the picture) does not suggest so.
+Actually, a dataflow graph never defines its scope of the evaluation. This introduces some ambiguity in the semantics.
+
+This ambiguity becomes more interesting when we have closures. Consider the following program, which returns a closure.
+We don’t know where should we compute ``%1``; it can be either inside or outside the closure.
+
+.. code::
+
+  fn (%x) {
+    %1 = log(%x)
+    %2 = fn(%y) {
+      add(%y, %1)
+    }
+    %2
+  }
+
+A let binding solves this problem, as the computation of the value happens at the let node. In both programs,
+if we change ``%1 = log(%x)`` to ``let %v1 = log(%x)``, we clearly specify the computation location to
+be outside of the if scope and closure. As you can see let-binding gives a more precise specification of the computation site
+and could be useful when we generate backend code (as such specification is in the IR).
+
+On the other hand, the dataflow form, which does not specify the scope of computation, does have its own advantages
+-- namely, we don’t need to worry about where to put the let when we generate the code. The dataflow form also gives more freedom
+to the later passes to decide where to put the evaluation point. As a result, it might not be a bad idea to use data flow
+form of the program in the initial phases of optimizations when you find it is convenient.
+Many optimizations in Relay today are written to optimize dataflow programs.
+
+However, when we lower the IR to an actual runtime program, we need to be precise about the scope of computation.
+In particular, we want to explicitly specify where the scope of computation should happen when we are using
+sub-functions and closures. Let-binding can be used to solve this problem in later stage execution specific optimizations.
+
+
+Implication on IR Transformations
+---------------------------------
+
+Hopefully, by now you are familiar with the two kinds of representations.
+Most functional programming languages do their analysis in A-normal form,
+where the analyzer does not need to be mindful that the expressions are DAGs.
+
+Relay choose to support both the dataflow form and let bindings. We believe that it is important to let the
+framework developer choose the representation they are familiar with.
+This does, however, have some implications on how we write passes:
+
+- If you come from a dataflow background and want to handle lets, keep a map of var to the expressions so you can perform lookup when encountering a var. This likely means a minimum change as we already need a map from expressions to transformed expressions anyway. Note that this will effectively remove all the lets in the program.
+- If you come from a PL background and like A-normal form, we will provide a dataflow to A-normal form pass.
+- For PL folks, when you are implementing something (like a dataflow-to-ANF transformation), be mindful that expressions can be DAGs, and this usually means that we should visit expressions with a ``Map<Expr, Result>`` and only compute the transformed result once, so the resulting expression keeps the common structure.
+
+There are additional advanced concepts such as symbolic shape inference, polymorphic functions
+that are not covered by this material; you are more than welcome to look at other materials.

From 2be66736946b77a84f160fe15b73cd8cb8f9b128 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 25 Dec 2018 03:16:32 +0800
Subject: [PATCH 518/529] [RELAY] Add missing arg in vgg (#2329)

---
 python/tvm/relay/testing/vgg.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/testing/vgg.py b/python/tvm/relay/testing/vgg.py
index 811de33c579a..bec141f70ffd 100644
--- a/python/tvm/relay/testing/vgg.py
+++ b/python/tvm/relay/testing/vgg.py
@@ -98,7 +98,8 @@ def get_workload(batch_size,
                  num_classes=1000,
                  image_shape=(3, 224, 224),
                  dtype="float32",
-                 num_layers=11):
+                 num_layers=11,
+                 batch_norm=False):
     """Get benchmark workload for VGG nets.
 
     Parameters
@@ -118,6 +119,9 @@ def get_workload(batch_size,
     num_layers : int
         Number of layers for the variant of vgg. Options are 11, 13, 16, 19.
 
+    batch_norm : bool
+        Use batch normalization.
+
     Returns
     -------
     net : nnvm.Symbol
@@ -126,5 +130,5 @@ def get_workload(batch_size,
     params : dict of str to NDArray
         The parameters.
     """
-    net = get_net(batch_size, image_shape, num_classes, dtype, num_layers)
+    net = get_net(batch_size, image_shape, num_classes, dtype, num_layers, batch_norm)
     return create_workload(net)

From d37c088ade3bb205dcc8ef7241c36fc679447230 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Mon, 24 Dec 2018 14:28:02 -0500
Subject: [PATCH 519/529] [Relay][Docs] Fix broken bullet points in Relay
 operator addition tutorial (#2325)

---
 docs/dev/relay_add_op.rst | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/docs/dev/relay_add_op.rst b/docs/dev/relay_add_op.rst
index 751e938fe012..4a9625ce1198 100644
--- a/docs/dev/relay_add_op.rst
+++ b/docs/dev/relay_add_op.rst
@@ -7,10 +7,8 @@ that they will be integrated into Relay's type system.
 
 Registering an operator requires three steps:
 
-- Using the ``RELAY_REGISTER_OP`` macro in C++ to
-register the operator's arity and type information
-- Defining a C++ function to produce a call node for the
-operator and registering a Python API hook for the function
+- Using the ``RELAY_REGISTER_OP`` macro in C++ to register the operator's arity and type information
+- Defining a C++ function to produce a call node for the operator and registering a Python API hook for the function
 - Wrapping the above Python API hook in a neater interface
 
 The file ``src/relay/op/tensor/binary.cc`` provides
@@ -48,9 +46,7 @@ to specify the following information about an operator in Relay:
 
 - Arity (number of arguments)
 - Names and descriptions for positional arguments
-- Support level (1 indicating an internal intrinsic, higher numbers
-indicating operators that are not as integral to the framework or are
-supported externally)
+- Support level (1 indicates an internal intrinsic; higher numbers indicate less integral or externally supported operators)
 - A type relation for the operator
 
 The below example is from ``binary.cc`` and uses a broadcasting
@@ -144,8 +140,6 @@ before producing the call node:
 Summary
 -------
 
-- A TVM operator can be registered in Relay using a relation to express
-the appropriate type information.
-- Using an operator in Relay requires a function to produce a
-call node for the operator.
+- A TVM operator can be registered in Relay using a relation to express the appropriate type information.
+- Using an operator in Relay requires a function to produce a call node for the operator.
 - It is best to have a simple Python wrapper for producing the call node.

From 7d4ea4d07a17eb2a837156562861b75c964e3568 Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Mon, 24 Dec 2018 11:28:36 -0800
Subject: [PATCH 520/529] [RELAY][AUTOTVM] Extract tuning tasks from Relay
 programs (#2181)

---
 python/tvm/autotvm/task/__init__.py           |   1 +
 python/tvm/autotvm/task/nnvm_integration.py   | 231 +++---------------
 python/tvm/autotvm/task/relay_integration.py  | 200 +++++++++++++++
 python/tvm/autotvm/task/topi_integration.py   | 192 ++++++++++++++-
 .../relay/test_autotvm_task_extraction.py     |  56 +++++
 topi/python/topi/x86/conv2d.py                |   2 +-
 topi/python/topi/x86/depthwise_conv2d.py      |   2 +-
 7 files changed, 477 insertions(+), 207 deletions(-)
 create mode 100644 python/tvm/autotvm/task/relay_integration.py
 create mode 100644 tests/python/relay/test_autotvm_task_extraction.py

diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index 04bcec92fd57..f6ea07c272d0 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -14,3 +14,4 @@
 
 from .topi_integration import register_topi_compute, register_topi_schedule
 from .nnvm_integration import extract_from_graph, extract_from_multiple_graph
+from .relay_integration import extract_from_program, extract_from_multiple_program
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index 6a07194a594d..cd7337586519 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -7,208 +7,13 @@
 import logging
 
 
-from ... import tensor, placeholder, create_schedule, target as _target
+from ... import target as _target
 
-from ..util import get_const_tuple
-from .task import create, register
+from .task import create
+from .topi_integration import TaskExtractEnv
 
 logger = logging.getLogger('autotvm')
 
-def serialize_args(args):
-    """serialize arguments of a topi function to a hashable tuple.
-
-    Parameters
-    ----------
-    args: list of hashable or Tensor
-    """
-    ret = []
-    for t in args:
-        if isinstance(t, tensor.Tensor):
-            ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype))
-        else:
-            ret.append(t)
-    return tuple(ret)
-
-
-def deserialize_args(args):
-    """The inverse function of :code:`serialize_args`.
-
-    Parameters
-    ----------
-    args: list of hashable or Tensor
-    """
-    ret = []
-    for t in args:
-        if isinstance(t, tuple) and t[0] == 'TENSOR':
-            ret.append(placeholder(shape=t[1], dtype=t[2]))
-        else:
-            ret.append(t)
-    return ret
-
-
-# Task extractor for nnvm graph
-class TaskExtractEnv:
-    """Global environment for extracting tuning tasks from nnvm graph"""
-    current = None
-
-    def __init__(self):
-        import topi
-        import nnvm
-
-        # NOTE: To add more symbols, you only need to change the following lists
-        # nnvm symbol -> topi compute
-        self.symbol2topi = {
-            nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
-                              topi.nn.group_conv2d_nchw],
-            nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
-            nnvm.sym.dense: [topi.nn.dense],
-        }
-
-        # topi compute -> autotvm task name
-        self.topi_to_task = {
-            topi.nn.conv2d: "topi_nn_conv2d",
-            topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
-            topi.nn.group_conv2d_nchw: "topi_nn_group_conv2d_nchw",
-            topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
-            topi.nn.dense: "topi_nn_dense",
-        }
-
-        self.topi_to_schedule = {
-            topi.nn.conv2d: [topi.generic.schedule_conv2d_nchw,
-                             topi.generic.schedule_conv2d_nhwc],
-            topi.nn.depthwise_conv2d_nchw: [topi.generic.schedule_depthwise_conv2d_nchw,
-                                            topi.generic.schedule_depthwise_conv2d_nhwc],
-            topi.nn.group_conv2d_nchw: [topi.generic.schedule_group_conv2d_nchw],
-            topi.nn.conv2d_transpose_nchw: [topi.generic.schedule_conv2d_transpose_nchw],
-            topi.nn.dense: [topi.generic.schedule_dense],
-        }
-
-        self._register_tracing()
-        self._register_topi_task()
-        self.task_collection = []
-        self.wanted_topi_funcs = list(self.topi_to_task.keys())
-
-    def _register_tracing(self):
-        """Register tracing function to track the topi function call"""
-        # register topi compute for "tracing" target
-        for topi_compute in self.topi_to_task:
-            def _local_scope(compute_func):
-                """start a scope to hold the local function in for loop"""
-
-                @compute_func.register("tracing", )
-                def _tracing_topi_compute(*args, **kwargs):
-                    assert not kwargs, "Do not support extracting tuning tasks when" \
-                                       "kwargs is used in TOPI function call." \
-                                       "Please modify it to use only positional args."
-
-                    if compute_func in self.wanted_topi_funcs:  # record this call
-                        key = (self.topi_to_task[compute_func], serialize_args(args))
-                        if key not in self.task_collection:
-                            self.task_collection.append(key)
-
-                    return compute_func.fdefault(*args)
-            _local_scope(topi_compute)
-
-        # register topi schedule for "tracing" target
-        for topi_compute in self.topi_to_task:
-            for topi_schedule in self.topi_to_schedule[topi_compute]:
-                def _local_scope_(schedule_func):
-                    """start a scope to hold the local function in for loop"""
-
-                    @schedule_func.register("tracing", )
-                    def _tracing_topi_compute(outs):
-                        outs = [outs] if isinstance(outs, tensor.Tensor) else outs
-                        return create_schedule([x.op for x in outs])
-                _local_scope_(topi_schedule)
-
-    def _register_topi_task(self):
-        """register tuning wrapper for topi function"""
-        import topi
-
-        # Tuning wrapper for topi functions
-        @register("topi_nn_conv2d")
-        def _topi_nn_conv2d(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            layout = args[-2]
-            assert layout == 'NCHW', "only support NCHW currently"
-            C = topi.nn.conv2d(*args, **kwargs)
-            s = topi.generic.schedule_conv2d_nchw([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_depthwise_conv2d_nchw")
-        def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.depthwise_conv2d_nchw(*args, **kwargs)
-            s = topi.generic.schedule_depthwise_conv2d_nchw([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_group_conv2d_nchw")
-        def _topi_nn_group_conv2d_nchw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.group_conv2d_nchw(*args, **kwargs)
-            s = topi.generic.schedule_group_conv2d_nchw([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_conv2d_transpose_nchw")
-        def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.conv2d_transpose_nchw(*args, **kwargs)
-            s = topi.generic.schedule_conv2d_transpose_nchw([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_dense")
-        def _topi_nn_dense(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            data, weight, bias = args
-            C = topi.nn.dense(*args, **kwargs)
-            s = topi.generic.schedule_dense([C])
-            if bias is not None:
-                return s, [data, weight, bias, C]
-            return s, [data, weight, C]
-
-    def reset(self, wanted_topi_funcs):
-        """Reset task collections
-
-        Parameters
-        ----------
-        wanted_topi_funcs: List of function
-            The topi function to be extracted
-        """
-        self.task_collection = []
-        self.wanted_topi_funcs = wanted_topi_funcs
-
-    def get_tasks(self):
-        """Get collected tasks
-
-        Returns
-        -------
-        tasks: List of tuple(name, args)
-            A list of tasks extracted from the nnvm graph
-        """
-        return self.task_collection
-
-    @staticmethod
-    def get():
-        """Get the single instance of TaskExtractEnv
-
-        Returns
-        -------
-        env: TaskExtractEnv
-            The single instance of TaskExtractEnv
-        """
-        if not TaskExtractEnv.current:
-            TaskExtractEnv.current = TaskExtractEnv()
-        return TaskExtractEnv.current
-
 
 def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
     """ Extract tuning tasks from a nnvm graph.
@@ -237,13 +42,24 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
         collected tasks
     """
     import nnvm.compiler
+    import nnvm
+    import topi
 
     env = TaskExtractEnv.get()
 
+    #NOTE: To add more symbols, you only need to change the following lists
+    #nnvm symbol -> topi compute
+    SYMBOL2TOPI = {
+        nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                          topi.nn.group_conv2d_nchw],
+        nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+        nnvm.sym.dense: [topi.nn.dense],
+    }
+
     topi_funcs = []
     for sym_name in symbols:
-        if sym_name in env.symbol2topi:
-            topi_funcs.extend(env.symbol2topi[sym_name])
+        if sym_name in SYMBOL2TOPI:
+            topi_funcs.extend(SYMBOL2TOPI[sym_name])
         else:
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
 
@@ -297,13 +113,24 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_
         collected tasks
     """
     import nnvm.compiler
+    import nnvm
+    import topi
 
     env = TaskExtractEnv.get()
 
+    #NOTE: To add more symbols, you only need to change the following lists
+    #nnvm symbol -> topi compute
+    SYMBOL2TOPI = {
+        nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                          topi.nn.group_conv2d_nchw],
+        nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+        nnvm.sym.dense: [topi.nn.dense],
+    }
+
     topi_funcs = []
     for sym_name in symbols:
-        if sym_name in env.symbol2topi:
-            topi_funcs.extend(env.symbol2topi[sym_name])
+        if sym_name in SYMBOL2TOPI:
+            topi_funcs.extend(SYMBOL2TOPI[sym_name])
         else:
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
 
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
new file mode 100644
index 000000000000..21acf257f9ac
--- /dev/null
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -0,0 +1,200 @@
+# pylint: disable=unused-variable,invalid-name
+"""
+Decorator and utilities for the integration with TOPI and Relay
+99.9% copy-paste of implementation by @MerryMercy
+
+"""
+import threading
+import warnings
+import logging
+
+
+from ... import tensor, placeholder, target as _target
+
+from .task import create
+from .topi_integration import TaskExtractEnv
+
+logger = logging.getLogger('autotvm')
+
+
+def serialize_args(args):
+    """serialize arguments of a topi function to a hashable tuple.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tensor.Tensor):
+            ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype))
+        else:
+            ret.append(t)
+    return tuple(ret)
+
+
+def deserialize_args(args):
+    """The inverse function of :code:`serialize_args`.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tuple) and t[0] == 'TENSOR':
+            ret.append(placeholder(shape=t[1], dtype=t[2]))
+        else:
+            ret.append(t)
+    return ret
+
+
+def extract_from_program(func, params, ops, target, target_host=None):
+    """ Extract tuning tasks from a relay program.
+
+    This function collects tuning tasks by building the program
+    with a "tracing" target and tracing all the calls to topi.
+
+    Parameters
+    ----------
+    func: relay.expr.Function
+        The func to tune
+    params: dict of str to numpy array
+        The associated parameters of the program
+    ops: List of relay op
+        List of relay ops to be tuned
+    dtype: str or dict of str to str
+        The input types to the program
+    target: tvm.target.Target
+        The compilation target
+    target_host: tvm.target.Target
+        The host compilation target
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task
+        collected tasks
+    """
+    env = TaskExtractEnv.get()
+    import tvm.relay.op
+    from tvm import relay
+    import topi
+
+    # NOTE: To add more ops, you only need to change the following lists
+    # relay op -> topi compute
+    OP2TOPI = {
+        tvm.relay.op.nn.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                                 topi.nn.group_conv2d_nchw],
+        tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+        tvm.relay.op.nn.dense: [topi.nn.dense],
+    }
+
+    topi_funcs = []
+    for op_name in ops:
+        if op_name in OP2TOPI:
+            topi_funcs.extend(OP2TOPI[op_name])
+        else:
+            warnings.warn("Op %s is not tunable, ignored" % op_name)
+
+    # run compiler to collect all TOPI calls during compilation
+    env.reset(topi_funcs)
+
+    # disable logger temporarily
+    old_state = logger.disabled
+    logger.disabled = True
+
+    # use a "tracing" target to do a fake compile for collecting topi calls
+    tracing_target = _target.create("llvm -device=tracing")
+    relay.backend.compile_engine.get().clear()
+    # wrap build call in thread to avoid multiprocessing problems
+    build_thread = threading.Thread(target=relay.build, args=(func,
+                                                              tracing_target,
+                                                              target_host,
+                                                              params))
+    build_thread.start()
+    build_thread.join()
+    logger.disabled = old_state
+
+    # create tasks for target
+    tasks = []
+    for task_name, args in env.get_tasks():
+        tasks.append(create(task_name, args,
+                            target=target, target_host=target_host,
+                            template_key='direct'))
+
+    return tasks
+
+
+def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
+    """ Extract tuning tasks from multiple relay programs.
+
+    This function is the multiple program version of extract_from_program
+
+    Parameters
+    ----------
+    funcs: List of relay.expr.Function
+        The list of functions to tune
+    params: List of dict of str to numpy array
+        The associated parameters of the programs
+    ops: List of relay op
+        List of relay ops to be tuned
+    target: tvm.target.Target
+        The compilation target
+    target_host: tvm.target.Target
+        The host compilation target
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task
+        collected tasks
+    """
+    env = TaskExtractEnv.get()
+    import tvm.relay.op
+    from tvm import relay
+    import topi
+
+    # NOTE: To add more ops, you only need to change the following lists
+    # relay op -> topi compute
+    OP2TOPI = {
+        tvm.relay.op.nn.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                                 topi.nn.group_conv2d_nchw],
+        tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+        tvm.relay.op.nn.dense: [topi.nn.dense],
+    }
+
+    topi_funcs = []
+    for op_name in ops:
+        if op_name in OP2TOPI:
+            topi_funcs.extend(OP2TOPI[op_name])
+        else:
+            warnings.warn("Op %s is not tunable, ignored" % op_name)
+
+    # run compiler to collect all TOPI calls during compilation
+    env.reset(topi_funcs)
+
+    # disable logger temporarily
+    old_state = logger.disabled
+    logger.disabled = True
+
+    # use a "tracing" target to do a fake compile for collecting topi calls
+    tracing_target = _target.create("llvm -device=tracing")
+
+    for func, param in zip(funcs, params):
+        # wrap build call in thread to avoid multiprocessing problems
+        build_thread = threading.Thread(target=relay.build, args=(func,
+                                                                  tracing_target,
+                                                                  target_host,
+                                                                  params))
+        build_thread.start()
+        build_thread.join()
+
+    logger.disabled = old_state
+
+    # create tasks for target
+    tasks = []
+    for task_name, args in env.get_tasks():
+        tasks.append(create(task_name, args,
+                            target=target, target_host=target_host,
+                            template_key='direct'))
+
+    return tasks
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index f005ee0c9a54..412d7ae0e40b 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -11,16 +11,202 @@
 See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
 """
 
-from ... import _api_internal, tensor
-
-from .task import args_to_workload, dispatcher
+from ... import _api_internal, tensor, placeholder, create_schedule
 
+from .task import args_to_workload, dispatcher, register
+from ..util import get_const_tuple
 
 # A table that records all registered dispatcher for all targets
 _REGISTED_DISPATHCER = {
 }
 
 
+def serialize_args(args):
+    """serialize arguments of a topi function to a hashable tuple.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tensor.Tensor):
+            ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype))
+        else:
+            ret.append(t)
+    return tuple(ret)
+
+
+def deserialize_args(args):
+    """The inverse function of :code:`serialize_args`.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tuple) and t[0] == 'TENSOR':
+            ret.append(placeholder(shape=t[1], dtype=t[2]))
+        else:
+            ret.append(t)
+    return ret
+
+
+# Task extractor for nnvm graph, relay program
+class TaskExtractEnv:
+    """Global environment for extracting tuning tasks from nnvm graph"""
+    current = None
+
+    def __init__(self):
+        import topi
+
+        # topi compute -> autotvm task name
+        self.topi_to_task = {
+            topi.nn.conv2d: "topi_nn_conv2d",
+            topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
+            topi.nn.group_conv2d_nchw: "topi_nn_group_conv2d_nchw",
+            topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
+            topi.nn.dense: "topi_nn_dense",
+        }
+
+        self.topi_to_schedule = {
+            topi.nn.conv2d: [topi.generic.schedule_conv2d_nchw,
+                             topi.generic.schedule_conv2d_nhwc],
+            topi.nn.depthwise_conv2d_nchw: [topi.generic.schedule_depthwise_conv2d_nchw,
+                                            topi.generic.schedule_depthwise_conv2d_nhwc],
+            topi.nn.group_conv2d_nchw: [topi.generic.schedule_group_conv2d_nchw],
+            topi.nn.conv2d_transpose_nchw: [topi.generic.schedule_conv2d_transpose_nchw],
+            topi.nn.dense: [topi.generic.schedule_dense],
+        }
+
+        self._register_tracing()
+        self._register_topi_task()
+        self.task_collection = []
+        self.wanted_topi_funcs = list(self.topi_to_task.keys())
+
+    def _register_tracing(self):
+        """Register tracing function to track the topi function call"""
+        # register topi compute for "tracing" target
+        for topi_compute in self.topi_to_task:
+            def _local_scope(compute_func):
+                """start a scope to hold the local function in for loop"""
+
+                @compute_func.register("tracing", )
+                def _tracing_topi_compute(*args, **kwargs):
+                    assert not kwargs, "Do not support extracting tuning tasks when" \
+                                       "kwargs is used in TOPI function call." \
+                                       "Please modify it to use only positional args."
+
+                    if compute_func in self.wanted_topi_funcs:  # record this call
+                        key = (self.topi_to_task[compute_func], serialize_args(args))
+                        if key not in self.task_collection:
+                            self.task_collection.append(key)
+
+                    return compute_func.fdefault(*args)
+            _local_scope(topi_compute)
+
+        # register topi schedule for "tracing" target
+        for topi_compute in self.topi_to_task:
+            for topi_schedule in self.topi_to_schedule[topi_compute]:
+                def _local_scope_(schedule_func):
+                    """start a scope to hold the local function in for loop"""
+
+                    @schedule_func.register("tracing", )
+                    def _tracing_topi_compute(outs):
+                        outs = [outs] if isinstance(outs, tensor.Tensor) else outs
+                        return create_schedule([x.op for x in outs])
+                _local_scope_(topi_schedule)
+
+    def _register_topi_task(self):
+        """register tuning wrapper for topi function"""
+        import topi
+
+        # Tuning wrapper for topi functions
+        @register("topi_nn_conv2d")
+        def _topi_nn_conv2d(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            layout = args[-2]
+            assert layout == 'NCHW', "only support NCHW currently"
+            C = topi.nn.conv2d(*args, **kwargs)
+            s = topi.generic.schedule_conv2d_nchw([C])
+            return s, [A, W, C]
+
+        @register("topi_nn_depthwise_conv2d_nchw")
+        def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.depthwise_conv2d_nchw(*args, **kwargs)
+            s = topi.generic.schedule_depthwise_conv2d_nchw([C])
+            return s, [A, W, C]
+
+        @register("topi_nn_group_conv2d_nchw")
+        def _topi_nn_group_conv2d_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.group_conv2d_nchw(*args, **kwargs)
+            s = topi.generic.schedule_group_conv2d_nchw([C])
+            return s, [A, W, C]
+
+        @register("topi_nn_conv2d_transpose_nchw")
+        def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.conv2d_transpose_nchw(*args, **kwargs)
+            s = topi.generic.schedule_conv2d_transpose_nchw([C])
+            return s, [A, W, C]
+
+        @register("topi_nn_dense")
+        def _topi_nn_dense(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            data, weight, bias = args
+            C = topi.nn.dense(*args, **kwargs)
+            s = topi.generic.schedule_dense([C])
+            if bias is not None:
+                return s, [data, weight, bias, C]
+            return s, [data, weight, C]
+
+    def reset(self, wanted_topi_funcs):
+        """Reset task collections
+
+        Parameters
+        ----------
+        wanted_topi_funcs: List of function
+            The topi function to be extracted
+        """
+        self.task_collection = []
+        self.wanted_topi_funcs = wanted_topi_funcs
+
+    def get_tasks(self):
+        """Get collected tasks
+
+        Returns
+        -------
+        tasks: List of tuple(name, args)
+            A list of tasks extracted from the nnvm graph
+        """
+        return self.task_collection
+
+    @staticmethod
+    def get():
+        """Get the single instance of TaskExtractEnv
+
+        Returns
+        -------
+        env: TaskExtractEnv
+            The single instance of TaskExtractEnv
+        """
+        if not TaskExtractEnv.current:
+            TaskExtractEnv.current = TaskExtractEnv()
+        return TaskExtractEnv.current
+
+
 def register_topi_compute(topi_compute, target_keys, template_keys, func=None):
     """Register a tunable template for a topi compute function.
 
diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py
new file mode 100644
index 000000000000..8c93e4a56642
--- /dev/null
+++ b/tests/python/relay/test_autotvm_task_extraction.py
@@ -0,0 +1,56 @@
+"""Test task extraction for autotvm"""
+import tvm.relay.testing
+from tvm import relay
+from tvm import autotvm
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+
+    if name == 'resnet-18':
+        net, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = relay.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'dcgan':
+        net, params = relay.testing.dcgan.get_workload(batch_size=batch_size)
+        input_shape = (batch_size, 100)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape
+
+def test_task_extraction():
+    target = 'llvm'
+
+    net, params, input_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_program(net, target=target,
+                                            params=params,
+                                            ops=(relay.op.nn.conv2d,))
+    assert len(tasks) == 12
+
+    net, params, input_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_program(net, target=target,
+                                            params=params,
+                                            ops=(relay.op.nn.dense,))
+    assert len(tasks) == 1
+
+    net, params, input_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_program(net, target=target,
+                                            params=params,
+                                            ops=(relay.op.nn.conv2d, relay.op.nn.dense))
+    assert len(tasks) == 13
+
+    net, params, input_shape = get_network('mobilenet', batch_size=1)
+    tasks = autotvm.task.extract_from_program(net, target=target,
+                                            params=params,
+                                            ops=(relay.op.nn.conv2d, relay.op.nn.dense))
+    assert len(tasks) == 20
+
+    net, params, input_shape = get_network('dcgan', batch_size=1)
+    tasks = autotvm.task.extract_from_program(net, target=target,
+                                            params=params,
+                                            ops=(relay.op.nn.conv2d_transpose,))
+    assert len(tasks) == 4
+
+if __name__ == '__main__':
+    test_task_extraction()
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 1a73736264bd..fe38b38d38e0 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -2,7 +2,7 @@
 """Conv2D schedule on x86"""
 import tvm
 from tvm import autotvm
-from tvm.autotvm.task.nnvm_integration import deserialize_args
+from tvm.autotvm.task.topi_integration import deserialize_args
 from tvm.autotvm.task import get_config
 from .. import generic, tag
 from .. import nn
diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py
index 8f37a0316229..64858df91cdc 100644
--- a/topi/python/topi/x86/depthwise_conv2d.py
+++ b/topi/python/topi/x86/depthwise_conv2d.py
@@ -4,7 +4,7 @@
 from tvm import autotvm
 from tvm.autotvm.task import get_config
 from tvm.autotvm.task.space import SplitEntity
-from tvm.autotvm.task.nnvm_integration import deserialize_args
+from tvm.autotvm.task.topi_integration import deserialize_args
 from .. import generic, tag
 from ..nn.pad import pad
 from ..util import get_const_tuple

From d7ff19ae6705a7fac72e7c78637919f4788f6c32 Mon Sep 17 00:00:00 2001
From: Dominic Symes <36929632+dominicsymes@users.noreply.github.com>
Date: Mon, 24 Dec 2018 21:08:45 +0000
Subject: [PATCH 521/529] [FRONTEND][TENSORFLOW] Bugfix (#2326)

---
 nnvm/python/nnvm/frontend/tensorflow.py               | 7 +++++--
 nnvm/tests/python/frontend/tensorflow/test_forward.py | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 10f23a49b5de..47aca3816e6f 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -646,6 +646,9 @@ def _transform_mask(stride_dim, ellipsis_mask):
                 pass
             else:
                 final_output.append(out_shape[gather_index])
+        # Prevent 0-dim tensors which are not accepted by nnvm
+        if not final_output:
+            final_output.append(1)
         return _sym.reshape(out, shape=tuple(final_output))
     return _impl
 
@@ -1187,8 +1190,8 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
                 raise NotImplementedError( \
                     "Please freeze the graph with add_shapes=True")
             self._outputs_are_0d[node.name] = [ \
-                not shape if isinstance(shape, list) else False \
-                for shape in self._output_shapes[node.name]]
+                not tshape if isinstance(tshape, list) else False \
+                for tshape in self._output_shapes[node.name]]
 
             if node.op == "Placeholder":
                 self._nodes[node.name] = _sym.Variable(name=node.name,
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index ed3d0272b4fc..5b8f11695790 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -463,6 +463,7 @@ def test_forward_stridedslice():
     _test_stridedslice((3, 4, 5, 4, 5, 6), [1, 2, 0, -3], [4, 5, 3, 3], [2, 2, 1, 1],
                        'float32', shrink_axis_mask=8, new_axis_mask=1, ellipsis_mask=2, begin_mask=5,
                        end_mask=8)
+    _test_stridedslice((1), [0], [1], [1], 'float32', shrink_axis_mask=1)
 
 
 #######################################################################

From 3a187c9a3326d07345c6442daaf02a10c22d4dfb Mon Sep 17 00:00:00 2001
From: Liang ZOU <liang.d.zou@gmail.com>
Date: Tue, 25 Dec 2018 14:23:18 +0800
Subject: [PATCH 522/529] [DOCS] typo "@func myfunc" => "func @myfunc" (#2333)

typo "@func myfunc" => "func @myfunc"
---
 docs/dev/relay_intro.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/dev/relay_intro.rst b/docs/dev/relay_intro.rst
index 2462d0d3ecc2..66b643421a5b 100644
--- a/docs/dev/relay_intro.rst
+++ b/docs/dev/relay_intro.rst
@@ -75,7 +75,7 @@ GlobalVar decouples the definition/declaration and enables recursion and delayed
 
 .. code ::
 
-  @def myfunc(%x) {
+  def @myfunc(%x) {
     %1 = equal(%x, 1)
      if (%1) {
         %x

From fa1315ae143a2a118ceb234c404a494832b62239 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Mon, 24 Dec 2018 22:46:22 -0800
Subject: [PATCH 523/529] [relay][frontend] Enable ssd test by attaching
 schedules to multibox and ssd ops (#2322)

* add ssd ops to mxnet.py

* add ssd ops to mxnet.py

* add result check for multibox and nms unit tests

* add result check for multibox and nms unit tests

* address @kevinthesun's comments

* Disable cuda test for nms for now.
---
 python/tvm/relay/frontend/common.py     |  24 +++
 python/tvm/relay/frontend/mxnet.py      |  32 +++-
 python/tvm/relay/op/vision/__init__.py  |   1 +
 python/tvm/relay/op/vision/_multibox.py |  77 +++++++++
 python/tvm/relay/op/vision/multibox.py  |  12 +-
 tests/python/relay/test_op_level5.py    | 206 ++++++++++++++++++------
 topi/python/topi/util.py                |  42 +++++
 tutorials/nnvm/deploy_ssd.py            |  32 +++-
 8 files changed, 367 insertions(+), 59 deletions(-)
 create mode 100644 python/tvm/relay/op/vision/_multibox.py

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 95633a4d4586..20598400ce21 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -106,6 +106,30 @@ def get_int_tuple(self, key, default=RequiredAttr()):
             raise AttributeError("Required attribute {} not found.".format(key))
         return default
 
+    def get_float_tuple(self, key, default=RequiredAttr()):
+        """Get float tuple attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+
+        if key in self.attrs:
+            tshape = self.attrs[key]
+            return tuple(float(x.strip()) for x in
+                         tshape.strip('()[]').split(','))
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
     def get_tuple_tuple_int(self, key, default=RequiredAttr()):
         """Get int list attribute
 
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index f61c65bbaf6a..7bffbd4f499e 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -241,6 +241,33 @@ def _mx_lrn(inputs, attrs):
     return _op.nn.lrn(inputs[0], **new_attrs)
 
 
+def _mx_multibox_prior(inputs, attrs):
+    new_attrs = {}
+    new_attrs["sizes"] = attrs.get_float_tuple("sizes", (1.0, ))
+    new_attrs["steps"] = attrs.get_float_tuple("steps", (-1.0, -1.0))
+    new_attrs["offsets"] = attrs.get_float_tuple("offsets", (0.5, 0.5))
+    new_attrs["ratios"] = attrs.get_float_tuple("ratios", (1.0, ))
+    new_attrs["clip"] = attrs.get_bool("clip", False)
+    return _op.vision.multibox_prior(inputs[0], **new_attrs)
+
+
+def _mx_multibox_detection(inputs, attrs):
+    new_attrs0 = {}
+    new_attrs0["clip"] = attrs.get_bool("clip", True)
+    new_attrs0["threshold"] = attrs.get_float("threshold", 0.01)
+    new_attrs0["variances"] = attrs.get_float_tuple("variances", (0.1, 0.1,
+                                                                  0.2, 0.2))
+
+    new_attrs1 = {}
+    new_attrs1["overlap_threshold"] = attrs.get_float("nms_threshold", 0.5)
+    new_attrs1["force_suppress"] = attrs.get_bool("force_suppress", False)
+    new_attrs1["topk"] = attrs.get_int("nms_topk", -1)
+
+    ret = _op.vision.multibox_transform_loc(inputs[0], inputs[1],
+                                            inputs[2], **new_attrs0)
+    return _op.vision.nms(ret[0], ret[1], **new_attrs1)
+
+
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -327,13 +354,14 @@ def _mx_lrn(inputs, attrs):
     "LeakyReLU"     : _mx_leaky_relu,
     "SoftmaxOutput" : _mx_softmax_output,
     "SoftmaxActivation" : _mx_softmax_activation,
+    # vision
+    "_contrib_MultiBoxPrior" : _mx_multibox_prior,
+    "_contrib_MultiBoxDetection" : _mx_multibox_detection,
     # List of missing operators that are present in NNVMv1
     # TODO(tvm-tvm): support all operators.
     #
     # "broadcast_to",
     # "gather_nd",
-    # "_contrib_MultiBoxPrior" : _rename("multibox_prior"),
-    # "_contrib_MultiBoxDetection" : _contrib_multibox_detection,
     # "Crop"          : _crop_like,
 
 }
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
index 9ecd8a84770a..ea3ed69e8f38 100644
--- a/python/tvm/relay/op/vision/__init__.py
+++ b/python/tvm/relay/op/vision/__init__.py
@@ -4,3 +4,4 @@
 
 from .multibox import *
 from .nms import *
+from . import _multibox
diff --git a/python/tvm/relay/op/vision/_multibox.py b/python/tvm/relay/op/vision/_multibox.py
new file mode 100644
index 000000000000..e9ef43f7e06f
--- /dev/null
+++ b/python/tvm/relay/op/vision/_multibox.py
@@ -0,0 +1,77 @@
+# pylint: disable=invalid-name, unused-argument
+"""Definition of vision ops"""
+from __future__ import absolute_import
+
+import topi
+from topi.util import get_const_int, get_const_float, get_float_tuple
+from .. import op as reg
+from ..op import OpPattern
+
+
+@reg.register_schedule("vision.multibox_prior")
+def schedule_multibox_prior(_, outs, target):
+    """Schedule definition of multibox_prior"""
+    with target:
+        return topi.generic.schedule_multibox_prior(outs)
+
+
+@reg.register_compute("vision.multibox_prior")
+def compute_multibox_prior(attrs, inputs, _, target):
+    """Compute definition of multibox_prior"""
+    sizes = get_float_tuple(attrs.sizes)
+    ratios = get_float_tuple(attrs.ratios)
+    steps = get_float_tuple(attrs.steps)
+    offsets = get_float_tuple(attrs.offsets)
+    clip = bool(get_const_int(attrs.clip))
+    return [
+        topi.vision.ssd.multibox_prior(inputs[0], sizes, ratios, steps,
+                                       offsets, clip)
+    ]
+
+
+reg.register_pattern("vision.multibox_prior", OpPattern.OPAQUE)
+
+
+# multibox_transform_loc
+@reg.register_schedule("vision.multibox_transform_loc")
+def schedule_multibox_transform_loc(_, outs, target):
+    """Schedule definition of multibox_detection"""
+    with target:
+        return topi.generic.schedule_multibox_transform_loc(outs)
+
+
+@reg.register_compute("vision.multibox_transform_loc")
+def compute_multibox_transform_loc(attrs, inputs, _, target):
+    """Compute definition of multibox_detection"""
+    clip = bool(get_const_int(attrs.clip))
+    threshold = get_const_float(attrs.threshold)
+    variances = get_float_tuple(attrs.variances)
+    return topi.vision.ssd.multibox_transform_loc(
+        inputs[0], inputs[1], inputs[2], clip, threshold, variances)
+
+
+reg.register_pattern("vision.multibox_transform_loc", OpPattern.OPAQUE)
+reg.register_pattern("vision.multibox_detection", OpPattern.OPAQUE)
+
+
+# non-maximum suppression
+@reg.register_schedule("vision.nms")
+def schedule_nms(_, outs, target):
+    """Schedule definition of nms"""
+    with target:
+        return topi.generic.schedule_nms(outs)
+
+
+@reg.register_compute("vision.nms")
+def compute_nms(attrs, inputs, _, target):
+    """Compute definition of nms"""
+    overlap_threshold = get_const_float(attrs.overlap_threshold)
+    force_suppress = bool(get_const_int(attrs.force_suppress))
+    topk = get_const_int(attrs.topk)
+    return [
+        topi.vision.nms(inputs[0], inputs[1], overlap_threshold,
+                        force_suppress, topk)
+    ]
+
+
+reg.register_pattern("vision.nms", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/multibox.py b/python/tvm/relay/op/vision/multibox.py
index b04610aaa080..90591da925f5 100644
--- a/python/tvm/relay/op/vision/multibox.py
+++ b/python/tvm/relay/op/vision/multibox.py
@@ -1,6 +1,7 @@
 """Multibox operations."""
 from __future__ import absolute_import as _abs
 from . import _make
+from ...expr import TupleWrapper
 
 def multibox_prior(data,
                    sizes=(1.0,),
@@ -43,7 +44,7 @@ def multibox_transform_loc(cls_prob,
                            anchor,
                            clip=True,
                            threshold=0.01,
-                           variance=(0.1, 0.1, 0.2, 0.2)):
+                           variances=(0.1, 0.1, 0.2, 0.2)):
     """Location transformation for multibox detection
 
     Parameters
@@ -63,12 +64,13 @@ def multibox_transform_loc(cls_prob,
     threshold : double, optional
         Threshold to be a positive prediction.
 
-    variance : Tuple of float, optional
-        Variances to be decoded from box regression output.
+    variances : Tuple of float, optional
+        variances to be decoded from box regression output.
 
     Returns
     -------
     ret : tuple of tvm.relay.Expr
     """
-    return _make.multibox_transform_loc(cls_prob, loc_pred, anchor, clip,
-                                        threshold, variance)
+    return TupleWrapper(_make.multibox_transform_loc(cls_prob, loc_pred,
+                                                     anchor, clip, threshold,
+                                                     variances), 2)
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 6bd331b98120..aa31aa96ef45 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -1,11 +1,13 @@
 """ Support level5 operator test cases.
 """
+import math
 import numpy as np
 import tvm
 from tvm import relay
 from tvm.relay.testing import ctx_list
 import topi.testing
 
+
 def test_resize_infer_type():
     n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
@@ -48,64 +50,163 @@ def verify_resize(dshape, scale, method, layout):
         for layout in ["NHWC", "NCHW"]:
             verify_resize((1, 4, 4, 4), 2, method, layout)
 
+
 def test_multibox_prior():
+    def get_ref_result(dshape, sizes=(1.0,),
+                       ratios=(1.0,), steps=(-1.0, -1.0),
+                       offsets=(0.5, 0.5), clip=True):
+        in_height = dshape[2]
+        in_width = dshape[3]
+        num_sizes = len(sizes)
+        num_ratios = len(ratios)
+        size_ratio_concat = sizes + ratios
+        steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
+        steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+        offset_h = offsets[0]
+        offset_w = offsets[1]
+
+        oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
+        dtype = "float32"
+        np_out = np.zeros(oshape).astype(dtype)
+
+        for i in range(in_height):
+            center_h = (i + offset_h) * steps_h
+            for j in range(in_width):
+                center_w = (j + offset_w) * steps_w
+                for k in range(num_sizes + num_ratios - 1):
+                    w = size_ratio_concat[k] * in_height / in_width / 2.0 if k < num_sizes else \
+                        size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                    h = size_ratio_concat[k] / 2.0 if k < num_sizes else \
+                        size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                    count = i * in_width * (num_sizes + num_ratios - 1) + j * (num_sizes + num_ratios - 1) + k
+                    np_out[0][count][0] = center_w - w
+                    np_out[0][count][1] = center_h - h
+                    np_out[0][count][2] = center_w + w
+                    np_out[0][count][3] = center_h + h
+        if clip:
+            np_out = np.clip(np_out, 0, 1)
+
+        return np_out
+
+    def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0,),
+                              ratios=(1.0,), steps=(-1.0, -1.0),
+                              offsets=(0.5, 0.5), clip=True, check_size=False,
+                              check_type_only=False):
+
+        z = relay.vision.multibox_prior(x, sizes, ratios, steps, offsets, clip)
+        zz = relay.ir_pass.infer_type(z)
+        if check_size:
+            assert "sizes=" in z.astext()
+        assert zz.checked_type == relay.TensorType(
+            (1, dshape[2] * dshape[3] * (len(sizes) + len(ratios) - 1), 4),
+            "float32")
+
+        if check_type_only:
+            return
+
+        data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
+        func = relay.Function([x], z)
+        func = relay.ir_pass.infer_type(func)
+        for target, ctx in ctx_list():
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(data)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            op_res2 = intrp2.evaluate(func)(data)
+            tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
     sizes = (0.3, 1.5, 0.7)
     ratios = (1.3, 2.4)
     steps = (2.0, 1.5)
     offsets = (0.2, 0.3)
-    clip = True
-
-    n, c, h, w = tvm.var("n"), 3, 56, 56
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-
-    z = relay.vision.multibox_prior(x, sizes, ratios,
-                                    steps, offsets, clip)
-    assert "sizes=" in z.astext()
-    zz = relay.ir_pass.infer_type(z)
-    assert zz.checked_type == relay.TensorType(
-        (1, h * w * (len(sizes) + len(ratios) - 1), 4), "float32")
+    dshape = (1, 3, 56, 56)
+    ref_res = get_ref_result(dshape, sizes, ratios, steps, offsets)
+    x = relay.var("x", relay.TensorType(dshape, "float32"))
+    verify_multibox_prior(x, dshape, ref_res, sizes, ratios, steps, offsets,
+                          check_size=True)
+    y = relay.var("y", relay.TensorType((tvm.var("n"), 3, 56, 56), "float32"))
+    verify_multibox_prior(x, dshape, ref_res, sizes, ratios, steps, offsets,
+                          check_size=True, check_type_only=True)
 
-    n, c, h, w = tvm.var("n"), 24, 32, 32
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    z = relay.vision.multibox_prior(x)
-    zz = relay.ir_pass.infer_type(z)
-    assert zz.checked_type == relay.TensorType(
-        (1, h * w, 4), "float32")
+    dshape = (1, 24, 32, 32)
+    ref_res = get_ref_result(dshape, clip=False)
+    x = relay.var("x", relay.TensorType(dshape, "float32"))
+    verify_multibox_prior(x, dshape, ref_res, clip=False)
+    y = relay.var("y", relay.TensorType((tvm.var("n"), 24, 32, 32), "float32"))
+    verify_multibox_prior(x, dshape, ref_res, clip=False, check_type_only=True)
 
 
 def test_nms():
-    num_anchors = 60
-
-    overlap_threshold = 0.5
-    force_suppress = True
-    nms_topk = 10
-
-    n = tvm.var("n")
-    x0 = relay.var("x0", relay.ty.TensorType((n, num_anchors, 6), "float32"))
-    x1 = relay.var("x1", relay.ty.TensorType((n,), "int"))
+    def verify_nms(x0_data, x1_data, dshape, ref_res, valid_count,
+                   overlap_threshold=0.5, force_suppress=False, topk=-1,
+                   check_type_only=False):
+        x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
+        x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
+        z = relay.vision.nms(x0, x1, overlap_threshold, force_suppress, topk)
+        assert "overlap_threshold" in z.astext()
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.ty.TensorType(dshape, "float32")
 
-    z = relay.vision.nms(x0, x1, overlap_threshold, force_suppress, nms_topk)
+        if check_type_only:
+            return
 
-    assert "overlap_threshold" in z.astext()
-    zz = relay.ir_pass.infer_type(z)
-    assert zz.checked_type == relay.ty.TensorType(
-        (n, num_anchors, 6), "float32")
+        func = relay.Function([x0, x1], z)
+        func = relay.ir_pass.infer_type(func)
+        ctx_list = [("llvm", tvm.cpu(0))]
+        for target, ctx in ctx_list:
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(x0_data, x1_data)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            op_res2 = intrp2.evaluate(func)(x0_data, x1_data)
+            tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
-    n = tvm.var("n")
-    x0 = relay.var("x0", relay.ty.TensorType((n, num_anchors, 6), "float32"))
-    x1 = relay.var("x1", relay.ty.TensorType((n,), "int"))
+    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
+                         [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
+                         [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
+    np_valid_count = np.array([4]).astype("int32")
+    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
+                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1]]])
+    num_anchors = 5
 
-    z = relay.vision.nms(x0, x1)
+    dshape = (tvm.var("n"), num_anchors, 6)
+    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
+               force_suppress=True, topk=2, check_type_only=True)
+    dshape = (1, num_anchors, 6)
+    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
+               force_suppress=True, topk=2, check_type_only=False)
 
-    zz = relay.ir_pass.infer_type(z)
-    assert zz.checked_type == relay.ty.TensorType(
-        (n, num_anchors, 6), "float32")
+    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
+                           [1, 0.7, 30, 60, 50, 80], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1]]])
+    dshape = (tvm.var("n"), num_anchors, 6)
+    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
+               check_type_only=True)
+    dshape = (1, num_anchors, 6)
+    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
+               topk=3)
 
 
 def test_multibox_transform_loc():
     def test_default_value():
-        num_anchors = 5
-        num_classes = 5
+        num_anchors = 3
+        num_classes = 3
+
+        np_cls_prob = np.array(
+            [[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45],
+              [0.7, 0.1, 0.2]]]).astype("float32")
+        np_loc_preds = np.array(
+            [[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4,
+              -0.8]]).astype("float32")
+        np_anchors = np.array(
+            [[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2],
+              [1.2, 1.2, 1.5, 1.5]]]).astype("float32")
+
+        expected_np_out = np.array([[[1, 0.69999999, 0, 0, 0.10818365, 0.10008108],
+                                     [0, 0.44999999, 1, 1, 1, 1],
+                                     [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]])
+
 
         cls_prob = relay.var(
             "cls_prob",
@@ -115,16 +216,31 @@ def test_default_value():
         anchors = relay.var(
             "anchors", relay.ty.TensorType((1, num_anchors, 4), "float32"))
 
-        ret = relay.vision.multibox_transform_loc(
+        mtl = relay.vision.multibox_transform_loc(
             cls_prob=cls_prob, loc_pred=loc_pred, anchor=anchors)
-        ret = relay.ir_pass.infer_type(ret)
+        ret = relay.ir_pass.infer_type(mtl.astuple())
         ref_type = relay.ty.TupleType(
             tvm.convert([
                 relay.ty.TensorType((1, num_anchors, 6), "float32"),
                 relay.ty.TensorType((1, ), "int")
             ]))
+
         assert ret.checked_type == ref_type
 
+        nms = relay.vision.nms(mtl[0], mtl[1])
+        func = relay.Function([cls_prob, loc_pred, anchors], nms)
+        func = relay.ir_pass.infer_type(func)
+        ctx_list = [("llvm", tvm.cpu(0))]
+        for target, ctx in ctx_list:
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(np_cls_prob, np_loc_preds,
+                                            np_anchors)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), expected_np_out, rtol=1e-5)
+            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            op_res2 = intrp2.evaluate(func)(np_cls_prob, np_loc_preds,
+                                            np_anchors)
+            tvm.testing.assert_allclose(op_res2.asnumpy(), expected_np_out, rtol=1e-5)
+
     def test_threshold():
         num_anchors = 5
         num_classes = 5
@@ -137,15 +253,15 @@ def test_threshold():
         anchors = relay.var(
             "anchors", relay.ty.TensorType((1, num_anchors, 4), "float32"))
         threshold = 0.02
-        variance = (0.2, 0.2, 0.3, 0.3)
+        variances = (0.2, 0.2, 0.3, 0.3)
 
         ret = relay.vision.multibox_transform_loc(
             cls_prob=cls_prob,
             loc_pred=loc_pred,
             anchor=anchors,
             threshold=threshold,
-            variance=variance)
-        ret = relay.ir_pass.infer_type(ret)
+            variances=variances)
+        ret = relay.ir_pass.infer_type(ret.astuple())
         ref_type = relay.ty.TupleType(
             tvm.convert([
                 relay.ty.TensorType((n, num_anchors, 6), "float32"),
diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index de9ff90ae26b..edfb0e467e1f 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -78,6 +78,28 @@ def get_const_int(expr):
     return int(expr.value)
 
 
+def get_const_float(expr):
+    """Verifies expr is a floating point and get the constant value.
+
+    Parameters
+    ----------
+    expr : tvm.Expr or float
+        The input expression.
+
+    Returns
+    -------
+    out_value : float
+        The output.
+    """
+    if isinstance(expr, float):
+        return float(expr)
+    if not isinstance(expr, tvm.expr.FloatImm):
+        expr = tvm.ir_pass.Simplify(expr)
+    if not isinstance(expr, tvm.expr.FloatImm):
+        raise ValueError("Expect value to be constant float")
+    return float(expr.value)
+
+
 def equal_const_int(expr, value):
     """Returns if expr equals value.
 
@@ -120,6 +142,26 @@ def get_const_tuple(in_tuple):
     return out_tuple
 
 
+def get_float_tuple(in_tuple):
+    """Verifies input tuple is FloatImm, returns tuple of float.
+
+    Parameters
+    ----------
+    in_tuple : tuple of Expr
+        The input.
+
+    Returns
+    -------
+    out_tuple : tuple of float
+        The output.
+    """
+    out_tuple = ()
+    for elem in in_tuple:
+        value = get_const_float(elem)
+        out_tuple = out_tuple + (value, )
+    return out_tuple
+
+
 def simplify(expr):
     """Simplify the expression if it is Expr, directly return if it is int.
 
diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd.py
index 3f5f89a632b6..9afa113959f0 100644
--- a/tutorials/nnvm/deploy_ssd.py
+++ b/tutorials/nnvm/deploy_ssd.py
@@ -5,7 +5,7 @@
 
 This article is an introductory tutorial to deploy SSD models with TVM.
 We will use mxnet pretrained SSD model with Resnet50 as body network and
-convert it to NNVM graph.
+convert it to NNVM graph;
 """
 import os
 import zipfile
@@ -16,6 +16,7 @@
 
 from nnvm import compiler
 from nnvm.frontend import from_mxnet
+from tvm import relay
 from tvm.contrib.download import download
 from tvm.contrib import graph_runtime
 from mxnet.model import load_checkpoint
@@ -58,7 +59,7 @@
 inference_symbol_folder = "c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
 inference_symbol_url = "https://gist.github.com/kevinthesun/c1904e900848df4548ce5dfb18c719c7/" \
                        "archive/a28c4856c827fe766aa3da0e35bad41d44f0fb26.zip"
-            
+
 dir = "ssd_model"
 if not os.path.exists(dir):
     os.makedirs(dir)
@@ -77,13 +78,31 @@
 zip_ref.close()
 
 ######################################################################
-# Convert and compile model with NNVM for CPU.
+# Convert and compile model with NNVM or Relay for CPU.
 
 sym = mx.sym.load("%s/%s/ssd_resnet50_inference.json" % (dir, inference_symbol_folder))
 _, arg_params, aux_params = load_checkpoint("%s/%s" % (dir, model_name), 0)
-net, params = from_mxnet(sym, arg_params, aux_params)
-with compiler.build_config(opt_level=3):
-    graph, lib, params = compiler.build(net, target, {"data": dshape}, params=params)
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-f", "--frontend",
+    help="Frontend for compilation, nnvm or relay",
+    type=str,
+    default="nnvm")
+args = parser.parse_args()
+if args.frontend == "relay":
+    net, params = relay.frontend.from_mxnet(sym, {"data": dshape}, arg_params=arg_params, aux_params=aux_params)
+    with relay.build_config(opt_level=3):
+        graph, lib, params = relay.build(net, target, params=params)
+elif args.frontend == "nnvm":
+    net, params = from_mxnet(sym, arg_params, aux_params)
+    with compiler.build_config(opt_level=3):
+        graph, lib, params = compiler.build(
+            net, target, {"data": dshape}, params=params)
+else:
+    parser.print_help()
+    parser.exit()
 
 ######################################################################
 # Create TVM runtime and do inference
@@ -141,4 +160,3 @@ def display(img, out, thresh=0.5):
 
 image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 display(image, tvm_output.asnumpy()[0], thresh=0.45)
-

From 9aabf9643d3c7aaa422d9f61a61968fc7d859967 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Tue, 25 Dec 2018 10:32:51 -0800
Subject: [PATCH 524/529] Add a the ability to trigger debugging in the
 interpreter without recompiling (#2219)

---
 include/tvm/relay/attrs/debug.h   | 29 ++++++++++++++
 include/tvm/relay/op_attr_types.h |  5 +++
 python/tvm/relay/__init__.py      |  7 +---
 python/tvm/relay/debug.py         | 25 ++++++++++++
 python/tvm/relay/op/__init__.py   |  2 +
 python/tvm/relay/op/op.py         | 16 ++++++++
 src/relay/backend/interpreter.cc  | 63 ++++++++++++++++++++++++++++++-
 src/relay/op/debug.cc             | 54 ++++++++++++++++++++++++++
 tests/python/relay/test_debug.py  | 32 ++++++++++++++++
 9 files changed, 226 insertions(+), 7 deletions(-)
 create mode 100644 include/tvm/relay/attrs/debug.h
 create mode 100644 python/tvm/relay/debug.py
 create mode 100644 src/relay/op/debug.cc
 create mode 100644 tests/python/relay/test_debug.py

diff --git a/include/tvm/relay/attrs/debug.h b/include/tvm/relay/attrs/debug.h
new file mode 100644
index 000000000000..8243dc0a3b91
--- /dev/null
+++ b/include/tvm/relay/attrs/debug.h
@@ -0,0 +1,29 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/attrs/debug.h
+ * \brief Auxiliary attributes for debug operators.
+ */
+#ifndef TVM_RELAY_ATTRS_DEBUG_H_
+#define TVM_RELAY_ATTRS_DEBUG_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Options for the debug operators.
+ */
+struct DebugAttrs : public tvm::AttrsNode<DebugAttrs> {
+  EnvFunc debug_func;
+
+  TVM_DECLARE_ATTRS(DebugAttrs, "relay.attrs.DebugAttrs") {
+    TVM_ATTR_FIELD(debug_func)
+        .describe("The function to use when debugging.");
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_DEBUG_H_
diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 1f37e9947bb8..c2839a471d20 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -48,6 +48,11 @@ using TOpPattern = int;
  */
 using TOpIsStateful = bool;
 
+/*!
+ * \brief Mark the operator as non-computational.
+ */
+using TNonComputational = bool;
+
 /*!
  * \brief Computation description interface.
  *
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 69180837b724..572589921dcf 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -10,6 +10,7 @@
 from . import ir_pass
 from .build_module import build, build_config, create_executor
 from . import parser
+from . import debug
 
 # Root operators
 from .op import Op
@@ -63,11 +64,5 @@
 const = expr.const
 bind = expr.bind
 
-# pylint: disable=unused-argument
-@register_func("relay.debug")
-def _debug(*args):
-    import pdb
-    pdb.set_trace()
-
 # Parser
 fromtext = parser.fromtext
diff --git a/python/tvm/relay/debug.py b/python/tvm/relay/debug.py
new file mode 100644
index 000000000000..00ad7b4401b0
--- /dev/null
+++ b/python/tvm/relay/debug.py
@@ -0,0 +1,25 @@
+# pylint: disable=wildcard-import, redefined-builtin, invalid-name
+"""The Relay IR namespace containing the IR definition and compiler."""
+from __future__ import absolute_import
+from .base import NodeBase, register_relay_node
+from ..api import register_func
+
+@register_relay_node
+class InterpreterState(NodeBase):
+    pass
+
+# pylint: disable=unused-argument
+def _debugger_init(expr, stack):
+    import pdb
+    pdb.set_trace()
+
+# pylint: disable=unused-argument
+@register_func("relay.debug")
+def _debug(*args):
+    _, _, _, ist = args
+    print("Relay Debugger")
+    print("  You can manipulate the expression under evaluation with the name `expr`.")
+    print("  You can manipulate the call stack with the name `stack`.")
+    print("--------------")
+    print("--------------")
+    _debugger_init(ist.current_expr, ist.stack)
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 4a6dfd9f7335..63baa5128bb9 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -3,6 +3,7 @@
 # operator defs
 from .op import get, register, register_schedule, register_compute, register_alter_op_layout, \
     Op
+from .op import debug
 
 # Operators
 from .reduce import *
@@ -13,6 +14,7 @@
 from . import vision
 from . import op_attrs
 
+
 # operator registry
 from . import _tensor
 from . import _transform
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index dd3af9c44e42..b027211acf47 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -8,6 +8,7 @@
 from ..expr import Expr
 from ...api import register_func
 from ...build_module import lower, build
+from . import _make
 
 @register_relay_node
 class Op(Expr):
@@ -183,3 +184,18 @@ def schedule_injective(attrs, outputs, target):
     """Generic schedule for binary broadcast."""
     with target:
         return topi.generic.schedule_injective(outputs)
+
+__DEBUG_COUNTER__ = 0
+
+def debug(expr, debug_func=None):
+    """The main entry point to the debugger."""
+    global __DEBUG_COUNTER__
+
+    if debug_func:
+        name = "debugger_func{}".format(__DEBUG_COUNTER__)
+        register_func(name, debug_func)
+        __DEBUG_COUNTER__ += 1
+    else:
+        name = ''
+
+    return _make.debug(expr, name)
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 33d06e9c6c28..734180c53759 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -8,6 +8,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/pass.h>
+#include <tvm/relay/attrs/debug.h>
 #include "compile_engine.h"
 
 namespace tvm {
@@ -124,13 +125,48 @@ struct Stack {
   };
 };
 
+/*! \brief A representation of the interpreter state which can be passed back to Python. */
+class InterpreterState;
+
+/*! \brief A container capturing the state of the interpreter. */
+class InterpreterStateNode : public Node {
+ public:
+  using Frame = tvm::Map<Var, Value>;
+  using Stack = tvm::Array<Frame>;
+
+  /*! \brief The current expression under evaluation. */
+  Expr current_expr;
+
+  /*! \brief The call stack of the interpreter. */
+  Stack stack;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("current_expr", &current_expr);
+    v->Visit("stack", &stack);
+  }
+
+  TVM_DLL static InterpreterState make(Expr current_expr, Stack stack);
+
+  static constexpr const char* _type_key = "relay.InterpreterState";
+  TVM_DECLARE_NODE_TYPE_INFO(InterpreterStateNode, Node);
+};
+
+RELAY_DEFINE_NODE_REF(InterpreterState, InterpreterStateNode, NodeRef);
+
+InterpreterState InterpreterStateNode::make(Expr current_expr, Stack stack) {
+  NodePtr<InterpreterStateNode> n = make_node<InterpreterStateNode>();
+  n->current_expr = std::move(current_expr);
+  n->stack = std::move(stack);
+  return InterpreterState(n);
+}
+
 // NOTE: the current interpreter assumes A-normal form.
 // which is better for execution.
 //
 // It will run duplicated computations when taking program that
 // contains DAG in dataflow-form.
-// Conversion to ANF is recommended before running the interpretation.
 //
+// Conversion to ANF is recommended before running the interpretation.
 class Interpreter :
       public ExprFunctor<Value(const Expr& n)> {
  public:
@@ -209,6 +245,21 @@ class Interpreter :
 
   Value InvokePrimitiveOp(Function func,
                           const Array<Value>& args) {
+    auto call_node = func->body.as<CallNode>();
+
+    if (call_node && call_node->op == Op::Get("debug")) {
+      auto dattrs = call_node->attrs.as<DebugAttrs>();
+      auto interp_state = this->get_state(call_node->args[0]);
+
+      if (dattrs->debug_func.defined()) {
+        dattrs->debug_func(interp_state);
+      } else {
+        RELAY_DEBUG(interp_state);
+      }
+
+      return args[0];
+    }
+
     // Marshal the arguments.
     // Handle tuple input/output by flattening them.
     size_t arg_len = 0;
@@ -381,6 +432,16 @@ class Interpreter :
     }
   }
 
+  InterpreterState get_state(Expr e = Expr()) const {
+    InterpreterStateNode::Stack stack;
+    for (auto fr : this->stack_.frames) {
+      InterpreterStateNode::Frame frame = fr.locals;
+      stack.push_back(frame);
+    }
+    auto state = InterpreterStateNode::make(e, stack);
+    return state;
+  }
+
  private:
   // module
   Module mod_;
diff --git a/src/relay/op/debug.cc b/src/relay/op/debug.cc
new file mode 100644
index 000000000000..4c9b0a5ca83e
--- /dev/null
+++ b/src/relay/op/debug.cc
@@ -0,0 +1,54 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file nn.cc
+ * \brief Property def of nn operators.
+ */
+
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/debug.h>
+#include <topi/elemwise.h>
+#include <vector>
+#include "./type_relations.h"
+#include "./op_common.h"
+#include "./layout.h"
+
+namespace tvm {
+namespace relay {
+
+Array<Tensor> DebugCompute(const Attrs& attrs,
+                               const Array<Tensor>& inputs,
+                               const Type& out_type,
+                               const Target& target) {
+  return Array<Tensor>{ topi::identity(inputs[0]) };
+}
+
+RELAY_REGISTER_OP("debug")
+.describe(R"code(Enter the interpreter's debugger.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("program", "Tuple", "The program to execute before debugging.")
+.set_support_level(1)
+.add_type_rel("Debug", IdentityRel)
+.set_attr<TOpPattern>("TOpPattern", kOpaque)
+.set_attr<FTVMCompute>("FTVMCompute", DebugCompute);
+
+Expr MakeDebug(Expr expr, std::string name) {
+  auto dattrs = make_node<DebugAttrs>();
+  if (name.size() > 0) {
+    dattrs->debug_func = EnvFunc::Get(name);
+  } else {
+    dattrs->debug_func = EnvFunc();
+  }
+  static const Op& op = Op::Get("debug");
+  return CallNode::make(op, {expr}, Attrs(dattrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.debug")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeDebug, args, rv);
+  });
+
+}  // namespace relay
+}  // namespace tvm
+
diff --git a/tests/python/relay/test_debug.py b/tests/python/relay/test_debug.py
new file mode 100644
index 000000000000..3463e2916147
--- /dev/null
+++ b/tests/python/relay/test_debug.py
@@ -0,0 +1,32 @@
+from tvm.relay import var, const, create_executor
+from tvm.relay.op import debug
+
+
+_test_debug_hit = False
+
+def test_debug():
+    global _test_debug_hit
+    ex = create_executor()
+    x = var('x', shape=(), dtype='int32')
+    _test_debug_hit = False
+    def did_exec(x):
+        global _test_debug_hit
+        _test_debug_hit = True
+    prog = debug(x, debug_func=did_exec)
+    result = ex.evaluate(prog, { x: const(1) })
+    assert _test_debug_hit
+    assert result.asnumpy() == 1
+
+def test_debug_with_expr():
+    global _test_debug_hit
+    _test_debug_hit = False
+    ex = create_executor()
+    x = var('x', shape=(), dtype='int32')
+    _test_debug_hit = False
+    def did_exec(x):
+        global _test_debug_hit
+        _test_debug_hit = True
+    prog = debug(x + x * x, debug_func=did_exec)
+    result = ex.evaluate(prog, { x: const(2) })
+    assert _test_debug_hit
+    assert result.asnumpy() == 6

From 9c48af8fd7e27f9381f02926cd34377fd261e5ea Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 26 Dec 2018 06:10:06 +0800
Subject: [PATCH 525/529] [TOPI][CUDA] Add reorder option in int8 conv2d
 (#2327)

---
 topi/python/topi/cuda/conv2d_int8.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index ef2cb3706bf2..637c5de35513 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -233,11 +233,16 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output):
 
     s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
 
+    cfg.define_reorder("reorder_inner", [rco, ryo, rxo], policy="all")
+    cfg["reorder_inner"].apply(s, conv, [rco, ryo, rxo])
+    cfg["reorder_inner"].apply(s, conv, [rci, ryi, rxi])
+
     _, rc_block = s[conv].split(rc_block, factor=4)
     s[conv].tensorize(rc_block, _dp4a)
 
-    s[AA].compute_at(s[conv], rxo)
-    s[WW].compute_at(s[conv], rxo)
+    cache_loc = [rco, ryo, rxo][cfg["reorder_inner"].perm[-1]]
+    s[AA].compute_at(s[conv], cache_loc)
+    s[WW].compute_at(s[conv], cache_loc)
 
     # cooperative fetching
     for load in [AA, WW]:

From fcb0981929efc94ac06ee7dbabb999fbdde7f64f Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 26 Dec 2018 12:25:51 +0800
Subject: [PATCH 526/529] [RELAY] Inline scalar compute (#2335)

---
 src/relay/backend/compile_engine.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 42394955cc64..e36d916f5498 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -84,6 +84,9 @@ class ScheduleGetter :
     CHECK(master_op_.defined());
     Schedule schedule = fschedule[master_op_](
         master_attrs_, cache_node->outputs, target_);
+    for (const auto& scalar : scalars_) {
+      schedule[scalar].compute_inline();
+    }
     return std::make_pair(schedule, cfunc);
   }
 
@@ -123,6 +126,7 @@ class ScheduleGetter :
           return tvm::Expr();
         }
       });
+    scalars_.push_back(value->op);
     return {value};
   }
 
@@ -216,6 +220,7 @@ class ScheduleGetter :
   int master_op_pattern_{0};
   std::ostringstream readable_name_stream_;
   std::unordered_map<Expr, Array<Tensor>, NodeHash, NodeEqual> memo_;
+  Array<Operation> scalars_;
 };
 
 

From 97dd8301c1125e3b7e6d20adc60d24b0a8a1e39e Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Thu, 27 Dec 2018 01:49:17 +0800
Subject: [PATCH 527/529] [NNVM] Fix dtype of output of mean. (#2334)

dtype of count is the same as dtype of inputs[0] when created, but its type may
  change when multiplied by inputs[0]->shape[i]. Which causes dtype of
  output is not same as dtype of input.
---
 nnvm/src/top/tensor/reduce.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index 007a3cc6e3fb..105765fccc61 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -352,7 +352,7 @@ Example::
 
     Expr count = make_const(inputs[0]->dtype, 1);
     for (auto& i : r_axes) {
-      count *= inputs[0]->shape[i];
+      count *= cast(inputs[0]->dtype, inputs[0]->shape[i]);
     }
 
     return Array<Tensor>{

From 7e9e45d55603e7d536069b09e0ad1995f3d6a87f Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Wed, 26 Dec 2018 15:57:07 -0800
Subject: [PATCH 528/529] [Relay][OP] Add cast op (#2319)

* Add cast op
* Rename dtype_cast to cast
* Add additional safety check for String2TVMType
* Add missing relay op docs
---
 docs/langref/relay_op.rst            |  7 +++++++
 include/tvm/runtime/packed_func.h    |  4 +++-
 python/tvm/relay/expr.py             |  2 +-
 python/tvm/relay/op/transform.py     | 20 ++++++++++++++++++++
 src/relay/op/tensor/transform.cc     |  2 +-
 tests/python/relay/test_op_level3.py |  5 +++++
 6 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 5c3ab8b1ffda..f053165470fe 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -133,6 +133,9 @@ This level enables additional math and transform operators.
    :nosignatures:
 
    tvm.relay.image.resize
+   tvm.relay.vision.multibox_prior
+   tvm.relay.vision.multibox_transform_loc
+   tvm.relay.vision.nms
 
 
 **Level 10: Temporary Operators**
@@ -160,6 +163,7 @@ Level 1 Definitions
 .. autofunction:: tvm.relay.mod
 .. autofunction:: tvm.relay.tanh
 .. autofunction:: tvm.relay.concatenate
+.. autofunction:: tvm.relay.expand_dims
 .. autofunction:: tvm.relay.nn.softmax
 .. autofunction:: tvm.relay.nn.log_softmax
 .. autofunction:: tvm.relay.nn.relu
@@ -236,6 +240,9 @@ Level 4 Definitions
 Level 5 Definitions
 -------------------
 .. autofunction:: tvm.relay.image.resize
+.. autofunction:: tvm.relay.vision.multibox_prior
+.. autofunction:: tvm.relay.vision.multibox_transform_loc
+.. autofunction:: tvm.relay.vision.nms
 
 
 Level 10 Definitions
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 0aeb7f2b1513..1e5265c07959 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -946,9 +946,11 @@ inline TVMType String2TVMType(std::string s) {
   char* xdelim;  // emulate sscanf("%ux%u", bits, lanes)
   uint8_t bits = static_cast<uint8_t>(strtoul(scan, &xdelim, 10));
   if (bits != 0) t.bits = bits;
+  char* endpt = xdelim;
   if (*xdelim == 'x') {
-    t.lanes = static_cast<uint16_t>(strtoul(xdelim + 1, nullptr, 10));
+    t.lanes = static_cast<uint16_t>(strtoul(xdelim + 1, &endpt, 10));
   }
+  CHECK(endpt == s.c_str() + s.length()) << "unknown type " << s;
   return t;
 }
 
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index e0c1f68ad431..b96111083bce 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -49,7 +49,7 @@ def astype(self, dtype):
         result : tvm.relay.Expr
             The result expression.
         """
-        return _make.dtype_cast(self, dtype)
+        return _make.cast(self, dtype)
 
     def __add__(self, other):
         if isinstance(other, Expr):
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 2791eaf7d9db..bc0a42d6ab30 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -4,6 +4,26 @@
 from ..expr import TupleWrapper
 
 
+def cast(data, dtype):
+    """Cast input tensor to data type.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    dtype: str
+        The target data type
+
+    Returns
+    -------
+    result : relay.Expr
+        The casted result.
+    """
+    from .. import _make as _relay_make
+    return _relay_make.cast(data, dtype)
+
+
 def expand_dims(data, axis, num_newaxis=1):
     """Insert `num_newaxis` axises at the position given by `axis`.
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index eb8b4f13fb3f..704324533185 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -61,7 +61,7 @@ Expr MakeCast(Expr data,
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_API("relay._make.dtype_cast")
+TVM_REGISTER_API("relay._make.cast")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
     runtime::detail::unpack_call<Expr, 2>(MakeCast, args, rv);
 });
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 0731ecfef40a..31e87ef04856 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -46,6 +46,11 @@ def test_cast():
     assert "dtype=" in yy.astext()
     assert yy.checked_type == relay.TensorType((8, 9, 4), "int32")
 
+    x = relay.var("x", relay.TensorType((8, 9, 4), "float32"))
+    y = relay.cast(x, "int32")
+    yy = relay.ir_pass.infer_type(y)
+    assert "dtype=" in yy.astext()
+    assert yy.checked_type == relay.TensorType((8, 9, 4), "int32")
 
 def test_clip():
     a = relay.var("a", relay.TensorType((10, 4), "float32"))

From 8d9156907c1042433fb9374021f3534c71499947 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 26 Dec 2018 19:20:35 -0800
Subject: [PATCH 529/529] [COMMUNITY] @srkreddy1238 -> Committer (#2339)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 964b700392b0..23d22686705b 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -19,6 +19,7 @@ We do encourage everyone to work anything they are interested in.
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Masahiro Masuda](https://github.com/masahi): @masahi - topi, relay
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
+- [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi